# LinkedInt # Scrapes LinkedIn without using LinkedIn API # Original scraper by @DisK0nn3cT (https://github.com/DisK0nn3cT/linkedin-gatherer) # Modified by @vysecurity # - Additions: # --- UI Updates # --- Constrain to company filters # --- Addition of Hunter for e-mail prediction #!/usr/bin/python import sys import re import time import requests import subprocess import json import argparse try: import http.cookiejar as cookielib from configparser import RawConfigParser from urllib.parse import quote_plus except ImportError: import cookielib from ConfigParser import RawConfigParser from urllib import quote_plus try: input = raw_input except NameError: pass import os import math import string from bs4 import BeautifulSoup #from thready import threaded try: reload(sys) sys.setdefaultencoding('utf-8') except Exception: pass """ Setup Argument Parameters """ parser = argparse.ArgumentParser(description='Discovery LinkedIn') parser.add_argument('-u', '--keywords', help='Keywords to search') parser.add_argument('-o', '--output', help='Output file (do not include extentions)') args = parser.parse_args() config = RawConfigParser() config.read('LinkedInt.cfg') api_key = config.get('API_KEYS', 'hunter') username = config.get('CREDS', 'linkedin_username') password = config.get('CREDS', 'linkedin_password') def login(): URL = 'https://www.linkedin.com' s = requests.Session() rv = s.get(URL + '/uas/login?trk=guest_homepage-basic_nav-header-signin') p = BeautifulSoup(rv.content, "html.parser") csrf = p.find(attrs={'name' : 'loginCsrfParam'})['value'] csrf_token = p.find(attrs={'name':'csrfToken'})['value'] sid_str = p.find(attrs={'name':'sIdString'})['value'] postdata = {'csrfToken':csrf_token, 'loginCsrfParam':csrf, 'sIdString':sid_str, 'session_key':username, 'session_password':password, } rv = s.post(URL + '/checkpoint/lg/login-submit', data=postdata) try: cookie = requests.utils.dict_from_cookiejar(s.cookies) cookie = cookie['li_at'] except: print("[!] Cannot log in") sys.exit(0) return cookie def get_search(): body = "" csv = [] css = """ """ header = """
""" # Do we want to automatically get the company ID? if bCompany: if bAuto: # Automatic # Grab from the URL companyID = 0 url = "https://www.linkedin.com/voyager/api/typeahead/hits?q=blended&query={}".format(search) headers = {'Csrf-Token':'ajax:0397788525211216808', 'X-RestLi-Protocol-Version':'2.0.0'} cookies['JSESSIONID'] = 'ajax:0397788525211216808' r = requests.get(url, cookies=cookies, headers=headers) content = json.loads(r.text) firstID = 0 for i in range(0,len(content['elements'])): try: companyID = content['elements'][i]['hitInfo']['com.linkedin.voyager.typeahead.TypeaheadCompany']['id'] if firstID == 0: firstID = companyID print("[Notice] Found company ID: {}".format(companyID)) except: continue companyID = firstID if companyID == 0: print("[WARNING] No valid company ID found in auto, please restart and find your own") else: # Don't auto, use the specified ID companyID = bSpecific print() print("[*] Using company ID: {}".format(companyID)) # Fetch the initial page to get results/page counts if bCompany == False: url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords={}&origin=OTHER&q=guided&start=0".format(search) else: url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->{})&origin=OTHER&q=guided&start=0".format(companyID) print(url) headers = {'Csrf-Token':'ajax:0397788525211216808', 'X-RestLi-Protocol-Version':'2.0.0'} cookies['JSESSIONID'] = 'ajax:0397788525211216808' r = requests.get(url, cookies=cookies, headers=headers) content = json.loads(r.text) data_total = content['elements'][0]['total'] # Calculate pages off final results at 40 results/page pages = int(math.ceil(data_total / 40.0)) if pages == 0: pages = 1 if data_total % 40 == 0: # Becuase we count 0... Subtract a page if there are no left over results on the last page pages = pages - 1 if pages == 0: print("[!] Try to use quotes in the search name") sys.exit(0) print("[*] {} Results Found".format(data_total)) if data_total > 1000: pages = 25 print("[*] LinkedIn only allows 1000 results. Refine keywords to capture all data") print("[*] Fetching {} Pages".format(pages)) print() for p in range(pages): # Request results for each page using the start offset if bCompany == False: url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords={}&origin=OTHER&q=guided&start={}".format(search, p*40) else: url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->{})&origin=OTHER&q=guided&start={}".format(companyID, p*40) r = requests.get(url, cookies=cookies, headers=headers) content = r.text.encode('UTF-8') content = json.loads(content) print("[*] Fetching page {} with {} results".format((p),len(content['elements'][0]['elements']))) for c in content['elements'][0]['elements']: if 'com.linkedin.voyager.search.SearchProfile' in c['hitInfo'] and c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['headless'] == False: try: data_industry = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['industry'] except: data_industry = "" data_firstname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['firstName'] data_lastname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['lastName'] data_slug = "https://www.linkedin.com/in/{}".format(c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['publicIdentifier']) data_occupation = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['occupation'] data_location = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['location'] try: data_picture = "{}{}".format(c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['picture']['com.linkedin.common.VectorImage']['rootUrl'],c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['picture']['com.linkedin.common.VectorImage']['artifacts'][2]['fileIdentifyingUrlPathSegment']) except: print("[*] No picture found for {} {}, {}".format(data_firstname, data_lastname, data_occupation)) data_picture = "" # incase the last name is multi part, we will split it down parts = data_lastname.split() name = data_firstname + " " + data_lastname fname = "" mname = "" lname = "" if len(parts) == 1: fname = data_firstname mname = '?' lname = parts[0] elif len(parts) == 2: fname = data_firstname mname = parts[0] lname = parts[1] elif len(parts) >= 3: fname = data_firstname lname = parts[0] else: fname = data_firstname lname = '?' fname = re.sub('[^A-Za-z]+', '', fname) mname = re.sub('[^A-Za-z]+', '', mname) lname = re.sub('[^A-Za-z]+', '', lname) if len(fname) == 0 or len(lname) == 0: # invalid user, let's move on, this person has a weird name continue #come here if prefix == 'full': user = '{}{}{}'.format(fname, mname, lname) if prefix == 'firstlast': user = '{}{}'.format(fname, lname) if prefix == 'firstmlast': if len(mname) == 0: user = '{}{}{}'.format(fname, mname, lname) else: user = '{}{}{}'.format(fname, mname[0], lname) if prefix == 'flast': user = '{}{}'.format(fname[0], lname) if prefix == 'firstl': user = '{}{}'.format(fname,lname[0]) if prefix == 'first.last': user = '{}.{}'.format(fname, lname) if prefix == 'fmlast': if len(mname) == 0: user = '{}{}{}'.format(fname[0], mname, lname) else: user = '{}{}{}'.format(fname[0], mname[0], lname) if prefix == 'lastfirst': user = '{}{}'.format(lname, fname) email = '{}@{}'.format(user, suffix) body += "" \ "" \ "" \ "" \ "" \ "" \ "".format(data_slug, data_picture, data_slug, name, email, data_occupation, data_location) csv.append('"{}","{}","{}","{}","{}", "{}"'.format(data_firstname, data_lastname, name, email, data_occupation, data_location.replace(",",";"))) foot = "
Photo Name Possible Email: Job Location
{}{}{}{}
" f = open('{}.html'.format(outfile), 'w') f.write(css) f.write(header) f.write(body) f.write(foot) f.close() f = open('{}.csv'.format(outfile), 'w') f.writelines('\n'.join(csv)) f.close() else: print("[!] Headless profile found. Skipping") print() def banner(): with open('banner.txt', 'r') as f: data = f.read() print("\033[1;31m{}\033[0;0m".format(data)) print("\033[1;34mProviding you with Linkedin Intelligence") print("\033[1;32mAuthor: Vincent Yiu (@vysec, @vysecurity)\033[0;0m") print("\033[1;32mOriginal version by @DisK0nn3cT\033[0;0m") def authenticate(): try: a = login() print(a) session = a if len(session) == 0: sys.exit("[!] Unable to login to LinkedIn.com") print("[*] Obtained new session: {}".format(session)) cookies = dict(li_at=session) except Exception as e: sys.exit("[!] Could not authenticate to linkedin. {}".format(e)) return cookies if __name__ == '__main__': banner() # Prompt user for data variables search = args.keywords if args.keywords!=None else input("[*] Enter search Keywords (use quotes for more precise results)\n") print() outfile = args.output if args.output!=None else input("[*] Enter filename for output (exclude file extension)\n") print() while True: bCompany = input("[*] Filter by Company? (Y/N): \n") if bCompany.lower() == "y" or bCompany.lower() == "n": break else: print("[!] Incorrect choice") if bCompany.lower() == "y": bCompany = True else: bCompany = False bAuto = True bSpecific = 0 prefix = "" suffix = "" print() if bCompany: while True: bSpecific = input("[*] Specify a Company ID (Provide ID or leave blank to automate): \n") if bSpecific != "": bAuto = False if bSpecific != 0: try: int(bSpecific) break except: print("[!] Incorrect choice, the ID either has to be a number or blank") else: print("[!] Incorrect choice, the ID either has to be a number or blank") else: bAuto = True break print() while True: suffix = input("[*] Enter e-mail domain suffix (eg. contoso.com): \n") suffix = suffix.lower() if "." in suffix: break else: print("[!] Incorrect e-mail? There's no dot") print() while True: prefix = input("[*] Select a prefix for e-mail generation (auto,full,firstlast,firstmlast,flast,firstl,first.last,fmlast,lastfirst): \n") prefix = prefix.lower() print() if prefix == "full" or prefix == "firstlast" or prefix == "firstmlast" or prefix == "flast" or prefix == "firstl" or prefix =="first" or prefix == "first.last" or prefix == "fmlast" or prefix == "lastfirst": break elif prefix == "auto": #if auto prefix then we want to use hunter IO to find it. print("[*] Automatically using Hunter IO to determine best Prefix") url = "https://hunter.io/trial/v2/domain-search?offset=0&domain={}&format=json".format(suffix) r = requests.get(url) content = json.loads(r.text) if "status" in content: print("[!] Rate limited by Hunter IO trial") url = "https://api.hunter.io/v2/domain-search?domain={}&api_key={}".format(suffix, api_key) r = requests.get(url) content = json.loads(r.text) if "status" in content: print("[!] Rate limited by Hunter IO Key") continue #print content prefix = content['data']['pattern'] print("[!] {}".format(prefix)) if prefix: prefix = prefix.replace("{","").replace("}", "") if prefix == "full" or prefix == "firstlast" or prefix == "firstmlast" or prefix == "flast" or prefix == "firstl" or prefix =="first" or prefix == "first.last" or prefix == "fmlast" or prefix == "lastfirst": print("[+] Found {} prefix".format(prefix)) break else: print("[!] Automatic prefix search failed, please insert a manual choice") continue else: print("[!] Automatic prefix search failed, please insert a manual choice") continue else: print("[!] Incorrect choice, please select a value from (auto,full,firstlast,firstmlast,flast,firstl,first.last,fmlast)") print() # URL Encode for the querystring search = quote_plus(search) cookies = authenticate() # Initialize Scraping get_search() print("[+] Complete")