From 39b1ab20ed881bda67a18466cc3c0c24866d98e0 Mon Sep 17 00:00:00 2001 From: vysec Date: Thu, 8 Mar 2018 21:22:02 +0000 Subject: [PATCH] push --- LinkedInt.py | 426 +++++++++++++++++++++++++++++++++++++++++++++++++++ banner.txt | 7 + 2 files changed, 433 insertions(+) create mode 100644 LinkedInt.py create mode 100644 banner.txt diff --git a/LinkedInt.py b/LinkedInt.py new file mode 100644 index 0000000..515f088 --- /dev/null +++ b/LinkedInt.py @@ -0,0 +1,426 @@ +# LinkedInt +# Scrapes LinkedIn without using LinkedIn API +# Original scraper by @DisK0nn3cT (https://github.com/DisK0nn3cT/linkedin-gatherer) +# Modified by @vysecurity +# - Additions: +# --- UI Updates +# --- Constrain to company filters +# --- Addition of Hunter for e-mail prediction + + +#!/usr/bin/python + +import sys +import re +import time +import requests +import subprocess +import json +import argparse +import cookielib +import os +import urllib +import math +import urllib2 +import string +from bs4 import BeautifulSoup +from thready import threaded + +reload(sys) +sys.setdefaultencoding('utf-8') + +""" Setup Argument Parameters """ +parser = argparse.ArgumentParser(description='Discovery LinkedIn') +parser.add_argument('-u', '--keywords', help='Keywords to search') +parser.add_argument('-o', '--output', help='Output file (do not include extentions)') +args = parser.parse_args() +api_key = "" # Hunter API key +username = "" # enter username here +password = "" # enter password here + +def login(): + cookie_filename = "cookies.txt" + cookiejar = cookielib.MozillaCookieJar(cookie_filename) + opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(),urllib2.HTTPHandler(debuglevel=0),urllib2.HTTPSHandler(debuglevel=0),urllib2.HTTPCookieProcessor(cookiejar)) + page = loadPage(opener, "https://www.linkedin.com/") + parse = BeautifulSoup(page, "html.parser") + + csrf = parse.find(id="loginCsrfParam-login")['value'] + + login_data = urllib.urlencode({'session_key': username, 'session_password': password, 'loginCsrfParam': csrf}) + page = loadPage(opener,"https://www.linkedin.com/uas/login-submit", login_data) + + parse = BeautifulSoup(page, "html.parser") + cookie = "" + + try: + cookie = cookiejar._cookies['.www.linkedin.com']['/']['li_at'].value + except: + sys.exit(0) + + cookiejar.save() + os.remove(cookie_filename) + return cookie + +def loadPage(client, url, data=None): + try: + response = client.open(url) + except: + print "[!] Cannot load main LinkedIn page" + try: + if data is not None: + response = client.open(url, data) + else: + response = client.open(url) + return ''.join(response.readlines()) + except: + sys.exit(0) + +def get_search(): + + body = "" + csv = [] + css = """ + + """ + + header = """
+ + + + + + + + """ + + # Do we want to automatically get the company ID? + + + if bCompany: + if bAuto: + # Automatic + # Grab from the URL + companyID = 0 + url = "https://www.linkedin.com/voyager/api/typeahead/hits?q=blended&query=%s" % search + headers = {'Csrf-Token':'ajax:0397788525211216808', 'X-RestLi-Protocol-Version':'2.0.0'} + cookies['JSESSIONID'] = 'ajax:0397788525211216808' + r = requests.get(url, cookies=cookies, headers=headers) + content = json.loads(r.text) + firstID = 0 + for i in range(0,len(content['elements'])): + try: + companyID = content['elements'][i]['hitInfo']['com.linkedin.voyager.typeahead.TypeaheadCompany']['id'] + if firstID == 0: + firstID = companyID + print "[Notice] Found company ID: %s" % companyID + except: + continue + companyID = firstID + if companyID == 0: + print "[WARNING] No valid company ID found in auto, please restart and find your own" + else: + # Don't auto, use the specified ID + companyID = bSpecific + + print + + print "[*] Using company ID: %s" % companyID + + # Fetch the initial page to get results/page counts + if bCompany == False: + url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords=%s&origin=OTHER&q=guided&start=0" % search + else: + url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->%s)&origin=OTHER&q=guided&start=0" % (companyID) + + print url + + headers = {'Csrf-Token':'ajax:0397788525211216808', 'X-RestLi-Protocol-Version':'2.0.0'} + cookies['JSESSIONID'] = 'ajax:0397788525211216808' + #print url + r = requests.get(url, cookies=cookies, headers=headers) + content = json.loads(r.text) + data_total = content['elements'][0]['total'] + + # Calculate pages off final results at 40 results/page + pages = data_total / 40 + + if pages == 0: + pages = 1 + + if data_total % 40 == 0: + # Becuase we count 0... Subtract a page if there are no left over results on the last page + pages = pages - 1 + + if pages == 0: + print "[!] Try to use quotes in the search name" + sys.exit(0) + + print "[*] %i Results Found" % data_total + if data_total > 1000: + pages = 25 + print "[*] LinkedIn only allows 1000 results. Refine keywords to capture all data" + print "[*] Fetching %i Pages" % pages + print + + for p in range(pages): + # Request results for each page using the start offset + if bCompany == False: + url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords=%s&origin=OTHER&q=guided&start=%i" % (search, p*40) + else: + url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->%s)&origin=OTHER&q=guided&start=%i" % (companyID, p*40) + #print url + r = requests.get(url, cookies=cookies, headers=headers) + content = r.text.encode('UTF-8') + content = json.loads(content) + print "[*] Fetching page %i with %i results" % ((p),len(content['elements'][0]['elements'])) + for c in content['elements'][0]['elements']: + if 'com.linkedin.voyager.search.SearchProfile' in c['hitInfo'] and c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['headless'] == False: + try: + data_industry = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['industry'] + except: + data_industry = "" + data_firstname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['firstName'] + data_lastname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['lastName'] + data_slug = "https://www.linkedin.com/in/%s" % c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['publicIdentifier'] + data_occupation = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['occupation'] + data_location = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['location'] + try: + data_picture = "https://media.licdn.com/mpr/mpr/shrinknp_400_400%s" % c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['picture']['com.linkedin.voyager.common.MediaProcessorImage']['id'] + except: + print "[*] No picture found for %s %s, %s" % (data_firstname, data_lastname, data_occupation) + data_picture = "" + + # incase the last name is multi part, we will split it down + + parts = data_lastname.split() + + name = data_firstname + " " + data_lastname + fname = "" + mname = "" + lname = "" + + if len(parts) == 1: + fname = data_firstname + mname = '?' + lname = parts[0] + elif len(parts) == 2: + fname = data_firstname + mname = parts[0] + lname = parts[1] + elif len(parts) >= 3: + fname = data_firstname + lname = parts[0] + else: + fname = data_firstname + lname = '?' + + fname = re.sub('[^A-Za-z]+', '', fname) + mname = re.sub('[^A-Za-z]+', '', mname) + lname = re.sub('[^A-Za-z]+', '', lname) + + if len(fname) == 0 or len(lname) == 0: + # invalid user, let's move on, this person has a weird name + continue + + #come here + + if prefix == 'full': + user = '{}{}{}'.format(fname, mname, lname) + if prefix == 'firstlast': + user = '{}{}'.format(fname, lname) + if prefix == 'firstmlast': + if len(mname) == 0: + user = '{}{}{}'.format(fname, mname, lname) + else: + user = '{}{}{}'.format(fname, mname[0], lname) + if prefix == 'flast': + user = '{}{}'.format(fname[0], lname) + if prefix == 'first.last': + user = '{}.{}'.format(fname, lname) + if prefix == 'fmlast': + if len(mname) == 0: + user = '{}{}{}'.format(fname[0], mname, lname) + else: + user = '{}{}{}'.format(fname[0], mname[0], lname) + if prefix == 'lastfirst': + user = '{}{}'.format(lname, fname) + + email = '{}@{}'.format(user, suffix) + + body += "" \ + "" \ + "" \ + "" \ + "" \ + "" \ + "" % (data_slug, data_picture, data_slug, name, email, data_occupation, data_location) + + csv.append('"%s","%s","%s","%s","%s", "%s"' % (data_firstname, data_lastname, name, email, data_occupation, data_location.replace(",",";"))) + foot = "
PhotoNamePossible Email:JobLocation
%s%s%s%s
" + f = open('{}.html'.format(outfile), 'wb') + f.write(css) + f.write(header) + f.write(body) + f.write(foot) + f.close() + f = open('{}.csv'.format(outfile), 'wb') + f.writelines('\n'.join(csv)) + f.close() + else: + print "[!] Headless profile found. Skipping" + print + +def banner(): + with open('banner.txt', 'r') as f: + data = f.read() + + print "\033[1;31m%s\033[0;0m" % data + print "\033[1;34mProviding you with Linkedin Intelligence" + print "\033[1;32mAuthor: Vincent Yiu (@vysec, @vysecurity)\033[0;0m" + print "\033[1;32mOriginal version by @DisK0nn3cT\033[0;0m" + +def authenticate(): + try: + a = login() + print a + session = a + if len(session) == 0: + sys.exit("[!] Unable to login to LinkedIn.com") + print "[*] Obtained new session: %s" % session + cookies = dict(li_at=session) + except Exception, e: + sys.exit("[!] Could not authenticate to linkedin. %s" % e) + return cookies + +if __name__ == '__main__': + banner() + # Prompt user for data variables + search = args.keywords if args.keywords!=None else raw_input("[*] Enter search Keywords (use quotes for more percise results)\n") + print + outfile = args.output if args.output!=None else raw_input("[*] Enter filename for output (exclude file extension)\n") + print + while True: + bCompany = raw_input("[*] Filter by Company? (Y/N): \n") + if bCompany.lower() == "y" or bCompany.lower() == "n": + break + else: + print "[!] Incorrect choice" + + if bCompany.lower() == "y": + bCompany = True + else: + bCompany = False + + bAuto = True + bSpecific = 0 + prefix = "" + suffix = "" + + print + + if bCompany: + while True: + bSpecific = raw_input("[*] Specify a Company ID (Provide ID or leave blank to automate): \n") + if bSpecific != "": + bAuto = False + if bSpecific != 0: + try: + int(bSpecific) + break + except: + print "[!] Incorrect choice, the ID either has to be a number or blank" + + else: + print "[!] Incorrect choice, the ID either has to be a number or blank" + else: + bAuto = True + break + + print + + + while True: + suffix = raw_input("[*] Enter e-mail domain suffix (eg. contoso.com): \n") + suffix = suffix.lower() + if "." in suffix: + break + else: + print "[!] Incorrect e-mail? There's no dot" + + print + + while True: + prefix = raw_input("[*] Select a prefix for e-mail generation (auto,full,firstlast,firstmlast,flast,first.last,fmlast,lastfirst): \n") + prefix = prefix.lower() + print + if prefix == "full" or prefix == "firstlast" or prefix == "firstmlast" or prefix == "flast" or prefix =="first" or prefix == "first.last" or prefix == "fmlast" or prefix == "lastfirst": + break + elif prefix == "auto": + #if auto prefix then we want to use hunter IO to find it. + print "[*] Automaticly using Hunter IO to determine best Prefix" + url = "https://hunter.io/trial/v2/domain-search?offset=0&domain=%s&format=json" % suffix + r = requests.get(url) + content = json.loads(r.text) + if "status" in content: + print "[!] Rate limited by Hunter IO trial" + url = "https://api.hunter.io/v2/domain-search?domain=%s&api_key=%s" % (suffix, api_key) + #print url + r = requests.get(url) + content = json.loads(r.text) + if "status" in content: + print "[!] Rate limited by Hunter IO Key" + continue + #print content + prefix = content['data']['pattern'] + print "[!] %s" % prefix + if prefix: + prefix = prefix.replace("{","").replace("}", "") + if prefix == "full" or prefix == "firstlast" or prefix == "firstmlast" or prefix == "flast" or prefix =="first" or prefix == "first.last" or prefix == "fmlast" or prefix == "lastfirst": + print "[+] Found %s prefix" % prefix + break + else: + print "[!] Automatic prefix search failed, please insert a manual choice" + continue + else: + print "[!] Automatic prefix search failed, please insert a manual choice" + continue + else: + print "[!] Incorrect choice, please select a value from (auto,full,firstlast,firstmlast,flast,first.last,fmlast)" + + print + + + + # URL Encode for the querystring + search = urllib.quote_plus(search) + cookies = authenticate() + + + # Initialize Scraping + get_search() + + print "[+] Complete" \ No newline at end of file diff --git a/banner.txt b/banner.txt new file mode 100644 index 0000000..f4397ab --- /dev/null +++ b/banner.txt @@ -0,0 +1,7 @@ +██╗ ██╗███╗ ██╗██╗ ██╗███████╗██████╗ ██╗███╗ ██╗████████╗ +██║ ██║████╗ ██║██║ ██╔╝██╔════╝██╔══██╗██║████╗ ██║╚══██╔══╝ +██║ ██║██╔██╗ ██║█████╔╝ █████╗ ██║ ██║██║██╔██╗ ██║ ██║ +██║ ██║██║╚██╗██║██╔═██╗ ██╔══╝ ██║ ██║██║██║╚██╗██║ ██║ +███████╗██║██║ ╚████║██║ ██╗███████╗██████╔╝██║██║ ╚████║ ██║ +╚══════╝╚═╝╚═╝ ╚═══╝╚═╝ ╚═╝╚══════╝╚═════╝ ╚═╝╚═╝ ╚═══╝ ╚═╝ + \ No newline at end of file