LinkedInt/LinkedInt.py

#!/usr/bin/python

# LinkedInt
# Scrapes LinkedIn without using LinkedIn API
# Original scraper by @DisK0nn3cT (https://github.com/DisK0nn3cT/linkedin-gatherer)
# Modified by @vysecurity
# - Additions:
# --- UI Updates
# --- Constrain to company filters
# --- Addition of Hunter for e-mail prediction

import sys
import re
import time
import requests
import subprocess
import json
import argparse
import cookielib
import ConfigParser
import os
import urllib
import math
import urllib2
import string
from bs4 import BeautifulSoup
from thready import threaded

reload(sys)
sys.setdefaultencoding('utf-8')

""" Setup Argument Parameters """
parser = argparse.ArgumentParser(description='Discovery LinkedIn')
parser.add_argument('-u', '--keywords', help='Keywords to search')
parser.add_argument('-o', '--output', help='Output file (do not include extentions)')
args = parser.parse_args()
baseDir = os.path.dirname(os.path.realpath(sys.argv[0])) + os.path.sep
config = ConfigParser.RawConfigParser()
config.read(baseDir + 'LinkedInt.cfg')
api_key = config.get('API_KEYS', 'hunter')
username = config.get('CREDS', 'linkedin_username')
password = config.get('CREDS', 'linkedin_password')

def login():
	cookie_filename = baseDir + "cookies.txt"
	cookiejar = cookielib.MozillaCookieJar(cookie_filename)
	opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(),urllib2.HTTPHandler(debuglevel=0),urllib2.HTTPSHandler(debuglevel=0),urllib2.HTTPCookieProcessor(cookiejar))
	page = loadPage(opener, "https://www.linkedin.com/")
	parse = BeautifulSoup(page, "html.parser")

	csrf = parse.find(id="loginCsrfParam-login")['value']

	login_data = urllib.urlencode({'session_key': username, 'session_password': password, 'loginCsrfParam': csrf})
	page = loadPage(opener,"https://www.linkedin.com/uas/login-submit", login_data)

	parse = BeautifulSoup(page, "html.parser")
	cookie = ""

	try:
		cookie = cookiejar._cookies['.www.linkedin.com']['/']['li_at'].value
	except:
                print "[!] Cannot log in"
		sys.exit(0)

	cookiejar.save()
	os.remove(cookie_filename)
	return cookie

def loadPage(client, url, data=None):
	try:
		response = client.open(url)
	except:
		print "[!] Cannot load main LinkedIn page"
	try:
		if data is not None:
			response = client.open(url, data)
		else:
			response = client.open(url)
		return ''.join(response.readlines())
	except:
		sys.exit(0)

def get_search():

    body = ""
    csv = []
    css = """<style>
    #employees {
        font-family: "Trebuchet MS", Arial, Helvetica, sans-serif;
        border-collapse: collapse;
        width: 100%;
    }

    #employees td, #employees th {
        border: 1px solid #ddd;
        padding: 8px;
    }

    #employees tr:nth-child(even){background-color: #f2f2f2;}

    #employees tr:hover {background-color: #ddd;}

    #employees th {
        padding-top: 12px;
        padding-bottom: 12px;
        text-align: left;
        background-color: #4CAF50;
        color: white;
    }
    </style>

    """

    header = """<center><table id=\"employees\">
             <tr>
             <th>Photo</th>
             <th>Name</th>
             <th>Possible Email:</th>
             <th>Job</th>
             <th>Location</th>
             </tr>
             """

    # Do we want to automatically get the company ID?

    if bCompany:
	    if bAuto:
	        # Automatic
	        # Grab from the URL
	        companyID = 0
	        url = "https://www.linkedin.com/voyager/api/typeahead/hits?q=blended&query=%s" % search
	        headers = {'Csrf-Token':'ajax:0397788525211216808', 'X-RestLi-Protocol-Version':'2.0.0'}
	        cookies['JSESSIONID'] = 'ajax:0397788525211216808'
	        r = requests.get(url, cookies=cookies, headers=headers)
	        content = json.loads(r.text)
	        firstID = 0
	        for i in range(0,len(content['elements'])):
	        	try:
	        		companyID = content['elements'][i]['hitInfo']['com.linkedin.voyager.typeahead.TypeaheadCompany']['id']
	        		if firstID == 0:
	        			firstID = companyID
	        		print "[Notice] Found company ID: %s" % companyID
	        	except:
	        		continue
	        companyID = firstID
	        if companyID == 0:
	        	print "[WARNING] No valid company ID found in auto, please restart and find your own"
	    else:
	        # Don't auto, use the specified ID
	        companyID = bSpecific

	    print

	    print "[*] Using company ID: %s" % companyID

	# Fetch the initial page to get results/page counts
    if bCompany == False:
        url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords=%s&origin=OTHER&q=guided&start=0" % search
    else:
        url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->%s)&origin=OTHER&q=guided&start=0" % (companyID)

    print url

    headers = {'Csrf-Token':'ajax:0397788525211216808', 'X-RestLi-Protocol-Version':'2.0.0'}
    cookies['JSESSIONID'] = 'ajax:0397788525211216808'
    #print url
    r = requests.get(url, cookies=cookies, headers=headers)
    content = json.loads(r.text)
    data_total = content['elements'][0]['total']

    # Calculate pages off final results at 40 results/page
    pages = int(math.ceil(data_total / 40.0))

    if pages == 0:
    	pages = 1

    if data_total % 40 == 0:
        # Becuase we count 0... Subtract a page if there are no left over results on the last page
        pages = pages - 1

    if pages == 0:
    	print "[!] Try to use quotes in the search name"
    	sys.exit(0)

    print "[*] %i Results Found" % data_total
    if data_total > 1000:
        pages = 25
        print "[*] LinkedIn only allows 1000 results. Refine keywords to capture all data"
    print "[*] Fetching %i Pages" % pages
    print

    for p in range(pages):
        # Request results for each page using the start offset
        if bCompany == False:
            url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords=%s&origin=OTHER&q=guided&start=%i" % (search, p*40)
        else:
            url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List(v->PEOPLE,facetCurrentCompany->%s)&origin=OTHER&q=guided&start=%i" % (companyID, p*40)
        #print url
        r = requests.get(url, cookies=cookies, headers=headers)
        content = r.text.encode('UTF-8')
        content = json.loads(content)
        print "[*] Fetching page %i with %i results" % ((p),len(content['elements'][0]['elements']))
        for c in content['elements'][0]['elements']:
            if 'com.linkedin.voyager.search.SearchProfile' in c['hitInfo'] and c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['headless'] == False:
                try:
                    data_industry = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['industry']
                except:
                    data_industry = ""
                data_firstname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['firstName']
                data_lastname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['lastName']
                data_slug = "https://www.linkedin.com/in/%s" % c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['publicIdentifier']
                data_occupation = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['occupation']
                data_location = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['location']
                try:
                    data_picture = "%s%s" % (c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['picture']['com.linkedin.common.VectorImage']['rootUrl'],c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['picture']['com.linkedin.common.VectorImage']['artifacts'][2]['fileIdentifyingUrlPathSegment'])
                except:
                    print "[*] No picture found for %s %s, %s" % (data_firstname, data_lastname, data_occupation)
                    data_picture = ""

                # incase the last name is multi part, we will split it down

                parts = data_lastname.split()

                name = data_firstname + " " + data_lastname
                fname = ""
                mname = ""
                lname = ""

                if len(parts) == 1:
                    fname = data_firstname
                    mname = '?'
                    lname = parts[0]
                elif len(parts) == 2:
                    fname = data_firstname
                    mname = parts[0]
                    lname = parts[1]
                elif len(parts) >= 3:
                    fname = data_firstname
                    lname = parts[0]
                else:
                    fname = data_firstname
                    lname = '?'

                fname = re.sub('[^A-Za-z]+', '', fname)
                mname = re.sub('[^A-Za-z]+', '', mname)
                lname = re.sub('[^A-Za-z]+', '', lname)

                if len(fname) == 0 or len(lname) == 0:
                    # invalid user, let's move on, this person has a weird name
                    continue

                    #come here

                if prefix == 'full':
                    user = '{}{}{}'.format(fname, mname, lname)
                if prefix == 'firstlast':
                    user = '{}{}'.format(fname, lname)
                if prefix == 'firstmlast':
                    if len(mname) == 0:
                        user = '{}{}{}'.format(fname, mname, lname)
                    else:
                        user = '{}{}{}'.format(fname, mname[0], lname)
                if prefix == 'flast':
                    user = '{}{}'.format(fname[0], lname)
                if prefix == 'firstl':
                    user = '{}{}'.format(fname,lname[0])
                if prefix == 'first.last':
                    user = '{}.{}'.format(fname, lname)
                if prefix == 'fmlast':
                    if len(mname) == 0:
                        user = '{}{}{}'.format(fname[0], mname, lname)
                    else:
                        user = '{}{}{}'.format(fname[0], mname[0], lname)
                if prefix == 'lastfirst':
                	user = '{}{}'.format(lname, fname)

                email = '{}@{}'.format(user, suffix)

                body += "<tr>" \
                    "<td><a href=\"%s\"><img src=\"%s\" width=200 height=200></a></td>" \
                    "<td><a href=\"%s\">%s</a></td>" \
                    "<td>%s</td>" \
                    "<td>%s</td>" \
                    "<td>%s</td>" \
                    "<a>" % (data_slug, data_picture, data_slug, name, email, data_occupation, data_location)

                csv.append('"%s","%s","%s","%s","%s", "%s"' % (data_firstname, data_lastname, name, email, data_occupation, data_location.replace(",",";")))
                foot = "</table></center>"
                f = open('{}.html'.format(outfile), 'wb')
                f.write(css)
                f.write(header)
                f.write(body)
                f.write(foot)
                f.close()
                f = open('{}.csv'.format(outfile), 'wb')
                f.writelines('\n'.join(csv))
                f.close()
            else:
                print "[!] Headless profile found. Skipping"
        print

def banner():
        with open(baseDir + 'banner.txt', 'r') as f:
            data = f.read()

            print "\033[1;31m%s\033[0;0m" % data
            print "\033[1;34mProviding you with Linkedin Intelligence"
            print "\033[1;32mAuthor: Vincent Yiu (@vysec, @vysecurity)\033[0;0m"
            print "\033[1;32mOriginal version by @DisK0nn3cT\033[0;0m"

def authenticate():
    try:
    	a = login()
    	print a
        session = a
        if len(session) == 0:
            sys.exit("[!] Unable to login to LinkedIn.com")
        print "[*] Obtained new session: %s" % session
        cookies = dict(li_at=session)
    except Exception, e:
        sys.exit("[!] Could not authenticate to linkedin. %s" % e)
    return cookies

if __name__ == '__main__':
    banner()
    # Prompt user for data variables
    search = args.keywords if args.keywords!=None else raw_input("[*] Enter search Keywords (use quotes for more precise results)\n")
    print
    outfile = args.output if args.output!=None else raw_input("[*] Enter filename for output (exclude file extension)\n")
    print
    while True:
        bCompany = raw_input("[*] Filter by Company? (Y/N): \n")
        if bCompany.lower() == "y" or bCompany.lower() == "n":
            break
        else:
            print "[!] Incorrect choice"

    if bCompany.lower() == "y":
        bCompany = True
    else:
        bCompany = False

    bAuto = True
    bSpecific = 0
    prefix = ""
    suffix = ""

    print

    if bCompany:
	    while True:
	        bSpecific = raw_input("[*] Specify a Company ID (Provide ID or leave blank to automate): \n")
	        if bSpecific != "":
	            bAuto = False
	            if bSpecific != 0:
	                try:
	                    int(bSpecific)
	                    break
	                except:
	                    print "[!] Incorrect choice, the ID either has to be a number or blank"

	            else:
	                print "[!] Incorrect choice, the ID either has to be a number or blank"
	        else:
	            bAuto = True
	            break

    print


    while True:
        suffix = raw_input("[*] Enter e-mail domain suffix (eg. contoso.com): \n")
        suffix = suffix.lower()
        if "." in suffix:
            break
        else:
            print "[!] Incorrect e-mail? There's no dot"

    print

    while True:
        prefix = raw_input("[*] Select a prefix for e-mail generation (auto,full,firstlast,firstmlast,flast,firstl,first.last,fmlast,lastfirst): \n")
        prefix = prefix.lower()
        print
        if prefix == "full" or prefix == "firstlast" or prefix == "firstmlast" or prefix == "flast" or prefix == "firstl" or prefix =="first" or prefix == "first.last" or prefix == "fmlast" or prefix == "lastfirst":
            break
        elif prefix == "auto":
            #if auto prefix then we want to use hunter IO to find it.
            print "[*] Automatically using Hunter IO to determine best Prefix"
            url = "https://hunter.io/trial/v2/domain-search?offset=0&domain=%s&format=json" % suffix
            r = requests.get(url)
            content = json.loads(r.text)
            if "status" in content:
                print "[!] Rate limited by Hunter IO trial"
                url = "https://api.hunter.io/v2/domain-search?domain=%s&api_key=%s" % (suffix, api_key)
                #print url
                r = requests.get(url)
                content = json.loads(r.text)
                if "status" in content:
                    print "[!] Rate limited by Hunter IO Key"
                    continue
            #print content
            prefix = content['data']['pattern']
            print "[!] %s" % prefix
            if prefix:
                prefix = prefix.replace("{","").replace("}", "")
                if prefix == "full" or prefix == "firstlast" or prefix == "firstmlast" or prefix == "flast" or prefix == "firstl" or prefix =="first" or prefix == "first.last" or prefix == "fmlast" or prefix == "lastfirst":
                    print "[+] Found %s prefix" % prefix
                    break
                else:
                    print "[!] Automatic prefix search failed, please insert a manual choice"
                    continue
            else:
                print "[!] Automatic prefix search failed, please insert a manual choice"
                continue
        else:
            print "[!] Incorrect choice, please select a value from (auto,full,firstlast,firstmlast,flast,firstl,first.last,fmlast)"

    print


    # URL Encode for the querystring
    search = urllib.quote_plus(search)
    cookies = authenticate()


    # Initialize Scraping
    get_search()

    print "[+] Complete"