Skip to content

Instantly share code, notes, and snippets.

@ixtli
Last active June 19, 2018 23:04
Show Gist options
  • Save ixtli/7a39f3db24e868ad1b82356d6ac976d4 to your computer and use it in GitHub Desktop.
Save ixtli/7a39f3db24e868ad1b82356d6ac976d4 to your computer and use it in GitHub Desktop.
A python program to download all public Linked In info about members of ICE. All of this was made by https://twitter.com/sam_lavigne
headers = {}
import time
import json
import csv
import os
import requests
from bs4 import BeautifulSoup
from jinja2 import Template
import headers
# these represent different job functions
FUNCTION_FACETS = [17, 18, 14, 2, 4, 20, 5, 13, 12, 26] #FA
SENIORITY_FACETS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] #SE
LOCATION_FACETS = [ #G
'us:8-2-0-1-2',
'us:97',
'us:va',
'us:dc',
'us:tx',
'us:ca',
'us:md',
'us:70',
'us:31',
'us:ny',
'us:8-8-0-8-1',
'us:8-8-0-3-1',
'us:ga',
'us:52',
'us:7',
'us:8-8-0-95-11',
'us:nj',
'us:3-2-0-31-1',
]
FACETS = [
('FA', FUNCTION_FACETS),
('SE', SENIORITY_FACETS),
('G', LOCATION_FACETS)
]
def download_file(url, local_filename=None):
'''Downloads a file with requests
from: https://stackoverflow.com/a/16696317
'''
if local_filename is None:
local_filename = url.split('/')[-1]
print('saving to', local_filename)
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
return local_filename
def get_page(company_id, facet=None, facet_id=None, start=0, count=50):
'''Gets a single page of results from linkedin for a particular job function at a company'''
params = {
'facet': ['CC'],
'facet.CC': company_id,
'count': count,
'start': start,
}
if facet is not None and facet_id is not None:
params['facet'] = ['CC', facet]
params['facet.' + facet] = facet_id
response = requests.get('https://www.linkedin.com/sales/search/results', headers=headers.headers, params=params)
return response.json()
def get_company(company_id, outname):
'''Gets all employees from a company using particular job functions'''
people = []
for facet, facet_ids in FACETS:
for facet_id in facet_ids:
print('getting facet', facet, facet_id, 'for company', company_id)
count = 50
start = 0
results = get_page(company_id, facet, facet_id)
total = results['pagination']['total']
people += results['searchResults']
start += count
while start < total:
print('getting', start, 'of', total)
time.sleep(1)
results = get_page(company_id, facet, facet_id, start)
people += results['searchResults']
start += count
with open(outname, 'w') as outfile:
json.dump(people, outfile, indent=2)
return outname
def get_images(datafile):
'''Downloads profile images'''
with open(datafile, 'r') as infile:
people = json.load(infile)
people = [p['member'] for p in people]
for p in people:
if 'vectorImage' not in p:
continue
pid = p['memberId']
outname = 'images/{}.jpg'.format(pid)
if os.path.exists(outname):
print('skipping')
continue
url = p['vectorImage']['rootUrl']
url += sorted(p['vectorImage']['artifacts'], key=lambda x: x['width'])[-1]['fileIdentifyingUrlPathSegment']
print(url)
download_file(url, outname)
time.sleep(1)
def get_profile(pid):
'''Downloads individual profiles'''
outname = 'profiles/{}.json'.format(pid)
if os.path.exists(outname):
return outname
out = {}
url = 'https://www.linkedin.com/sales/people/{},NAME_SEARCH'.format(pid)
print(url)
response = requests.get(url, headers=headers.headers)
soup = BeautifulSoup(response.text, 'html.parser')
codes = soup.select('code')
for c in codes:
try:
d = json.loads(c.text)
if 'contactInfo' in d:
out = d
break
except Exception as e:
continue
with open(outname, 'w') as outfile:
json.dump(out, outfile)
time.sleep(1)
return outname
def get_profiles(datafile):
'''Gets all profiles'''
with open(datafile, 'r') as infile:
data = json.load(infile)
for d in data:
pid = d['member']['profileId']
get_profile(pid)
def clean_and_parse(datafile, outname):
'''Outputs csv, json and html from employee listings'''
out = []
mids = []
with open(datafile, 'r') as infile:
data = json.load(infile)
for d in data:
mid = d['member']['memberId']
pid = d['member']['profileId']
imgpath = 'images/{}.jpg'.format(mid)
if not os.path.exists(imgpath):
imgpath = None
item = {
'name': d['member'].get('formattedName', ''),
'title': d['member'].get('title', ''),
'img': imgpath,
'company': d['company'].get('companyName', ''),
'location': d['member'].get('location', ''),
'id': d['member']['memberId'],
'linkedin': 'https://linkedin.com/in/' + pid,
}
if mid not in mids:
out.append(item)
mids.append(mid)
with open(outname + '.json', 'w') as jsonfile:
json.dump(out, jsonfile, indent=2)
with open(outname + '.csv', 'w') as csvfile:
fieldnames = list(out[0].keys())
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in out:
writer.writerow(row)
with open('template.html', 'r') as templatefile:
template = Template(templatefile.read())
html = template.render(people=out)
with open('index.html', 'w') as htmlout:
htmlout.write(html)
if __name__ == '__main__':
ICE = '533534'
datafile = 'ice_raw.json'
get_company(ICE, datafile)
get_profiles(datafile)
get_images(datafile)
clean_and_parse(datafile, 'ice')
beautifulsoup4==4.6.0
certifi==2018.4.16
chardet==3.0.4
idna==2.7
Jinja2==2.10
MarkupSafe==1.0
requests==2.19.1
urllib3==1.23
<html>
<head>
<title>ICE @ LinkedIn</title>
<style>
body, table {
font: 14px sans-serif;
}
#container {
max-width: 1100px;
margin: auto;
}
table {
border-collapse: collapse;
width: 100%;
}
th {
text-align: left;
}
td {
padding: 3px;
border: 1px solid #ccc;
}
img {
max-width: 50px;
display: block;
}
a {
color: #000;
}
</style>
</head>
<body>
<div id="container">
<h1>People on LinkedIn who work for ICE</h1>
<p>collected by <a href="http://lav.io">Sam Lavigne</a></p>
<p><a href="https://raw.githubusercontent.com/antiboredom/ice-linkedin/master/ice.csv">Download as a CSV</a>, or explore <a href="https://github.com/antiboredom/ice-linkedin">the more detailed dataset here</a>.</p>
<table>
<tr>
<th></th>
<th>Name</th>
<th>Title</th>
<th>Location</th>
</tr>
{% for p in people %}
<tr>
<td>
{% if p.img %}
<a href="{{p.linkedin}}"><img src="{{p.img}}"></a> {% endif %}
</td>
<td><a href="{{p.linkedin}}">{{p.name}}</a></td>
<td>{{p.title}}</td>
<td>{{p.location}}</td>
</tr>
{% endfor %}
</table>
</div>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment