Skip to content

Instantly share code, notes, and snippets.

@vitalyp
Created March 14, 2015 02:29
Show Gist options
  • Save vitalyp/76eeca23bba2153b9d62 to your computer and use it in GitHub Desktop.
Save vitalyp/76eeca23bba2153b9d62 to your computer and use it in GitHub Desktop.
Scraping LinkedIn Public Profiles for Fun and Profit
#!/usr/bin/python
#
# Copyright (C) 2012 Itzik Kotler
#
# scraper.py is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# scraper.py is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with scraper.py. If not, see <http://www.gnu.org/licenses/>.
"""Simple LinkedIn public profiles scraper that uses Google Custom Search"""
import urllib
import simplejson
GOOGLE_API_KEY = "<YOUR GOOGLE API KEY>"
CX = "<YOUR GOOGLE SEARCH ENGINE CX>"
BASE_URL = "https://www.googleapis.com/customsearch/v1?key="+GOOGLE_API_KEY+"&cx="+CX
def __get_all_hcards_from_query(query, index=0, hcards={}):
url = query
if index != 0:
url = url + '&start=%d' % (index)
json = simplejson.loads(urllib.urlopen(url).read())
if json.has_key('error'):
print "Stopping at %s due to Error!" % (url)
print json
else:
for item in json['items']:
try:
hcards[item['pagemap']['hcard'][0]['fn']] = item['pagemap']['hcard'][0]['title']
except KeyError as e:
pass
if json['queries'].has_key('nextPage'):
return __get_all_hcards_from_query(query, json['queries']['nextPage'][0]['startIndex'], hcards)
return hcards
def get_all_employees_by_company_via_linkedin(company):
queries = ['"at %s" inurl:"in"', '"at %s" inurl:"pub"']
result = {}
for query in queries:
_query = query % company
result.update(__get_all_hcards_from_query(BASE_URL + '&q=' + _query))
return list(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment