Skip to content

Instantly share code, notes, and snippets.

@AkshayAgarwal007
Last active November 26, 2016 00:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AkshayAgarwal007/46d2715292165f60f54657849502cccf to your computer and use it in GitHub Desktop.
Save AkshayAgarwal007/46d2715292165f60f54657849502cccf to your computer and use it in GitHub Desktop.
Queries Google for Jeff Weiner's LinkedIn profile and then scraps the results and finds the best matching profile URL using Selenium.
from selenium import webdriver
import json
from difflib import get_close_matches
def urlMatch(titles_urls, name):
links = []
titles= []
for item in titles_urls:
if "linkedin.com/in/" in item[1]:
titles.append(item[0].split('|')[0].strip())
links.append(item[1])
if titles == []: return None
elif len(titles)==1: return links[0]
else:
best_match = get_close_matches(name,titles,n=1)
return links[titles.index(best_match[0])]
name = "Jeff Weiner"
position = "CEO"
company = "LinkedIn"
operator = 'site: www.linkedin.com/in'
query = " ".join([operator, name, position, company])
#The following url will give you the first 5 (start=0, num=5) google search results
#corresponding to your query.
url = 'https://www.google.com/webhp?#num=5&start=0'+'&q=' + query
browser = webdriver.Firefox()
browser.get(url)
#XPath will find a subnode of h3, a[@href] specifies that we only want <a> nodes with
# any href attribute that are subnodes of <h3> tags that have a class of ‘r’
links = browser.find_elements_by_xpath("//h3[@class='r']/a[@href]")
results = []
for link in links:
title = link.text.encode('utf8')
url = link.get_attribute('href')
title_url = (title, url)
results.append(title_url)
for result in results:
title = result[0]
url = result[1]
print '[+]', title, '--', url
with open('urls.json','wb') as outputfile:
json.dump(results,outputfile)
best_match = urlMatch (results, name)
if best_match is None:
print ('''No best match found or maybe the required person is possibly
not on Linkedin or we may be searching with the wrong details''')
print ('\nBest Match')
print '[+]', best_match
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment