Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Queries Google for Jeff Weiner's LinkedIn profile and then scraps the results and finds the best matching profile URL using Selenium.
from selenium import webdriver
import json
from difflib import get_close_matches
def urlMatch(titles_urls, name):
links = []
titles= []
for item in titles_urls:
if "linkedin.com/in/" in item[1]:
titles.append(item[0].split('|')[0].strip())
links.append(item[1])
if titles == []: return None
elif len(titles)==1: return links[0]
else:
best_match = get_close_matches(name,titles,n=1)
return links[titles.index(best_match[0])]
name = "Jeff Weiner"
position = "CEO"
company = "LinkedIn"
operator = 'site: www.linkedin.com/in'
query = " ".join([operator, name, position, company])
#The following url will give you the first 5 (start=0, num=5) google search results
#corresponding to your query.
url = 'https://www.google.com/webhp?#num=5&start=0'+'&q=' + query
browser = webdriver.Firefox()
browser.get(url)
#XPath will find a subnode of h3, a[@href] specifies that we only want <a> nodes with
# any href attribute that are subnodes of <h3> tags that have a class of ‘r’
links = browser.find_elements_by_xpath("//h3[@class='r']/a[@href]")
results = []
for link in links:
title = link.text.encode('utf8')
url = link.get_attribute('href')
title_url = (title, url)
results.append(title_url)
for result in results:
title = result[0]
url = result[1]
print '[+]', title, '--', url
with open('urls.json','wb') as outputfile:
json.dump(results,outputfile)
best_match = urlMatch (results, name)
if best_match is None:
print ('''No best match found or maybe the required person is possibly
not on Linkedin or we may be searching with the wrong details''')
print ('\nBest Match')
print '[+]', best_match
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment