Last active
November 26, 2016 00:06
-
-
Save AkshayAgarwal007/46d2715292165f60f54657849502cccf to your computer and use it in GitHub Desktop.
Queries Google for Jeff Weiner's LinkedIn profile and then scraps the results and finds the best matching profile URL using Selenium.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
import json | |
from difflib import get_close_matches | |
def urlMatch(titles_urls, name): | |
links = [] | |
titles= [] | |
for item in titles_urls: | |
if "linkedin.com/in/" in item[1]: | |
titles.append(item[0].split('|')[0].strip()) | |
links.append(item[1]) | |
if titles == []: return None | |
elif len(titles)==1: return links[0] | |
else: | |
best_match = get_close_matches(name,titles,n=1) | |
return links[titles.index(best_match[0])] | |
name = "Jeff Weiner" | |
position = "CEO" | |
company = "LinkedIn" | |
operator = 'site: www.linkedin.com/in' | |
query = " ".join([operator, name, position, company]) | |
#The following url will give you the first 5 (start=0, num=5) google search results | |
#corresponding to your query. | |
url = 'https://www.google.com/webhp?#num=5&start=0'+'&q=' + query | |
browser = webdriver.Firefox() | |
browser.get(url) | |
#XPath will find a subnode of h3, a[@href] specifies that we only want <a> nodes with | |
# any href attribute that are subnodes of <h3> tags that have a class of ‘r’ | |
links = browser.find_elements_by_xpath("//h3[@class='r']/a[@href]") | |
results = [] | |
for link in links: | |
title = link.text.encode('utf8') | |
url = link.get_attribute('href') | |
title_url = (title, url) | |
results.append(title_url) | |
for result in results: | |
title = result[0] | |
url = result[1] | |
print '[+]', title, '--', url | |
with open('urls.json','wb') as outputfile: | |
json.dump(results,outputfile) | |
best_match = urlMatch (results, name) | |
if best_match is None: | |
print ('''No best match found or maybe the required person is possibly | |
not on Linkedin or we may be searching with the wrong details''') | |
print ('\nBest Match') | |
print '[+]', best_match |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment