AkshayAgarwal007/LinkedInUrlScrapper.py

## LinkedInUrlScrapper.py
from selenium import webdriver
import json
from difflib import get_close_matches

def urlMatch(titles_urls, name):
    links = []
    titles= []
    for item in titles_urls:
        if "linkedin.com/in/" in item[1]:
            titles.append(item[0].split('|')[0].strip())
            links.append(item[1])
    if titles == []: return None
    elif len(titles)==1: return links[0]
    else:
        best_match = get_close_matches(name,titles,n=1)
        return links[titles.index(best_match[0])]

name = "Jeff Weiner"
position = "CEO"
company = "LinkedIn"
operator = 'site: www.linkedin.com/in'
query = " ".join([operator, name, position, company])
#The following url will give you the first 5 (start=0, num=5) google search results
#corresponding to your query.
url = 'https://www.google.com/webhp?#num=5&start=0'+'&q=' + query

browser = webdriver.Firefox()
browser.get(url)

#XPath will find a subnode of h3, a[@href] specifies that we only want <a> nodes with
# any href attribute that are subnodes of <h3> tags that have a class of ‘r’
links = browser.find_elements_by_xpath("//h3[@class='r']/a[@href]")
results = []
for link in links:
        title = link.text.encode('utf8')
        url = link.get_attribute('href')
        title_url = (title, url)
        results.append(title_url)

for result in results:
        title = result[0]
        url = result[1]
        print '[+]', title, '--', url

with open('urls.json','wb') as outputfile:
    json.dump(results,outputfile)

best_match = urlMatch (results, name)
if best_match is None:
    print ('''No best match found or maybe the required person is possibly
          not on Linkedin or we may be searching with the wrong details''')

print ('\nBest Match')
print '[+]', best_match
	from selenium import webdriver
	import json
	from difflib import get_close_matches

	def urlMatch(titles_urls, name):
	links = []
	titles= []
	for item in titles_urls:
	if "linkedin.com/in/" in item[1]:
	titles.append(item[0].split('\|')[0].strip())
	links.append(item[1])
	if titles == []: return None
	elif len(titles)==1: return links[0]
	else:
	best_match = get_close_matches(name,titles,n=1)
	return links[titles.index(best_match[0])]

	name = "Jeff Weiner"
	position = "CEO"
	company = "LinkedIn"
	operator = 'site: www.linkedin.com/in'
	query = " ".join([operator, name, position, company])
	#The following url will give you the first 5 (start=0, num=5) google search results
	#corresponding to your query.
	url = 'https://www.google.com/webhp?#num=5&start=0'+'&q=' + query

	browser = webdriver.Firefox()
	browser.get(url)

	#XPath will find a subnode of h3, a[@href] specifies that we only want <a> nodes with
	# any href attribute that are subnodes of <h3> tags that have a class of ‘r’
	links = browser.find_elements_by_xpath("//h3[@class='r']/a[@href]")
	results = []
	for link in links:
	title = link.text.encode('utf8')
	url = link.get_attribute('href')
	title_url = (title, url)
	results.append(title_url)

	for result in results:
	title = result[0]
	url = result[1]
	print '[+]', title, '--', url

	with open('urls.json','wb') as outputfile:
	json.dump(results,outputfile)

	best_match = urlMatch (results, name)
	if best_match is None:
	print ('''No best match found or maybe the required person is possibly
	not on Linkedin or we may be searching with the wrong details''')

	print ('\nBest Match')
	print '[+]', best_match