Skip to content

Instantly share code, notes, and snippets.

@scrapehero
Last active December 6, 2021 18:13
Show Gist options
  • Save scrapehero/7fb058f67a2988cba74957ef2ec013d1 to your computer and use it in GitHub Desktop.
Save scrapehero/7fb058f67a2988cba74957ef2ec013d1 to your computer and use it in GitHub Desktop.
Python script to scrape a company details from a public company page on LinkedIn.com. Written as part of How to Scrape educational post - https://www.scrapehero.com/tutorial-scraping-linkedin-for-public-company-data/
from lxml import html
import csv, os, json
import requests
from exceptions import ValueError
from time import sleep
def linkedin_companies_parser(url):
for i in range(5):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
}
print "Fetching :",url
response = requests.get(url, headers = headers,verify=False)
formatted_response = response.content.replace('<!--', '').replace('-->', '')
doc = html.fromstring(formatted_response)
datafrom_xpath = doc.xpath('//code[@id="stream-promo-top-bar-embed-id-content"]//text()')
content_about = doc.xpath('//code[@id="stream-about-section-embed-id-content"]')
if not content_about:
content_about = doc.xpath('//code[@id="stream-footer-embed-id-content"]')
if content_about:
pass
# json_text = content_about[0].html_content().replace('<code id="stream-footer-embed-id-content"><!--','').replace('<code id="stream-about-section-embed-id-content"><!--','').replace('--></code>','')
if datafrom_xpath:
try:
json_formatted_data = json.loads(datafrom_xpath[0])
company_name = json_formatted_data['companyName'] if 'companyName' in json_formatted_data.keys() else None
size = json_formatted_data['size'] if 'size' in json_formatted_data.keys() else None
industry = json_formatted_data['industry'] if 'industry' in json_formatted_data.keys() else None
description = json_formatted_data['description'] if 'description' in json_formatted_data.keys() else None
follower_count = json_formatted_data['followerCount'] if 'followerCount' in json_formatted_data.keys() else None
year_founded = json_formatted_data['yearFounded'] if 'yearFounded' in json_formatted_data.keys() else None
website = json_formatted_data['website'] if 'website' in json_formatted_data.keys() else None
type = json_formatted_data['companyType'] if 'companyType' in json_formatted_data.keys() else None
specialities = json_formatted_data['specialties'] if 'specialties' in json_formatted_data.keys() else None
if "headquarters" in json_formatted_data.keys():
city = json_formatted_data["headquarters"]['city'] if 'city' in json_formatted_data["headquarters"].keys() else None
country = json_formatted_data["headquarters"]['country'] if 'country' in json_formatted_data['headquarters'].keys() else None
state = json_formatted_data["headquarters"]['state'] if 'state' in json_formatted_data['headquarters'].keys() else None
street1 = json_formatted_data["headquarters"]['street1'] if 'street1' in json_formatted_data['headquarters'].keys() else None
street2 = json_formatted_data["headquarters"]['street2'] if 'street2' in json_formatted_data['headquarters'].keys() else None
zip = json_formatted_data["headquarters"]['zip'] if 'zip' in json_formatted_data['headquarters'].keys() else None
street = street1 + ', ' + street2
else:
city = None
country = None
state = None
street1 = None
street2 = None
street = None
zip = None
data = {
'company_name': company_name,
'size': size,
'industry': industry,
'description': description,
'follower_count': follower_count,
'founded': year_founded,
'website': website,
'type': type,
'specialities': specialities,
'city': city,
'country': country,
'state': state,
'street': street,
'zip': zip,
'url': url
}
return data
except:
print "cant parse page", url
# Retry in case of captcha or login page redirection
if len(response.content) < 2000 or "trk=login_reg_redirect" in url:
if response.status_code == 404:
print "linkedin page not found"
else:
raise ValueError('redirecting to login page or captcha found')
except :
print "retrying :",url
def readurls():
companyurls = ['https://www.linkedin.com/company/tata-consultancy-services']
extracted_data = []
for url in companyurls:
extracted_data.append(linkedin_companies_parser(url))
f = open('data.json', 'w')
json.dump(extracted_data, f, indent=4)
if __name__ == "__main__":
readurls()
@totolo
Copy link

totolo commented Mar 6, 2019

I tried the original script and it wasn't working for me.

@vicky002
Copy link

Just found this site which seems to be very useful and it has more than 20+ automation scripts. They are providing this for free.
You can use this script to pull company Data: TexAu - LinkedInCompanyInfo You may have to create an account.

Copy link

ghost commented May 17, 2021

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment