Skip to content

Instantly share code, notes, and snippets.

@datahutrepo
Created December 6, 2016 12:58
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 5 You must be signed in to fork a gist
  • Save datahutrepo/ed76d6179ea5478b137f82d6b6cc97f9 to your computer and use it in GitHub Desktop.
Save datahutrepo/ed76d6179ea5478b137f82d6b6cc97f9 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import json
import requests
import re
import random
import urllib
import lxml.html
import bs4
import sys
reload(sys)
sys.setdefaultencoding('utf8')
HEADERS = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, sdch',
'accept-language': 'en-US,en;q=0.8',
'upgrade-insecure-requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'}
file = open('company_data.json', 'w')
file.write('[')
file.close()
COUNT = 0
def increment():
global COUNT
COUNT = COUNT+1
def fetch_request(url):
try:
fetch_url = requests.get(url, headers=HEADERS)
except:
try:
fetch_url = requests.get(url, headers=HEADERS)
except:
try:
fetch_url = requests.get(url, headers=HEADERS)
except:
fetch_url = ''
return fetch_url
def parse_company_urls(company_url):
if company_url:
if '/company/' in company_url:
parse_company_data(company_url)
else:
parent_url = company_url
fetch_company_url=fetch_request(company_url)
if fetch_company_url:
sel = lxml.html.fromstring(fetch_company_url.content)
COMPANIES_XPATH = '//div[@class="section last"]/div/ul/li/a/@href'
companies_urls = sel.xpath(COMPANIES_XPATH)
if companies_urls:
if '/company/' in companies_urls[0]:
print 'Parsing From Category ', parent_url
print '-------------------------------------------------------------------------------------'
for company_url in companies_urls:
parse_company_urls(company_url)
else:
pass
def parse_company_data(company_data_url):
if company_data_url:
fetch_company_data = fetch_request(company_data_url)
if fetch_company_data.status_code == 200:
try:
source = fetch_company_data.content.decode('utf-8')
sel = lxml.html.fromstring(source)
# CODE_XPATH = '//code[@id="stream-promo-top-bar-embed-id-content"]'
# code_text = sel.xpath(CODE_XPATH).re(r'<!--(.*)-->')
code_text = sel.get_element_by_id(
'stream-promo-top-bar-embed-id-content')
if len(code_text) > 0:
code_text = str(code_text[0])
code_text = re.findall(r'<!--(.*)-->', str(code_text))
code_text = code_text[0].strip() if code_text else '{}'
json_data = json.loads(code_text)
if json_data.get('squareLogo', ''):
company_pic = 'https://media.licdn.com/mpr/mpr/shrink_200_200' + \
json_data.get('squareLogo', '')
elif json_data.get('legacyLogo', ''):
company_pic = 'https://media.licdn.com/media' + \
json_data.get('legacyLogo', '')
else:
company_pic = ''
company_name = json_data.get('companyName', '')
followers = str(json_data.get('followerCount', ''))
# CODE_XPATH = '//code[@id="stream-about-section-embed-id-content"]'
# code_text = sel.xpath(CODE_XPATH).re(r'<!--(.*)-->')
code_text = sel.get_element_by_id(
'stream-about-section-embed-id-content')
if len(code_text) > 0:
code_text = str(code_text[0]).encode('utf-8')
code_text = re.findall(r'<!--(.*)-->', str(code_text))
code_text = code_text[0].strip() if code_text else '{}'
json_data = json.loads(code_text)
company_industry = json_data.get('industry', '')
item = {'company_name': str(company_name.encode('utf-8')),
'followers': str(followers),
'company_industry': str(company_industry.encode('utf-8')),
'logo_url': str(company_pic),
'url': str(company_data_url.encode('utf-8')), }
increment()
print item
file = open('company_data.json', 'a')
file.write(str(item)+',\n')
file.close()
except:
pass
else:
pass
fetch_company_dir = fetch_request('https://www.linkedin.com/directory/companies/')
if fetch_company_dir:
print 'Starting Company Url Scraping'
print '-----------------------------'
sel = lxml.html.fromstring(fetch_company_dir.content)
SUB_PAGES_XPATH = '//div[@class="bucket-list-container"]/ol/li/a/@href'
sub_pages = sel.xpath(SUB_PAGES_XPATH)
print 'Company Category URL list'
print '--------------------------'
print sub_pages
if sub_pages:
for sub_page in sub_pages:
parse_company_urls(sub_page)
else:
pass
@prasathprashuu
Copy link

This code is outdated!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment