-
-
Save weizhou2273/60fbabd1712a6d4ad25fe01bf99118ba to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from bs4 import BeautifulSoup | |
from selenium.webdriver.common.action_chains import ActionChains | |
import time | |
from bs4 import BeautifulSoup | |
import scrapy | |
from scrapy import spider | |
from scrapy.spiders import CrawlSpider, Rule | |
import yaml | |
import getpass | |
import time | |
from selenium import webdriver | |
import json | |
from selenium.common.exceptions import NoSuchElementException | |
from selenium.common.exceptions import WebDriverException | |
import pandas as pd | |
import csv | |
import tfidf | |
driver = webdriver.Chrome(executable_path='/Users/williamzhou/Desktop/chromedriver') | |
driver.get("https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=data+scientist&sc.keyword=data+scientist&locT=&locId=&jobType=") | |
info =[] | |
location = [ | |
'Atlanta, GA','Austin, TX','Boston, MA', | |
'Cambridge, MA','Chicago, IL','Los Angeles, CA', | |
'New York, NY','Palo Alto, CA','Philadelphia, PA', | |
'San Diego, CA','San Francisco, CA','San Jose, CA', | |
'Seattle, WA', | |
'Washington, DC','TX' | |
] | |
# get the link of each job post | |
jobname = driver.find_element_by_name('sc.keyword') | |
jobname.clear() | |
jobname.send_keys('data scientist') | |
joblk=[] | |
for loc in location: | |
locsc = driver.find_element_by_id('sc.location') | |
locsc.clear() | |
locsc.send_keys(loc) | |
driver.find_element_by_xpath('//*[@id="SiteSrchTop"]/form/button').click() | |
time.sleep(3) | |
for i in range(33): | |
try: | |
for ele in driver.find_elements_by_xpath('//*[@id="MainCol"]/ul/li'): | |
salalk={} | |
salalk['link']=ele.find_element_by_xpath('./div[2]/div[1]/div[1]/a').get_attribute('href') | |
salalk['location'] = ele.find_element_by_xpath('//*[@id="HeroHeaderModule"]/div[3]/div[2]/div/div[1]/span[2]').text | |
try: | |
salalk['salary'] = ele.find_element_by_xpath('./div[2]/div[3]/div[1]/span').text | |
except NoSuchElementException: | |
salalk['salary'] = None | |
joblk.append(salalk) | |
try: | |
driver.find_element_by_xpath('//*[@id="FooterPageNav"]/div/ul/li[7]/a').click() | |
except WebDriverException: | |
driver.find_element_by_xpath('//*[@id="JAModal"]/div/div/div[2]/div/button').click() | |
print 'scraping %s' %loc | |
print 'page %d' %(i+1) | |
except NoSuchElementException or WebDriverException: | |
try: | |
driver.find_element_by_xpath('//*[@id="JAModal"]/div/div/div[2]/div/button').click() | |
except NoSuchElementException: | |
print "%s finished" %loc | |
time.sleep(2) | |
with open('/Users/williamzhou/Desktop/glassdoor/%s.json'%loc.replace(', ',''), 'w') as fp: | |
json.dump(joblk, fp) | |
link = salalk['link'] | |
info=[] | |
page = webdriver.Chrome(executable_path='/Users/williamzhou/Desktop/chromedriver') | |
for lk in link: | |
try: | |
page.get(lk) | |
time.sleep(4) | |
info_detail ={} | |
info_detail['i']=i | |
info_detail['link'] = lk | |
info_detail['jobtitle'] = [x.text for x in page.find_elements_by_xpath('//*[@id="HeroHeaderModule"]/div[3]/div[2]/h2')] | |
info_detail['employername'] =[x.text for x in page.find_elements_by_xpath('//*[@id="HeroHeaderModule"]/div[3]/div[2]/span[1]')] | |
info_detail['employerlocation'] = [x.text for x in page.find_elements_by_xpath('//*[@id="HeroHeaderModule"]/div[3]/div[2]/span[2]')] | |
info_detail['jobdescp'] = [x.text for x in page.find_elements_by_xpath('//*[@id="JobDescContainer"]')] | |
info_detail['hq'] = [x.text for x in page.find_elements_by_xpath('//*[@id="EmpBasicInfo"]/div[1]/div[2]/span')] | |
info_detail['companysize'] = [x.text for x in page.find_elements_by_xpath('//*[@id="EmpBasicInfo"]/div[1]/div[3]/span')] | |
info_detail['founded'] = [x.text for x in page.find_elements_by_xpath('//*[@id="EmpBasicInfo"]/div[1]/div[4]/span')] | |
info_detail['companytype'] = [x.text for x in page.find_elements_by_xpath('//*[@id="EmpBasicInfo"]/div[1]/div[5]/span')] | |
info_detail['industry'] = [x.text for x in page.find_elements_by_xpath('//*[@id="EmpBasicInfo"]/div[1]/div[6]/span')] | |
info_detail['revenue'] = [x.text for x in page.find_elements_by_xpath('//*[@id="EmpBasicInfo"]/div[1]/div[7]/span')] | |
info.append(info_detail) | |
time.sleep(2) | |
print i+1 | |
i=i+1 | |
except: | |
with open('/Users/williamzhou/Desktop/glassdoor/scrape_to_%d.json'%(i+1), 'w') as fp: | |
json.dump(info, fp) | |
with open('/Users/williamzhou/Desktop/glassdoor/scrape_to_%d.json'%(i+1), 'w') as fp: | |
json.dump(info, fp) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment