Skip to content

Instantly share code, notes, and snippets.

@michaelHL
Created August 19, 2018 09:58
Show Gist options
  • Save michaelHL/1e0e30b869a147cef182fb9e566d3872 to your computer and use it in GitHub Desktop.
Save michaelHL/1e0e30b869a147cef182fb9e566d3872 to your computer and use it in GitHub Desktop.
selenium 爬取拉勾网内容
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import json
import time
def lagou_search_url(keyword, city, district=None):
url_base = 'https://www.lagou.com/jobs/'
if district:
return (url_base + 'list_' + keyword
+ '?px=default&city=' + city
+ '&district=' + district)
else:
return (url_base + 'list_' + keyword
+ '?px=default&city=' + city)
_keyword = '算法工程师'
_city = '上海'
_id = 0
_outfile_name = 'lagou_alg_sh.json'
res = {}
# 无头模式浏览器参数
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=chrome_options)
print('[INFO] 浏览器已开启 ...')
# 城市+关键词搜索结果页
driver.get(lagou_search_url(_keyword, _city))
districts_elem = driver.find_elements_by_xpath(
'//div[@data-type="district"]/a[not(@class)]')
districts = list(map(lambda x: x.text, districts_elem))
print('[INFO] 城市+关键词搜索页面打开完成 ...')
print()
for dist in districts:
print(f'[INFO] 当前地区:{dist}')
cur_search_url = lagou_search_url(_keyword, _city, dist)
driver.get(cur_search_url)
max_page = int(driver.find_element_by_css_selector('.span.totalNum').text)
# 空搜索
if max_page == 0:
continue
cur_page = 1
# 多页处理
while True:
print(f'[INFO] 当前页面:{cur_page}/{max_page}')
names_elem = driver.find_elements_by_css_selector('.company_name > a')
industry_elem = driver.find_elements_by_css_selector('div.industry')
jobs_elem = driver.find_elements_by_xpath(
'//h3[@style="max-width: 180px;"]')
money_elem = driver.find_elements_by_css_selector('.money')
names = [x.text for x in names_elem]
industry = [x.text for x in industry_elem]
jobs = [x.text for x in jobs_elem]
money = [x.text for x in money_elem]
# print('[INFO] 店家:', ' | '.join(names), sep='')
for n, i, j, m in zip(names, industry, jobs, money):
_id += 1
res[_id] = {}
res[_id]['公司名'] = n
res[_id]['公司描述'] = i
res[_id]['职位'] = j
res[_id]['薪资'] = m
if cur_page < max_page:
driver.execute_script('$(".pager_next").click()')
cur_page += 1
time.sleep(5)
else:
break
json.dump(res, open(_outfile_name, 'w', encoding='utf-8'), ensure_ascii=False,
indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment