Last active
June 27, 2023 08:35
-
-
Save U-Recommend/bf1b60bebc32cb6d4ba302fb1a7fb242 to your computer and use it in GitHub Desktop.
爬虫
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 597广东 PC版 | |
import requests | |
from bs4 import BeautifulSoup as bf | |
import re | |
import json | |
import time | |
from city_list import searchCitys | |
from multiprocessing.dummy import Pool | |
import redis | |
p_url = 'https://gd.597.com/zhaopin/?page={}' | |
m_url = 'https://gd.597.com/job-{}.html' | |
job_url = 'https://gd.597.com/com-{}/' | |
total_company_datas = {} | |
total_job_datas = {} | |
session = requests.session() | |
session.cookies['city'] = 'www' | |
r = redis.Redis(host='127.0.0.1',port=6379,db=4) | |
# r.flushdb() | |
JOB_NAME = '597_gd_job_' | |
COMPANY_NAME = '597_gd_company_' | |
# 文本格式化 | |
def format_text(text): | |
try: | |
text = str(text).replace('\xa0', '').replace('\n','').replace(',','.') | |
return text.strip() | |
except: | |
return '' | |
def format_url(url): | |
if url.startswith('//'): | |
url = f"https:{url}" | |
return url | |
def get_id(link): | |
id = link.split('/')[-1].split('.')[0].replace('.html','') | |
return id | |
def get_num(text): | |
match = re.match('\d+', text) | |
if match: | |
number = int(match.group()) | |
return number | |
return | |
def get_city_area(job_info): | |
job_request = format_text(job_info.select('.job_request')[0].text) | |
job_req = job_request.split('|') | |
ad = format_text(job_req[0]) if job_req else '' | |
ads = ad.split('-') | |
city = ads[0] if ads else '' | |
area = ads[1] if len(ads) > 1 else '' | |
return city, area | |
def set_gongshang_info(data=None): | |
res = f'公司名称:{data.get("com_name", "")}. 法人: {data.get("farean", "")}.注资: {data.get("zibenjin", "")}.官网: {data.get("website", "")}' | |
return res | |
GD_COMPANYS = {} | |
GD_JOBS = {} | |
def page_list(url): | |
resp = requests.get(url) | |
soup = bf(resp.text, 'html.parser') | |
job_items = soup.find_all('.firm-item') | |
for i, job_item in enumerate(job_items): | |
if i < 1: | |
continue | |
job_url = job_item.select('.des_title')[0]['href'] | |
job_id = get_id(job_url) | |
job_name = job_item.select('.des_title')[0].text | |
company_url = job_item.select('.firm_md')[0]['href'] | |
company_id = get_id(company_url) | |
company_name = job_item.select('.firm_md')[0].text | |
salary = job_item.find_all('li')[2].text | |
add = job_item.find_all('li')[3].text | |
city,area = add.split('-', 1) | |
exp = job_item.find_all('li')[4].text | |
GD_JOBS[job_id] = { | |
'job_id': job_id, | |
'job_name': job_name, | |
'job_url': job_url, | |
'company_id': company_id, | |
'salary': salary, | |
'city': city, | |
'area': area, | |
'exp': exp | |
} | |
if not company_id in GD_COMPANYS: | |
GD_COMPNAYS[company_id] = { | |
'company_id': company_id, | |
'company_name': company_name, | |
'company_url': company_url, | |
'city': city, | |
'area': area | |
} | |
url = 'https://gd.597.com/zhaopin/?page=67' | |
page_list(url) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://sr.597.com/zhaopin/?page=1 | |
import requests | |
from bs4 import BeautifulSoup as bf | |
import re | |
import json | |
import time | |
from city_list import searchCitys | |
from multiprocessing.dummy import Pool | |
import redis | |
p_url = 'https://sr.597.com/zhaopin/?page={}' | |
m_url = 'https://m.597.com/companyList/famous/?page={}' | |
job_url = 'https://m.597.com/companyjob.html?page={}' | |
total_company_datas = {} | |
total_job_datas = {} | |
session = requests.session() | |
session.cookies['city'] = 'www' | |
# 文本格式化 | |
def format_text(text): | |
try: | |
text = str(text).replace('\xa0', '').replace('\n','').replace(',','.') | |
return text.strip() | |
except: | |
return '' | |
def format_url(url): | |
if url.startswith('//'): | |
url = f"https:{url}" | |
return url | |
def get_id(link): | |
id = link.split('/')[-1].split('.')[0] | |
return id | |
def get_num(text): | |
match = re.match('\d+', text) | |
if match: | |
number = int(match.group()) | |
return number | |
return | |
def get_city_area(job_info): | |
job_request = format_text(job_info.select('.job_request')[0].text) | |
job_req = job_request.split('|') | |
ad = format_text(job_req[0]) if job_req else '' | |
ads = ad.split('-') | |
city = ads[0] if ads else '' | |
area = ads[1] if len(ads) > 1 else '' | |
return city, area | |
def set_gongshang_info(data=None): | |
res = f'公司名称:{data.get("com_name", "")}. 法人: {data.get("farean", "")}.注资: {data.get("zibenjin", "")}.官网: {data.get("website", "")}' | |
return res | |
def set_company_info(company): | |
data = ['', | |
company.get('city_name','-'), | |
company.get('area','-'), | |
company.get('address','-'), | |
company.get('com_name','-'), | |
company.get('com_type','-'), | |
company.get('desc_more','-'), | |
company.get('com_str','-'), | |
company.get('-'), | |
set_gongshang_info(data=company), | |
company.get('address','-'), | |
company.get('num','-'), | |
company.get('job_names','-')] | |
return data | |
def set_job_info(job, city_name): | |
data = [ | |
job.get('com_name'), | |
job.get('job_name'), | |
job.get('job_salary'), | |
city_name, | |
job.get('job_years'), | |
job.get('job_edu'), | |
job.get('job_salary'), | |
job.get('job_name'), | |
job.get('job_desc_txt'), | |
job.get('ellipsis_comUser'), | |
job.get('address') | |
] | |
return data | |
def format_select(soup, xclass, index, type='text'): | |
ses = soup.select(xclass) | |
if not len(ses) > index: | |
return '' | |
return format_text(ses[index].text) | |
# company['com_str'] = format_text(soup.select('.com_str')[0].text) | |
def get_cache(type=None, id=None): | |
if type == 'job': | |
name = f'597_job_{id}' | |
else: | |
name = f'597_company_{id}' | |
res = r.get(name) | |
if res: | |
return json.loads(res) | |
r.set(name, json.dumps({})) | |
return {} | |
def set_cache(type=None, id=None, data=None): | |
if type == 'job': | |
name = f'597_job_{id}' | |
else: | |
name = f'597_company_{id}' | |
r_data = get_cache(type=type, id=id) | |
r_data.update(data) | |
r.set(name, json.dumps(r_data)) | |
def get_job_url(url, job_data=None): | |
url = format_url(url) | |
resp = session.get(url) | |
soup = bf(resp.text, 'html.parser') | |
job_data['job_age'] = format_text(soup.select('.job_age')[0].text) | |
job_data['job_req'] = format_text(soup.select('.job_req')[0].text) | |
worktimeinfo = soup.select('.worktimeinfo') | |
job_data['worktimeinfo'] = format_text(worktimeinfo[0].text) if worktimeinfo else '' | |
job_data['job_desc_txt'] = format_text(soup.select('.job_desc_txt')[0].text) | |
print(job_data) | |
return job_data | |
def get_job_list_datas(url, city_name=None): | |
resp = session.get(url) | |
time.sleep(1) | |
soup = bf(resp.text, 'html.parser') | |
job_infos = soup.select('.job_info') | |
if not job_infos: | |
print('no job info') | |
return False | |
print(len(job_infos)) | |
for job_info in job_infos: | |
link = job_info.find('a')['href'] | |
link = format_url(link) | |
job_id = get_id(link) | |
job_name = format_text(job_info.select('.job_name')[0].text) | |
job_salary = format_text(job_info.select('.job_salary')[0].text) | |
job_request = format_text(job_info.select('.job_request')[0].text) | |
job_req = job_request.split('|') | |
ad = format_text(job_req[0]) if job_req else '' | |
exp = format_text(job_req[1]) if len(job_req) > 1 else '' | |
edu = format_text(job_req[2]) if len(job_req) > 2 else '' | |
ads = ad.split('-') | |
city = ads[0] if ads else '' | |
area = ads[1] if len(ads) > 1 else '' | |
tipjs = [format_text(t.text) for t in job_info.select('.tipj')] | |
tips = '/'.join(tipjs) if tipjs else '' | |
job_ana = job_info.select('.job_ana')[0] | |
company_link = job_ana.find('a')['href'] | |
company_link = format_url(company_link) | |
company_id = get_id(company_link) | |
contact = format_text(job_ana.select('.ellipsis_comUser')[0].text) | |
cname = format_text(job_ana.select('.ellipsis_cname')[0].find('a').text) | |
com_data = { | |
'name': cname, | |
'link': company_link, | |
'contact': contact, | |
'city_name': city, | |
'area': area | |
} | |
set_cache(type='company', id=company_id, data=com_data) | |
job_data = { | |
'company_id': company_id, | |
'link': link, | |
'company_name': cname, | |
'job_name': job_name, | |
'job_salary': job_salary, | |
'exp': exp, | |
'edu': edu, | |
'city': city, | |
'area': area, | |
'tips': tips | |
} | |
job_data = get_job_url(link, job_data=job_data) | |
set_cache(type='job', id=job_id, data=job_data) | |
return True | |
def get_company_page(url, company=None, job_datas=None): | |
url = format_url(url) | |
resp = session.get(url) | |
soup = bf(resp.text, 'html.parser') | |
com_infos = soup.select('.com_info') | |
if not com_infos: | |
return | |
com_name = format_select(soup, '.com_name', 0) | |
# com_name = format_text(soup.select('.com_name')[0].text) | |
company['com_name'] = com_name | |
company['com_type'] = format_select(soup, '.com_type', 0) | |
company['com_gm'] = format_select(soup, '.com_gm', 0) | |
company['com_xz'] = format_select(soup, '.com_xz', 0) | |
company['com_str'] = format_select(soup, '.com_str', 0) | |
desc_more = format_select(soup, '.txt_more_box', 0) | |
company['desc_more'] = desc_more.replace('\xa0','').replace('\n', '').strip() | |
address = format_select(soup, '.comAddresstxt', 0) | |
company['address'] = address | |
com_item = soup.select('.com_gs_item_desc') | |
company['faren'] = format_text(com_item[1].text) if len(com_item) > 1 else '' | |
company['zibenjin'] = format_text(com_item[2].text) if len(com_item) > 2 else '' | |
company['website'] = format_text(com_item[3].text) if len(com_item) > 3 else '' | |
num_txt = format_select(soup, '.tab_item', 1) | |
num = get_num(num_txt) | |
if not num: | |
num = num_txt.replace('职位', '').replace('(', '').replace(')', '').strip() | |
company['num'] = num | |
job_names = [i.text for i in soup.select('.job_name')] | |
company['job_names'] = '/'.join(job_names) | |
job_infos = soup.select('.job_info') | |
for job_info in job_infos: | |
job_url = job_info.find('a')['href'] | |
job_url = format_url(job_url) | |
job_id = get_id(link=job_url) | |
job_name = format_select(job_info, '.job_name', 0) | |
job_salary = format_select(job_info, '.job_salary', 0) | |
job_are = format_select(job_info, '.job_are', 0) | |
job_years = format_select(job_info, '.job_years', 0) | |
job_edu = format_select(job_info, '.job_edu', 0) | |
ellipsis_comUser = format_select(job_info, '.ellipsis_comUser', 0) | |
job_url = format_url(job_url) | |
jdata = {} | |
jdata['link'] = job_url | |
jdata['com_name'] = com_name | |
jdata['job_name'] = job_name | |
jdata['job_salary'] = job_salary | |
jdata['job_are'] = job_are | |
jdata['job_years'] = job_years | |
jdata['job_edu'] = job_edu | |
jdata['ellipsis_comUser'] = ellipsis_comUser | |
jdata['address'] = address | |
jdata = get_job_url(job_url, job_data=jdata) | |
job_datas[job_id] = jdata | |
return company, job_datas | |
# * 列表搜索公司 ID,公司名称,联系人 | |
# * 公司简介, 职务列表 | |
# * 职务详情 | |
def get_comp_data(url=None, com_datas=None, city_name=None): | |
resp = session.get(url) | |
soup = bf(resp.text, 'html.parser') | |
job_infos = soup.select('.job_info') | |
if not job_infos: | |
print('no job info') | |
return False, com_datas | |
for job_info in job_infos: | |
job_ana = job_info.select('.job_ana')[0] | |
company_link = job_ana.find('a')['href'] | |
company_link = format_url(company_link) | |
compnay_id = get_id(company_link) | |
if get_cache(type='company', id=company_id): | |
continue | |
contact = format_select(job_ana, '.ellipsis_comUser', 0) | |
cname = format_text(job_ana.select('.ellipsis_cname')[0].find('a').text) | |
city, area = get_city_area(job_info=job_info) | |
com_datas[compnay_id] = { | |
'name': cname, | |
'link': company_link, | |
'contact': contact, | |
'city_name': city_name, | |
'city': city, | |
'area': area | |
} | |
return True, com_datas | |
def get_company_list_datas(data=None): | |
ct, city_name = data | |
print(city_name) | |
com_datas={} | |
job_datas={} | |
session.cookies['city'] = ct | |
page = 1 | |
while True: | |
url = job_url.format(page) | |
page += 1 | |
status, com_datas = get_comp_data(url=url, city_name=city_name, com_datas=com_datas) | |
if not status: | |
break | |
time.sleep(1) | |
print(f'company_length: {len(com_datas)}') | |
with open(f'company/companys_{city_name}.json', 'w') as f: | |
json.dump(com_datas, f) | |
company_headers = ['省份', '地级市', '区县', '详细地址', '公司名称', '公司行业', '公司简介', '产品介绍', '工商信息', '公司地址', '招聘职位数量', '职位名称'] | |
job_headers = ['公司名称', '岗位名称', '薪资范围', '工作城市', '工作经验', '学历要求', '薪资待遇', '职位描述关键词', '职位描述', '招聘者', '工作地址'] | |
com_result = [] | |
job_result = [] | |
if com_datas: | |
for cid, company in com_datas.items(): | |
url = company.get('link') | |
if url: | |
company, job_datas = get_company_page(url, company=company, job_datas=job_datas) | |
comp_data = set_company_info(company=company) | |
com_result.append(comp_data) | |
print(job_datas) | |
print(type(job_datas)) | |
if job_datas: | |
for j, job in job_datas.items(): | |
data = set_job_info(job, city_name) | |
print(data) | |
job_result.append(data) | |
# with open(f'company/company_info_{city_name}.csv', 'w') as f1, open(f'city/city_info_{city_name}.csv', 'w') as f2: | |
# f1.write(','.join(company_headers)+'\n') | |
# f2.write(','.join(job_headers)+'\n') | |
# for cid, company in com_datas.items(): | |
# url = company.get('link') | |
# if url: | |
# company, job_datas = get_company_page(url, company=company, job_datas=job_datas) | |
# comp_data = set_company_info(company=company) | |
# com_result.append(comp_data) | |
# print(comp_data) | |
# f1.write(','.join(comp_data)+'\n') | |
# if job_datas: | |
# for job in job_datas: | |
# data = set_job_info(job, city_name) | |
# print(data) | |
# job_result.append(data) | |
# f2.write(','.join(data)+'\n') | |
print(f'RESULT_COMPANY: {len(com_result)}') | |
print(f'JOB_RESULT: {len(job_result)}') | |
if com_result: | |
with open(f'company/company_result_{city_name}.json', 'w') as f1: | |
json.dump(com_result, f1) | |
if job_result: | |
with open(f'job/job_result_{city_name}.json', 'w') as f2: | |
json.dump(job_result, f2) | |
return True | |
def get_job_list(data=None): | |
ct, city_name = data | |
print(city_name) | |
com_datas={} | |
job_datas={} | |
session.cookies['city'] = ct | |
page = 1 | |
while True: | |
print(page) | |
url = job_url.format(page) | |
page += 1 | |
status = get_company_list_datas(url=url, city_name=city_name) | |
if not status: | |
break | |
print(len(com_datas)) | |
print(len(job_datas)) | |
print('-' * 50) | |
with open(f'json/company-sample_{city_name}.json', 'w') as x1: | |
json.dump(com_datas, x1) | |
with open(f'json/job-sample_{city_name}.json', 'w') as x2: | |
json.dump(job_datas, x2) | |
def save_company(): | |
print('保存公司信息') | |
headers = ['省份', '地级市', '区县', '详细地址', '公司名称', '公司行业', '公司简介', '产品介绍', '工商信息', '公司地址', '招聘职位数量', '职位名称'] | |
items = ('provice', 'city', 'city', 'address', 'name', 'hangye', 'content', 'content', 'name', 'address', 'num') | |
with open('company.csv', 'w') as f: | |
f.write(','.join(headers) + '\n') | |
for key in r.scan_iter(match='597_company*'): | |
d = r.get(key) | |
res = json.loads(d) | |
print(res) | |
job_name_list = [] | |
for i in r.scan_iter(match='597_job*'): | |
x = r.get(i) | |
if x: | |
job = json.loads(x) | |
job_name_list.append(job.get('name')) | |
job_names = '/'.join(job_name_list) | |
data = [res.get(item, '-') for item in items] | |
data.append(job_names) | |
f.write(','.join(data) + '\n') | |
def save_job(): | |
print('保存职务') | |
headers = ['公司名称','岗位名称','薪资范围','工作城市','工作经验','学历要求','薪资待遇','职位描述关键词','职位描述','招聘者','工作地址'] | |
items = ('company_name', 'name', 'salary', 'city', 'exp', 'edu', 'salary', 'gjz', 'desc', 'contact') | |
with open('jobs.csv', 'w') as f: | |
f.write(','.join(headers) + '\n') | |
for key in r.scan_iter(match='597_job*'): | |
d = r.get(key) | |
res = json.loads(d) | |
print(res) | |
company_id = res.get('company_id') | |
company = get_cache(type='company', id=company_id) | |
address = company.get('address') | |
data = [res.get(item, '-') for item in items] | |
data.append(address) | |
f.write(','.join(data) + '\n') | |
def get_city_list(): | |
yield ('www', '全国') | |
for i in searchCitys: | |
xs = i.split('|') | |
city_name = xs[0] if xs else '' | |
ct = xs[1] if len(xs) > 1 else '' | |
if ct: | |
yield (ct, city_name) | |
def main(): | |
# with Pool(4) as p: | |
# res = p.map(get_company_list_datas, get_city_list()) | |
for i in get_city_list(): | |
get_company_list_datas(i) | |
# get_job_list() | |
main() | |
# comp_headers = ['省份', '地级市', '区县', '详细地址', '公司名称', '公司行业', '公司简介', '产品介绍', '工商信息', '公司地址', '招聘职位数量', '职位名称'] | |
# job_headers = ['公司名称', '岗位名称', '薪资范围', '工作城市', '工作经验', '学历要求', '薪资待遇', '职位描述关键词', '职位描述', '招聘者', '工作地址'] | |
# import os | |
# comp_path = '/home/tonytan/Downloads/jobs_search/597网站/company' | |
# # 'company/company_result_{city_name}.json' | |
# with open('company_all.csv', 'w') as f: | |
# f.write(','.join(comp_headers) + '\n') | |
# for file in os.listdir(comp_path): | |
# if 'json' in file: | |
# print(file) | |
# filepath = os.path.join(comp_path, file) | |
# with open(filepath, 'r') as f1: | |
# datas = json.load(f1) | |
# print(datas) | |
# if datas: | |
# for data in datas: | |
# if data: | |
# print(data) | |
# data = [i or '-' for i in data] | |
# f.write(','.join(data) + '\n') | |
# job_path = '/home/tonytan/Downloads/jobs_search/597网站/job' | |
# # 'job/job_result_{city_name}.json' | |
# with open('job_all.csv', 'w') as f: | |
# f.write(','.join(job_headers) + '\n') | |
# for file in os.listdir(job_path): | |
# if 'json' in file: | |
# print(file) | |
# filepath = os.path.join(job_path, file) | |
# with open(filepath, 'r') as f1: | |
# datas = json.load(f1) | |
# print(datas) | |
# if datas: | |
# for data in datas: | |
# if data: | |
# print(data) | |
# data = [i or '-' for i in data] | |
# f.write(','.join(data) + '\n') | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup as bf | |
import re | |
import json | |
import time | |
from city_list import searchCitys | |
from multiprocessing.dummy import Pool | |
import redis | |
page_url = 'http://www.chinajsjob.com/job/sort528_px0_p{}.shtml' | |
headers = { | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', | |
'Accept-Encoding': 'gzip, deflate', | |
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', | |
'Cache-Control': 'max-age=0', | |
'Connection': 'keep-alive', | |
'Cookie': 'ASP.NET_SessionId=dfzkw1zgek2jobo1cmb1135m; SECKEY_ABVK=br9hg1wn5hFZWVreAzXQVijHrwJN07ofrJDzeaMRCA0%3D; BMAP_SECKEY=Ftud9bx-REipUcl-ZQmatvyH_fYD-3IJUtW5VsGJFAZmTN8vQbak_7LfYDzS-2L0BU4wMDQ-Pwd-ZpmwYKAulQ6VzBm5eaZubn2dDuNnZNkrsDFRIKzHg5PcgPTp5qOEE7jrjHh7Quv1TSKl3h0aGOsqEhV0lxsj3DTA7o7rroA', | |
'Host': 'www.chinajsjob.com', | |
'Referer': 'http://www.chinajsjob.com/job/sort528_px0_p1.shtml', | |
'Upgrade-Insecure-Requests': '1', | |
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36' | |
} | |
# 文本格式化 | |
def format_text(text): | |
try: | |
text = str(text) | |
text = text.replace('\xa0', '').replace('\n','').replace(',','.').replace('\t', '').replace('\r', '').replace('\r\n', '') | |
return text.strip() | |
except: | |
return '' | |
def format_url(url): | |
if url.startswith('//'): | |
url = f"https:{url}" | |
return url | |
def get_id(link): | |
id = link.split('/')[-1].split('.')[0].replace('.html','') | |
return id | |
def get_num(text): | |
match = re.match('\d+', text) | |
if match: | |
number = int(match.group()) | |
return number | |
return | |
href = 'http://www.chinajsjob.com' | |
header = ['省份','公司名称', '公司行业', '公司简介', '产品介绍', '工商信息', '公司地址', '招聘职位数量', '职位名称', '岗位名称', '薪资范围', '工作城市', '工作经验', '学历要求', '薪资待遇', '职位描述关键词', '职位描述', '招聘者', '工作地址'] | |
def job_page(page): | |
if href not in page: | |
page = f'{href}{page}' | |
resp = requests.get(page, headers=headers) | |
soup = bf(resp.text, 'html.parser') | |
data = {} | |
keyword_items = soup.select('.jjText') | |
keywords = [format_text(i.text) for i in keyword_items] | |
data['keyword'] = '/'.join(keywords) | |
data['job_desc'] = format_text(soup.select('.zwmsCon')[0].text) | |
data['address'] = format_text(soup.select('.gzddCon')[0].select('span')[0].text) | |
return data | |
def company_page(page): | |
if href not in page: | |
page = f'{href}{page}' | |
resp = requests.get(page, headers=headers) | |
soup = bf(resp.text, 'html.parser') | |
data = {} | |
job_txt = format_text(soup.select('.zzzw')[0].text) | |
job_txt = job_txt.split('等')[-1] | |
data['job_num'] = format_text(job_txt) | |
data['company_desc'] = format_text(soup.select('.des')[0].text) | |
return data | |
def list_item_page(item): | |
job_url = item.select('.zwTop')[0].select('a')[0].href | |
job_name = format_text(item.select('.zwbt')[0].text) | |
salary = format_text(item.select('.price')[0].text) | |
city = format_text(item.select('.cs')[0].text) | |
exp = format_text(item.select('.yq')[0].select('span')[2].text) | |
edu = format_text(item.select('.yq')[0].select('span')[3].text) | |
company_name = format_text(item.select('.comName')[0].text) | |
company_url = format_text(item.select('.comName')[0].href) | |
industry = format_text(item.select('.comsm')[0].select('span')[2].text) | |
contact = format_text(item.select('.lxrcon')[0].text) | |
job_page_data = job_page(job_url) | |
company_page_data = company_page(company_url) | |
data = { | |
'province': '江苏省', | |
'company_name': format_text(company_name), | |
'industry': format_text(industry), | |
'company_desc': company_page_data.get('company_desc'), | |
'product_desc': '', | |
'gongshang': f'公司名称: {company_name}', | |
'address': job_page_data.get('address'), | |
'job_num': company_page_data.get('job_num', 1), | |
'job_names': '', | |
'job_nmae': format_text(job_name), | |
'salary': format_text(salary), | |
'city': format_text(city), | |
'exp': format_text(exp), | |
'edu': format_text(edu), | |
'salary1': format_text(salary), | |
'keyword': job_page_data.get('keyword'), | |
'job_desc': job_page_data.get('job_desc'), | |
'contact': format_text(contact), | |
} | |
return data | |
datas = [] | |
def list_page(page): | |
try: | |
url = page_url.format(page) | |
resp = requests.get(url, headers=headers) | |
soup = bf(resp.text, 'html.parser') | |
job_items = soup.select('.zwLeft')[0].find_all('li') | |
for item in job_items: | |
data = list_item_page(item) | |
datas.append(data) | |
return True | |
except: | |
return False | |
page = 1 | |
while True: | |
res = list_page(page) | |
if not res: | |
break | |
page += 1 | |
print(len(datas)) | |
key_items = ('province','company_name','industry','company_desc','product_desc','gongshang','address','job_num','job_names','job_nmae','salary','city','exp','edu','salary1','keyword','job_desc','contact','address') | |
with open('jobs.csv', 'w') as f: | |
f.write(','.join(header) + '\n') | |
for data in datas: | |
d = [data.get(i) for i in key_items] | |
f.write(','.join(d) + '\n') | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import random | |
proxies = { | |
'https': '36.6.144.239:8089', | |
'https': '36.6.145.194:8089', | |
'https': '39.98.197.238:80', | |
'https': '36.6.144.156:8089', | |
'https': '183.239.68.61:4780', | |
'https': '1.15.156.141:7890', | |
'https': '36.6.145.46:8089', | |
'https': '36.6.144.114:8089', | |
'https': '114.106.171.45:8089', | |
'https': '223.247.47.100:8089', | |
'https': '111.225.153.56:8089', | |
'https': '36.6.144.131:8089', | |
'https': '36.6.144.192:8089', | |
} | |
headers_list = [ | |
{ | |
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G955U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 10; SM-G981B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (iPad; CPU OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/87.0.4280.77 Mobile/15E148 Safari/604.1' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.109 Safari/537.36 CrKey/1.54.248666' | |
}, { | |
'user-agent': 'Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.188 Safari/537.36 CrKey/1.54.250320' | |
}, { | |
'user-agent': 'Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/10.0.9.2372 Mobile Safari/537.10+' | |
}, { | |
'user-agent': 'Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.1.0; en-US) AppleWebKit/536.2+ (KHTML like Gecko) Version/7.2.1.0 Safari/536.2+' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.1; en-us; GT-N7100 Build/JRO03C) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; LGMS323 Build/KOT49I.MS32310c) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/102.0.0.0 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mobile Safari/537.36 Edge/14.14263' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)' | |
}, { | |
'user-agent': 'Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 9; Pixel 3 Build/PQ1A.181105.017.A1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.158 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 11; Pixel 3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.181 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36' | |
}, { | |
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1' | |
}, { | |
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1' | |
}, { | |
'user-agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1' | |
} | |
] | |
list_items ={ | |
"UCB001": 'job_id', | |
"AAB004": 'company_id', | |
"AAB299REMARK": 'city', | |
"AAE006": 'address', | |
"ACB200": 'company_id', | |
"ACB215REMARK": 'hangye', | |
"ACB217": 'job_name', | |
"ACB21AREMARK": 'salary', | |
"ACB244": 'tags', | |
"AAE004": 'contact', | |
"AAE005": 'mobile', | |
} | |
company_items = { | |
"AAB004": "鹤峰县铭创餐饮服务有限公司", | |
"AAB020REMARK": "有限责任(公司)", | |
"AAB022REMARK": "餐饮业", | |
"AAB299REMARK": "鹤峰县", | |
"AAE006": "鹤峰县后坝路27号", | |
"AAE004": "刘选明", | |
"AAC003_CJ": "黄恩", | |
"AAB004_GL": "湖北省劳动就业服务中心", | |
"AppletsLink": "https://www.hbggzp.cn/xcx/enterprise?detailId=558364dadb964771b2090555f2a21fb2" | |
} | |
job_items = { | |
"AAB004": "鹤峰县铭创餐饮服务有限公司", | |
"AAB004_GL": "湖北省劳动就业服务中心", | |
"AAB022REMARK": "餐饮业", | |
"AAB299REMARK": "鹤峰县", | |
"AAB301REMARK": "鹤峰县", | |
"AAE004": "鑫鑫", | |
"AAE005": "15586626667", | |
"AAE006": "鹤峰县后坝路27号", | |
"ACB202": "容美镇后坝", | |
"AAB092": "简介", | |
"ACB215REMARK": "洗碗工", | |
"ACB216": "服从管理,吃苦耐劳 ,身体健康,能吃苦耐劳,遵纪守法,", | |
"ACB217": "洗碗工", | |
"ACB21AREMARK": "2000-3000元", | |
"ACB239REMARK": "全职", | |
"UCE465": "服从管理,吃苦耐劳 ,身体健康,能吃苦耐劳,遵纪守法,", | |
} | |
list_page_url = 'https://www.hbggzp.cn/PER/JA/COMMON/COMPANY/JOB/V1/QUERY?KEY=&AAB301=420000&ACB215=&UCB005=&ACB21A=&ACB239=&UCE466E=&ACC217=&AAC011=&PAGE={}&LINAGE={}' | |
job_page_url = 'https://www.hbggzp.cn/CACHE/PER/JA/COMMON/COMPANY/JOB/V1/GETINFO?ACB200={}' | |
company_page_url = 'https://www.hbggzp.cn/CACHE/PER/JA/COMMON/COMPANY/BASE/V1/GETINFO?UCB001={}' | |
headers = { | |
'Referer': 'https://www.hbggzp.cn/views/PostSearch/homeIndex.html?t=2', | |
# 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', | |
'Host': 'www.hbggzp.cn' | |
} | |
job_datas = {} | |
company_datas ={} | |
def list_request(url): | |
user_agents = random.choice(headers_list) | |
headers.update(user_agents) | |
# resp = requests.get(url, headers=headers, proxies=proxies, timeout=20) | |
resp = requests.get(url) | |
res = resp.json() | |
print(res) | |
result = res.get('Result',{}).get('Rows',[]) | |
for row in result: | |
data = {j: row.get(k, '') for k,j in list_items.items()} | |
job_id = data.get('job_id') | |
company_id = data.get('company_id') | |
job_datas[job_id] = data | |
company_datas[company_id] = data | |
pages = [] | |
size = 20 | |
length = int(5158/size) + 1 | |
for i in range(1, length+1): | |
print(i) | |
if i in pages: | |
continue | |
url = list_page_url.format(i, size) | |
list_request(url) | |
pages.append(i) | |
companys = job_datas | |
jobs = company_datas | |
def company_request(url): | |
rtimes = 0 | |
while rtimes < 4: | |
try: | |
resp = requests.get(url=url, times=20) | |
res = resp.json() | |
print(res) | |
result = res.get('Result', {}) | |
try: | |
companys[i].update(result) | |
except: | |
companys[i] = result | |
return | |
except: | |
rtimes += 1 | |
time.sleep(1) | |
t = 1 | |
for i, d in companys.items(): | |
if 'AAB019REMARK' in d: | |
continue | |
url = company_page_url.format(i) | |
company_request(url) | |
print(t) | |
t += 1 | |
def job_request(url): | |
rtimes = 0 | |
while rtimes < 4: | |
try: | |
resp = requests.get(url=url, times=20) | |
res = resp.json() | |
print(res) | |
result = res.get('Result', {}) | |
try: | |
jobs[i].update(result) | |
except: | |
jobs[i] = result | |
return | |
except: | |
rtimes += 1 | |
time.sleep(1) | |
t = 1 | |
for i, d in jobs.items(): | |
if 'AAB019REMARK' in d: | |
continue | |
url = job_page_url.format(i) | |
job_request(url) | |
print(t) | |
t += 1 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup as bf | |
import re | |
import json | |
import time | |
from multiprocessing.dummy import Pool | |
# 文本格式化 | |
def format_text(text): | |
try: | |
text = str(text).replace('\xa0', '').replace('\n','').replace(',','.') | |
return text.strip() | |
except: | |
return '' | |
def format_url(url): | |
if url.startswith('//'): | |
url = f"https:{url}" | |
return url | |
def get_id(link): | |
id = link.split('/')[-1].split('.')[0].replace('.html','') | |
return id | |
def get_num(text): | |
match = re.match('\d+', text) | |
if match: | |
number = int(match.group()) | |
return number | |
return | |
def get_city_area(job_info): | |
job_request = format_text(job_info.select('.job_request')[0].text) | |
job_req = job_request.split('|') | |
ad = format_text(job_req[0]) if job_req else '' | |
ads = ad.split('-') | |
city = ads[0] if ads else '' | |
area = ads[1] if len(ads) > 1 else '' | |
return city, area | |
def set_gongshang_info(data=None): | |
res = f'公司名称:{data.get("com_name", "")}. 法人: {data.get("farean", "")}.注资: {data.get("zibenjin", "")}.官网: {data.get("website", "")}' | |
return res | |
headers = { | |
'Accept':'application/json, text/javascript, */*; q=0.01', | |
'Accept-Encoding':'gzip, deflate, br', | |
'Accept-Language':'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', | |
'Client':'PC', | |
'Content-Type':'application/json;charset=UTF-8', | |
'Cookie':'uuid=0eed4eb033203d4625f8a345a66ebe03; uuid.sig=7dq3I2MyMVLywODf5wNpFT0M8ynyrT0g8N3qmIoYNQ0; cf99fb1bff6494fb3dc3e887223abd55=c7d4c9a5a963820712513e1d100d3aab; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22188fab415f3c88-08c4cb1d4988be8-13462c6c-2073600-188fab415f486c%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg4ZmFiNDE1ZjNjODgtMDhjNGNiMWQ0OTg4YmU4LTEzNDYyYzZjLTIwNzM2MDAtMTg4ZmFiNDE1ZjQ4NmMifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%22188fab415f3c88-08c4cb1d4988be8-13462c6c-2073600-188fab415f486c%22%7D; uvcookie=ffdc40e600b04eddba474be1b31f22f3; jscookie=d8fe9c5c47b33eb5c1713ef89736ea6a; orginflag=1; accessflag=1; 8aa932cc6211ccbe21af0c963b3e3415=3706ba89e1b89e7d7fb3138eb20e5eb2; Hm_lvt_6cc160b93b871a4884a8cd8dc4addcd3=1687833287; d87a445a1f22d65370f406844dc1e56a=be9bce0c00d065fb057cdc512f037a7f; sensorsTrackObj={%22from%22:%22%E6%90%9C%E7%B4%A2%22%2C%22index_number%22:9%2C%22page_number%22:6}; keyCityCode=14012700; keyTypeCode=10030010%2C10030010; Hm_lpvt_6cc160b93b871a4884a8cd8dc4addcd3=1687846607; pvcount="targetUrl=https%3A%2F%2Fwww.job5156.com%2Fs%2Fsearch%2F%3FkeywordType%3D%26keyword%3D~fromSourceType=2"', | |
'Referer':'https://www.job5156.com/s/search/?keywordType=&keyword=', | |
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', | |
'Sec-Ch-Ua':'"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"', | |
'Sec-Ch-Ua-Mobile':'?0', | |
'Sec-Ch-Ua-Platform':'"Linux"', | |
'Sec-Fetch-Dest':'empty', | |
'Sec-Fetch-Mode':'cors', | |
'Sec-Fetch-Site':'same-site', | |
} | |
data = { | |
't': 1687833340449, | |
'pn': 6, | |
'keyword': '', | |
'keywordType': 0, | |
'posTypeList': '', | |
'locationList': '', | |
'taoLabelList': '', | |
'degreeFrom': '', | |
'propertyList': '', | |
'industryList': '', | |
'sortBy': 0, | |
'urgentFlag': '', | |
'comIdList': '', | |
'locationAddrStr': '', | |
'salary': '', | |
} | |
p_url = 'https://www.job5156.com/s/result' | |
job_url = 'https://api.qlrc.com/personal/Job?JobId={}' | |
job_datas = {} | |
company_datas = {} | |
def list_page(): | |
page = 1 | |
while True: | |
data['pn'] = str(page) | |
data['t'] = str(int(time.time()*1000)) | |
resp = requests.get(p_url, data=data, headers=headers) | |
res = resp.json() | |
job_list = res.get('posData', {}).get('posItems', []) | |
print(len(job_list)) | |
if not job_list: | |
break | |
page += 1 | |
job_datas.extend(job_list) | |
job_ids = [] | |
datas = [] | |
def job_page(job_id): | |
url = job_url.format(job_id) | |
resp = requests.get(url, headers=headers) | |
res = resp.json() | |
job = res.get('job', {}) | |
comp = job.get('cpMain', {}) | |
ca_main = job.get('caMain', {}) | |
data = ['山东省', comp.get('name', '-'), comp.get('industry', ''), comp.get('brief','-'), '-', comp.get('name', '-'), comp.get('address', '-'),comp.get('jobCount', 1), '-', job.get('name', '-'),job.get('salary', '-'), job.get('city', '-'), job.get('experience', '-'), job.get('degree', '-'), job.get('salary', '-'), job.get('jobKeyWord', '-'), job.get('demand', '-'), ca_main.get('name', '-'), comp.get('address', '-')] | |
data = [format_text(i) for i in data] | |
print(data) | |
datas.append(data) | |
header = ['省份','公司名称', '公司行业', '公司简介', '产品介绍', '工商信息', '公司地址', '招聘职位数量', '职位名称', '岗位名称', '薪资范围', '工作城市', '工作经验', '学历要求', '薪资待遇', '职位描述关键词', '职位描述', '招聘者', '工作地址'] | |
with open('job.csv', 'w') as f: | |
f.write(','.join(header) + '\n') | |
for i in range(1, 101): | |
list_page(str(i)) | |
print(job_ids) | |
with Pool(4) as p: | |
p.map(job_page, job_ids) | |
for job_id in job_ids: | |
job_page(job_id) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup as bf | |
import re | |
import json | |
import time | |
from multiprocessing.dummy import Pool | |
# 文本格式化 | |
def format_text(text): | |
try: | |
text = str(text).replace('\xa0', '').replace('\n','').replace(',','.') | |
return text.strip() | |
except: | |
return '' | |
def format_url(url): | |
if url.startswith('//'): | |
url = f"https:{url}" | |
return url | |
def get_id(link): | |
id = link.split('/')[-1].split('.')[0].replace('.html','') | |
return id | |
def get_num(text): | |
match = re.match('\d+', text) | |
if match: | |
number = int(match.group()) | |
return number | |
return | |
def get_city_area(job_info): | |
job_request = format_text(job_info.select('.job_request')[0].text) | |
job_req = job_request.split('|') | |
ad = format_text(job_req[0]) if job_req else '' | |
ads = ad.split('-') | |
city = ads[0] if ads else '' | |
area = ads[1] if len(ads) > 1 else '' | |
return city, area | |
def set_gongshang_info(data=None): | |
res = f'公司名称:{data.get("com_name", "")}. 法人: {data.get("farean", "")}.注资: {data.get("zibenjin", "")}.官网: {data.get("website", "")}' | |
return res | |
headers = { | |
'Accept':'application/json, text/plain, */*', | |
'Accept-Encoding':'gzip, deflate, br', | |
'Accept-Language':'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', | |
'Client':'PC', | |
'Content-Type':'application/json;charset=UTF-8', | |
'Origin':'https://www.qlrc.com', | |
'Referer':'https://www.qlrc.com/', | |
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', | |
'Sec-Ch-Ua':'"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"', | |
'Sec-Ch-Ua-Mobile':'?0', | |
'Sec-Ch-Ua-Platform':'"Linux"', | |
'Sec-Fetch-Dest':'empty', | |
'Sec-Fetch-Mode':'cors', | |
'Sec-Fetch-Site':'same-site', | |
'Subsiteinfo':'%7B%22id%22%3A32%2C%22provinceID%22%3A32%2C%22subSiteName%22%3A%22%E9%BD%90%E9%B2%81%E4%BA%BA%E6%89%8D%E7%BD%91%22%2C%22subSiteCity%22%3A%22%E5%B1%B1%E4%B8%9C%22%2C%22subSiteUrl%22%3A%22https%3A%2F%2Fwww.qlrc.com%22%2C%22isSecond%22%3Afalse%2C%22pcUrl%22%3A%22www.qlrc.com%22%2C%22h5Url%22%3A%22m.qlrc.com%22%2C%22isWechatValid%22%3Atrue%2C%22beian%22%3A%22%E4%BA%ACICP%E5%A4%8712005109%E5%8F%B7-17%22%2C%22pinyin%22%3A%22shandong%22%2C%22servicePhone%22%3A%220531-68961040%22%2C%22dataWay%22%3A%22%E7%BC%93%E5%AD%98%E6%95%B0%E6%8D%AE%22%2C%22isMobile%22%3Afalse%2C%22isIE%22%3Afalse%2C%22searchServer%22%3A%224%22%7D' | |
} | |
data = { | |
"OrderBy": 0, | |
"jobTypeId": "", | |
"dcMajorID": "", | |
"regionId": "32", | |
"industryId": "", | |
"mapPlaceId": 0, | |
"minSalary": 0, | |
"maxSalary": 0, | |
"minSalaryvalue": "", | |
"maxSalaryvalue": "", | |
"experienceId": "", | |
"replyRate": 0, | |
"autoReplyDay": 0, | |
"isNegotiable": False, | |
"educationId": "", | |
"employType": "", | |
"keyWord": "", | |
"Page": 2, | |
"companySizeId": "", | |
"welfare": "", | |
"isOnline": False, | |
"distance": 0, | |
"onlyEnv": False, | |
"selectRegionId": "32", | |
"filterWelfare": [], | |
"filterJobType": None, | |
"filterRegion": None, | |
"filterEducation": None, | |
"filterSalary": {}, | |
"filterExperience": None, | |
"filterEmployType": None, | |
"filterCompanySize": None, | |
"filterReplyRate": None, | |
"serverNo": "4", | |
"city": { | |
"id": "", | |
"value": "" | |
}, | |
"district": { | |
"id": "", | |
"value": "" | |
}, | |
"place": { | |
"id": "", | |
"value": "" | |
}, | |
"firstJob": { | |
"id": "", | |
"value": "" | |
}, | |
"secondJob": { | |
"id": "", | |
"value": "" | |
}, | |
"regionType": 1, | |
"jobType": 1, | |
"showWords": "" | |
} | |
p_url = 'https://so.qlrc.com/job/' | |
job_url = 'https://api.qlrc.com/personal/Job?JobId={}' | |
job_datas = {} | |
company_datas = {} | |
job_ids = set() | |
def list_page(page): | |
headers['Page'] = page | |
resp = requests.post(p_url, json=data, headers=headers) | |
res = resp.json() | |
job_list = res.get('jobList', []) | |
for item in job_list: | |
job_id = item.get('jobSecondId') | |
job_ids.add(job_id) | |
datas = {} | |
def job_page(job_id): | |
resp = requests.get(job_url.format(job_id), headers=headers) | |
res = resp.json() | |
job = res.get('job', {}) | |
comp = job.get('cpMain', {}) | |
ca_main = job.get('caMain', {}) | |
data = ['山东省', comp.get('name', '-'), comp.get('industry', ''), comp.get('brief','-'), '-', comp.get('name', '-'), comp.get('address', '-'),comp.get('jobCount', 1), '-', job.get('name', '-'),job.get('salary', '-'), job.get('city', '-'), job.get('experience', '-'), jon.get('degree', '-'), job.get('salary', '-'), job.get('jobKeyWord', '-'), job.get('demand', '-'), ca_main.get('name', '-'), comp.get('address', '-')] | |
data = [format_text(i) for i in data] | |
datas[job_id] = data | |
with open('job.csv', 'a') as f: | |
f.write(','.join(data) + '\n') | |
header = ['省份','公司名称', '公司行业', '公司简介', '产品介绍', '工商信息', '公司地址', '招聘职位数量', '职位名称', '岗位名称', '薪资范围', '工作城市', '工作经验', '学历要求', '薪资待遇', '职位描述关键词', '职位描述', '招聘者', '工作地址'] | |
with open('job.csv', 'a') as f: | |
f.write(','.join(header) + '\n') | |
with Pool(4) as p: | |
p.map(job_page, job_ids) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import redis | |
import json | |
import os | |
import time | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.support.wait import WebDriverWait | |
from selenium.webdriver.chrome.webdriver import Options | |
from multiprocessing.dummy import Pool | |
page_url = 'http://www.sdbys.com/job/search?page={}' | |
job_url = 'http://www.sdbys.com/job/view/id/{}' | |
company_url = 'http://www.sdbys.com/companydetail/view/id/{}' | |
r = redis.Redis(host='127.0.0.1',port=6379,db=0) | |
r.flushdb() | |
def format_text(text): | |
try: | |
text = str(text).replace('\xa0', '').replace(',','.').replace('\n','') | |
return text.strip() | |
except: | |
return '' | |
def format_url(url): | |
if url.startswith('//'): | |
url = f"https:{url}" | |
return url | |
def get_id(link): | |
id = link.split('/')[-1].split('.')[0] | |
return id | |
def get_cache(type=None, id=None): | |
if type == 'job': | |
name = f'sdpys_job_{id}' | |
else: | |
name = f'sdpys_company_{id}' | |
res = r.get(name) | |
if res: | |
return json.loads(res) | |
r.set(name, json.dumps({})) | |
return {} | |
def set_cache(type=None, id=None, data=None): | |
if type == 'job': | |
name = f'sdpys_job_{id}' | |
else: | |
name = f'sdpys_company_{id}' | |
r_data = get_cache(type=type, id=id) | |
r_data.update(data) | |
r.set(name, json.dumps(r_data)) | |
def selenium_options(): | |
options = Options() | |
# 设置中文编码格式 | |
options.add_argument('lang=zh_CN.UTF-8') | |
options.add_argument( | |
'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"') | |
options.add_argument('blink-settings=imagesEnabled=false') | |
options.add_argument('--headless') | |
prefs = { | |
'profile.default_content_setting_values': { | |
'notifications': 2 | |
}} | |
options.add_experimental_option('prefs', prefs) | |
options.add_argument('--no-sandbox') | |
return options | |
options = selenium_options() | |
driver = webdriver.Chrome(options=options) | |
def list_page_datas(page=None): | |
print(page) | |
driver.get(page) | |
time.sleep(3) | |
items = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div/div/div[1]/ul/li') | |
for item in items: | |
company_name = item.find_element(By.XPATH, './div[2]/div/div[2]/a').text | |
company_name = format_text(company_name) | |
company_url = item.find_element(By.XPATH, './div[2]/div/div[2]/a').get_attribute('href') | |
company_id = get_id(company_url) | |
job_name = item.find_element(By.XPATH, './div[2]/div/div[3]/a').text | |
job_name = format_text(job_name) | |
job_url = item.find_element(By.XPATH, './div[2]/div/div[3]/a').get_attribute('href') | |
job_id = get_id(job_url) | |
salary = item.find_element(By.XPATH, './div[2]/div/div[3]/span').text | |
salary = format_text(salary) | |
add = item.find_element(By.XPATH, './div[2]/div/div[4]/ul/li[1]').text | |
print(add) | |
adds = add.split('-') | |
print(adds) | |
adds = adds if isinstance(adds, list) else [adds] | |
province = format_text(adds[0]) if adds else '' | |
city = format_text(adds[1]) if adds and len(adds) > 1 else '' | |
edu = item.find_element(By.XPATH, './div[2]/div/div[4]/ul/li[3]').text | |
edu = format_text(edu) | |
company_data = get_cache(type='company', id=company_id) | |
if company_data: | |
job_names = company_data.get('job_names') | |
if not job_name in job_names: | |
job_names += ',' + job_name | |
company_data['job_names'] = job_names | |
set_cache(type='company', id=company_id, data=company_data) | |
else: | |
company_data['id'] = company_id | |
company_data['name'] = company_name | |
company_data['url'] = company_url | |
company_data['provice'] = province | |
company_data['city'] = city | |
job_names = job_name | |
company_data['job_names'] = job_names | |
set_cache(type='company', id=company_id, data=company_data) | |
job_data = get_cache(type='job', id=job_id) | |
if job_data: | |
return | |
job_data['id'] = job_id | |
job_data['name'] = job_name | |
job_data['company_name'] = company_name | |
job_data['company_id'] = company_id | |
job_data['job_url'] = job_url | |
job_data['salary'] = salary | |
job_data['province'] = province | |
job_data['city'] = city | |
job_data['edu'] = edu | |
set_cache(type='job', id=job_id, data=job_data) | |
print(job_data) | |
def job_page(job_data): | |
# job_data = get_cache(type='job', id=id) | |
print('job page') | |
id = job_data.get('id') | |
print(id) | |
job_url = job_data.get('job_url') | |
print(job_url) | |
driver.get(job_url) | |
time.sleep(2) | |
try: | |
select = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div[3]/div[1]/ul/li[5]/span') | |
contact=WebDriverWait(driver,10,0.2).until(EC.visibility_of(select)).text | |
# contact = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div[3]/div[1]/ul/li[5]/span').text | |
contact = format_text(contact) | |
except: | |
contact = '' | |
try: | |
tags = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div[3]/div[1]/ul/li[7]/span').text | |
tags = format_text(tags) | |
except: | |
tags = '' | |
try: | |
desc = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div[3]/div[2]/div/div').text | |
desc = format_text(desc) | |
except: | |
desc = '' | |
try: | |
exp = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div[3]/div[1]/ul/li[3]/span').text | |
exp = format_text(exp) | |
except: | |
exp = '' | |
try: | |
guanjianzi = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div[3]/div[1]/ul/li[1]/span').text | |
guanjianzi = format_text(guanjianzi) | |
except: | |
guanjianzi = '' | |
data = { | |
'contact': contact, | |
'tags': tags, | |
'desc': desc, | |
'exp': exp, | |
'gjz': guanjianzi | |
} | |
job_data = set_cache(type='job', id=id, data=data) | |
print(data) | |
def company_page(company_data): | |
# company_data = get_cache(type='company', id=id) | |
id = company_data.get('id') | |
print(id) | |
company_url = company_data.get('url') | |
driver.get(company_url) | |
time.sleep(1) | |
try: | |
num = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[3]/div/div[1]/div/div/div[1]/div/div[2]/ul/li[1]/p[1]').text | |
num = format_text(num) | |
except: | |
num = '' | |
try: | |
content = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[3]/div/div[2]/div[2]/div[1]/div/div').text | |
content = format_text(content) | |
except: | |
content = '' | |
try: | |
address = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[3]/div/div[2]/div[2]/div[3]/div/p').text | |
address = format_text(address) | |
except: | |
address = '' | |
try: | |
hangye = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[3]/div/div[3]/div/div/div/div[2]/span').text | |
hangye = format_text(hangye) | |
except: | |
hangye = '' | |
data = { | |
'num': num, | |
'content': content, | |
'address': address, | |
'hangye': hangye | |
} | |
set_cache(type='company', id=id, data=data) | |
print(data) | |
def company_item(): | |
company_keys = r.scan_iter(match='sdpys_company*') | |
for key in company_keys: | |
try: | |
d = r.get(key) | |
data = json.loads(d) | |
print(data) | |
yield data | |
except: | |
continue | |
def job_item(): | |
job_keys = r.scan_iter(match='sdpys_job*') | |
for key in job_keys: | |
try: | |
d = r.get(key) | |
data = json.loads(d) | |
print(data) | |
yield data | |
except: | |
continue | |
def handle(): | |
for i in range(1, 5260): | |
try: | |
page = page_url.format(i) | |
list_page_datas(page=page) | |
except: | |
continue | |
print('company ...') | |
for i in company_item(): | |
try: | |
company_page(i) | |
except: | |
continue | |
print('job...') | |
for i in job_item(): | |
try: | |
job_page(i) | |
except: | |
continue | |
def save_company(): | |
print('保存公司信息') | |
headers = ['省份', '地级市', '区县', '详细地址', '公司名称', '公司行业', '公司简介', '产品介绍', '工商信息', '公司地址', '招聘职位数量', '职位名称'] | |
items = ('provice', 'city', 'city', 'address', 'name', 'hangye', 'content', 'content', 'name', 'address', 'num') | |
with open('companyx.csv', 'w') as f: | |
f.write(','.join(headers) + '\n') | |
for key in r.scan_iter(match='sdpys_company*'): | |
d = r.get(key) | |
res = json.loads(d) | |
job_name_list = [] | |
# for i in r.scan_iter(match='sdpys_job*'): | |
# x = r.get(i) | |
# if x: | |
# job = json.loads(x) | |
# job_name_list.append(job.get('name')) | |
# job_names = '/'.join(job_name_list) | |
data = [res.get(item, '-') for item in items] | |
# data.append(job_names) | |
f.write(','.join(data) + '\n') | |
print('finished') | |
def save_job(): | |
print('保存职务') | |
headers = ['公司名称','岗位名称','薪资范围','工作城市','工作经验','学历要求','薪资待遇','职位描述关键词','职位描述','招聘者','工作地址'] | |
items = ('company_name', 'name', 'salary', 'city', 'exp', 'edu', 'salary', 'gjz', 'desc', 'contact') | |
with open('jobs.csv', 'w') as f: | |
f.write(','.join(headers) + '\n') | |
for key in r.scan_iter(match='sdpys_job*'): | |
d = r.get(key) | |
res = json.loads(d) | |
company_id = res.get('company_id') | |
company = get_cache(type='company', id=company_id) | |
address = company.get('address') | |
data = [res.get(item, '-') for item in items] | |
data.append(address) | |
f.write(','.join(data) + '\n') | |
print('finished') | |
handle() | |
save_company() | |
save_job() | |
with open(file, 'r') as f1, open('jobs1.csv', 'w') as f2: | |
for i in csv.reader(f1): | |
add = i[10] | |
print(add) | |
res = {} | |
if add: | |
res = jio.parse_location(add) | |
privince=res.get('province') | |
city = res.get('city', '-') | |
print(city) | |
i.append(str(city)) | |
txt = ','.join(i) + '\n' | |
f2.write(txt) | |
import jionlp as jio | |
with open('jobs1.csv', 'r') as f1, open('jobs2.csv', 'w') as f2: | |
times = 1 | |
for i in csv.reader(f1): | |
print(times) | |
times += 1 | |
add = i[0] | |
print(add) | |
res = {} | |
if add: | |
res = jio.parse_location(add) | |
privince=res.get('province') | |
city = res.get('city', '-') | |
print(city) | |
i.append(str(city)) | |
txt = ','.join(i) + '\n' | |
f2.write(txt) | |
with open('companyx.csv', 'r') as f1, open('company1.csv', 'w') as f2: | |
times = 1 | |
for i in csv.reader(f1): | |
print(times) | |
times += 1 | |
add = i[3] | |
print(add) | |
res = {} | |
if add: | |
res = jio.parse_location(add) | |
province=res.get('province', '-') | |
city = res.get('city', '-') | |
country = res.get('country', '-') | |
i.extend([str(province), str(city), str(country)]) | |
txt = ','.join(i) + '\n' | |
f2.write(txt) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 云南招聘网 | |
import redis | |
import json | |
import os | |
import time | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.support.wait import WebDriverWait | |
from selenium.webdriver.chrome.webdriver import Options | |
from multiprocessing.dummy import Pool | |
page_url = 'https://www.ynzp.com/search/offer_search_result.aspx?jcity1Hidden=330000&page={}' | |
job_url = 'https://www.ynzp.com/job/{}.html' | |
company_url = 'https://www.ynzp.com/ent/{}.html' | |
r = redis.Redis(host='127.0.0.1',port=6379,db=3) | |
# r.flushdb() | |
JOB_NAME = 'ynzp_21_job_' | |
COMPANY_NAME = 'ynzp_21_company_' | |
def format_text(text): | |
try: | |
text = str(text).replace('\xa0', '').replace(',','.').replace('\n','') | |
return text.strip() | |
except: | |
return '' | |
def format_url(url): | |
if url.startswith('//'): | |
url = f"https:{url}" | |
return url | |
def get_id(link): | |
id = link.split('/')[-1].split('.')[0] | |
return id | |
def get_cache(type=None, id=None): | |
if type == 'job': | |
name = f'{JOB_NAME}{id}' | |
else: | |
name = f'{COMPANY_NAME}{id}' | |
res = r.get(name) | |
if res: | |
return json.loads(res) | |
r.set(name, json.dumps({})) | |
return {} | |
def set_cache(type=None, id=None, data=None): | |
if type == 'job': | |
name = f'{JOB_NAME}{id}' | |
else: | |
name = f'{COMPANY_NAME}{id}' | |
r_data = get_cache(type=type, id=id) | |
r_data.update(data) | |
r.set(name, json.dumps(r_data)) | |
def selenium_options(): | |
options = Options() | |
# 设置中文编码格式 | |
options.add_argument('lang=zh_CN.UTF-8') | |
options.add_argument( | |
'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"') | |
options.add_argument('blink-settings=imagesEnabled=false') | |
options.add_argument('--headless') | |
prefs = { | |
'profile.default_content_setting_values': { | |
'notifications': 2 | |
}} | |
options.add_experimental_option('prefs', prefs) | |
options.add_argument('--no-sandbox') | |
return options | |
options = selenium_options() | |
driver = webdriver.Chrome(options=options) | |
def list_page_datas(page=None): | |
print(page) | |
try: | |
driver.get(page) | |
except: | |
driver = webdriver.Chrome(options=options) | |
driver.get(page) | |
time.sleep(1) | |
items = driver.find_elements(By.CLASS_NAME, 'V1Item') | |
print(len(items)) | |
for i, item in enumerate(items): | |
try: | |
item = driver.find_elements(By.CLASS_NAME, 'V1Item')[i] | |
company_name_node = f'//*[@id="ctl00_ContentPlaceHolder1_repJob_ctl{str(i).zfill(2)}_EntUrl"]' | |
company_name = driver.find_element(By.XPATH, company_name_node).text | |
company_name = format_text(company_name) | |
company_url = driver.find_element(By.XPATH, company_name_node).get_attribute('href') | |
company_id = get_id(company_url) | |
job_name_node = f'//*[@id="ctl00_ContentPlaceHolder1_repJob_ctl{str(i).zfill(2)}_JobName"]' | |
job_name = driver.find_element(By.XPATH, job_name_node).text | |
job_name = format_text(job_name) | |
job_url = driver.find_element(By.XPATH, job_name_node).get_attribute('href') | |
job_id = get_id(job_url) | |
try: | |
salary = driver.find_elements(By.XPATH, f'//*[@class="salaryStyle"]')[{i}].text | |
salary = format_text(salary) | |
print(salary) | |
except: | |
salary = '' | |
node = driver.find_elements(By.CLASS_NAME, 'JobInfo')[i*2] | |
area = node.find_elements(By.TAG_NAME, 'span')[0].text | |
print(area) | |
edu = node.find_elements(By.TAG_NAME, 'span')[1].text | |
exp = node.find_elements(By.TAG_NAME, 'span')[2].text | |
company_data = get_cache(type='company', id=company_id) | |
if company_data: | |
job_names = company_data.get('job_names') | |
job_names += ',' + job_name | |
else: | |
company_data['id'] = company_id | |
company_data['name'] = company_name | |
company_data['url'] = company_url | |
# company_data['provice'] = province | |
company_data['city'] = area | |
company_data['area'] = area | |
job_names = job_name | |
company_data['job_names'] = job_names | |
print(company_data) | |
set_cache(type='company', id=company_id, data=company_data) | |
job_data = get_cache(type='job', id=job_id) | |
job_data['id'] = job_id | |
job_data['name'] = job_name | |
job_data['company_name'] = company_name | |
job_data['company_id'] = company_id | |
job_data['job_url'] = job_url | |
job_data['salary'] = salary | |
# job_data['province'] = province | |
# job_data['city'] = city | |
job_data['edu'] = edu | |
job_data['exp'] = exp | |
set_cache(type='job', id=job_id, data=job_data) | |
print(job_data) | |
except: | |
return | |
def job_page(job_data): | |
print('job page') | |
id = job_data.get('id') | |
print(id) | |
page = job_data.get('job_url') | |
print(page) | |
try: | |
driver.get(page) | |
except: | |
# driver.quit() | |
driver = webdriver.Chrome(options=options) | |
driver.get(page) | |
time.sleep(2) | |
try: | |
contact = driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_plEntContact"]/ul/li[1]/p').text | |
# contact=WebDriverWait(driver,10,0.2).until(EC.visibility_of(select)).text | |
# contact = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div[3]/div[1]/ul/li[5]/span').text | |
contact = format_text(contact) | |
except: | |
contact = '' | |
try: | |
tags = driver.find_element(By.XPATH, '//*[@id="divEntWant"]').text | |
tags = format_text(tags) | |
except: | |
tags = '' | |
try: | |
desc = driver.find_element(By.XPATH, '//*[@id="ShowJobContent"]/div[2]/div[2]/div[2]/ul/li[2]/div').text | |
desc = format_text(desc) | |
except: | |
desc = '' | |
# try: | |
# exp = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div[3]/div[1]/ul/li[3]/span').text | |
# exp = format_text(exp) | |
# except: | |
# exp = '' | |
try: | |
guanjianzi = driver.find_element(By.XPATH, '//*[@id="divEntWant"]').text | |
guanjianzi = format_text(guanjianzi) | |
except: | |
guanjianzi = '' | |
try: | |
hangye = driver.find_element(By.XPATH, '//*[@id="cell1"]').text | |
hangye = format_text(hangye) | |
except: | |
hangye = '' | |
try: | |
company_url = driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_hlEntName"]').get_attribute('href') | |
company_id = get_id(company_url) | |
except: | |
company_id = '' | |
data = { | |
'contact': contact, | |
'tags': tags, | |
'desc': desc, | |
# 'exp': exp, | |
'gjz': guanjianzi, | |
'hangye': hangye | |
} | |
job_data = set_cache(type='job', id=id, data=data) | |
if company_id and hangye: | |
set_cache(type='company', id=company_id, data={'hangye': hangye}) | |
print(data) | |
def company_page(company_data): | |
# company_data = get_cache(type='company', id=id) | |
id = company_data.get('id') | |
print(id) | |
page = company_data.get('url') | |
try: | |
driver.get(page) | |
except: | |
driver = webdriver.Chrome(options=options) | |
driver.get(page) | |
time.sleep(1) | |
try: | |
num = driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_V3ucentNav_hlJobs"]').text | |
num = format_text(num) | |
except: | |
num = '' | |
try: | |
content = driver.find_element(By.ID, 'EntIntro').find_elements(By.CLASS_NAME, 'content')[0].text | |
content = format_text(content) | |
except: | |
content = '' | |
try: | |
address = driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_panNormal"]/div[1]/div[2]/div[1]/div[2]/ul/li[2]/div/ul/li[6]/p/span[2]').text | |
address = format_text(address) | |
except: | |
address = '' | |
data = { | |
'num': num, | |
'content': content, | |
'address': address, | |
} | |
set_cache(type='company', id=id, data=data) | |
print(data) | |
def company_item(): | |
company_keys = r.scan_iter(match=f'{COMPANY_NAME}*') | |
times = 1 | |
for key in company_keys: | |
try: | |
d = r.get(key) | |
data = json.loads(d) | |
print(f'数量:{times}') | |
times += 1 | |
yield data | |
except: | |
continue | |
def job_item(): | |
job_keys = r.scan_iter(match=f'{JOB_NAME}*') | |
times = 1 | |
for key in job_keys: | |
try: | |
d = r.get(key) | |
data = json.loads(d) | |
print(f'数量: {times}') | |
times += 1 | |
yield data | |
except: | |
continue | |
def page_item(): | |
for i in range(1, 51): | |
yield page_url.format(i) | |
for i in range(1, 51): | |
page = page_url.format(i) | |
list_page_datas(page=page) | |
with Pool(4) as p: | |
p.map(company_page, company_item()) | |
with Pool(4) as p: | |
p.map(job_page, job_item()) | |
def handle(): | |
for i in range(1, 51): | |
page = page_url.format(i) | |
list_page_datas(page=page) | |
print('company ...') | |
for i in company_item(): | |
company_page(i) | |
print('job...') | |
for i in job_item(): | |
job_page(i) | |
def save_company(): | |
print('保存公司信息') | |
headers = ['省份', '地级市', '区县', '详细地址', '公司名称', '公司行业', '公司简介', '产品介绍', '工商信息', '公司地址', '招聘职位数量', '职位名称'] | |
items = ('provice', 'city', 'city', 'address', 'name', 'hangye', 'content', 'content', 'name', 'address', 'num') | |
with open('company.csv', 'w') as f: | |
f.write(','.join(headers) + '\n') | |
for key in r.scan_iter(match=f'{COMPANY_NAME}*'): | |
d = r.get(key) | |
res = json.loads(d) | |
print(res) | |
job_name_list = [] | |
for i in r.scan_iter(match='job*'): | |
x = r.get(i) | |
if x: | |
job = json.loads(x) | |
job_name_list.append(job.get('name')) | |
job_names = '/'.join(job_name_list) | |
data = [res.get(item, '-') for item in items] | |
data.append(job_names) | |
f.write(','.join(data) + '\n') | |
def save_job(): | |
print('保存职务') | |
headers = ['公司名称','岗位名称','薪资范围','工作城市','工作经验','学历要求','薪资待遇','职位描述关键词','职位描述','招聘者','工作地址'] | |
items = ('company_name', 'name', 'salary', 'city', 'exp', 'edu', 'salary', 'gjz', 'desc', 'contact') | |
with open('jobs.csv', 'w') as f: | |
f.write(','.join(headers) + '\n') | |
for key in r.scan_iter(match=f'{JOB_NAME}*'): | |
d = r.get(key) | |
res = json.loads(d) | |
print(res) | |
company_id = res.get('company_id') | |
company = get_cache(type='company', id=company_id) | |
address = company.get('address') | |
data = [res.get(item, '-') for item in items] | |
data.append(address) | |
f.write(','.join(data) + '\n') | |
handle() | |
# save_company() | |
# save_job() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment