Skip to content

Instantly share code, notes, and snippets.

@U-Recommend
Last active June 27, 2023 08:35
Show Gist options
  • Save U-Recommend/bf1b60bebc32cb6d4ba302fb1a7fb242 to your computer and use it in GitHub Desktop.
Save U-Recommend/bf1b60bebc32cb6d4ba302fb1a7fb242 to your computer and use it in GitHub Desktop.
爬虫
# 597广东 PC版
import requests
from bs4 import BeautifulSoup as bf
import re
import json
import time
from city_list import searchCitys
from multiprocessing.dummy import Pool
import redis
p_url = 'https://gd.597.com/zhaopin/?page={}'
m_url = 'https://gd.597.com/job-{}.html'
job_url = 'https://gd.597.com/com-{}/'
total_company_datas = {}
total_job_datas = {}
session = requests.session()
session.cookies['city'] = 'www'
r = redis.Redis(host='127.0.0.1',port=6379,db=4)
# r.flushdb()
JOB_NAME = '597_gd_job_'
COMPANY_NAME = '597_gd_company_'
# 文本格式化
def format_text(text):
try:
text = str(text).replace('\xa0', '').replace('\n','').replace(',','.')
return text.strip()
except:
return ''
def format_url(url):
if url.startswith('//'):
url = f"https:{url}"
return url
def get_id(link):
id = link.split('/')[-1].split('.')[0].replace('.html','')
return id
def get_num(text):
match = re.match('\d+', text)
if match:
number = int(match.group())
return number
return
def get_city_area(job_info):
job_request = format_text(job_info.select('.job_request')[0].text)
job_req = job_request.split('|')
ad = format_text(job_req[0]) if job_req else ''
ads = ad.split('-')
city = ads[0] if ads else ''
area = ads[1] if len(ads) > 1 else ''
return city, area
def set_gongshang_info(data=None):
res = f'公司名称:{data.get("com_name", "")}. 法人: {data.get("farean", "")}.注资: {data.get("zibenjin", "")}.官网: {data.get("website", "")}'
return res
GD_COMPANYS = {}
GD_JOBS = {}
def page_list(url):
resp = requests.get(url)
soup = bf(resp.text, 'html.parser')
job_items = soup.find_all('.firm-item')
for i, job_item in enumerate(job_items):
if i < 1:
continue
job_url = job_item.select('.des_title')[0]['href']
job_id = get_id(job_url)
job_name = job_item.select('.des_title')[0].text
company_url = job_item.select('.firm_md')[0]['href']
company_id = get_id(company_url)
company_name = job_item.select('.firm_md')[0].text
salary = job_item.find_all('li')[2].text
add = job_item.find_all('li')[3].text
city,area = add.split('-', 1)
exp = job_item.find_all('li')[4].text
GD_JOBS[job_id] = {
'job_id': job_id,
'job_name': job_name,
'job_url': job_url,
'company_id': company_id,
'salary': salary,
'city': city,
'area': area,
'exp': exp
}
if not company_id in GD_COMPANYS:
GD_COMPNAYS[company_id] = {
'company_id': company_id,
'company_name': company_name,
'company_url': company_url,
'city': city,
'area': area
}
url = 'https://gd.597.com/zhaopin/?page=67'
page_list(url)
# https://sr.597.com/zhaopin/?page=1
import requests
from bs4 import BeautifulSoup as bf
import re
import json
import time
from city_list import searchCitys
from multiprocessing.dummy import Pool
import redis
p_url = 'https://sr.597.com/zhaopin/?page={}'
m_url = 'https://m.597.com/companyList/famous/?page={}'
job_url = 'https://m.597.com/companyjob.html?page={}'
total_company_datas = {}
total_job_datas = {}
session = requests.session()
session.cookies['city'] = 'www'
# 文本格式化
def format_text(text):
try:
text = str(text).replace('\xa0', '').replace('\n','').replace(',','.')
return text.strip()
except:
return ''
def format_url(url):
if url.startswith('//'):
url = f"https:{url}"
return url
def get_id(link):
id = link.split('/')[-1].split('.')[0]
return id
def get_num(text):
match = re.match('\d+', text)
if match:
number = int(match.group())
return number
return
def get_city_area(job_info):
job_request = format_text(job_info.select('.job_request')[0].text)
job_req = job_request.split('|')
ad = format_text(job_req[0]) if job_req else ''
ads = ad.split('-')
city = ads[0] if ads else ''
area = ads[1] if len(ads) > 1 else ''
return city, area
def set_gongshang_info(data=None):
res = f'公司名称:{data.get("com_name", "")}. 法人: {data.get("farean", "")}.注资: {data.get("zibenjin", "")}.官网: {data.get("website", "")}'
return res
def set_company_info(company):
data = ['',
company.get('city_name','-'),
company.get('area','-'),
company.get('address','-'),
company.get('com_name','-'),
company.get('com_type','-'),
company.get('desc_more','-'),
company.get('com_str','-'),
company.get('-'),
set_gongshang_info(data=company),
company.get('address','-'),
company.get('num','-'),
company.get('job_names','-')]
return data
def set_job_info(job, city_name):
data = [
job.get('com_name'),
job.get('job_name'),
job.get('job_salary'),
city_name,
job.get('job_years'),
job.get('job_edu'),
job.get('job_salary'),
job.get('job_name'),
job.get('job_desc_txt'),
job.get('ellipsis_comUser'),
job.get('address')
]
return data
def format_select(soup, xclass, index, type='text'):
ses = soup.select(xclass)
if not len(ses) > index:
return ''
return format_text(ses[index].text)
# company['com_str'] = format_text(soup.select('.com_str')[0].text)
def get_cache(type=None, id=None):
if type == 'job':
name = f'597_job_{id}'
else:
name = f'597_company_{id}'
res = r.get(name)
if res:
return json.loads(res)
r.set(name, json.dumps({}))
return {}
def set_cache(type=None, id=None, data=None):
if type == 'job':
name = f'597_job_{id}'
else:
name = f'597_company_{id}'
r_data = get_cache(type=type, id=id)
r_data.update(data)
r.set(name, json.dumps(r_data))
def get_job_url(url, job_data=None):
url = format_url(url)
resp = session.get(url)
soup = bf(resp.text, 'html.parser')
job_data['job_age'] = format_text(soup.select('.job_age')[0].text)
job_data['job_req'] = format_text(soup.select('.job_req')[0].text)
worktimeinfo = soup.select('.worktimeinfo')
job_data['worktimeinfo'] = format_text(worktimeinfo[0].text) if worktimeinfo else ''
job_data['job_desc_txt'] = format_text(soup.select('.job_desc_txt')[0].text)
print(job_data)
return job_data
def get_job_list_datas(url, city_name=None):
resp = session.get(url)
time.sleep(1)
soup = bf(resp.text, 'html.parser')
job_infos = soup.select('.job_info')
if not job_infos:
print('no job info')
return False
print(len(job_infos))
for job_info in job_infos:
link = job_info.find('a')['href']
link = format_url(link)
job_id = get_id(link)
job_name = format_text(job_info.select('.job_name')[0].text)
job_salary = format_text(job_info.select('.job_salary')[0].text)
job_request = format_text(job_info.select('.job_request')[0].text)
job_req = job_request.split('|')
ad = format_text(job_req[0]) if job_req else ''
exp = format_text(job_req[1]) if len(job_req) > 1 else ''
edu = format_text(job_req[2]) if len(job_req) > 2 else ''
ads = ad.split('-')
city = ads[0] if ads else ''
area = ads[1] if len(ads) > 1 else ''
tipjs = [format_text(t.text) for t in job_info.select('.tipj')]
tips = '/'.join(tipjs) if tipjs else ''
job_ana = job_info.select('.job_ana')[0]
company_link = job_ana.find('a')['href']
company_link = format_url(company_link)
company_id = get_id(company_link)
contact = format_text(job_ana.select('.ellipsis_comUser')[0].text)
cname = format_text(job_ana.select('.ellipsis_cname')[0].find('a').text)
com_data = {
'name': cname,
'link': company_link,
'contact': contact,
'city_name': city,
'area': area
}
set_cache(type='company', id=company_id, data=com_data)
job_data = {
'company_id': company_id,
'link': link,
'company_name': cname,
'job_name': job_name,
'job_salary': job_salary,
'exp': exp,
'edu': edu,
'city': city,
'area': area,
'tips': tips
}
job_data = get_job_url(link, job_data=job_data)
set_cache(type='job', id=job_id, data=job_data)
return True
def get_company_page(url, company=None, job_datas=None):
url = format_url(url)
resp = session.get(url)
soup = bf(resp.text, 'html.parser')
com_infos = soup.select('.com_info')
if not com_infos:
return
com_name = format_select(soup, '.com_name', 0)
# com_name = format_text(soup.select('.com_name')[0].text)
company['com_name'] = com_name
company['com_type'] = format_select(soup, '.com_type', 0)
company['com_gm'] = format_select(soup, '.com_gm', 0)
company['com_xz'] = format_select(soup, '.com_xz', 0)
company['com_str'] = format_select(soup, '.com_str', 0)
desc_more = format_select(soup, '.txt_more_box', 0)
company['desc_more'] = desc_more.replace('\xa0','').replace('\n', '').strip()
address = format_select(soup, '.comAddresstxt', 0)
company['address'] = address
com_item = soup.select('.com_gs_item_desc')
company['faren'] = format_text(com_item[1].text) if len(com_item) > 1 else ''
company['zibenjin'] = format_text(com_item[2].text) if len(com_item) > 2 else ''
company['website'] = format_text(com_item[3].text) if len(com_item) > 3 else ''
num_txt = format_select(soup, '.tab_item', 1)
num = get_num(num_txt)
if not num:
num = num_txt.replace('职位', '').replace('(', '').replace(')', '').strip()
company['num'] = num
job_names = [i.text for i in soup.select('.job_name')]
company['job_names'] = '/'.join(job_names)
job_infos = soup.select('.job_info')
for job_info in job_infos:
job_url = job_info.find('a')['href']
job_url = format_url(job_url)
job_id = get_id(link=job_url)
job_name = format_select(job_info, '.job_name', 0)
job_salary = format_select(job_info, '.job_salary', 0)
job_are = format_select(job_info, '.job_are', 0)
job_years = format_select(job_info, '.job_years', 0)
job_edu = format_select(job_info, '.job_edu', 0)
ellipsis_comUser = format_select(job_info, '.ellipsis_comUser', 0)
job_url = format_url(job_url)
jdata = {}
jdata['link'] = job_url
jdata['com_name'] = com_name
jdata['job_name'] = job_name
jdata['job_salary'] = job_salary
jdata['job_are'] = job_are
jdata['job_years'] = job_years
jdata['job_edu'] = job_edu
jdata['ellipsis_comUser'] = ellipsis_comUser
jdata['address'] = address
jdata = get_job_url(job_url, job_data=jdata)
job_datas[job_id] = jdata
return company, job_datas
# * 列表搜索公司 ID,公司名称,联系人
# * 公司简介, 职务列表
# * 职务详情
def get_comp_data(url=None, com_datas=None, city_name=None):
resp = session.get(url)
soup = bf(resp.text, 'html.parser')
job_infos = soup.select('.job_info')
if not job_infos:
print('no job info')
return False, com_datas
for job_info in job_infos:
job_ana = job_info.select('.job_ana')[0]
company_link = job_ana.find('a')['href']
company_link = format_url(company_link)
compnay_id = get_id(company_link)
if get_cache(type='company', id=company_id):
continue
contact = format_select(job_ana, '.ellipsis_comUser', 0)
cname = format_text(job_ana.select('.ellipsis_cname')[0].find('a').text)
city, area = get_city_area(job_info=job_info)
com_datas[compnay_id] = {
'name': cname,
'link': company_link,
'contact': contact,
'city_name': city_name,
'city': city,
'area': area
}
return True, com_datas
def get_company_list_datas(data=None):
ct, city_name = data
print(city_name)
com_datas={}
job_datas={}
session.cookies['city'] = ct
page = 1
while True:
url = job_url.format(page)
page += 1
status, com_datas = get_comp_data(url=url, city_name=city_name, com_datas=com_datas)
if not status:
break
time.sleep(1)
print(f'company_length: {len(com_datas)}')
with open(f'company/companys_{city_name}.json', 'w') as f:
json.dump(com_datas, f)
company_headers = ['省份', '地级市', '区县', '详细地址', '公司名称', '公司行业', '公司简介', '产品介绍', '工商信息', '公司地址', '招聘职位数量', '职位名称']
job_headers = ['公司名称', '岗位名称', '薪资范围', '工作城市', '工作经验', '学历要求', '薪资待遇', '职位描述关键词', '职位描述', '招聘者', '工作地址']
com_result = []
job_result = []
if com_datas:
for cid, company in com_datas.items():
url = company.get('link')
if url:
company, job_datas = get_company_page(url, company=company, job_datas=job_datas)
comp_data = set_company_info(company=company)
com_result.append(comp_data)
print(job_datas)
print(type(job_datas))
if job_datas:
for j, job in job_datas.items():
data = set_job_info(job, city_name)
print(data)
job_result.append(data)
# with open(f'company/company_info_{city_name}.csv', 'w') as f1, open(f'city/city_info_{city_name}.csv', 'w') as f2:
# f1.write(','.join(company_headers)+'\n')
# f2.write(','.join(job_headers)+'\n')
# for cid, company in com_datas.items():
# url = company.get('link')
# if url:
# company, job_datas = get_company_page(url, company=company, job_datas=job_datas)
# comp_data = set_company_info(company=company)
# com_result.append(comp_data)
# print(comp_data)
# f1.write(','.join(comp_data)+'\n')
# if job_datas:
# for job in job_datas:
# data = set_job_info(job, city_name)
# print(data)
# job_result.append(data)
# f2.write(','.join(data)+'\n')
print(f'RESULT_COMPANY: {len(com_result)}')
print(f'JOB_RESULT: {len(job_result)}')
if com_result:
with open(f'company/company_result_{city_name}.json', 'w') as f1:
json.dump(com_result, f1)
if job_result:
with open(f'job/job_result_{city_name}.json', 'w') as f2:
json.dump(job_result, f2)
return True
def get_job_list(data=None):
ct, city_name = data
print(city_name)
com_datas={}
job_datas={}
session.cookies['city'] = ct
page = 1
while True:
print(page)
url = job_url.format(page)
page += 1
status = get_company_list_datas(url=url, city_name=city_name)
if not status:
break
print(len(com_datas))
print(len(job_datas))
print('-' * 50)
with open(f'json/company-sample_{city_name}.json', 'w') as x1:
json.dump(com_datas, x1)
with open(f'json/job-sample_{city_name}.json', 'w') as x2:
json.dump(job_datas, x2)
def save_company():
print('保存公司信息')
headers = ['省份', '地级市', '区县', '详细地址', '公司名称', '公司行业', '公司简介', '产品介绍', '工商信息', '公司地址', '招聘职位数量', '职位名称']
items = ('provice', 'city', 'city', 'address', 'name', 'hangye', 'content', 'content', 'name', 'address', 'num')
with open('company.csv', 'w') as f:
f.write(','.join(headers) + '\n')
for key in r.scan_iter(match='597_company*'):
d = r.get(key)
res = json.loads(d)
print(res)
job_name_list = []
for i in r.scan_iter(match='597_job*'):
x = r.get(i)
if x:
job = json.loads(x)
job_name_list.append(job.get('name'))
job_names = '/'.join(job_name_list)
data = [res.get(item, '-') for item in items]
data.append(job_names)
f.write(','.join(data) + '\n')
def save_job():
print('保存职务')
headers = ['公司名称','岗位名称','薪资范围','工作城市','工作经验','学历要求','薪资待遇','职位描述关键词','职位描述','招聘者','工作地址']
items = ('company_name', 'name', 'salary', 'city', 'exp', 'edu', 'salary', 'gjz', 'desc', 'contact')
with open('jobs.csv', 'w') as f:
f.write(','.join(headers) + '\n')
for key in r.scan_iter(match='597_job*'):
d = r.get(key)
res = json.loads(d)
print(res)
company_id = res.get('company_id')
company = get_cache(type='company', id=company_id)
address = company.get('address')
data = [res.get(item, '-') for item in items]
data.append(address)
f.write(','.join(data) + '\n')
def get_city_list():
yield ('www', '全国')
for i in searchCitys:
xs = i.split('|')
city_name = xs[0] if xs else ''
ct = xs[1] if len(xs) > 1 else ''
if ct:
yield (ct, city_name)
def main():
# with Pool(4) as p:
# res = p.map(get_company_list_datas, get_city_list())
for i in get_city_list():
get_company_list_datas(i)
# get_job_list()
main()
# comp_headers = ['省份', '地级市', '区县', '详细地址', '公司名称', '公司行业', '公司简介', '产品介绍', '工商信息', '公司地址', '招聘职位数量', '职位名称']
# job_headers = ['公司名称', '岗位名称', '薪资范围', '工作城市', '工作经验', '学历要求', '薪资待遇', '职位描述关键词', '职位描述', '招聘者', '工作地址']
# import os
# comp_path = '/home/tonytan/Downloads/jobs_search/597网站/company'
# # 'company/company_result_{city_name}.json'
# with open('company_all.csv', 'w') as f:
# f.write(','.join(comp_headers) + '\n')
# for file in os.listdir(comp_path):
# if 'json' in file:
# print(file)
# filepath = os.path.join(comp_path, file)
# with open(filepath, 'r') as f1:
# datas = json.load(f1)
# print(datas)
# if datas:
# for data in datas:
# if data:
# print(data)
# data = [i or '-' for i in data]
# f.write(','.join(data) + '\n')
# job_path = '/home/tonytan/Downloads/jobs_search/597网站/job'
# # 'job/job_result_{city_name}.json'
# with open('job_all.csv', 'w') as f:
# f.write(','.join(job_headers) + '\n')
# for file in os.listdir(job_path):
# if 'json' in file:
# print(file)
# filepath = os.path.join(job_path, file)
# with open(filepath, 'r') as f1:
# datas = json.load(f1)
# print(datas)
# if datas:
# for data in datas:
# if data:
# print(data)
# data = [i or '-' for i in data]
# f.write(','.join(data) + '\n')
import requests
from bs4 import BeautifulSoup as bf
import re
import json
import time
from city_list import searchCitys
from multiprocessing.dummy import Pool
import redis
page_url = 'http://www.chinajsjob.com/job/sort528_px0_p{}.shtml'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'ASP.NET_SessionId=dfzkw1zgek2jobo1cmb1135m; SECKEY_ABVK=br9hg1wn5hFZWVreAzXQVijHrwJN07ofrJDzeaMRCA0%3D; BMAP_SECKEY=Ftud9bx-REipUcl-ZQmatvyH_fYD-3IJUtW5VsGJFAZmTN8vQbak_7LfYDzS-2L0BU4wMDQ-Pwd-ZpmwYKAulQ6VzBm5eaZubn2dDuNnZNkrsDFRIKzHg5PcgPTp5qOEE7jrjHh7Quv1TSKl3h0aGOsqEhV0lxsj3DTA7o7rroA',
'Host': 'www.chinajsjob.com',
'Referer': 'http://www.chinajsjob.com/job/sort528_px0_p1.shtml',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
# 文本格式化
def format_text(text):
try:
text = str(text)
text = text.replace('\xa0', '').replace('\n','').replace(',','.').replace('\t', '').replace('\r', '').replace('\r\n', '')
return text.strip()
except:
return ''
def format_url(url):
if url.startswith('//'):
url = f"https:{url}"
return url
def get_id(link):
id = link.split('/')[-1].split('.')[0].replace('.html','')
return id
def get_num(text):
match = re.match('\d+', text)
if match:
number = int(match.group())
return number
return
href = 'http://www.chinajsjob.com'
header = ['省份','公司名称', '公司行业', '公司简介', '产品介绍', '工商信息', '公司地址', '招聘职位数量', '职位名称', '岗位名称', '薪资范围', '工作城市', '工作经验', '学历要求', '薪资待遇', '职位描述关键词', '职位描述', '招聘者', '工作地址']
def job_page(page):
if href not in page:
page = f'{href}{page}'
resp = requests.get(page, headers=headers)
soup = bf(resp.text, 'html.parser')
data = {}
keyword_items = soup.select('.jjText')
keywords = [format_text(i.text) for i in keyword_items]
data['keyword'] = '/'.join(keywords)
data['job_desc'] = format_text(soup.select('.zwmsCon')[0].text)
data['address'] = format_text(soup.select('.gzddCon')[0].select('span')[0].text)
return data
def company_page(page):
if href not in page:
page = f'{href}{page}'
resp = requests.get(page, headers=headers)
soup = bf(resp.text, 'html.parser')
data = {}
job_txt = format_text(soup.select('.zzzw')[0].text)
job_txt = job_txt.split('等')[-1]
data['job_num'] = format_text(job_txt)
data['company_desc'] = format_text(soup.select('.des')[0].text)
return data
def list_item_page(item):
job_url = item.select('.zwTop')[0].select('a')[0].href
job_name = format_text(item.select('.zwbt')[0].text)
salary = format_text(item.select('.price')[0].text)
city = format_text(item.select('.cs')[0].text)
exp = format_text(item.select('.yq')[0].select('span')[2].text)
edu = format_text(item.select('.yq')[0].select('span')[3].text)
company_name = format_text(item.select('.comName')[0].text)
company_url = format_text(item.select('.comName')[0].href)
industry = format_text(item.select('.comsm')[0].select('span')[2].text)
contact = format_text(item.select('.lxrcon')[0].text)
job_page_data = job_page(job_url)
company_page_data = company_page(company_url)
data = {
'province': '江苏省',
'company_name': format_text(company_name),
'industry': format_text(industry),
'company_desc': company_page_data.get('company_desc'),
'product_desc': '',
'gongshang': f'公司名称: {company_name}',
'address': job_page_data.get('address'),
'job_num': company_page_data.get('job_num', 1),
'job_names': '',
'job_nmae': format_text(job_name),
'salary': format_text(salary),
'city': format_text(city),
'exp': format_text(exp),
'edu': format_text(edu),
'salary1': format_text(salary),
'keyword': job_page_data.get('keyword'),
'job_desc': job_page_data.get('job_desc'),
'contact': format_text(contact),
}
return data
datas = []
def list_page(page):
try:
url = page_url.format(page)
resp = requests.get(url, headers=headers)
soup = bf(resp.text, 'html.parser')
job_items = soup.select('.zwLeft')[0].find_all('li')
for item in job_items:
data = list_item_page(item)
datas.append(data)
return True
except:
return False
page = 1
while True:
res = list_page(page)
if not res:
break
page += 1
print(len(datas))
key_items = ('province','company_name','industry','company_desc','product_desc','gongshang','address','job_num','job_names','job_nmae','salary','city','exp','edu','salary1','keyword','job_desc','contact','address')
with open('jobs.csv', 'w') as f:
f.write(','.join(header) + '\n')
for data in datas:
d = [data.get(i) for i in key_items]
f.write(','.join(d) + '\n')
import requests
import random
proxies = {
'https': '36.6.144.239:8089',
'https': '36.6.145.194:8089',
'https': '39.98.197.238:80',
'https': '36.6.144.156:8089',
'https': '183.239.68.61:4780',
'https': '1.15.156.141:7890',
'https': '36.6.145.46:8089',
'https': '36.6.144.114:8089',
'https': '114.106.171.45:8089',
'https': '223.247.47.100:8089',
'https': '111.225.153.56:8089',
'https': '36.6.144.131:8089',
'https': '36.6.144.192:8089',
}
headers_list = [
{
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G955U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 10; SM-G981B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (iPad; CPU OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/87.0.4280.77 Mobile/15E148 Safari/604.1'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.109 Safari/537.36 CrKey/1.54.248666'
}, {
'user-agent': 'Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.188 Safari/537.36 CrKey/1.54.250320'
}, {
'user-agent': 'Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/10.0.9.2372 Mobile Safari/537.10+'
}, {
'user-agent': 'Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.1.0; en-US) AppleWebKit/536.2+ (KHTML like Gecko) Version/7.2.1.0 Safari/536.2+'
}, {
'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'
}, {
'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.1; en-us; GT-N7100 Build/JRO03C) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'
}, {
'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true'
}, {
'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; LGMS323 Build/KOT49I.MS32310c) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/102.0.0.0 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mobile Safari/537.36 Edge/14.14263'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)'
}, {
'user-agent': 'Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 9; Pixel 3 Build/PQ1A.181105.017.A1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.158 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 11; Pixel 3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.181 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36'
}, {
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
}, {
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
}, {
'user-agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'
}
]
list_items ={
"UCB001": 'job_id',
"AAB004": 'company_id',
"AAB299REMARK": 'city',
"AAE006": 'address',
"ACB200": 'company_id',
"ACB215REMARK": 'hangye',
"ACB217": 'job_name',
"ACB21AREMARK": 'salary',
"ACB244": 'tags',
"AAE004": 'contact',
"AAE005": 'mobile',
}
company_items = {
"AAB004": "鹤峰县铭创餐饮服务有限公司",
"AAB020REMARK": "有限责任(公司)",
"AAB022REMARK": "餐饮业",
"AAB299REMARK": "鹤峰县",
"AAE006": "鹤峰县后坝路27号",
"AAE004": "刘选明",
"AAC003_CJ": "黄恩",
"AAB004_GL": "湖北省劳动就业服务中心",
"AppletsLink": "https://www.hbggzp.cn/xcx/enterprise?detailId=558364dadb964771b2090555f2a21fb2"
}
job_items = {
"AAB004": "鹤峰县铭创餐饮服务有限公司",
"AAB004_GL": "湖北省劳动就业服务中心",
"AAB022REMARK": "餐饮业",
"AAB299REMARK": "鹤峰县",
"AAB301REMARK": "鹤峰县",
"AAE004": "鑫鑫",
"AAE005": "15586626667",
"AAE006": "鹤峰县后坝路27号",
"ACB202": "容美镇后坝",
"AAB092": "简介",
"ACB215REMARK": "洗碗工",
"ACB216": "服从管理,吃苦耐劳 ,身体健康,能吃苦耐劳,遵纪守法,",
"ACB217": "洗碗工",
"ACB21AREMARK": "2000-3000元",
"ACB239REMARK": "全职",
"UCE465": "服从管理,吃苦耐劳 ,身体健康,能吃苦耐劳,遵纪守法,",
}
list_page_url = 'https://www.hbggzp.cn/PER/JA/COMMON/COMPANY/JOB/V1/QUERY?KEY=&AAB301=420000&ACB215=&UCB005=&ACB21A=&ACB239=&UCE466E=&ACC217=&AAC011=&PAGE={}&LINAGE={}'
job_page_url = 'https://www.hbggzp.cn/CACHE/PER/JA/COMMON/COMPANY/JOB/V1/GETINFO?ACB200={}'
company_page_url = 'https://www.hbggzp.cn/CACHE/PER/JA/COMMON/COMPANY/BASE/V1/GETINFO?UCB001={}'
headers = {
'Referer': 'https://www.hbggzp.cn/views/PostSearch/homeIndex.html?t=2',
# 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'Host': 'www.hbggzp.cn'
}
job_datas = {}
company_datas ={}
def list_request(url):
user_agents = random.choice(headers_list)
headers.update(user_agents)
# resp = requests.get(url, headers=headers, proxies=proxies, timeout=20)
resp = requests.get(url)
res = resp.json()
print(res)
result = res.get('Result',{}).get('Rows',[])
for row in result:
data = {j: row.get(k, '') for k,j in list_items.items()}
job_id = data.get('job_id')
company_id = data.get('company_id')
job_datas[job_id] = data
company_datas[company_id] = data
pages = []
size = 20
length = int(5158/size) + 1
for i in range(1, length+1):
print(i)
if i in pages:
continue
url = list_page_url.format(i, size)
list_request(url)
pages.append(i)
companys = job_datas
jobs = company_datas
def company_request(url):
rtimes = 0
while rtimes < 4:
try:
resp = requests.get(url=url, times=20)
res = resp.json()
print(res)
result = res.get('Result', {})
try:
companys[i].update(result)
except:
companys[i] = result
return
except:
rtimes += 1
time.sleep(1)
t = 1
for i, d in companys.items():
if 'AAB019REMARK' in d:
continue
url = company_page_url.format(i)
company_request(url)
print(t)
t += 1
def job_request(url):
rtimes = 0
while rtimes < 4:
try:
resp = requests.get(url=url, times=20)
res = resp.json()
print(res)
result = res.get('Result', {})
try:
jobs[i].update(result)
except:
jobs[i] = result
return
except:
rtimes += 1
time.sleep(1)
t = 1
for i, d in jobs.items():
if 'AAB019REMARK' in d:
continue
url = job_page_url.format(i)
job_request(url)
print(t)
t += 1
import requests
from bs4 import BeautifulSoup as bf
import re
import json
import time
from multiprocessing.dummy import Pool
# 文本格式化
def format_text(text):
try:
text = str(text).replace('\xa0', '').replace('\n','').replace(',','.')
return text.strip()
except:
return ''
def format_url(url):
if url.startswith('//'):
url = f"https:{url}"
return url
def get_id(link):
id = link.split('/')[-1].split('.')[0].replace('.html','')
return id
def get_num(text):
match = re.match('\d+', text)
if match:
number = int(match.group())
return number
return
def get_city_area(job_info):
job_request = format_text(job_info.select('.job_request')[0].text)
job_req = job_request.split('|')
ad = format_text(job_req[0]) if job_req else ''
ads = ad.split('-')
city = ads[0] if ads else ''
area = ads[1] if len(ads) > 1 else ''
return city, area
def set_gongshang_info(data=None):
res = f'公司名称:{data.get("com_name", "")}. 法人: {data.get("farean", "")}.注资: {data.get("zibenjin", "")}.官网: {data.get("website", "")}'
return res
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'Client':'PC',
'Content-Type':'application/json;charset=UTF-8',
'Cookie':'uuid=0eed4eb033203d4625f8a345a66ebe03; uuid.sig=7dq3I2MyMVLywODf5wNpFT0M8ynyrT0g8N3qmIoYNQ0; cf99fb1bff6494fb3dc3e887223abd55=c7d4c9a5a963820712513e1d100d3aab; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22188fab415f3c88-08c4cb1d4988be8-13462c6c-2073600-188fab415f486c%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg4ZmFiNDE1ZjNjODgtMDhjNGNiMWQ0OTg4YmU4LTEzNDYyYzZjLTIwNzM2MDAtMTg4ZmFiNDE1ZjQ4NmMifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%22188fab415f3c88-08c4cb1d4988be8-13462c6c-2073600-188fab415f486c%22%7D; uvcookie=ffdc40e600b04eddba474be1b31f22f3; jscookie=d8fe9c5c47b33eb5c1713ef89736ea6a; orginflag=1; accessflag=1; 8aa932cc6211ccbe21af0c963b3e3415=3706ba89e1b89e7d7fb3138eb20e5eb2; Hm_lvt_6cc160b93b871a4884a8cd8dc4addcd3=1687833287; d87a445a1f22d65370f406844dc1e56a=be9bce0c00d065fb057cdc512f037a7f; sensorsTrackObj={%22from%22:%22%E6%90%9C%E7%B4%A2%22%2C%22index_number%22:9%2C%22page_number%22:6}; keyCityCode=14012700; keyTypeCode=10030010%2C10030010; Hm_lpvt_6cc160b93b871a4884a8cd8dc4addcd3=1687846607; pvcount="targetUrl=https%3A%2F%2Fwww.job5156.com%2Fs%2Fsearch%2F%3FkeywordType%3D%26keyword%3D~fromSourceType=2"',
'Referer':'https://www.job5156.com/s/search/?keywordType=&keyword=',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'Sec-Ch-Ua':'"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
'Sec-Ch-Ua-Mobile':'?0',
'Sec-Ch-Ua-Platform':'"Linux"',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-site',
}
data = {
't': 1687833340449,
'pn': 6,
'keyword': '',
'keywordType': 0,
'posTypeList': '',
'locationList': '',
'taoLabelList': '',
'degreeFrom': '',
'propertyList': '',
'industryList': '',
'sortBy': 0,
'urgentFlag': '',
'comIdList': '',
'locationAddrStr': '',
'salary': '',
}
p_url = 'https://www.job5156.com/s/result'
job_url = 'https://api.qlrc.com/personal/Job?JobId={}'
job_datas = {}
company_datas = {}
def list_page():
page = 1
while True:
data['pn'] = str(page)
data['t'] = str(int(time.time()*1000))
resp = requests.get(p_url, data=data, headers=headers)
res = resp.json()
job_list = res.get('posData', {}).get('posItems', [])
print(len(job_list))
if not job_list:
break
page += 1
job_datas.extend(job_list)
job_ids = []
datas = []
def job_page(job_id):
url = job_url.format(job_id)
resp = requests.get(url, headers=headers)
res = resp.json()
job = res.get('job', {})
comp = job.get('cpMain', {})
ca_main = job.get('caMain', {})
data = ['山东省', comp.get('name', '-'), comp.get('industry', ''), comp.get('brief','-'), '-', comp.get('name', '-'), comp.get('address', '-'),comp.get('jobCount', 1), '-', job.get('name', '-'),job.get('salary', '-'), job.get('city', '-'), job.get('experience', '-'), job.get('degree', '-'), job.get('salary', '-'), job.get('jobKeyWord', '-'), job.get('demand', '-'), ca_main.get('name', '-'), comp.get('address', '-')]
data = [format_text(i) for i in data]
print(data)
datas.append(data)
header = ['省份','公司名称', '公司行业', '公司简介', '产品介绍', '工商信息', '公司地址', '招聘职位数量', '职位名称', '岗位名称', '薪资范围', '工作城市', '工作经验', '学历要求', '薪资待遇', '职位描述关键词', '职位描述', '招聘者', '工作地址']
with open('job.csv', 'w') as f:
f.write(','.join(header) + '\n')
for i in range(1, 101):
list_page(str(i))
print(job_ids)
with Pool(4) as p:
p.map(job_page, job_ids)
for job_id in job_ids:
job_page(job_id)
import requests
from bs4 import BeautifulSoup as bf
import re
import json
import time
from multiprocessing.dummy import Pool
# 文本格式化
def format_text(text):
try:
text = str(text).replace('\xa0', '').replace('\n','').replace(',','.')
return text.strip()
except:
return ''
def format_url(url):
if url.startswith('//'):
url = f"https:{url}"
return url
def get_id(link):
id = link.split('/')[-1].split('.')[0].replace('.html','')
return id
def get_num(text):
match = re.match('\d+', text)
if match:
number = int(match.group())
return number
return
def get_city_area(job_info):
job_request = format_text(job_info.select('.job_request')[0].text)
job_req = job_request.split('|')
ad = format_text(job_req[0]) if job_req else ''
ads = ad.split('-')
city = ads[0] if ads else ''
area = ads[1] if len(ads) > 1 else ''
return city, area
def set_gongshang_info(data=None):
res = f'公司名称:{data.get("com_name", "")}. 法人: {data.get("farean", "")}.注资: {data.get("zibenjin", "")}.官网: {data.get("website", "")}'
return res
headers = {
'Accept':'application/json, text/plain, */*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'Client':'PC',
'Content-Type':'application/json;charset=UTF-8',
'Origin':'https://www.qlrc.com',
'Referer':'https://www.qlrc.com/',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'Sec-Ch-Ua':'"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
'Sec-Ch-Ua-Mobile':'?0',
'Sec-Ch-Ua-Platform':'"Linux"',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-site',
'Subsiteinfo':'%7B%22id%22%3A32%2C%22provinceID%22%3A32%2C%22subSiteName%22%3A%22%E9%BD%90%E9%B2%81%E4%BA%BA%E6%89%8D%E7%BD%91%22%2C%22subSiteCity%22%3A%22%E5%B1%B1%E4%B8%9C%22%2C%22subSiteUrl%22%3A%22https%3A%2F%2Fwww.qlrc.com%22%2C%22isSecond%22%3Afalse%2C%22pcUrl%22%3A%22www.qlrc.com%22%2C%22h5Url%22%3A%22m.qlrc.com%22%2C%22isWechatValid%22%3Atrue%2C%22beian%22%3A%22%E4%BA%ACICP%E5%A4%8712005109%E5%8F%B7-17%22%2C%22pinyin%22%3A%22shandong%22%2C%22servicePhone%22%3A%220531-68961040%22%2C%22dataWay%22%3A%22%E7%BC%93%E5%AD%98%E6%95%B0%E6%8D%AE%22%2C%22isMobile%22%3Afalse%2C%22isIE%22%3Afalse%2C%22searchServer%22%3A%224%22%7D'
}
data = {
"OrderBy": 0,
"jobTypeId": "",
"dcMajorID": "",
"regionId": "32",
"industryId": "",
"mapPlaceId": 0,
"minSalary": 0,
"maxSalary": 0,
"minSalaryvalue": "",
"maxSalaryvalue": "",
"experienceId": "",
"replyRate": 0,
"autoReplyDay": 0,
"isNegotiable": False,
"educationId": "",
"employType": "",
"keyWord": "",
"Page": 2,
"companySizeId": "",
"welfare": "",
"isOnline": False,
"distance": 0,
"onlyEnv": False,
"selectRegionId": "32",
"filterWelfare": [],
"filterJobType": None,
"filterRegion": None,
"filterEducation": None,
"filterSalary": {},
"filterExperience": None,
"filterEmployType": None,
"filterCompanySize": None,
"filterReplyRate": None,
"serverNo": "4",
"city": {
"id": "",
"value": ""
},
"district": {
"id": "",
"value": ""
},
"place": {
"id": "",
"value": ""
},
"firstJob": {
"id": "",
"value": ""
},
"secondJob": {
"id": "",
"value": ""
},
"regionType": 1,
"jobType": 1,
"showWords": ""
}
p_url = 'https://so.qlrc.com/job/'
job_url = 'https://api.qlrc.com/personal/Job?JobId={}'
job_datas = {}
company_datas = {}
job_ids = set()
def list_page(page):
headers['Page'] = page
resp = requests.post(p_url, json=data, headers=headers)
res = resp.json()
job_list = res.get('jobList', [])
for item in job_list:
job_id = item.get('jobSecondId')
job_ids.add(job_id)
datas = {}
def job_page(job_id):
resp = requests.get(job_url.format(job_id), headers=headers)
res = resp.json()
job = res.get('job', {})
comp = job.get('cpMain', {})
ca_main = job.get('caMain', {})
data = ['山东省', comp.get('name', '-'), comp.get('industry', ''), comp.get('brief','-'), '-', comp.get('name', '-'), comp.get('address', '-'),comp.get('jobCount', 1), '-', job.get('name', '-'),job.get('salary', '-'), job.get('city', '-'), job.get('experience', '-'), jon.get('degree', '-'), job.get('salary', '-'), job.get('jobKeyWord', '-'), job.get('demand', '-'), ca_main.get('name', '-'), comp.get('address', '-')]
data = [format_text(i) for i in data]
datas[job_id] = data
with open('job.csv', 'a') as f:
f.write(','.join(data) + '\n')
header = ['省份','公司名称', '公司行业', '公司简介', '产品介绍', '工商信息', '公司地址', '招聘职位数量', '职位名称', '岗位名称', '薪资范围', '工作城市', '工作经验', '学历要求', '薪资待遇', '职位描述关键词', '职位描述', '招聘者', '工作地址']
with open('job.csv', 'a') as f:
f.write(','.join(header) + '\n')
with Pool(4) as p:
p.map(job_page, job_ids)
import redis
import json
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.webdriver import Options
from multiprocessing.dummy import Pool
page_url = 'http://www.sdbys.com/job/search?page={}'
job_url = 'http://www.sdbys.com/job/view/id/{}'
company_url = 'http://www.sdbys.com/companydetail/view/id/{}'
r = redis.Redis(host='127.0.0.1',port=6379,db=0)
r.flushdb()
def format_text(text):
try:
text = str(text).replace('\xa0', '').replace(',','.').replace('\n','')
return text.strip()
except:
return ''
def format_url(url):
if url.startswith('//'):
url = f"https:{url}"
return url
def get_id(link):
id = link.split('/')[-1].split('.')[0]
return id
def get_cache(type=None, id=None):
if type == 'job':
name = f'sdpys_job_{id}'
else:
name = f'sdpys_company_{id}'
res = r.get(name)
if res:
return json.loads(res)
r.set(name, json.dumps({}))
return {}
def set_cache(type=None, id=None, data=None):
if type == 'job':
name = f'sdpys_job_{id}'
else:
name = f'sdpys_company_{id}'
r_data = get_cache(type=type, id=id)
r_data.update(data)
r.set(name, json.dumps(r_data))
def selenium_options():
options = Options()
# 设置中文编码格式
options.add_argument('lang=zh_CN.UTF-8')
options.add_argument(
'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"')
options.add_argument('blink-settings=imagesEnabled=false')
options.add_argument('--headless')
prefs = {
'profile.default_content_setting_values': {
'notifications': 2
}}
options.add_experimental_option('prefs', prefs)
options.add_argument('--no-sandbox')
return options
options = selenium_options()
driver = webdriver.Chrome(options=options)
def list_page_datas(page=None):
print(page)
driver.get(page)
time.sleep(3)
items = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div/div/div[1]/ul/li')
for item in items:
company_name = item.find_element(By.XPATH, './div[2]/div/div[2]/a').text
company_name = format_text(company_name)
company_url = item.find_element(By.XPATH, './div[2]/div/div[2]/a').get_attribute('href')
company_id = get_id(company_url)
job_name = item.find_element(By.XPATH, './div[2]/div/div[3]/a').text
job_name = format_text(job_name)
job_url = item.find_element(By.XPATH, './div[2]/div/div[3]/a').get_attribute('href')
job_id = get_id(job_url)
salary = item.find_element(By.XPATH, './div[2]/div/div[3]/span').text
salary = format_text(salary)
add = item.find_element(By.XPATH, './div[2]/div/div[4]/ul/li[1]').text
print(add)
adds = add.split('-')
print(adds)
adds = adds if isinstance(adds, list) else [adds]
province = format_text(adds[0]) if adds else ''
city = format_text(adds[1]) if adds and len(adds) > 1 else ''
edu = item.find_element(By.XPATH, './div[2]/div/div[4]/ul/li[3]').text
edu = format_text(edu)
company_data = get_cache(type='company', id=company_id)
if company_data:
job_names = company_data.get('job_names')
if not job_name in job_names:
job_names += ',' + job_name
company_data['job_names'] = job_names
set_cache(type='company', id=company_id, data=company_data)
else:
company_data['id'] = company_id
company_data['name'] = company_name
company_data['url'] = company_url
company_data['provice'] = province
company_data['city'] = city
job_names = job_name
company_data['job_names'] = job_names
set_cache(type='company', id=company_id, data=company_data)
job_data = get_cache(type='job', id=job_id)
if job_data:
return
job_data['id'] = job_id
job_data['name'] = job_name
job_data['company_name'] = company_name
job_data['company_id'] = company_id
job_data['job_url'] = job_url
job_data['salary'] = salary
job_data['province'] = province
job_data['city'] = city
job_data['edu'] = edu
set_cache(type='job', id=job_id, data=job_data)
print(job_data)
def job_page(job_data):
# job_data = get_cache(type='job', id=id)
print('job page')
id = job_data.get('id')
print(id)
job_url = job_data.get('job_url')
print(job_url)
driver.get(job_url)
time.sleep(2)
try:
select = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div[3]/div[1]/ul/li[5]/span')
contact=WebDriverWait(driver,10,0.2).until(EC.visibility_of(select)).text
# contact = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div[3]/div[1]/ul/li[5]/span').text
contact = format_text(contact)
except:
contact = ''
try:
tags = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div[3]/div[1]/ul/li[7]/span').text
tags = format_text(tags)
except:
tags = ''
try:
desc = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div[3]/div[2]/div/div').text
desc = format_text(desc)
except:
desc = ''
try:
exp = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div[3]/div[1]/ul/li[3]/span').text
exp = format_text(exp)
except:
exp = ''
try:
guanjianzi = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div[3]/div[1]/ul/li[1]/span').text
guanjianzi = format_text(guanjianzi)
except:
guanjianzi = ''
data = {
'contact': contact,
'tags': tags,
'desc': desc,
'exp': exp,
'gjz': guanjianzi
}
job_data = set_cache(type='job', id=id, data=data)
print(data)
def company_page(company_data):
# company_data = get_cache(type='company', id=id)
id = company_data.get('id')
print(id)
company_url = company_data.get('url')
driver.get(company_url)
time.sleep(1)
try:
num = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[3]/div/div[1]/div/div/div[1]/div/div[2]/ul/li[1]/p[1]').text
num = format_text(num)
except:
num = ''
try:
content = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[3]/div/div[2]/div[2]/div[1]/div/div').text
content = format_text(content)
except:
content = ''
try:
address = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[3]/div/div[2]/div[2]/div[3]/div/p').text
address = format_text(address)
except:
address = ''
try:
hangye = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[3]/div/div[3]/div/div/div/div[2]/span').text
hangye = format_text(hangye)
except:
hangye = ''
data = {
'num': num,
'content': content,
'address': address,
'hangye': hangye
}
set_cache(type='company', id=id, data=data)
print(data)
def company_item():
company_keys = r.scan_iter(match='sdpys_company*')
for key in company_keys:
try:
d = r.get(key)
data = json.loads(d)
print(data)
yield data
except:
continue
def job_item():
job_keys = r.scan_iter(match='sdpys_job*')
for key in job_keys:
try:
d = r.get(key)
data = json.loads(d)
print(data)
yield data
except:
continue
def handle():
for i in range(1, 5260):
try:
page = page_url.format(i)
list_page_datas(page=page)
except:
continue
print('company ...')
for i in company_item():
try:
company_page(i)
except:
continue
print('job...')
for i in job_item():
try:
job_page(i)
except:
continue
def save_company():
print('保存公司信息')
headers = ['省份', '地级市', '区县', '详细地址', '公司名称', '公司行业', '公司简介', '产品介绍', '工商信息', '公司地址', '招聘职位数量', '职位名称']
items = ('provice', 'city', 'city', 'address', 'name', 'hangye', 'content', 'content', 'name', 'address', 'num')
with open('companyx.csv', 'w') as f:
f.write(','.join(headers) + '\n')
for key in r.scan_iter(match='sdpys_company*'):
d = r.get(key)
res = json.loads(d)
job_name_list = []
# for i in r.scan_iter(match='sdpys_job*'):
# x = r.get(i)
# if x:
# job = json.loads(x)
# job_name_list.append(job.get('name'))
# job_names = '/'.join(job_name_list)
data = [res.get(item, '-') for item in items]
# data.append(job_names)
f.write(','.join(data) + '\n')
print('finished')
def save_job():
print('保存职务')
headers = ['公司名称','岗位名称','薪资范围','工作城市','工作经验','学历要求','薪资待遇','职位描述关键词','职位描述','招聘者','工作地址']
items = ('company_name', 'name', 'salary', 'city', 'exp', 'edu', 'salary', 'gjz', 'desc', 'contact')
with open('jobs.csv', 'w') as f:
f.write(','.join(headers) + '\n')
for key in r.scan_iter(match='sdpys_job*'):
d = r.get(key)
res = json.loads(d)
company_id = res.get('company_id')
company = get_cache(type='company', id=company_id)
address = company.get('address')
data = [res.get(item, '-') for item in items]
data.append(address)
f.write(','.join(data) + '\n')
print('finished')
handle()
save_company()
save_job()
with open(file, 'r') as f1, open('jobs1.csv', 'w') as f2:
for i in csv.reader(f1):
add = i[10]
print(add)
res = {}
if add:
res = jio.parse_location(add)
privince=res.get('province')
city = res.get('city', '-')
print(city)
i.append(str(city))
txt = ','.join(i) + '\n'
f2.write(txt)
import jionlp as jio
with open('jobs1.csv', 'r') as f1, open('jobs2.csv', 'w') as f2:
times = 1
for i in csv.reader(f1):
print(times)
times += 1
add = i[0]
print(add)
res = {}
if add:
res = jio.parse_location(add)
privince=res.get('province')
city = res.get('city', '-')
print(city)
i.append(str(city))
txt = ','.join(i) + '\n'
f2.write(txt)
with open('companyx.csv', 'r') as f1, open('company1.csv', 'w') as f2:
times = 1
for i in csv.reader(f1):
print(times)
times += 1
add = i[3]
print(add)
res = {}
if add:
res = jio.parse_location(add)
province=res.get('province', '-')
city = res.get('city', '-')
country = res.get('country', '-')
i.extend([str(province), str(city), str(country)])
txt = ','.join(i) + '\n'
f2.write(txt)
# 云南招聘网
import redis
import json
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.webdriver import Options
from multiprocessing.dummy import Pool
page_url = 'https://www.ynzp.com/search/offer_search_result.aspx?jcity1Hidden=330000&page={}'
job_url = 'https://www.ynzp.com/job/{}.html'
company_url = 'https://www.ynzp.com/ent/{}.html'
r = redis.Redis(host='127.0.0.1',port=6379,db=3)
# r.flushdb()
JOB_NAME = 'ynzp_21_job_'
COMPANY_NAME = 'ynzp_21_company_'
def format_text(text):
try:
text = str(text).replace('\xa0', '').replace(',','.').replace('\n','')
return text.strip()
except:
return ''
def format_url(url):
if url.startswith('//'):
url = f"https:{url}"
return url
def get_id(link):
id = link.split('/')[-1].split('.')[0]
return id
def get_cache(type=None, id=None):
if type == 'job':
name = f'{JOB_NAME}{id}'
else:
name = f'{COMPANY_NAME}{id}'
res = r.get(name)
if res:
return json.loads(res)
r.set(name, json.dumps({}))
return {}
def set_cache(type=None, id=None, data=None):
if type == 'job':
name = f'{JOB_NAME}{id}'
else:
name = f'{COMPANY_NAME}{id}'
r_data = get_cache(type=type, id=id)
r_data.update(data)
r.set(name, json.dumps(r_data))
def selenium_options():
options = Options()
# 设置中文编码格式
options.add_argument('lang=zh_CN.UTF-8')
options.add_argument(
'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"')
options.add_argument('blink-settings=imagesEnabled=false')
options.add_argument('--headless')
prefs = {
'profile.default_content_setting_values': {
'notifications': 2
}}
options.add_experimental_option('prefs', prefs)
options.add_argument('--no-sandbox')
return options
options = selenium_options()
driver = webdriver.Chrome(options=options)
def list_page_datas(page=None):
print(page)
try:
driver.get(page)
except:
driver = webdriver.Chrome(options=options)
driver.get(page)
time.sleep(1)
items = driver.find_elements(By.CLASS_NAME, 'V1Item')
print(len(items))
for i, item in enumerate(items):
try:
item = driver.find_elements(By.CLASS_NAME, 'V1Item')[i]
company_name_node = f'//*[@id="ctl00_ContentPlaceHolder1_repJob_ctl{str(i).zfill(2)}_EntUrl"]'
company_name = driver.find_element(By.XPATH, company_name_node).text
company_name = format_text(company_name)
company_url = driver.find_element(By.XPATH, company_name_node).get_attribute('href')
company_id = get_id(company_url)
job_name_node = f'//*[@id="ctl00_ContentPlaceHolder1_repJob_ctl{str(i).zfill(2)}_JobName"]'
job_name = driver.find_element(By.XPATH, job_name_node).text
job_name = format_text(job_name)
job_url = driver.find_element(By.XPATH, job_name_node).get_attribute('href')
job_id = get_id(job_url)
try:
salary = driver.find_elements(By.XPATH, f'//*[@class="salaryStyle"]')[{i}].text
salary = format_text(salary)
print(salary)
except:
salary = ''
node = driver.find_elements(By.CLASS_NAME, 'JobInfo')[i*2]
area = node.find_elements(By.TAG_NAME, 'span')[0].text
print(area)
edu = node.find_elements(By.TAG_NAME, 'span')[1].text
exp = node.find_elements(By.TAG_NAME, 'span')[2].text
company_data = get_cache(type='company', id=company_id)
if company_data:
job_names = company_data.get('job_names')
job_names += ',' + job_name
else:
company_data['id'] = company_id
company_data['name'] = company_name
company_data['url'] = company_url
# company_data['provice'] = province
company_data['city'] = area
company_data['area'] = area
job_names = job_name
company_data['job_names'] = job_names
print(company_data)
set_cache(type='company', id=company_id, data=company_data)
job_data = get_cache(type='job', id=job_id)
job_data['id'] = job_id
job_data['name'] = job_name
job_data['company_name'] = company_name
job_data['company_id'] = company_id
job_data['job_url'] = job_url
job_data['salary'] = salary
# job_data['province'] = province
# job_data['city'] = city
job_data['edu'] = edu
job_data['exp'] = exp
set_cache(type='job', id=job_id, data=job_data)
print(job_data)
except:
return
def job_page(job_data):
print('job page')
id = job_data.get('id')
print(id)
page = job_data.get('job_url')
print(page)
try:
driver.get(page)
except:
# driver.quit()
driver = webdriver.Chrome(options=options)
driver.get(page)
time.sleep(2)
try:
contact = driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_plEntContact"]/ul/li[1]/p').text
# contact=WebDriverWait(driver,10,0.2).until(EC.visibility_of(select)).text
# contact = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div[3]/div[1]/ul/li[5]/span').text
contact = format_text(contact)
except:
contact = ''
try:
tags = driver.find_element(By.XPATH, '//*[@id="divEntWant"]').text
tags = format_text(tags)
except:
tags = ''
try:
desc = driver.find_element(By.XPATH, '//*[@id="ShowJobContent"]/div[2]/div[2]/div[2]/ul/li[2]/div').text
desc = format_text(desc)
except:
desc = ''
# try:
# exp = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div[2]/div/div[3]/div[1]/ul/li[3]/span').text
# exp = format_text(exp)
# except:
# exp = ''
try:
guanjianzi = driver.find_element(By.XPATH, '//*[@id="divEntWant"]').text
guanjianzi = format_text(guanjianzi)
except:
guanjianzi = ''
try:
hangye = driver.find_element(By.XPATH, '//*[@id="cell1"]').text
hangye = format_text(hangye)
except:
hangye = ''
try:
company_url = driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_hlEntName"]').get_attribute('href')
company_id = get_id(company_url)
except:
company_id = ''
data = {
'contact': contact,
'tags': tags,
'desc': desc,
# 'exp': exp,
'gjz': guanjianzi,
'hangye': hangye
}
job_data = set_cache(type='job', id=id, data=data)
if company_id and hangye:
set_cache(type='company', id=company_id, data={'hangye': hangye})
print(data)
def company_page(company_data):
# company_data = get_cache(type='company', id=id)
id = company_data.get('id')
print(id)
page = company_data.get('url')
try:
driver.get(page)
except:
driver = webdriver.Chrome(options=options)
driver.get(page)
time.sleep(1)
try:
num = driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_V3ucentNav_hlJobs"]').text
num = format_text(num)
except:
num = ''
try:
content = driver.find_element(By.ID, 'EntIntro').find_elements(By.CLASS_NAME, 'content')[0].text
content = format_text(content)
except:
content = ''
try:
address = driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_panNormal"]/div[1]/div[2]/div[1]/div[2]/ul/li[2]/div/ul/li[6]/p/span[2]').text
address = format_text(address)
except:
address = ''
data = {
'num': num,
'content': content,
'address': address,
}
set_cache(type='company', id=id, data=data)
print(data)
def company_item():
company_keys = r.scan_iter(match=f'{COMPANY_NAME}*')
times = 1
for key in company_keys:
try:
d = r.get(key)
data = json.loads(d)
print(f'数量:{times}')
times += 1
yield data
except:
continue
def job_item():
job_keys = r.scan_iter(match=f'{JOB_NAME}*')
times = 1
for key in job_keys:
try:
d = r.get(key)
data = json.loads(d)
print(f'数量: {times}')
times += 1
yield data
except:
continue
def page_item():
for i in range(1, 51):
yield page_url.format(i)
for i in range(1, 51):
page = page_url.format(i)
list_page_datas(page=page)
with Pool(4) as p:
p.map(company_page, company_item())
with Pool(4) as p:
p.map(job_page, job_item())
def handle():
for i in range(1, 51):
page = page_url.format(i)
list_page_datas(page=page)
print('company ...')
for i in company_item():
company_page(i)
print('job...')
for i in job_item():
job_page(i)
def save_company():
print('保存公司信息')
headers = ['省份', '地级市', '区县', '详细地址', '公司名称', '公司行业', '公司简介', '产品介绍', '工商信息', '公司地址', '招聘职位数量', '职位名称']
items = ('provice', 'city', 'city', 'address', 'name', 'hangye', 'content', 'content', 'name', 'address', 'num')
with open('company.csv', 'w') as f:
f.write(','.join(headers) + '\n')
for key in r.scan_iter(match=f'{COMPANY_NAME}*'):
d = r.get(key)
res = json.loads(d)
print(res)
job_name_list = []
for i in r.scan_iter(match='job*'):
x = r.get(i)
if x:
job = json.loads(x)
job_name_list.append(job.get('name'))
job_names = '/'.join(job_name_list)
data = [res.get(item, '-') for item in items]
data.append(job_names)
f.write(','.join(data) + '\n')
def save_job():
print('保存职务')
headers = ['公司名称','岗位名称','薪资范围','工作城市','工作经验','学历要求','薪资待遇','职位描述关键词','职位描述','招聘者','工作地址']
items = ('company_name', 'name', 'salary', 'city', 'exp', 'edu', 'salary', 'gjz', 'desc', 'contact')
with open('jobs.csv', 'w') as f:
f.write(','.join(headers) + '\n')
for key in r.scan_iter(match=f'{JOB_NAME}*'):
d = r.get(key)
res = json.loads(d)
print(res)
company_id = res.get('company_id')
company = get_cache(type='company', id=company_id)
address = company.get('address')
data = [res.get(item, '-') for item in items]
data.append(address)
f.write(','.join(data) + '\n')
handle()
# save_company()
# save_job()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment