Skip to content

Instantly share code, notes, and snippets.

@louisom
Created March 26, 2017 05:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save louisom/6a9448ce288bcb9c430cba95c21965cc to your computer and use it in GitHub Desktop.
Save louisom/6a9448ce288bcb9c430cba95c21965cc to your computer and use it in GitHub Desktop.
國藝會爬蟲
# -*- coding: utf-8 -*-
import collections
import csv
import requests
from lxml import etree
JOB_PAGE_URL = 'http://www.ncafroc.org.tw/platform/jobs.aspx?nowPage={page}&id=&msg2=new#A1'
JOB_ID_URL = 'http://www.ncafroc.org.tw/platform/jobs-single.aspx?id={id}'
ID_LIST = ['43439', '43460', '43932', '43933', '43934', '43935', '43909', '43910', '43911', '43918', '43920', '43921', '43922', '43923', '43924', '43925', '43877', '43878', '43879', '43881', '43463', '43445', '43447', '43471', '43473', '43864', '43895', '43904', '43905', '43906', '43890', '43891', '43562', '43577', '43666', '43668', '43883', '43884', '43886', '43888', '43889', '43854', '43862', '43863', '43595', '43567', '42571', '43199', '43866', '43867', '43872', '43874', '43875', '43876', '43855', '43836', '43839', '43840', '43841', '43847', '43849', '43852', '43853', '43712', '43775', '43787', '43813', '43814', '43817', '43822', '43823', '43824', '43825', '43826', '43827', '43828', '43830', '43832', '43833', '43834', '43538', '43539', '43540', '43553', '43111', '42670', '43796', '43801', '43811', '43782', '43786', '43788', '43790', '43794', '43795', '43777', '43778', '43779', '43780', '43781', '43756', '43757', '43758', '43761', '43766', '43772', '43773', '43672', '42935', '43221', '43523', '43524', '43301', '43465', '43749', '43750', '43736', '43738', '43739', '43740', '43741', '43743', '43745', '43746', '43747', '43748', '43568', '43614', '43240', '43702', '43704', '43705', '43706', '43707', '43708', '43709', '43710', '43711', '43714', '43715', '43722', '43724', '43725', '43691', '43693', '43694', '43695', '43696', '43697', '43698', '43699', '43651', '43340', '43294', '43682', '43683', '43650', '43673', '43674', '43675', '43678', '43679', '43680', '43681', '43671', '43660', '43661', '43662', '42566', '43670', '43667', '42527', '43624', '43628', '43633', '43638', '43645', '43646', '43653', '43654', '43655', '43658']
def fetch_all_id(end_page=10):
id_list = []
for page in range(1, end_page + 1):
resp = requests.get(JOB_PAGE_URL.format(page=page))
root = etree.HTML(resp.text)
links = root.xpath('//a[@class="blacktext yellowlink"]')
id_list.extend(map(lambda x: x.get('href').split('=')[-1], links))
return id_list
def fetch_info_by_id(pid):
url = JOB_ID_URL.format(id=pid)
resp = requests.get(url)
root = etree.HTML(resp.text)
# Basic info
basic_row = ['from', 'title', 'type', 'publish_date', 'place', 'salary']
basic = list(map(lambda x: x.text, root.xpath('//div[@class="borderyellow largeml-20"]')))
# Advance info
adv_row = ['pre_requirements', 'job_details', 'others', 'welfare', 'how', 'url']
adv = map(lambda x: list(x.itertext()), root.xpath('//*[contains(@class, "mt30")]'))
adv = map(lambda x: list(map(lambda x: x.strip('\r\n\t'), x)), adv)
adv = list(map(lambda x: ' '.join(x).strip(' '), adv))[:5]
adv.append(url)
return collections.OrderedDict(zip(basic_row + adv_row, basic + adv))
if __name__ == '__main__':
# print(fetch_all_id(19))
with open('output.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
writer.writerow(['from', 'title', 'type', 'publish_date', 'place', 'salary', 'pre_requirements', 'job_details' 'others', 'welfare', 'how', 'url'])
for i in ID_LIST:
writer.writerow(fetch_info_by_id(i).values())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment