louisom/ncafroc.py

## ncafroc.py
# -*- coding: utf-8 -*-

import collections
import csv
import requests
from lxml import etree


JOB_PAGE_URL = 'http://www.ncafroc.org.tw/platform/jobs.aspx?nowPage={page}&id=&msg2=new#A1'
JOB_ID_URL = 'http://www.ncafroc.org.tw/platform/jobs-single.aspx?id={id}'

ID_LIST = ['43439', '43460', '43932', '43933', '43934', '43935', '43909', '43910', '43911', '43918', '43920', '43921', '43922', '43923', '43924', '43925', '43877', '43878', '43879', '43881', '43463', '43445', '43447', '43471', '43473', '43864', '43895', '43904', '43905', '43906', '43890', '43891', '43562', '43577', '43666', '43668', '43883', '43884', '43886', '43888', '43889', '43854', '43862', '43863', '43595', '43567', '42571', '43199', '43866', '43867', '43872', '43874', '43875', '43876', '43855', '43836', '43839', '43840', '43841', '43847', '43849', '43852', '43853', '43712', '43775', '43787', '43813', '43814', '43817', '43822', '43823', '43824', '43825', '43826', '43827', '43828', '43830', '43832', '43833', '43834', '43538', '43539', '43540', '43553', '43111', '42670', '43796', '43801', '43811', '43782', '43786', '43788', '43790', '43794', '43795', '43777', '43778', '43779', '43780', '43781', '43756', '43757', '43758', '43761', '43766', '43772', '43773', '43672', '42935', '43221', '43523', '43524', '43301', '43465', '43749', '43750', '43736', '43738', '43739', '43740', '43741', '43743', '43745', '43746', '43747', '43748', '43568', '43614', '43240', '43702', '43704', '43705', '43706', '43707', '43708', '43709', '43710', '43711', '43714', '43715', '43722', '43724', '43725', '43691', '43693', '43694', '43695', '43696', '43697', '43698', '43699', '43651', '43340', '43294', '43682', '43683', '43650', '43673', '43674', '43675', '43678', '43679', '43680', '43681', '43671', '43660', '43661', '43662', '42566', '43670', '43667', '42527', '43624', '43628', '43633', '43638', '43645', '43646', '43653', '43654', '43655', '43658']


def fetch_all_id(end_page=10):
    id_list = []
    for page in range(1, end_page + 1):
        resp = requests.get(JOB_PAGE_URL.format(page=page))
        root = etree.HTML(resp.text)

        links = root.xpath('//a[@class="blacktext yellowlink"]')
        id_list.extend(map(lambda x: x.get('href').split('=')[-1], links))

    return id_list


def fetch_info_by_id(pid):
    url = JOB_ID_URL.format(id=pid)
    resp = requests.get(url)
    root = etree.HTML(resp.text)

    # Basic info
    basic_row = ['from', 'title', 'type', 'publish_date', 'place', 'salary']
    basic = list(map(lambda x: x.text, root.xpath('//div[@class="borderyellow largeml-20"]')))

    # Advance info
    adv_row = ['pre_requirements', 'job_details', 'others', 'welfare', 'how', 'url']
    adv = map(lambda x: list(x.itertext()), root.xpath('//*[contains(@class, "mt30")]'))
    adv = map(lambda x: list(map(lambda x: x.strip('\r\n\t'), x)), adv)
    adv = list(map(lambda x: ' '.join(x).strip(' '), adv))[:5]
    adv.append(url)

    return collections.OrderedDict(zip(basic_row + adv_row, basic + adv))


if __name__ == '__main__':
    # print(fetch_all_id(19))
    with open('output.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)

        writer.writerow(['from', 'title', 'type', 'publish_date', 'place', 'salary', 'pre_requirements', 'job_details' 'others', 'welfare', 'how', 'url'])
        for i in ID_LIST:
            writer.writerow(fetch_info_by_id(i).values())
	# -- coding: utf-8 --

	import collections
	import csv
	import requests
	from lxml import etree


	JOB_PAGE_URL = 'http://www.ncafroc.org.tw/platform/jobs.aspx?nowPage={page}&id=&msg2=new#A1'
	JOB_ID_URL = 'http://www.ncafroc.org.tw/platform/jobs-single.aspx?id={id}'

	ID_LIST = ['43439', '43460', '43932', '43933', '43934', '43935', '43909', '43910', '43911', '43918', '43920', '43921', '43922', '43923', '43924', '43925', '43877', '43878', '43879', '43881', '43463', '43445', '43447', '43471', '43473', '43864', '43895', '43904', '43905', '43906', '43890', '43891', '43562', '43577', '43666', '43668', '43883', '43884', '43886', '43888', '43889', '43854', '43862', '43863', '43595', '43567', '42571', '43199', '43866', '43867', '43872', '43874', '43875', '43876', '43855', '43836', '43839', '43840', '43841', '43847', '43849', '43852', '43853', '43712', '43775', '43787', '43813', '43814', '43817', '43822', '43823', '43824', '43825', '43826', '43827', '43828', '43830', '43832', '43833', '43834', '43538', '43539', '43540', '43553', '43111', '42670', '43796', '43801', '43811', '43782', '43786', '43788', '43790', '43794', '43795', '43777', '43778', '43779', '43780', '43781', '43756', '43757', '43758', '43761', '43766', '43772', '43773', '43672', '42935', '43221', '43523', '43524', '43301', '43465', '43749', '43750', '43736', '43738', '43739', '43740', '43741', '43743', '43745', '43746', '43747', '43748', '43568', '43614', '43240', '43702', '43704', '43705', '43706', '43707', '43708', '43709', '43710', '43711', '43714', '43715', '43722', '43724', '43725', '43691', '43693', '43694', '43695', '43696', '43697', '43698', '43699', '43651', '43340', '43294', '43682', '43683', '43650', '43673', '43674', '43675', '43678', '43679', '43680', '43681', '43671', '43660', '43661', '43662', '42566', '43670', '43667', '42527', '43624', '43628', '43633', '43638', '43645', '43646', '43653', '43654', '43655', '43658']


	def fetch_all_id(end_page=10):
	id_list = []
	for page in range(1, end_page + 1):
	resp = requests.get(JOB_PAGE_URL.format(page=page))
	root = etree.HTML(resp.text)

	links = root.xpath('//a[@class="blacktext yellowlink"]')
	id_list.extend(map(lambda x: x.get('href').split('=')[-1], links))

	return id_list


	def fetch_info_by_id(pid):
	url = JOB_ID_URL.format(id=pid)
	resp = requests.get(url)
	root = etree.HTML(resp.text)

	# Basic info
	basic_row = ['from', 'title', 'type', 'publish_date', 'place', 'salary']
	basic = list(map(lambda x: x.text, root.xpath('//div[@class="borderyellow largeml-20"]')))

	# Advance info
	adv_row = ['pre_requirements', 'job_details', 'others', 'welfare', 'how', 'url']
	adv = map(lambda x: list(x.itertext()), root.xpath('//*[contains(@class, "mt30")]'))
	adv = map(lambda x: list(map(lambda x: x.strip('\r\n\t'), x)), adv)
	adv = list(map(lambda x: ' '.join(x).strip(' '), adv))[:5]
	adv.append(url)

	return collections.OrderedDict(zip(basic_row + adv_row, basic + adv))


	if __name__ == '__main__':
	# print(fetch_all_id(19))
	with open('output.csv', 'w', newline='') as csvfile:
	writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)

	writer.writerow(['from', 'title', 'type', 'publish_date', 'place', 'salary', 'pre_requirements', 'job_details' 'others', 'welfare', 'how', 'url'])
	for i in ID_LIST:
	writer.writerow(fetch_info_by_id(i).values())