linnil1/twsf.py

## twsf.py
import requests
from lxml import html
from pprint import pprint
import json
import re

def getLists():
    alllinks = []
    for i in range(1,7):
        lists = "https://twsf.ntsec.gov.tw/Article.aspx?a=40&lang=1&p=" + str(i)
        res = requests.get(lists)
        xml =  html.document_fromstring(res.content.decode())
        lnode = xml.xpath("//*[@class='PicTextRight']")
        for l in lnode:
            alink = l.xpath("a")[0]
            title = alink.get('title')
            href = alink.get('href')
            alllinks.append((title, href))

    alllinks = getLists()
    print(alllinks)
    json.dump(alllinks, open("twsf_lists.json", 'w'))


def get38Works():
    # no 1, 10, 11
    lists = json.load(open("twsf_lists.json"))
    # allworks = {}
    allworks = json.load(open("twsf_38_works.json"))
    for i in lists[-38:]:
        title = re.findall(r'\d+', i[0])[0]
        res = requests.get(i[1])
        res.encoding = 'big5'
        xml = html.document_fromstring(res.text)

        works = []
        for i in xml.xpath("//h3"):
            works.append(i.text_content())

        allworks[title] = works
        json.dump(allworks, open("twsf_38_works.json", 'w'))

lists = json.load(open("twsf_lists.json"))
allworks = json.load(open("twsf_38_works.json"))

# find it
for i in allworks:
    for j in allworks[i]:
        if "" in j:
            print(i)
            print(j)
	import requests
	from lxml import html
	from pprint import pprint
	import json
	import re

	def getLists():
	alllinks = []
	for i in range(1,7):
	lists = "https://twsf.ntsec.gov.tw/Article.aspx?a=40&lang=1&p=" + str(i)
	res = requests.get(lists)
	xml = html.document_fromstring(res.content.decode())
	lnode = xml.xpath("//*[@class='PicTextRight']")
	for l in lnode:
	alink = l.xpath("a")[0]
	title = alink.get('title')
	href = alink.get('href')
	alllinks.append((title, href))

	alllinks = getLists()
	print(alllinks)
	json.dump(alllinks, open("twsf_lists.json", 'w'))


	def get38Works():
	# no 1, 10, 11
	lists = json.load(open("twsf_lists.json"))
	# allworks = {}
	allworks = json.load(open("twsf_38_works.json"))
	for i in lists[-38:]:
	title = re.findall(r'\d+', i[0])[0]
	res = requests.get(i[1])
	res.encoding = 'big5'
	xml = html.document_fromstring(res.text)

	works = []
	for i in xml.xpath("//h3"):
	works.append(i.text_content())

	allworks[title] = works
	json.dump(allworks, open("twsf_38_works.json", 'w'))

	lists = json.load(open("twsf_lists.json"))
	allworks = json.load(open("twsf_38_works.json"))

	# find it
	for i in allworks:
	for j in allworks[i]:
	if "" in j:
	print(i)
	print(j)