Skip to content

Instantly share code, notes, and snippets.

@linnil1
Created April 30, 2018 14:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save linnil1/1b2825bf37b5b3e2c7ffffb64a90e7f1 to your computer and use it in GitHub Desktop.
Save linnil1/1b2825bf37b5b3e2c7ffffb64a90e7f1 to your computer and use it in GitHub Desktop.
A spider for 全國中小學科學展覽會
import requests
from lxml import html
from pprint import pprint
import json
import re
def getLists():
alllinks = []
for i in range(1,7):
lists = "https://twsf.ntsec.gov.tw/Article.aspx?a=40&lang=1&p=" + str(i)
res = requests.get(lists)
xml = html.document_fromstring(res.content.decode())
lnode = xml.xpath("//*[@class='PicTextRight']")
for l in lnode:
alink = l.xpath("a")[0]
title = alink.get('title')
href = alink.get('href')
alllinks.append((title, href))
alllinks = getLists()
print(alllinks)
json.dump(alllinks, open("twsf_lists.json", 'w'))
def get38Works():
# no 1, 10, 11
lists = json.load(open("twsf_lists.json"))
# allworks = {}
allworks = json.load(open("twsf_38_works.json"))
for i in lists[-38:]:
title = re.findall(r'\d+', i[0])[0]
res = requests.get(i[1])
res.encoding = 'big5'
xml = html.document_fromstring(res.text)
works = []
for i in xml.xpath("//h3"):
works.append(i.text_content())
allworks[title] = works
json.dump(allworks, open("twsf_38_works.json", 'w'))
lists = json.load(open("twsf_lists.json"))
allworks = json.load(open("twsf_38_works.json"))
# find it
for i in allworks:
for j in allworks[i]:
if "" in j:
print(i)
print(j)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment