Skip to content

Instantly share code, notes, and snippets.

@Linusp
Created February 28, 2022 05:55
Show Gist options
  • Save Linusp/321101c9afe58827347769ec2135e137 to your computer and use it in GitHub Desktop.
Save Linusp/321101c9afe58827347769ec2135e137 to your computer and use it in GitHub Desktop.
维基百科首页「你知道吗」内容抓取
"""抓取维基百科中文站的新条目推荐"""
import re
import json
import argparse
from urllib.parse import unquote_plus
import requests
from lxml import html
ROOT = 'https://zh.wikipedia.org'
URL = 'https://zh.wikipedia.org/zh-cn/Wikipedia:%E6%96%B0%E6%9D%A1%E7%9B%AE%E6%8E%A8%E8%8D%90'
PROXIES = {
'http': 'localhost:1235',
'https': 'localhost:1235'
}
def get_page(link, proxies=None, timeout=10):
resp = requests.get(link, proxies=proxies, timeout=timeout)
if resp.status_code != 200:
raise ValueError(resp.reason)
content = html.fromstring(resp.text)
content.make_links_absolute(ROOT)
return content
def get_links():
content = get_page(URL, proxies=PROXIES)
links = set()
for link in content.xpath('//p/a/@href'):
link = unquote_plus(link)
if link.find('新条目推荐') > 0 and re.findall('\d{4}年\d{1,2}月$', link):
if not link.startswith('https://zh.wikipedia.org'):
link = 'https://zh.wikipedia.org' + link
link = link.replace('zh.wikipedia.org/wiki', 'zh.wikipedia.org/zh-cn')
links.add(link)
return links
def extract_questions(link):
questions = []
content = get_page(link, proxies=PROXIES)
for item in content.xpath('//div[@id="bodyContent"]//ul/li'):
question_text = item.text_content().strip()
cur = {
'text': question_text,
'focus': {},
'entities': []
}
invalid, last_offset = False, 0
for child in item.iterchildren():
child_text = child.text_content().strip()
offset = question_text.find(child_text, last_offset)
if child.tag == 'a':
cur['entities'].append({
'text': child_text,
'offset': offset,
'url': unquote_plus(child.get('href')),
})
elif child.tag == 'b':
focus_links = list(child.iter('a'))
if len(focus_links) != 1:
invalid = True
break
focus = focus_links[0]
if focus.text_content() != child_text:
invalid = True
break
if cur.get('focus'):
invalid = True
break
focus_url = focus.get('href')
focus_name = focus.get('title')
cur['focus'] = {
'text': child_text,
'offset': offset,
'url': unquote_plus(focus_url),
'answer': focus_name
}
if invalid:
continue
questions.append(cur)
return questions
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--outfile", required=True)
args = parser.parse_args()
with open(args.outfile, 'w') as fout:
for link in get_links():
for item in extract_questions(link):
item['source'] = link
print(json.dumps(item, ensure_ascii=False), file=fout)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment