Skip to content

Instantly share code, notes, and snippets.

Created February 28, 2022 05:55
Show Gist options
  • Save Linusp/321101c9afe58827347769ec2135e137 to your computer and use it in GitHub Desktop.
Save Linusp/321101c9afe58827347769ec2135e137 to your computer and use it in GitHub Desktop.
import re
import json
import argparse
from urllib.parse import unquote_plus
import requests
from lxml import html
ROOT = ''
URL = ''
'http': 'localhost:1235',
'https': 'localhost:1235'
def get_page(link, proxies=None, timeout=10):
resp = requests.get(link, proxies=proxies, timeout=timeout)
if resp.status_code != 200:
raise ValueError(resp.reason)
content = html.fromstring(resp.text)
return content
def get_links():
content = get_page(URL, proxies=PROXIES)
links = set()
for link in content.xpath('//p/a/@href'):
link = unquote_plus(link)
if link.find('新条目推荐') > 0 and re.findall('\d{4}年\d{1,2}月$', link):
if not link.startswith(''):
link = '' + link
link = link.replace('', '')
return links
def extract_questions(link):
questions = []
content = get_page(link, proxies=PROXIES)
for item in content.xpath('//div[@id="bodyContent"]//ul/li'):
question_text = item.text_content().strip()
cur = {
'text': question_text,
'focus': {},
'entities': []
invalid, last_offset = False, 0
for child in item.iterchildren():
child_text = child.text_content().strip()
offset = question_text.find(child_text, last_offset)
if child.tag == 'a':
'text': child_text,
'offset': offset,
'url': unquote_plus(child.get('href')),
elif child.tag == 'b':
focus_links = list(child.iter('a'))
if len(focus_links) != 1:
invalid = True
focus = focus_links[0]
if focus.text_content() != child_text:
invalid = True
if cur.get('focus'):
invalid = True
focus_url = focus.get('href')
focus_name = focus.get('title')
cur['focus'] = {
'text': child_text,
'offset': offset,
'url': unquote_plus(focus_url),
'answer': focus_name
if invalid:
return questions
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--outfile", required=True)
args = parser.parse_args()
with open(args.outfile, 'w') as fout:
for link in get_links():
for item in extract_questions(link):
item['source'] = link
print(json.dumps(item, ensure_ascii=False), file=fout)
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment