import requests
from bs4 import BeautifulSoup
from lxml import etree
import re
import html
import pprint
pp = pprint.PrettyPrinter(indent=4)
TAG_RE = re.compile(r'<[^>]+>')
base_url = 'http://news.baidu.com/ns?rn=20&ie=utf-8&cl=2&ct=0&rsv_bp=1&sr=0&f=8&prevct=1&tn=newstitle&word=ethereum'
r = requests.get(base_url)
print(r.url)
selector = etree.HTML(r.content)
result = []
for item in selector.xpath("//div[contains(@class,'result')]"):
source = item.xpath('./h3/a/@href')[0]
title_html = etree.tostring(item.xpath('./h3')[0])
title = TAG_RE.sub('', html.unescape(str(title_html)))[13:-11]
author_html = item.xpath('./div[contains(@class, "c-title-author")]/text()')[0]
author, time = list(map(lambda s: s.strip(), filter(lambda x: x.strip(), author_html.split('\n'))))
result.append({
'url': source,
'title': title,
'author': author,
'time': time
})
pp.pprint(result)
Last active
October 10, 2018 06:48
-
-
Save watsy0007/5426b3740f8f4711c382a66eb7b6299b to your computer and use it in GitHub Desktop.
百度新闻爬虫
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment