Skip to content

Instantly share code, notes, and snippets.

@watsy0007
Last active October 10, 2018 06:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save watsy0007/5426b3740f8f4711c382a66eb7b6299b to your computer and use it in GitHub Desktop.
Save watsy0007/5426b3740f8f4711c382a66eb7b6299b to your computer and use it in GitHub Desktop.
百度新闻爬虫
import requests
from bs4 import BeautifulSoup
from lxml import etree
import re
import html
import pprint
pp = pprint.PrettyPrinter(indent=4)

TAG_RE = re.compile(r'<[^>]+>')

base_url = 'http://news.baidu.com/ns?rn=20&ie=utf-8&cl=2&ct=0&rsv_bp=1&sr=0&f=8&prevct=1&tn=newstitle&word=ethereum'

r = requests.get(base_url)
print(r.url)


selector = etree.HTML(r.content)
result = []
for item in selector.xpath("//div[contains(@class,'result')]"):
    source = item.xpath('./h3/a/@href')[0]
    title_html = etree.tostring(item.xpath('./h3')[0])
    title = TAG_RE.sub('', html.unescape(str(title_html)))[13:-11]
    author_html = item.xpath('./div[contains(@class, "c-title-author")]/text()')[0]
    author, time = list(map(lambda s: s.strip(), filter(lambda x: x.strip(), author_html.split('\n'))))
    result.append({
        'url': source,
        'title': title,
        'author': author,
        'time': time
    })
pp.pprint(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment