watsy0007/baidu_news.md

## baidu_news.md

      
    Raw
  

              baidu_news.md
            
          
    import requests
from bs4 import BeautifulSoup
from lxml import etree
import re
import html
import pprint
pp = pprint.PrettyPrinter(indent=4)

TAG_RE = re.compile(r'<[^>]+>')

base_url = 'http://news.baidu.com/ns?rn=20&ie=utf-8&cl=2&ct=0&rsv_bp=1&sr=0&f=8&prevct=1&tn=newstitle&word=ethereum'

r = requests.get(base_url)
print(r.url)


selector = etree.HTML(r.content)
result = []
for item in selector.xpath("//div[contains(@class,'result')]"):
    source = item.xpath('./h3/a/@href')[0]
    title_html = etree.tostring(item.xpath('./h3')[0])
    title = TAG_RE.sub('', html.unescape(str(title_html)))[13:-11]
    author_html = item.xpath('./div[contains(@class, "c-title-author")]/text()')[0]
    author, time = list(map(lambda s: s.strip(), filter(lambda x: x.strip(), author_html.split('\n'))))
    result.append({
        'url': source,
        'title': title,
        'author': author,
        'time': time
    })
pp.pprint(result)