Skip to content

Instantly share code, notes, and snippets.

@dengshilong
Created November 21, 2013 09:27
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dengshilong/7578553 to your computer and use it in GitHub Desktop.
Save dengshilong/7578553 to your computer and use it in GitHub Desktop.
抓取百度风云榜的热词。在urls.txt中写入 http://top.baidu.com/buzz?b=18等需要抓取的二级分类链接
# -*- encoding: UTF-8 -*-
import urllib2
import re
from datetime import date
def get_page(url):
"""得到一个网页的内容"""
try:
print "crawling %s" % url
headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5",\
"Referer": 'http://www.baidu.com'}
req = urllib2.Request(url, headers=headers) #设置头部
#req.set_proxy("125.216.144.199:8080",'http') #设置代理
return urllib2.urlopen(req).read()
except Exception,e:
print e
print "can't get page %s" % url
return None
def get_sub_categorys(url):
"""
输入: http://top.baidu.com/category?c=1等一级分类链接
输出: http://top.baidu.com/buzz?b=338等二级分类链接以及名称
每个元素为(链接,分类名称)
"""
content = get_page(url)
m = re.search(r'<div id="flist"([\s\S]+?)</div>', content)
if m:
temp = m.group(1)
links = re.findall(r'(buzz\?b=[\d]+)[^>]+>([^<]+)', temp)
root = 'http://top.baidu.com/'
return [(root + path, name) for path, name in links]
else:
return None
def get_buzz_word(url):
"""
输入: http://top.baidu.com/buzz?b=18等二级分类链接
输出: 热词列表
"""
content = get_page(url)
words = re.findall(r'<a class="list-title" target="_blank" href=".+?">(.+?)</a>', content)
return words
if __name__ == "__main__":
fw = open('words.txt', 'w')
with open('urls.txt', 'r') as f:
for line in f:
line = line.strip()
words = get_buzz_word(line)
fw.write('\t'.join(words) + '\n')
fw.close()
http://top.baidu.com/buzz?b=258
http://top.baidu.com/buzz?b=3
http://top.baidu.com/buzz?b=22
http://top.baidu.com/buzz?b=18
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment