dengshilong/crawl_buzz.py

## crawl_buzz.py
# -*- encoding: UTF-8 -*-
import urllib2
import re
from datetime import date
def get_page(url):
    """得到一个网页的内容"""
    try:
        print "crawling %s" % url
        headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5",\
             "Referer": 'http://www.baidu.com'}
        req = urllib2.Request(url, headers=headers) #设置头部
        #req.set_proxy("125.216.144.199:8080",'http')    #设置代理
        return urllib2.urlopen(req).read()
    except Exception,e:
        print e
        print "can't get page %s" % url
        return None

def get_sub_categorys(url):
    """
        输入： http://top.baidu.com/category?c=1等一级分类链接
        输出： http://top.baidu.com/buzz?b=338等二级分类链接以及名称
        每个元素为(链接,分类名称)
    """
    content = get_page(url)
    m = re.search(r'<div id="flist"([\s\S]+?)</div>', content)
    if m:
        temp = m.group(1)
        links = re.findall(r'(buzz\?b=[\d]+)[^>]+>([^<]+)', temp)
        root = 'http://top.baidu.com/'
        return [(root + path, name) for path, name in links]
    else:
        return None

def get_buzz_word(url):
    """
        输入： http://top.baidu.com/buzz?b=18等二级分类链接
        输出： 热词列表

    """
    content = get_page(url)
    words = re.findall(r'<a class="list-title" target="_blank" href=".+?">(.+?)</a>', content)
    return words

if __name__ == "__main__":
    fw = open('words.txt', 'w')
    with open('urls.txt', 'r') as f:
        for line in f:
            line = line.strip()
            words = get_buzz_word(line)
            fw.write('\t'.join(words) + '\n')
    fw.close()


## urls
http://top.baidu.com/buzz?b=258
http://top.baidu.com/buzz?b=3
http://top.baidu.com/buzz?b=22
http://top.baidu.com/buzz?b=18
	# -- encoding: UTF-8 --
	import urllib2
	import re
	from datetime import date
	def get_page(url):
	"""得到一个网页的内容"""
	try:
	print "crawling %s" % url
	headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5",\
	"Referer": 'http://www.baidu.com'}
	req = urllib2.Request(url, headers=headers) #设置头部
	#req.set_proxy("125.216.144.199:8080",'http') #设置代理
	return urllib2.urlopen(req).read()
	except Exception,e:
	print e
	print "can't get page %s" % url
	return None

	def get_sub_categorys(url):
	"""
	输入： http://top.baidu.com/category?c=1等一级分类链接
	输出： http://top.baidu.com/buzz?b=338等二级分类链接以及名称
	每个元素为(链接,分类名称)
	"""
	content = get_page(url)
	m = re.search(r'<div id="flist"([\s\S]+?)</div>', content)
	if m:
	temp = m.group(1)
	links = re.findall(r'(buzz\?b=[\d]+)[^>]+>([^<]+)', temp)
	root = 'http://top.baidu.com/'
	return [(root + path, name) for path, name in links]
	else:
	return None

	def get_buzz_word(url):
	"""
	输入： http://top.baidu.com/buzz?b=18等二级分类链接
	输出：热词列表

	"""
	content = get_page(url)
	words = re.findall(r'<a class="list-title" target="_blank" href=".+?">(.+?)</a>', content)
	return words

	if __name__ == "__main__":
	fw = open('words.txt', 'w')
	with open('urls.txt', 'r') as f:
	for line in f:
	line = line.strip()
	words = get_buzz_word(line)
	fw.write('\t'.join(words) + '\n')
	fw.close()
	http://top.baidu.com/buzz?b=258
	http://top.baidu.com/buzz?b=3
	http://top.baidu.com/buzz?b=22
	http://top.baidu.com/buzz?b=18