Skip to content

Instantly share code, notes, and snippets.

@ibigbug
Created April 9, 2013 17:05
Show Gist options
  • Save ibigbug/5347450 to your computer and use it in GitHub Desktop.
Save ibigbug/5347450 to your computer and use it in GitHub Desktop.
Baidu Tieba Crawler
#!/usr/bin/env python2
#coding:utf-8
import re
import time
import os
import urllib
import requests
from pyquery import PyQuery as pquery
HOST = 'http://tieba.baidu.com'
def mkdir(name=None):
curr_dir = os.path.curdir
os.chdir(curr_dir)
if not os.path.exists(name):
os.makedirs(name)
else:
pass
return os.path.abspath(name)
class TiebaCrawler(object):
def __init__(self, kw):
self.kw = kw
self.url = HOST + '/f?kw=' + kw
self.charset = 'GBK'
self.post_count = 0
self.pic_count = 0
def prepare(self):
"""
get total page number
make folder
"""
self.save_to = mkdir(self.kw)
os.chdir(self.save_to)
req = requests.get(self.url)
pager_pattern = re.compile(r'<a href=".+?pn=(.+?)" class="last">',
re.M)
last_page = pager_pattern.findall(req.content)
return int(last_page[0])
def generate_link(self):
total_page = self.prepare()
raw_post_pattern = r'<a href="(.+?)" title="(.+?)".+?class="j_th_tit"'
post_pattern = re.compile(raw_post_pattern, re.M)
for page in xrange(0, total_page, 50):
url = self.url + '&pn=' + str(page)
req = requests.get(url)
links = post_pattern.findall(req.content)
for link in links:
time.sleep(1)
self.post_count += 1
yield link
def consume(self):
q = self.generate_link()
while True:
try:
post = q.next()
self.handle_post(post)
time.sleep(1)
except StopIteration:
q.send(None)
def handle_post(self, post, page=1, count=0):
query, title = post
url = HOST + query + '?see_lz=1&pn=%s' % page
req = requests.get(url)
doc = pquery(req.text)
img_tags = doc('img.BDE_Image')
if img_tags:
post_dir = mkdir(title.decode(self.charset))
os.chdir(post_dir)
for img in img_tags:
try:
img_src = img.attrib.get('src')
count += 1
self.pic_count += 1
img_name = img_src.split('/')[-1]
urllib.urlretrieve(img_src, img_name)
except:
continue
os.chdir(self.save_to)
print '在帖子"%s"中第%d页中共搜集到%d张福利、、' % \
(title.decode(self.charset).encode('utf-8'), page, count)
pager_pattern = re.compile(u'>下一页', re.M)
has_next = pager_pattern.findall(req.text)
if has_next:
self.handle_post(post, page+1, count)
def run(self):
print '正在进入贴吧 %s' % self.kw
print '正在启动只看楼主模式...'
print '正在抓取...'
self.consume()
print """
总共帮你浏览了%d张帖子,收集到福利%d张^_^
""" % (self.post_count, self.pic_count)
if __name__ == '__main__':
b = TiebaCrawler('姐脱')
try:
b.run()
except KeyboardInterrupt:
print """
总共帮你浏览了%d张帖子,收集到福利%d张^_^
""" % (b.post_count, b.pic_count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment