Created
April 9, 2013 17:05
-
-
Save ibigbug/5347450 to your computer and use it in GitHub Desktop.
Baidu Tieba Crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
#coding:utf-8 | |
import re | |
import time | |
import os | |
import urllib | |
import requests | |
from pyquery import PyQuery as pquery | |
HOST = 'http://tieba.baidu.com' | |
def mkdir(name=None): | |
curr_dir = os.path.curdir | |
os.chdir(curr_dir) | |
if not os.path.exists(name): | |
os.makedirs(name) | |
else: | |
pass | |
return os.path.abspath(name) | |
class TiebaCrawler(object): | |
def __init__(self, kw): | |
self.kw = kw | |
self.url = HOST + '/f?kw=' + kw | |
self.charset = 'GBK' | |
self.post_count = 0 | |
self.pic_count = 0 | |
def prepare(self): | |
""" | |
get total page number | |
make folder | |
""" | |
self.save_to = mkdir(self.kw) | |
os.chdir(self.save_to) | |
req = requests.get(self.url) | |
pager_pattern = re.compile(r'<a href=".+?pn=(.+?)" class="last">', | |
re.M) | |
last_page = pager_pattern.findall(req.content) | |
return int(last_page[0]) | |
def generate_link(self): | |
total_page = self.prepare() | |
raw_post_pattern = r'<a href="(.+?)" title="(.+?)".+?class="j_th_tit"' | |
post_pattern = re.compile(raw_post_pattern, re.M) | |
for page in xrange(0, total_page, 50): | |
url = self.url + '&pn=' + str(page) | |
req = requests.get(url) | |
links = post_pattern.findall(req.content) | |
for link in links: | |
time.sleep(1) | |
self.post_count += 1 | |
yield link | |
def consume(self): | |
q = self.generate_link() | |
while True: | |
try: | |
post = q.next() | |
self.handle_post(post) | |
time.sleep(1) | |
except StopIteration: | |
q.send(None) | |
def handle_post(self, post, page=1, count=0): | |
query, title = post | |
url = HOST + query + '?see_lz=1&pn=%s' % page | |
req = requests.get(url) | |
doc = pquery(req.text) | |
img_tags = doc('img.BDE_Image') | |
if img_tags: | |
post_dir = mkdir(title.decode(self.charset)) | |
os.chdir(post_dir) | |
for img in img_tags: | |
try: | |
img_src = img.attrib.get('src') | |
count += 1 | |
self.pic_count += 1 | |
img_name = img_src.split('/')[-1] | |
urllib.urlretrieve(img_src, img_name) | |
except: | |
continue | |
os.chdir(self.save_to) | |
print '在帖子"%s"中第%d页中共搜集到%d张福利、、' % \ | |
(title.decode(self.charset).encode('utf-8'), page, count) | |
pager_pattern = re.compile(u'>下一页', re.M) | |
has_next = pager_pattern.findall(req.text) | |
if has_next: | |
self.handle_post(post, page+1, count) | |
def run(self): | |
print '正在进入贴吧 %s' % self.kw | |
print '正在启动只看楼主模式...' | |
print '正在抓取...' | |
self.consume() | |
print """ | |
总共帮你浏览了%d张帖子,收集到福利%d张^_^ | |
""" % (self.post_count, self.pic_count) | |
if __name__ == '__main__': | |
b = TiebaCrawler('姐脱') | |
try: | |
b.run() | |
except KeyboardInterrupt: | |
print """ | |
总共帮你浏览了%d张帖子,收集到福利%d张^_^ | |
""" % (b.post_count, b.pic_count) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment