Skip to content

Instantly share code, notes, and snippets.

@clouds56
Last active June 18, 2017 07:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save clouds56/7662ccad3f6f132e0e69876d0085cc33 to your computer and use it in GitHub Desktop.
Save clouds56/7662ccad3f6f132e0e69876d0085cc33 to your computer and use it in GitHub Desktop.
tieba_crawler
#!scrapy runspider -L INFO
import scrapy
import logging
import html2text
import urllib.parse
import json
import time
class BlogSpider(scrapy.Spider):
thread_id = 2501134746
name = 'blogspider'
start_urls = ['https://tieba.baidu.com/p/%d' % thread_id]
custom_settings = {
'DOWNLOAD_DELAY': 2,
'FEED_URI': 'thread_%d.json'%thread_id,
'DEPTH_PRIORITY': 1,
'SCHEDULER_DISK_QUEUE': 'scrapy.squeue.PickleFifoDiskQueue',
'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeue.FifoMemoryQueue',
'FEED_FORMAT': 'jsonlines',
'FEED_EXPORT_ENCODING': 'utf-8',
}
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
def parse(self, response):
reply_page_num = int(response.css("div.l_thread_info > ul > li.l_reply_num > span:nth-child(2)::text").extract()[0])
logging.info("max page number: %d" % reply_page_num)
url_parts = urllib.parse.urlparse(response.url)
for i in range(reply_page_num, 0, -1):
url_query = urllib.parse.parse_qs(url_parts.query)
url_query.update({'pn': i})
url_parts_dict = url_parts._asdict()
url_parts_dict['query'] = urllib.parse.urlencode(url_query)
request_url = urllib.parse.urlunparse(url_parts_dict.values())
yield scrapy.Request(request_url, callback=self.parse_page)
def parse_page(self, response):
if response.status != 200:
logging.warning("crawled %d '%s'" % (response.status, response.url))
page_num = int(self.h.handle(response.css('li.l_pager > span.tP').extract_first()).strip())
posts = response.css('#j_p_postlist > div.l_post')
for post in posts:
data = json.loads(post.xpath('@data-field').extract_first())
# author = self.h.handle(post.css('div.d_author li.d_name .p_author_name').extract_first()).strip()
level = int(self.h.handle(post.css('div.d_author li.l_badge .d_badge_lv').extract_first()).strip())
# order = re.search(r"(\d+)"+"\u697c", self.h.handle(post.css('div.d_post_content_main div.core_reply'))).group(1)
# lzl = posts.css('div.core_reply_wrapper li.lzl_single_post')
data['author']['level'] = level
data['timestamp'] = time.time()
data['page_num'] = page_num
yield data
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

TiebaCrawler

crawler.py

change thread_id in the code and run via scrapy runspider -L INFO crawler.py

result.ipynb

  1. change the filebase and "dump" cell (see # write your own query here)
  2. run all cells
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment