clouds56/TiebaCrawler.md

## crawler.py
#!scrapy runspider -L INFO
import scrapy
import logging
import html2text
import urllib.parse
import json
import time

class BlogSpider(scrapy.Spider):
    thread_id = 2501134746
    name = 'blogspider'
    start_urls = ['https://tieba.baidu.com/p/%d' % thread_id]
    custom_settings = {
        'DOWNLOAD_DELAY': 2,
        'FEED_URI': 'thread_%d.json'%thread_id,

        'DEPTH_PRIORITY': 1,
        'SCHEDULER_DISK_QUEUE': 'scrapy.squeue.PickleFifoDiskQueue',
        'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeue.FifoMemoryQueue',
        'FEED_FORMAT': 'jsonlines',
        'FEED_EXPORT_ENCODING': 'utf-8',
    }
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True

    def parse(self, response):
        reply_page_num = int(response.css("div.l_thread_info > ul > li.l_reply_num > span:nth-child(2)::text").extract()[0])
        logging.info("max page number: %d" % reply_page_num)
        url_parts = urllib.parse.urlparse(response.url)
        for i in range(reply_page_num, 0, -1):
            url_query = urllib.parse.parse_qs(url_parts.query)
            url_query.update({'pn': i})
            url_parts_dict = url_parts._asdict()
            url_parts_dict['query'] = urllib.parse.urlencode(url_query)
            request_url = urllib.parse.urlunparse(url_parts_dict.values())
            yield scrapy.Request(request_url, callback=self.parse_page)

    def parse_page(self, response):
        if response.status != 200:
            logging.warning("crawled %d '%s'" % (response.status, response.url))
        page_num = int(self.h.handle(response.css('li.l_pager > span.tP').extract_first()).strip())
        posts = response.css('#j_p_postlist > div.l_post')
        for post in posts:
            data = json.loads(post.xpath('@data-field').extract_first())
            # author = self.h.handle(post.css('div.d_author li.d_name .p_author_name').extract_first()).strip()
            level = int(self.h.handle(post.css('div.d_author li.l_badge .d_badge_lv').extract_first()).strip())
            # order = re.search(r"(\d+)"+"\u697c", self.h.handle(post.css('div.d_post_content_main div.core_reply'))).group(1)
            # lzl = posts.css('div.core_reply_wrapper li.lzl_single_post')
            data['author']['level'] = level
            data['timestamp'] = time.time()
            data['page_num'] = page_num
            yield data

## result.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              result.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## TiebaCrawler.md

      
    Raw
  

              TiebaCrawler.md
            
          
    TiebaCrawler

crawler.py

change thread_id in the code and run via scrapy runspider -L INFO crawler.py
result.ipynb


change the filebase and "dump" cell (see # write your own query here)
run all cells
	#!scrapy runspider -L INFO
	import scrapy
	import logging
	import html2text
	import urllib.parse
	import json
	import time

	class BlogSpider(scrapy.Spider):
	thread_id = 2501134746
	name = 'blogspider'
	start_urls = ['https://tieba.baidu.com/p/%d' % thread_id]
	custom_settings = {
	'DOWNLOAD_DELAY': 2,
	'FEED_URI': 'thread_%d.json'%thread_id,

	'DEPTH_PRIORITY': 1,
	'SCHEDULER_DISK_QUEUE': 'scrapy.squeue.PickleFifoDiskQueue',
	'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeue.FifoMemoryQueue',
	'FEED_FORMAT': 'jsonlines',
	'FEED_EXPORT_ENCODING': 'utf-8',
	}
	h = html2text.HTML2Text()
	h.ignore_links = True
	h.ignore_images = True

	def parse(self, response):
	reply_page_num = int(response.css("div.l_thread_info > ul > li.l_reply_num > span:nth-child(2)::text").extract()[0])
	logging.info("max page number: %d" % reply_page_num)
	url_parts = urllib.parse.urlparse(response.url)
	for i in range(reply_page_num, 0, -1):
	url_query = urllib.parse.parse_qs(url_parts.query)
	url_query.update({'pn': i})
	url_parts_dict = url_parts._asdict()
	url_parts_dict['query'] = urllib.parse.urlencode(url_query)
	request_url = urllib.parse.urlunparse(url_parts_dict.values())
	yield scrapy.Request(request_url, callback=self.parse_page)

	def parse_page(self, response):
	if response.status != 200:
	logging.warning("crawled %d '%s'" % (response.status, response.url))
	page_num = int(self.h.handle(response.css('li.l_pager > span.tP').extract_first()).strip())
	posts = response.css('#j_p_postlist > div.l_post')
	for post in posts:
	data = json.loads(post.xpath('@data-field').extract_first())
	# author = self.h.handle(post.css('div.d_author li.d_name .p_author_name').extract_first()).strip()
	level = int(self.h.handle(post.css('div.d_author li.l_badge .d_badge_lv').extract_first()).strip())
	# order = re.search(r"(\d+)"+"\u697c", self.h.handle(post.css('div.d_post_content_main div.core_reply'))).group(1)
	# lzl = posts.css('div.core_reply_wrapper li.lzl_single_post')
	data['author']['level'] = level
	data['timestamp'] = time.time()
	data['page_num'] = page_num
	yield data