change thread_id
in the code and run via scrapy runspider -L INFO crawler.py
- change the filebase and "dump" cell (see
# write your own query here
) - run all cells
#!scrapy runspider -L INFO | |
import scrapy | |
import logging | |
import html2text | |
import urllib.parse | |
import json | |
import time | |
class BlogSpider(scrapy.Spider): | |
thread_id = 2501134746 | |
name = 'blogspider' | |
start_urls = ['https://tieba.baidu.com/p/%d' % thread_id] | |
custom_settings = { | |
'DOWNLOAD_DELAY': 2, | |
'FEED_URI': 'thread_%d.json'%thread_id, | |
'DEPTH_PRIORITY': 1, | |
'SCHEDULER_DISK_QUEUE': 'scrapy.squeue.PickleFifoDiskQueue', | |
'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeue.FifoMemoryQueue', | |
'FEED_FORMAT': 'jsonlines', | |
'FEED_EXPORT_ENCODING': 'utf-8', | |
} | |
h = html2text.HTML2Text() | |
h.ignore_links = True | |
h.ignore_images = True | |
def parse(self, response): | |
reply_page_num = int(response.css("div.l_thread_info > ul > li.l_reply_num > span:nth-child(2)::text").extract()[0]) | |
logging.info("max page number: %d" % reply_page_num) | |
url_parts = urllib.parse.urlparse(response.url) | |
for i in range(reply_page_num, 0, -1): | |
url_query = urllib.parse.parse_qs(url_parts.query) | |
url_query.update({'pn': i}) | |
url_parts_dict = url_parts._asdict() | |
url_parts_dict['query'] = urllib.parse.urlencode(url_query) | |
request_url = urllib.parse.urlunparse(url_parts_dict.values()) | |
yield scrapy.Request(request_url, callback=self.parse_page) | |
def parse_page(self, response): | |
if response.status != 200: | |
logging.warning("crawled %d '%s'" % (response.status, response.url)) | |
page_num = int(self.h.handle(response.css('li.l_pager > span.tP').extract_first()).strip()) | |
posts = response.css('#j_p_postlist > div.l_post') | |
for post in posts: | |
data = json.loads(post.xpath('@data-field').extract_first()) | |
# author = self.h.handle(post.css('div.d_author li.d_name .p_author_name').extract_first()).strip() | |
level = int(self.h.handle(post.css('div.d_author li.l_badge .d_badge_lv').extract_first()).strip()) | |
# order = re.search(r"(\d+)"+"\u697c", self.h.handle(post.css('div.d_post_content_main div.core_reply'))).group(1) | |
# lzl = posts.css('div.core_reply_wrapper li.lzl_single_post') | |
data['author']['level'] = level | |
data['timestamp'] = time.time() | |
data['page_num'] = page_num | |
yield data |