Skip to content

Instantly share code, notes, and snippets.

@clouds56
Last active June 18, 2017 07:03
Show Gist options
  • Save clouds56/7662ccad3f6f132e0e69876d0085cc33 to your computer and use it in GitHub Desktop.
Save clouds56/7662ccad3f6f132e0e69876d0085cc33 to your computer and use it in GitHub Desktop.
tieba_crawler
#!scrapy runspider -L INFO
import scrapy
import logging
import html2text
import urllib.parse
import json
import time
class BlogSpider(scrapy.Spider):
thread_id = 2501134746
name = 'blogspider'
start_urls = ['https://tieba.baidu.com/p/%d' % thread_id]
custom_settings = {
'DOWNLOAD_DELAY': 2,
'FEED_URI': 'thread_%d.json'%thread_id,
'DEPTH_PRIORITY': 1,
'SCHEDULER_DISK_QUEUE': 'scrapy.squeue.PickleFifoDiskQueue',
'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeue.FifoMemoryQueue',
'FEED_FORMAT': 'jsonlines',
'FEED_EXPORT_ENCODING': 'utf-8',
}
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
def parse(self, response):
reply_page_num = int(response.css("div.l_thread_info > ul > li.l_reply_num > span:nth-child(2)::text").extract()[0])
logging.info("max page number: %d" % reply_page_num)
url_parts = urllib.parse.urlparse(response.url)
for i in range(reply_page_num, 0, -1):
url_query = urllib.parse.parse_qs(url_parts.query)
url_query.update({'pn': i})
url_parts_dict = url_parts._asdict()
url_parts_dict['query'] = urllib.parse.urlencode(url_query)
request_url = urllib.parse.urlunparse(url_parts_dict.values())
yield scrapy.Request(request_url, callback=self.parse_page)
def parse_page(self, response):
if response.status != 200:
logging.warning("crawled %d '%s'" % (response.status, response.url))
page_num = int(self.h.handle(response.css('li.l_pager > span.tP').extract_first()).strip())
posts = response.css('#j_p_postlist > div.l_post')
for post in posts:
data = json.loads(post.xpath('@data-field').extract_first())
# author = self.h.handle(post.css('div.d_author li.d_name .p_author_name').extract_first()).strip()
level = int(self.h.handle(post.css('div.d_author li.l_badge .d_badge_lv').extract_first()).strip())
# order = re.search(r"(\d+)"+"\u697c", self.h.handle(post.css('div.d_post_content_main div.core_reply'))).group(1)
# lzl = posts.css('div.core_reply_wrapper li.lzl_single_post')
data['author']['level'] = level
data['timestamp'] = time.time()
data['page_num'] = page_num
yield data
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import json\n",
"import html2text\n",
"\n",
"hh = html2text.HTML2Text()\n",
"hh.ignore_images = True\n",
"hh.ignore_links = True"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"filebase = \"thread_2501134746\"\n",
"url_template = \"https://tieba.baidu.com/p/{0}?pid={1}&cid=0#{1}\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"with open(\"%s.json\"%filebase, encoding='utf-8') as f:\n",
" posts = [json.loads(ln) for ln in f]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def fetch_key(content, key):\n",
" content = hh.handle(content)\n",
" return [line.strip() for line in content.splitlines() if key(line)]\n",
"\n",
"def unique_by(l, key=None):\n",
" a = set()\n",
" for x in l:\n",
" k = x\n",
" if key is not None:\n",
" k = key(k)\n",
" if not k in a:\n",
" a.add(k)\n",
" yield x\n",
"\n",
"def get_all(posts, key):\n",
" all_posts = [(i['content']['post_no'],\n",
" i['author']['user_name'],\n",
" fetch_key(i['content']['content'], key),\n",
" url_template.format(i['content']['thread_id'], i['content']['post_id']))\n",
" for i in posts if key(i['content']['content'])]\n",
" all_posts.sort(reverse=True)\n",
" return list(unique_by(all_posts, key=lambda x:x[1]))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def key_func(keys):\n",
" def func(content):\n",
" value = 0\n",
" for i, v in keys.items():\n",
" if i in content:\n",
" value += v\n",
" return value > 0\n",
" return func\n",
"\n",
"def dump(posts, keys, filename):\n",
" if isinstance(keys, str):\n",
" key = lambda x: keys in x\n",
" elif isinstance(keys, dict):\n",
" key = key_func(keys)\n",
" else:\n",
" key = key_func({i:1 for i in keys})\n",
" result = get_all(posts, key)\n",
" with open(filename, \"w\", encoding=\"utf8\") as fout:\n",
" for x in result:\n",
" print(*x, file=fout)\n",
" return len(result)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"24"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dump(posts, {'text':1}, \"%s_text.txt\"%filebase)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"28"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dump(posts, {'a':1, 'b':1}, \"%s_ab.txt\"%filebase)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"19"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dump(posts, {'text':1, 'nota':-100, 'notb':-100}, \"%s_text-ab.txt\"%filebase)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"17"
]
},
"execution_count": 0,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dump(posts, {'t1':1, 't2':1, 'a':-100, 'b':-100}, \"%s_t12-ab.txt\"%filebase)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

TiebaCrawler

crawler.py

change thread_id in the code and run via scrapy runspider -L INFO crawler.py

result.ipynb

  1. change the filebase and "dump" cell (see # write your own query here)
  2. run all cells
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment