|
{ |
|
"cells": [ |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 1, |
|
"metadata": { |
|
"collapsed": true |
|
}, |
|
"outputs": [], |
|
"source": [ |
|
"import json\n", |
|
"import html2text\n", |
|
"\n", |
|
"hh = html2text.HTML2Text()\n", |
|
"hh.ignore_images = True\n", |
|
"hh.ignore_links = True" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 2, |
|
"metadata": { |
|
"collapsed": true |
|
}, |
|
"outputs": [], |
|
"source": [ |
|
"filebase = \"thread_2501134746\"\n", |
|
"url_template = \"https://tieba.baidu.com/p/{0}?pid={1}&cid=0#{1}\"" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 3, |
|
"metadata": { |
|
"collapsed": true |
|
}, |
|
"outputs": [], |
|
"source": [ |
|
"with open(\"%s.json\"%filebase, encoding='utf-8') as f:\n", |
|
" posts = [json.loads(ln) for ln in f]" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 4, |
|
"metadata": { |
|
"collapsed": true |
|
}, |
|
"outputs": [], |
|
"source": [ |
|
"def fetch_key(content, key):\n", |
|
" content = hh.handle(content)\n", |
|
" return [line.strip() for line in content.splitlines() if key(line)]\n", |
|
"\n", |
|
"def unique_by(l, key=None):\n", |
|
" a = set()\n", |
|
" for x in l:\n", |
|
" k = x\n", |
|
" if key is not None:\n", |
|
" k = key(k)\n", |
|
" if not k in a:\n", |
|
" a.add(k)\n", |
|
" yield x\n", |
|
"\n", |
|
"def get_all(posts, key):\n", |
|
" all_posts = [(i['content']['post_no'],\n", |
|
" i['author']['user_name'],\n", |
|
" fetch_key(i['content']['content'], key),\n", |
|
" url_template.format(i['content']['thread_id'], i['content']['post_id']))\n", |
|
" for i in posts if key(i['content']['content'])]\n", |
|
" all_posts.sort(reverse=True)\n", |
|
" return list(unique_by(all_posts, key=lambda x:x[1]))" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 5, |
|
"metadata": { |
|
"collapsed": true |
|
}, |
|
"outputs": [], |
|
"source": [ |
|
"def key_func(keys):\n", |
|
" def func(content):\n", |
|
" value = 0\n", |
|
" for i, v in keys.items():\n", |
|
" if i in content:\n", |
|
" value += v\n", |
|
" return value > 0\n", |
|
" return func\n", |
|
"\n", |
|
"def dump(posts, keys, filename):\n", |
|
" if isinstance(keys, str):\n", |
|
" key = lambda x: keys in x\n", |
|
" elif isinstance(keys, dict):\n", |
|
" key = key_func(keys)\n", |
|
" else:\n", |
|
" key = key_func({i:1 for i in keys})\n", |
|
" result = get_all(posts, key)\n", |
|
" with open(filename, \"w\", encoding=\"utf8\") as fout:\n", |
|
" for x in result:\n", |
|
" print(*x, file=fout)\n", |
|
" return len(result)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 6, |
|
"metadata": {}, |
|
"outputs": [ |
|
{ |
|
"data": { |
|
"text/plain": [ |
|
"24" |
|
] |
|
}, |
|
"execution_count": 6, |
|
"metadata": {}, |
|
"output_type": "execute_result" |
|
} |
|
], |
|
"source": [ |
|
"dump(posts, {'text':1}, \"%s_text.txt\"%filebase)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 7, |
|
"metadata": {}, |
|
"outputs": [ |
|
{ |
|
"data": { |
|
"text/plain": [ |
|
"28" |
|
] |
|
}, |
|
"execution_count": 7, |
|
"metadata": {}, |
|
"output_type": "execute_result" |
|
} |
|
], |
|
"source": [ |
|
"dump(posts, {'a':1, 'b':1}, \"%s_ab.txt\"%filebase)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 8, |
|
"metadata": {}, |
|
"outputs": [ |
|
{ |
|
"data": { |
|
"text/plain": [ |
|
"19" |
|
] |
|
}, |
|
"execution_count": 8, |
|
"metadata": {}, |
|
"output_type": "execute_result" |
|
} |
|
], |
|
"source": [ |
|
"dump(posts, {'text':1, 'nota':-100, 'notb':-100}, \"%s_text-ab.txt\"%filebase)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": 9, |
|
"metadata": {}, |
|
"outputs": [ |
|
{ |
|
"data": { |
|
"text/plain": [ |
|
"17" |
|
] |
|
}, |
|
"execution_count": 0, |
|
"metadata": {}, |
|
"output_type": "execute_result" |
|
} |
|
], |
|
"source": [ |
|
"dump(posts, {'t1':1, 't2':1, 'a':-100, 'b':-100}, \"%s_t12-ab.txt\"%filebase)" |
|
] |
|
} |
|
], |
|
"metadata": { |
|
"kernelspec": { |
|
"display_name": "Python 3", |
|
"language": "python", |
|
"name": "python3" |
|
}, |
|
"language_info": { |
|
"codemirror_mode": { |
|
"name": "ipython", |
|
"version": 3 |
|
}, |
|
"file_extension": ".py", |
|
"mimetype": "text/x-python", |
|
"name": "python", |
|
"nbconvert_exporter": "python", |
|
"pygments_lexer": "ipython3", |
|
"version": "3.6.1" |
|
} |
|
}, |
|
"nbformat": 4, |
|
"nbformat_minor": 2 |
|
} |