Last active
June 30, 2020 09:09
-
-
Save 40sjg34si/78d52127986116b046cd75e4b9c0d81f to your computer and use it in GitHub Desktop.
Playaround/B站合集视频弹幕分布.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# B站视频弹幕分布(剧、视频合集)" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## import必需的包" | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-06-30T09:05:39.743879Z", | |
"end_time": "2020-06-30T09:05:39.876798Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "import requests\nimport aiohttp\nimport asyncio\nimport nest_asyncio\nfrom tqdm import tqdm\nimport math\n\nimport re\nimport nest_asyncio\nfrom collections import Counter\nfrom scipy.signal import find_peaks\n\nnest_asyncio.apply()", | |
"execution_count": 59, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## 根据用户hex值,每个用户所发弹幕按如下公式算分数,避免个别水怪刷弹幕" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "$W=log(C) + 1$" | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-06-29T21:25:48.670733Z", | |
"end_time": "2020-06-29T21:25:48.677725Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "def normalize(x):\n return math.log(x) + 1", | |
"execution_count": 47, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## 异步请求函数" | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-06-29T21:25:52.630113Z", | |
"end_time": "2020-06-29T21:25:52.709062Z" | |
}, | |
"scrolled": false, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "async def fetch(client, url):\n async with client.get(url) as resp:\n data = await resp.read()\n pattern = r'[a-z0-9]{8}:'\n a = re.findall(pattern, str(data))\n p = Counter(a)\n\n return sum(map(normalize, p.values()))\n\n\nasync def main(oid):\n semaphore = asyncio.Semaphore(200)\n\n async with semaphore:\n\n async with aiohttp.ClientSession() as client:\n while True:\n try:\n results = []\n segment_index = response = 1\n while response:\n url = f'https://api.bilibili.com/x/v2/dm/web/seg.so?type=1&oid={oid}&segment_index={segment_index}'\n response = await fetch(client=client, url=url)\n\n results.append(response)\n segment_index += 1\n\n return results\n except Exception as e:\n continue", | |
"execution_count": 48, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## 普通视频合集和番剧的url不一样,用对应的url和参数" | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-06-29T21:25:57.550685Z", | |
"end_time": "2020-06-29T21:25:57.635634Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "def get_danmaku_bv(bv_id):\n page_urls = f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp'\n\n r = requests.get(page_urls)\n page_lists = r.json()\n\n danmaku_count_dict = {}\n for part in tqdm(page_lists['data']):\n oid = part['cid']\n loop = asyncio.get_event_loop()\n results = loop.run_until_complete(main(oid))\n count = sum(results)\n danmaku_count_dict[(part['page'], part['part'])] = count\n\n return danmaku_count_dict\n\n\ndef get_danmaku_season(season_id):\n\n page_urls = f'https://api.bilibili.com/pgc/web/season/section?season_id={season_id}'\n r = requests.get(page_urls)\n page_lists = r.json()\n\n danmaku_count_dict = {}\n for part in tqdm(page_lists['result']['main_section']['episodes']):\n oid = part['cid']\n loop = asyncio.get_event_loop()\n results = loop.run_until_complete(main(oid))\n count = sum(results)\n danmaku_count_dict[(part['title'], part['long_title'])] = count\n\n return danmaku_count_dict", | |
"execution_count": 49, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## pretty print结果" | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-06-29T21:26:03.414288Z", | |
"end_time": "2020-06-29T21:26:03.449266Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "def save_results(file_path, results, length=40):\n max_score = max(results.values())\n\n peaks, _ = find_peaks(list(results.values()))\n\n with open(file_path, 'w+', encoding='utf-8') as f:\n for index, ((page, title), score) in enumerate(results.items()):\n\n score_len = len(str(int(score)))\n bar = '=' * max(int(score * length / max_score) - score_len,\n 0) + str(int(score))\n\n is_peak = '*' if index in peaks else ' '\n page = f'P{page}'\n f.write(f'{bar:>{length}} {is_peak} {page} {title}\\n')", | |
"execution_count": 50, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## 例子" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### Running Man 2010 ~ 2020" | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-06-29T21:26:07.847283Z", | |
"end_time": "2020-06-29T21:26:07.853283Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "bv_id = 'BV1SJ411F7y6'", | |
"execution_count": 51, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-06-29T19:09:53.423984Z", | |
"end_time": "2020-06-29T19:57:16.721613Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "results = get_danmaku_bv(bv_id)", | |
"execution_count": 34, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "100%|████████████████████████████████████████| 304/304 [47:22<00:00, 9.35s/it]\n", | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-06-30T06:46:34.146217Z", | |
"end_time": "2020-06-30T06:46:34.167201Z" | |
}, | |
"scrolled": true, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "save_results(file_path='runningman.txt', results=results)", | |
"execution_count": 58, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### 名侦探柯南" | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-06-29T21:26:59.863085Z", | |
"end_time": "2020-06-29T21:26:59.868082Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "season_id = 33378", | |
"execution_count": 54, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-06-29T21:27:04.067984Z", | |
"end_time": "2020-06-29T22:19:01.677878Z" | |
}, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "conan_results = get_danmaku_season(season_id)", | |
"execution_count": 55, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "100%|████████████████████████████████████████| 979/979 [51:53<00:00, 3.18s/it]\n", | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2020-06-30T06:44:47.924822Z", | |
"end_time": "2020-06-30T06:44:47.972793Z" | |
}, | |
"scrolled": true, | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "save_results(file_path='conan.txt', results=conan_results)", | |
"execution_count": 57, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"_draft": { | |
"nbviewer_url": "https://gist.github.com/78d52127986116b046cd75e4b9c0d81f" | |
}, | |
"gist": { | |
"id": "78d52127986116b046cd75e4b9c0d81f", | |
"data": { | |
"description": "Playaround/B站合集视频弹幕分布.ipynb", | |
"public": true | |
} | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3", | |
"language": "python" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.6.8", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"varInspector": { | |
"window_display": false, | |
"cols": { | |
"lenName": 16, | |
"lenType": 16, | |
"lenVar": 40 | |
}, | |
"kernels_config": { | |
"python": { | |
"library": "var_list.py", | |
"delete_cmd_prefix": "del ", | |
"delete_cmd_postfix": "", | |
"varRefreshCmd": "print(var_dic_list())" | |
}, | |
"r": { | |
"library": "var_list.r", | |
"delete_cmd_prefix": "rm(", | |
"delete_cmd_postfix": ") ", | |
"varRefreshCmd": "cat(var_dic_list()) " | |
} | |
}, | |
"types_to_exclude": [ | |
"module", | |
"function", | |
"builtin_function_or_method", | |
"instance", | |
"_Feature" | |
] | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment