40sjg34si/B站合集视频弹幕分布.ipynb

## B站合集视频弹幕分布.ipynb
{
  "cells": [
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "# B站视频弹幕分布（剧、视频合集）"
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## import必需的包"
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2020-06-30T09:05:39.743879Z",
          "end_time": "2020-06-30T09:05:39.876798Z"
        },
        "trusted": true
      },
      "cell_type": "code",
      "source": "import requests\nimport aiohttp\nimport asyncio\nimport nest_asyncio\nfrom tqdm import tqdm\nimport math\n\nimport re\nimport nest_asyncio\nfrom collections import Counter\nfrom scipy.signal import find_peaks\n\nnest_asyncio.apply()",
      "execution_count": 59,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## 根据用户hex值，每个用户所发弹幕按如下公式算分数，避免个别水怪刷弹幕"
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "$W=log(C) + 1$"
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2020-06-29T21:25:48.670733Z",
          "end_time": "2020-06-29T21:25:48.677725Z"
        },
        "trusted": true
      },
      "cell_type": "code",
      "source": "def normalize(x):\n    return math.log(x) + 1",
      "execution_count": 47,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## 异步请求函数"
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2020-06-29T21:25:52.630113Z",
          "end_time": "2020-06-29T21:25:52.709062Z"
        },
        "scrolled": false,
        "trusted": true
      },
      "cell_type": "code",
      "source": "async def fetch(client, url):\n    async with client.get(url) as resp:\n        data = await resp.read()\n        pattern = r'[a-z0-9]{8}:'\n        a = re.findall(pattern, str(data))\n        p = Counter(a)\n\n        return sum(map(normalize, p.values()))\n\n\nasync def main(oid):\n    semaphore = asyncio.Semaphore(200)\n\n    async with semaphore:\n\n        async with aiohttp.ClientSession() as client:\n            while True:\n                try:\n                    results = []\n                    segment_index = response = 1\n                    while response:\n                        url = f'https://api.bilibili.com/x/v2/dm/web/seg.so?type=1&oid={oid}&segment_index={segment_index}'\n                        response = await fetch(client=client, url=url)\n\n                        results.append(response)\n                        segment_index += 1\n\n                    return results\n                except Exception as e:\n                    continue",
      "execution_count": 48,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## 普通视频合集和番剧的url不一样，用对应的url和参数"
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2020-06-29T21:25:57.550685Z",
          "end_time": "2020-06-29T21:25:57.635634Z"
        },
        "trusted": true
      },
      "cell_type": "code",
      "source": "def get_danmaku_bv(bv_id):\n    page_urls = f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp'\n\n    r = requests.get(page_urls)\n    page_lists = r.json()\n\n    danmaku_count_dict = {}\n    for part in tqdm(page_lists['data']):\n        oid = part['cid']\n        loop = asyncio.get_event_loop()\n        results = loop.run_until_complete(main(oid))\n        count = sum(results)\n        danmaku_count_dict[(part['page'], part['part'])] = count\n\n    return danmaku_count_dict\n\n\ndef get_danmaku_season(season_id):\n\n    page_urls = f'https://api.bilibili.com/pgc/web/season/section?season_id={season_id}'\n    r = requests.get(page_urls)\n    page_lists = r.json()\n\n    danmaku_count_dict = {}\n    for part in tqdm(page_lists['result']['main_section']['episodes']):\n        oid = part['cid']\n        loop = asyncio.get_event_loop()\n        results = loop.run_until_complete(main(oid))\n        count = sum(results)\n        danmaku_count_dict[(part['title'], part['long_title'])] = count\n\n    return danmaku_count_dict",
      "execution_count": 49,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## pretty print结果"
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2020-06-29T21:26:03.414288Z",
          "end_time": "2020-06-29T21:26:03.449266Z"
        },
        "trusted": true
      },
      "cell_type": "code",
      "source": "def save_results(file_path, results, length=40):\n    max_score = max(results.values())\n\n    peaks, _ = find_peaks(list(results.values()))\n\n    with open(file_path, 'w+', encoding='utf-8') as f:\n        for index, ((page, title), score) in enumerate(results.items()):\n\n            score_len = len(str(int(score)))\n            bar = '=' * max(int(score * length / max_score) - score_len,\n                            0) + str(int(score))\n\n            is_peak = '*' if index in peaks else ' '\n            page = f'P{page}'\n            f.write(f'{bar:>{length}} {is_peak} {page} {title}\\n')",
      "execution_count": 50,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## 例子"
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "### Running Man 2010 ~ 2020"
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2020-06-29T21:26:07.847283Z",
          "end_time": "2020-06-29T21:26:07.853283Z"
        },
        "trusted": true
      },
      "cell_type": "code",
      "source": "bv_id = 'BV1SJ411F7y6'",
      "execution_count": 51,
      "outputs": []
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2020-06-29T19:09:53.423984Z",
          "end_time": "2020-06-29T19:57:16.721613Z"
        },
        "trusted": true
      },
      "cell_type": "code",
      "source": "results = get_danmaku_bv(bv_id)",
      "execution_count": 34,
      "outputs": [
        {
          "output_type": "stream",
          "text": "100%|████████████████████████████████████████| 304/304 [47:22<00:00,  9.35s/it]\n",
          "name": "stderr"
        }
      ]
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2020-06-30T06:46:34.146217Z",
          "end_time": "2020-06-30T06:46:34.167201Z"
        },
        "scrolled": true,
        "trusted": true
      },
      "cell_type": "code",
      "source": "save_results(file_path='runningman.txt', results=results)",
      "execution_count": 58,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "### 名侦探柯南"
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2020-06-29T21:26:59.863085Z",
          "end_time": "2020-06-29T21:26:59.868082Z"
        },
        "trusted": true
      },
      "cell_type": "code",
      "source": "season_id = 33378",
      "execution_count": 54,
      "outputs": []
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2020-06-29T21:27:04.067984Z",
          "end_time": "2020-06-29T22:19:01.677878Z"
        },
        "trusted": true
      },
      "cell_type": "code",
      "source": "conan_results = get_danmaku_season(season_id)",
      "execution_count": 55,
      "outputs": [
        {
          "output_type": "stream",
          "text": "100%|████████████████████████████████████████| 979/979 [51:53<00:00,  3.18s/it]\n",
          "name": "stderr"
        }
      ]
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2020-06-30T06:44:47.924822Z",
          "end_time": "2020-06-30T06:44:47.972793Z"
        },
        "scrolled": true,
        "trusted": true
      },
      "cell_type": "code",
      "source": "save_results(file_path='conan.txt', results=conan_results)",
      "execution_count": 57,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "_draft": {
      "nbviewer_url": "https://gist.github.com/78d52127986116b046cd75e4b9c0d81f"
    },
    "gist": {
      "id": "78d52127986116b046cd75e4b9c0d81f",
      "data": {
        "description": "Playaround/B站合集视频弹幕分布.ipynb",
        "public": true
      }
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3",
      "language": "python"
    },
    "language_info": {
      "name": "python",
      "version": "3.6.8",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "varInspector": {
      "window_display": false,
      "cols": {
        "lenName": 16,
        "lenType": 16,
        "lenVar": 40
      },
      "kernels_config": {
        "python": {
          "library": "var_list.py",
          "delete_cmd_prefix": "del ",
          "delete_cmd_postfix": "",
          "varRefreshCmd": "print(var_dic_list())"
        },
        "r": {
          "library": "var_list.r",
          "delete_cmd_prefix": "rm(",
          "delete_cmd_postfix": ") ",
          "varRefreshCmd": "cat(var_dic_list()) "
        }
      },
      "types_to_exclude": [
        "module",
        "function",
        "builtin_function_or_method",
        "instance",
        "_Feature"
      ]
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}
	{
	"cells": [
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "# B站视频弹幕分布（剧、视频合集）"
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "## import必需的包"
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2020-06-30T09:05:39.743879Z",
	"end_time": "2020-06-30T09:05:39.876798Z"
	},
	"trusted": true
	},
	"cell_type": "code",
	"source": "import requests\nimport aiohttp\nimport asyncio\nimport nest_asyncio\nfrom tqdm import tqdm\nimport math\n\nimport re\nimport nest_asyncio\nfrom collections import Counter\nfrom scipy.signal import find_peaks\n\nnest_asyncio.apply()",
	"execution_count": 59,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "## 根据用户hex值，每个用户所发弹幕按如下公式算分数，避免个别水怪刷弹幕"
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "$W=log(C) + 1$"
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2020-06-29T21:25:48.670733Z",
	"end_time": "2020-06-29T21:25:48.677725Z"
	},
	"trusted": true
	},
	"cell_type": "code",
	"source": "def normalize(x):\n return math.log(x) + 1",
	"execution_count": 47,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "## 异步请求函数"
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2020-06-29T21:25:52.630113Z",
	"end_time": "2020-06-29T21:25:52.709062Z"
	},
	"scrolled": false,
	"trusted": true
	},
	"cell_type": "code",
	"source": "async def fetch(client, url):\n async with client.get(url) as resp:\n data = await resp.read()\n pattern = r'[a-z0-9]{8}:'\n a = re.findall(pattern, str(data))\n p = Counter(a)\n\n return sum(map(normalize, p.values()))\n\n\nasync def main(oid):\n semaphore = asyncio.Semaphore(200)\n\n async with semaphore:\n\n async with aiohttp.ClientSession() as client:\n while True:\n try:\n results = []\n segment_index = response = 1\n while response:\n url = f'https://api.bilibili.com/x/v2/dm/web/seg.so?type=1&oid={oid}&segment_index={segment_index}'\n response = await fetch(client=client, url=url)\n\n results.append(response)\n segment_index += 1\n\n return results\n except Exception as e:\n continue",
	"execution_count": 48,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "## 普通视频合集和番剧的url不一样，用对应的url和参数"
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2020-06-29T21:25:57.550685Z",
	"end_time": "2020-06-29T21:25:57.635634Z"
	},
	"trusted": true
	},
	"cell_type": "code",
	"source": "def get_danmaku_bv(bv_id):\n page_urls = f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp'\n\n r = requests.get(page_urls)\n page_lists = r.json()\n\n danmaku_count_dict = {}\n for part in tqdm(page_lists['data']):\n oid = part['cid']\n loop = asyncio.get_event_loop()\n results = loop.run_until_complete(main(oid))\n count = sum(results)\n danmaku_count_dict[(part['page'], part['part'])] = count\n\n return danmaku_count_dict\n\n\ndef get_danmaku_season(season_id):\n\n page_urls = f'https://api.bilibili.com/pgc/web/season/section?season_id={season_id}'\n r = requests.get(page_urls)\n page_lists = r.json()\n\n danmaku_count_dict = {}\n for part in tqdm(page_lists['result']['main_section']['episodes']):\n oid = part['cid']\n loop = asyncio.get_event_loop()\n results = loop.run_until_complete(main(oid))\n count = sum(results)\n danmaku_count_dict[(part['title'], part['long_title'])] = count\n\n return danmaku_count_dict",
	"execution_count": 49,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "## pretty print结果"
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2020-06-29T21:26:03.414288Z",
	"end_time": "2020-06-29T21:26:03.449266Z"
	},
	"trusted": true
	},
	"cell_type": "code",
	"source": "def save_results(file_path, results, length=40):\n max_score = max(results.values())\n\n peaks, _ = find_peaks(list(results.values()))\n\n with open(file_path, 'w+', encoding='utf-8') as f:\n for index, ((page, title), score) in enumerate(results.items()):\n\n score_len = len(str(int(score)))\n bar = '=' * max(int(score * length / max_score) - score_len,\n 0) + str(int(score))\n\n is_peak = '*' if index in peaks else ' '\n page = f'P{page}'\n f.write(f'{bar:>{length}} {is_peak} {page} {title}\\n')",
	"execution_count": 50,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "## 例子"
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "### Running Man 2010 ~ 2020"
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2020-06-29T21:26:07.847283Z",
	"end_time": "2020-06-29T21:26:07.853283Z"
	},
	"trusted": true
	},
	"cell_type": "code",
	"source": "bv_id = 'BV1SJ411F7y6'",
	"execution_count": 51,
	"outputs": []
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2020-06-29T19:09:53.423984Z",
	"end_time": "2020-06-29T19:57:16.721613Z"
	},
	"trusted": true
	},
	"cell_type": "code",
	"source": "results = get_danmaku_bv(bv_id)",
	"execution_count": 34,
	"outputs": [
	{
	"output_type": "stream",
	"text": "100%\|████████████████████████████████████████\| 304/304 [47:22<00:00, 9.35s/it]\n",
	"name": "stderr"
	}
	]
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2020-06-30T06:46:34.146217Z",
	"end_time": "2020-06-30T06:46:34.167201Z"
	},
	"scrolled": true,
	"trusted": true
	},
	"cell_type": "code",
	"source": "save_results(file_path='runningman.txt', results=results)",
	"execution_count": 58,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "### 名侦探柯南"
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2020-06-29T21:26:59.863085Z",
	"end_time": "2020-06-29T21:26:59.868082Z"
	},
	"trusted": true
	},
	"cell_type": "code",
	"source": "season_id = 33378",
	"execution_count": 54,
	"outputs": []
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2020-06-29T21:27:04.067984Z",
	"end_time": "2020-06-29T22:19:01.677878Z"
	},
	"trusted": true
	},
	"cell_type": "code",
	"source": "conan_results = get_danmaku_season(season_id)",
	"execution_count": 55,
	"outputs": [
	{
	"output_type": "stream",
	"text": "100%\|████████████████████████████████████████\| 979/979 [51:53<00:00, 3.18s/it]\n",
	"name": "stderr"
	}
	]
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2020-06-30T06:44:47.924822Z",
	"end_time": "2020-06-30T06:44:47.972793Z"
	},
	"scrolled": true,
	"trusted": true
	},
	"cell_type": "code",
	"source": "save_results(file_path='conan.txt', results=conan_results)",
	"execution_count": 57,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "",
	"execution_count": null,
	"outputs": []
	}
	],
	"metadata": {
	"_draft": {
	"nbviewer_url": "https://gist.github.com/78d52127986116b046cd75e4b9c0d81f"
	},
	"gist": {
	"id": "78d52127986116b046cd75e4b9c0d81f",
	"data": {
	"description": "Playaround/B站合集视频弹幕分布.ipynb",
	"public": true
	}
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3",
	"language": "python"
	},
	"language_info": {
	"name": "python",
	"version": "3.6.8",
	"mimetype": "text/x-python",
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"pygments_lexer": "ipython3",
	"nbconvert_exporter": "python",
	"file_extension": ".py"
	},
	"varInspector": {
	"window_display": false,
	"cols": {
	"lenName": 16,
	"lenType": 16,
	"lenVar": 40
	},
	"kernels_config": {
	"python": {
	"library": "var_list.py",
	"delete_cmd_prefix": "del ",
	"delete_cmd_postfix": "",
	"varRefreshCmd": "print(var_dic_list())"
	},
	"r": {
	"library": "var_list.r",
	"delete_cmd_prefix": "rm(",
	"delete_cmd_postfix": ") ",
	"varRefreshCmd": "cat(var_dic_list()) "
	}
	},
	"types_to_exclude": [
	"module",
	"function",
	"builtin_function_or_method",
	"instance",
	"_Feature"
	]
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}