ikedaosushi/slack-analytics.ipynb

## slack-analytics.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "%matplotlib inline\n",
    "from IPython.display import Image\n",
    "\n",
    "import os, sys, re, datetime, time, copy\n",
    "from pathlib import Path\n",
    "\n",
    "pj_dir = Path(os.getcwd()).parents[1]\n",
    "data_dir = pj_dir/'data'\n",
    "img_dir = pj_dir/'images'\n",
    "src_dir = pj_dir/'src'\n",
    "sys.path.append(str(src_dir))\n",
    "\n",
    "from matplotlib import pyplot as plt\n",
    "import jpholiday\n",
    "from tqdm import tqdm_notebook\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "import seaborn as sns\n",
    "plt.style.use(\"bmh\")\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import dask.dataframe as dd\n",
    "import requests\n",
    "\n",
    "import MeCab\n",
    "from sklearn.manifold import TSNE\n",
    "from wordcloud import WordCloud\n",
    "\n",
    "from gensim import models\n",
    "from gensim.models.doc2vec import TaggedDocument"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib\n",
    "matplotlib.rcParams[\"figure.figsize\"] = (16, 4)\n",
    "plt.rcParams[\"font.family\"] = \"IPAexGothic\"\n",
    "import logging\n",
    "logging.basicConfig(level=logging.INFO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option(\"display.max_rows\", 100)\n",
    "load_dotenv(pj_dir/'.env')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Slackデータの取得"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "token = os.environ.get('SLACK_TOKEN')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "headers = {\n",
    "    \"Content-type\": \"application/json\",\n",
    "    \"Authorization\": f\"Bearer {token}\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fetch_messages_by_channel(channe_id):\n",
    "    oldest_ts = None\n",
    "    one_year_ago = pd.to_datetime('2017-12-31')\n",
    "    endpoint = 'https://slack.com/api/channels.history'\n",
    "\n",
    "    ls_messages = []\n",
    "    while True:\n",
    "        payload = {\n",
    "            'channel': channel_id,\n",
    "            'latest': oldest_ts,\n",
    "            'count': 1000\n",
    "        }\n",
    "\n",
    "        data = requests.get(endpoint, headers=headers, params=payload).json()\n",
    "        messages = data['messages']\n",
    "        ls_messages.extend(messages)\n",
    "\n",
    "        if data['has_more']:\n",
    "            time.sleep(1)\n",
    "            oldest_ts = messages[-1]['ts']\n",
    "            oldest_datetime = pd.to_datetime(oldest_ts, unit='s')\n",
    "            sys.stdout.write(f\"\\r{oldest_datetime}\")\n",
    "            sys.stdout.flush()\n",
    "            if oldest_datetime < one_year_ago:\n",
    "                sys.stdout.write(f\"\\rfinish!\" + ' '*50)\n",
    "                break\n",
    "        else:\n",
    "            break\n",
    "    df = pd.DataFrame(ls_messages)\n",
    "    df['channel_id'] = channel_id\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ls_df = []\n",
    "ls_err_channel_id = []\n",
    "for i, row in tqdm_notebook(df_channel.iterrows()):\n",
    "    channel_id = row['id']\n",
    "    try:\n",
    "        df = fetch_messages_by_channel(channel_id)\n",
    "    except:\n",
    "        print(f\"Error on {row['name']}\")\n",
    "        ls_err_channel_id.append(channel_id)\n",
    "    else:\n",
    "        ls_df.append(df)\n",
    "    time.sleep(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df = pd.concat(ls_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "endpoint = 'https://slack.com/api/channels.list'\n",
    "payload = {}\n",
    "\n",
    "data = requests.get(endpoint, headers=headers, params=payload).json()\n",
    "df_channel = pd.DataFrame(data['channels'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "endpoint = 'https://slack.com/api/users.list'\n",
    "payload = {}\n",
    "\n",
    "data = requests.get(endpoint, headers=headers, params=payload).json()\n",
    "df_member = pd.DataFrame(data['members'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 保存"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_channel.to_pickle(data_dir/'kaizen_slack/channels.pickle')\n",
    "df_member.to_pickle(data_dir/'kaizen_slack/members.pickle')\n",
    "df.to_pickle(data_dir/'kaizen_slack/messages.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ロード"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfall = pd.read_pickle(data_dir/'kaizen_slack/all_messages.pickle')\n",
    "df_channel = pd.read_pickle(data_dir/'kaizen_slack/channels.pickle')\n",
    "df_member = pd.read_pickle(data_dir/'kaizen_slack/members.pickle')\n",
    "df = pd.read_pickle(data_dir/'kaizen_slack/messages.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# mapping作成"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_id_name_map = df_member.set_index('id')['name'].to_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "channel_name_id_map = df_channel.set_index('id')['name'].to_dict()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 前処理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# datetimeに変換\n",
    "df['dt'] = pd.to_datetime(df['ts'], unit='s')\n",
    "\n",
    "# 1年前からの発言に絞る\n",
    "one_year_ago = pd.to_datetime('2017-12-31')\n",
    "df = df.query('@one_year_ago < dt')\n",
    "\n",
    "# usernameをmap\n",
    "df['username'] = df['user'].map(user_id_name_map)\n",
    "\n",
    "# channel nameをmap\n",
    "df['channel_name'] = df['channel_id'].map(channel_name_id_map)\n",
    "\n",
    "# botを削除\n",
    "df = df[df['bot_id'].isnull()]\n",
    "df = df.query('username != \"cronbot\"').query('username != \"slackbot\"')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# メッセージだけに絞る"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "not_message_types = ['channel_join', 'channel_leave', 'channel_topic', 'channel_archive', 'channel_purpose', 'sh_room_created', 'channel_name', 'pinned_item', 'reminder_add', 'app_conversation_join']\n",
    "df = df[~df['subtype'].isin(not_message_types)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# @されてるユーザー"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['at_user'] = df['text'].str.extract(r'(?<=<@)(.{1,9})(?=>)')\n",
    "df['at_username'] = df['at_user'].map(user_id_name_map)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 発言数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "s = df['username'].value_counts()[:5]\n",
    "\n",
    "n = s.shape[0]\n",
    "fig = plt.figure(figsize=(16,1*n))\n",
    "\n",
    "tmp_df = s.to_frame('value').reset_index().rename(columns={'index': 'name'})\n",
    "ax = sns.barplot(x='value', y='name', palette=\"autumn\", data=tmp_df)\n",
    "max_ = tmp_df['value'].max()\n",
    "\n",
    "for i, (_, row) in enumerate(tmp_df.iterrows()):\n",
    "    text = ax.text(row['value'] + max_*.05, i+0.1, row['value'], color='black', ha=\"center\", fontsize=20)\n",
    "\n",
    "[spine.set_visible(False) for spine in ax.spines.values()]\n",
    "ax.tick_params(bottom=False, left=False, labelbottom=False)\n",
    "ax.tick_params(axis='y', labelsize=20)\n",
    "ax.set_xlabel('')\n",
    "ax.set_ylabel('')\n",
    "ax.set_title('2018年 発言数 TOP5', fontsize=30)\n",
    "ax.patch.set_facecolor('white') \n",
    "\n",
    "ax.patch.set_alpha(0)\n",
    "plt.grid(False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# @された数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = df['at_username'].value_counts()[:5]\n",
    "\n",
    "n = s.shape[0]\n",
    "fig = plt.figure(figsize=(16,1*n))\n",
    "\n",
    "tmp_df = s.to_frame('value').reset_index().rename(columns={'index': 'name'})\n",
    "ax = sns.barplot(x='value', y='name', data=tmp_df, palette=\"autumn\")\n",
    "max_ = tmp_df['value'].max()\n",
    "\n",
    "for i, (_, row) in enumerate(tmp_df.iterrows()):\n",
    "    text = ax.text(row['value'] + max_*.05, i+0.1, row['value'], color='black', ha=\"center\", fontsize=20)\n",
    "\n",
    "[spine.set_visible(False) for spine in ax.spines.values()]\n",
    "ax.tick_params(bottom=False, left=False, labelbottom=False)\n",
    "ax.tick_params(axis='y', labelsize=20)\n",
    "ax.set_xlabel('')\n",
    "ax.set_ylabel('')\n",
    "ax.set_title('2018年 @された数 TOP5', fontsize=30)\n",
    "ax.patch.set_facecolor('white') \n",
    "\n",
    "ax.patch.set_alpha(0)\n",
    "plt.grid(False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 一番使われたリアクションは？"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "count_reaction = defaultdict(int)\n",
    "for reactions in tqdm_notebook(df['reactions'].fillna('')):\n",
    "    if len(reactions) == 0:\n",
    "        continue\n",
    "    for reaction in reactions:\n",
    "        name = reaction['name']\n",
    "        count = len(reaction['users'])\n",
    "        count_reaction[name] += count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = pd.Series(count_reaction).sort_values(ascending=False)[:5]\n",
    "\n",
    "n = s.shape[0]\n",
    "fig = plt.figure(figsize=(16,1*n))\n",
    "\n",
    "tmp_df = s.to_frame('value').reset_index().rename(columns={'index': 'name'})\n",
    "ax = sns.barplot(x='value', y='name', data=tmp_df, palette=\"autumn\")\n",
    "max_ = tmp_df['value'].max()\n",
    "\n",
    "for i, (_, row) in enumerate(tmp_df.iterrows()):\n",
    "    text = ax.text(row['value'] + max_*.05, i+0.1, row['value'], color='black', ha=\"center\", fontsize=20)\n",
    "\n",
    "[spine.set_visible(False) for spine in ax.spines.values()]\n",
    "ax.tick_params(bottom=False, left=False, labelbottom=False)\n",
    "ax.tick_params(axis='y', labelsize=20)\n",
    "ax.set_xlabel('')\n",
    "ax.set_ylabel('')\n",
    "ax.set_title('2018年 使われたリアクション TOP10', fontsize=30)\n",
    "ax.patch.set_facecolor('white') \n",
    "\n",
    "ax.patch.set_alpha(0)\n",
    "plt.grid(False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 時系列で見た発言数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "is_weekday_date =[dt for dt in pd.date_range('2018-1-1', '2018-12-16', freq='1D') if dt.weekday() in [0, 1, 2, 3, 4] and not jpholiday.is_holiday(dt.date())]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(16, 4))\n",
    "ax = fig.add_subplot(1, 1, 1)\n",
    "ax = df.groupby(pd.Grouper(key='dt', freq='1D')).size()[is_weekday_date].plot(linewidth=2, linestyle='--', ax=ax)\n",
    "ax = df.groupby(pd.Grouper(key='dt', freq='1D')).size()[is_weekday_date].rolling(window=5).mean().plot(linewidth=5, ax=ax)\n",
    "ax.set_title('1日あたりの発言数(休日祝日は除く) ※破線が実数、実線が周期5の移動平均', fontsize=20)\n",
    "ax.tick_params(axis='both', labelsize='xx-large')\n",
    "ax.set_xlabel('')\n",
    "ax.set_ylabel('')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Channel数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_channel['created_dt'] = pd.to_datetime(df_channel['created'], unit='s')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s_added = df_channel.groupby(pd.Grouper(key='created_dt', freq='1W')).size()\n",
    "s_archived = dfall.query('subtype == \"channel_archive\"').groupby(pd.Grouper(key='datetime', freq='1W')).size()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# transactionデータ作成"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_transition_channel = pd.concat([s_added, s_archived], axis=1).rename(columns={0: 'added', 1: 'archived'})\n",
    "df_transition_channel.fillna(0, inplace=True)\n",
    "df_transition_channel['count_channel'] = (df_transition_channel['added'] - df_transition_channel['archived']).cumsum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_transition_channel[one_year_ago < df_transition_channel.index]['added'].sum(), df_transition_channel[one_year_ago < df_transition_channel.index]['archived'].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(16, 8))\n",
    "ax = fig.add_subplot(1, 1, 1)\n",
    "s = df_transition_channel['count_channel']\n",
    "ax = s.plot(linewidth=5, linestyle='-', ax=ax, label='総Channel数')\n",
    "ax.set_title('左軸: Channel数   右軸: 作成/アーカイブ数', fontsize=20)\n",
    "ax.tick_params(axis='both', labelsize=20)\n",
    "ax.set_xlabel('')\n",
    "ax.set_ylabel('')\n",
    "plt.legend(fontsize=20, loc='lower right')\n",
    "\n",
    "ax2 = ax.twinx()\n",
    "s = df_transition_channel['added'].rolling(window=7).mean()\n",
    "s.plot(linewidth=3, linestyle='--', ax=ax2, label='作成数/day', color='C1')\n",
    "s = df_transition_channel['archived'].rolling(window=7).mean()\n",
    "s.plot(linewidth=3, linestyle='--', ax=ax2, label='アーカイブ数/day', color='C3')\n",
    "ax2.tick_params(axis='both', labelsize=15)\n",
    "plt.legend(fontsize=20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 時系列細かく"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "weekday_str_map = {\n",
    "    0: '月', 1: '火', 2: '水', 3: '木', 4: '金', 5: '土', 6: '日'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_daily = df.groupby(pd.Grouper(key='dt', freq='1D')).size()[is_weekday_date].to_frame('count')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_daily['weekday'] = df_daily.index.weekday.map(weekday_str_map)\n",
    "df_daily['day_in_month'] = df_daily.index.day"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(16, 4))\n",
    "ax = fig.add_subplot(1, 1, 1)\n",
    "ax = sns.boxplot(data=df_daily.sort_values('weekday'), x='weekday', y='count')\n",
    "ax.set_title('曜日による発言数の分布', fontsize=20)\n",
    "ax.tick_params(axis='both', labelsize='x-large')\n",
    "ax.set_xlabel('')\n",
    "ax.set_ylabel('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(16, 4))\n",
    "ax = fig.add_subplot(1, 1, 1)\n",
    "ax = sns.boxplot(data=df_daily.sort_values('day_in_month'), x='day_in_month', y='count')\n",
    "ax.set_title('日付による発言数の分布', fontsize=20)\n",
    "ax.tick_params(axis='both', labelsize='x-large')\n",
    "ax.set_xlabel('')\n",
    "ax.set_ylabel('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "is_weekday = df['dt'].dt.weekday.isin([0, 1, 2, 3, 4]) & ~df['dt'].dt.date.apply(jpholiday.is_holiday)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_hourly = df[is_weekday].groupby(pd.Grouper(key='dt', freq='1h')).size().to_frame('count')\n",
    "df_hourly['hour'] = df_hourly.index.hour + 9\n",
    "work_hours = list(range(9, 20))\n",
    "\n",
    "fig = plt.figure(figsize=(16, 4))\n",
    "ax = fig.add_subplot(1, 1, 1)\n",
    "ax = sns.boxplot(data=df_hourly.query('hour in @work_hours').sort_values('hour'), x='hour', y='count')\n",
    "ax.set_title('時間帯による発言数の分布', fontsize=20)\n",
    "ax.tick_params(axis='both', labelsize='x-large')\n",
    "ax.set_xlabel('')\n",
    "ax.set_ylabel('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = df.groupby('channel_name').size().sort_values(ascending=False)[:5]\n",
    "\n",
    "n = s.shape[0]\n",
    "fig = plt.figure(figsize=(16,1*n))\n",
    "\n",
    "tmp_df = s.to_frame('value').reset_index().rename(columns={'channel_name': 'name'})\n",
    "ax = sns.barplot(x='value', y='name', data=tmp_df, palette=\"autumn\")\n",
    "max_ = tmp_df['value'].max()\n",
    "\n",
    "for i, (_, row) in enumerate(tmp_df.iterrows()):\n",
    "    text = ax.text(row['value'] + max_*.05, i+0.1, row['value'], color='black', ha=\"center\", fontsize=20)\n",
    "\n",
    "[spine.set_visible(False) for spine in ax.spines.values()]\n",
    "ax.tick_params(bottom=False, left=False, labelbottom=False)\n",
    "ax.tick_params(axis='y', labelsize=20)\n",
    "ax.set_xlabel('')\n",
    "ax.set_ylabel('')\n",
    "ax.set_title('2018年 発言が多かったChannel TOP5', fontsize=30)\n",
    "ax.patch.set_facecolor('white') \n",
    "\n",
    "ax.patch.set_alpha(0)\n",
    "plt.grid(False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(16, 4))\n",
    "ax = fig.add_subplot(1, 1, 1)\n",
    "ax = df.query('channel_name == \"ad-cs\"').groupby(pd.Grouper(key='dt', freq='1D')).size()[is_weekday_date].plot(linewidth=2, linestyle='--', ax=ax)\n",
    "ax = df.query('channel_name == \"ad-cs\"').groupby(pd.Grouper(key='dt', freq='1D')).size()[is_weekday_date].rolling(window=5).mean().plot(linewidth=5, ax=ax)\n",
    "ax.set_title('ad-csの1日あたりの発言数(休日祝日は除く) ※破線が実数、実線が周期5の移動平均', fontsize=20)\n",
    "ax.tick_params(axis='both', labelsize='xx-large')\n",
    "ax.set_xlabel('')\n",
    "ax.set_ylabel('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(16, 4))\n",
    "ax = fig.add_subplot(1, 1, 1)\n",
    "ax = df.query('channel_name == \"times_ikedayu\"').groupby(pd.Grouper(key='dt', freq='1D')).size()[is_weekday_date].plot(linewidth=2, linestyle='--', ax=ax)\n",
    "ax = df.query('channel_name == \"times_ikedayu\"').groupby(pd.Grouper(key='dt', freq='1D')).size()[is_weekday_date].rolling(window=5).mean().plot(linewidth=5, ax=ax)\n",
    "ax.set_title('times_ikedayuの1日あたりの発言数(休日祝日は除く) ※破線が実数、実線が周期5の移動平均', fontsize=20)\n",
    "ax.tick_params(axis='both', labelsize='xx-large')\n",
    "ax.set_xlabel('')\n",
    "ax.set_ylabel('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# s = df[df['channel_name'].str.contains('times_')].groupby('channel_name').size().sort_values(ascending=False)[:5]\n",
    "\n",
    "# n = s.shape[0]\n",
    "# fig = plt.figure(figsize=(16,1*n))\n",
    "\n",
    "# tmp_df = s.to_frame('value').reset_index().rename(columns={'channel_name': 'name'})\n",
    "# ax = sns.barplot(x='value', y='name', data=tmp_df, palette=\"autumn\")\n",
    "# max_ = tmp_df['value'].max()\n",
    "\n",
    "# for i, (_, row) in enumerate(tmp_df.iterrows()):\n",
    "#     text = ax.text(row['value'] + max_*.05, i+0.1, row['value'], color='black', ha=\"center\", fontsize=20)\n",
    "\n",
    "# [spine.set_visible(False) for spine in ax.spines.values()]\n",
    "# ax.tick_params(bottom=False, left=False, labelbottom=False)\n",
    "# ax.tick_params(axis='y', labelsize=20)\n",
    "# ax.set_xlabel('')\n",
    "# ax.set_ylabel('')\n",
    "# ax.set_title('2018年 発言が多かったtimes TOP5', fontsize=30)\n",
    "# ax.patch.set_facecolor('white') \n",
    "\n",
    "# ax.patch.set_alpha(0)\n",
    "# plt.grid(False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_hist(s, title, bins=30, xlabel='', ylabel=''):\n",
    "    fig = plt.figure(figsize=(16, 4))\n",
    "    ax = fig.add_subplot(1, 1, 1)\n",
    "\n",
    "    mean =s.mean().round(2)\n",
    "    median = s.median().round(2)\n",
    "    std = s.std().round(2)\n",
    "\n",
    "    sns.distplot(s, ax=ax, bins=bins, kde_kws={\"color\": \"k\", \"lw\": 3})\n",
    "    ax.set_title(title, fontsize=20)\n",
    "    ax.tick_params(axis = 'x', which = 'major', labelsize = 20)\n",
    "    vals = ax.get_yticks()\n",
    "    ax.set_xlabel(xlabel, fontsize=20)\n",
    "    ax.set_ylabel(ylabel, fontsize=20)\n",
    "    ax.set_yticklabels(['{:,.2%}'.format(x) for x in vals])\n",
    "    ax.text( 0.99, 0.99, f\"平均値: {mean:.2f} \\n 中央値: {median:.2f} \\n 標準偏差: {std:.2f}\", horizontalalignment='right', verticalalignment='top', transform=ax.transAxes, fontsize=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = df.groupby('channel_name').size().sort_values(ascending=False)\n",
    "s = s[s > 0]\n",
    "plot_hist(s, 'チャンネルごとの発言数のヒストグラム', bins=100, xlabel='発言数')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = df.groupby('channel_name').size().sort_values(ascending=False)\n",
    "sum_ = s.sum()\n",
    "tmp_df = pd.concat([s, s.cumsum(), 100*s.cumsum()/sum_], axis=1)\n",
    "tmp_df.columns = ['number', 'cumsum', 'cumsum_percent']\n",
    "\n",
    "fig = plt.figure(figsize=(16, 4))\n",
    "ax = fig.add_subplot(1, 1, 1)\n",
    "\n",
    "n = tmp_df.shape[0]\n",
    "x = np.arange(0, n)\n",
    "ax.plot(x, tmp_df['cumsum_percent'], linewidth=5)\n",
    "ax.tick_params(axis='both', labelsize=20)\n",
    "ax.set_xlabel('Channel数', fontsize=20)\n",
    "ax.set_ylabel('発言数の累積％', fontsize=20)\n",
    "ax.set_title('Slcak Channelと発言数のパレート図', fontsize=30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 自然言語処理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['text_trimed'] = df['text'].str.replace(r'<\\S+>', '').str.replace(r':\\S+:', '').str.replace('\\n', '')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Doc2Vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp_df = df.groupby('channel_name').size().sort_values(ascending=False)\n",
    "top100_channel = tmp_df[:100].index.tolist()\n",
    "top10_channel = tmp_df[:10].index.tolist()\n",
    "top20_channel = tmp_df[:20].index.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "grouped = df.query('channel_name in @top100_channel').groupby('channel_name')\n",
    "channel_words = {}\n",
    "for channel_name, tmp_df in tqdm_notebook(grouped):\n",
    "    doc = ''.join(tmp_df['text_trimed'].values.tolist())\n",
    "    channel_words[channel_name] = split_into_words(doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def split_into_words(doc):\n",
    "    mecab = MeCab.Tagger(\"-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd\")\n",
    "    lines = mecab.parse(doc).splitlines()\n",
    "    words = []\n",
    "    for line in tqdm_notebook(lines):\n",
    "        chunks = line.split('\\t')\n",
    "        if len(chunks) > 3 and (chunks[3].startswith('動詞') or chunks[3].startswith('形容詞') or (chunks[3].startswith('名詞') and not chunks[3].startswith('名詞-数'))):\n",
    "            words.append(chunks[0])\n",
    "    return words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "stop_words = [ 'てる', 'いる', 'なる', 'れる', 'する', 'ある', 'こと', 'これ', 'さん', 'して', \\\n",
    "         'くれる', 'やる', 'くださる', 'そう', 'せる', 'した',  '思う',  \\\n",
    "         'それ', 'ここ', 'ちゃん', 'くん', '', 'て','に','を','は','の', 'が', 'と', 'た', 'し', 'で', \\\n",
    "         'ない', 'も', 'な', 'い', 'か', 'ので', 'よう', '', '思い', 'なっ', 'でき', 'いい', 'もの', 'あり', 'なり', 'ところ',\n",
    "        'こちら', '本日', 'おり', 'ください', 'お願い', 'いたし', 'ため', 'いただき', 'gt', 'commented', 'on', '思っ', '行っ', \n",
    "        'しまっ', 'やっ', '行き', 'とき', 'できる', '自分', '書い', 'あと'\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "channel_name = 'general'\n",
    "\n",
    "tmp_df = df.query(f'channel_name == \"{channel_name}\"')    \n",
    "tmp_doc = ''.join(tmp_df['text_trimed'].values.tolist())\n",
    "tmp_words = split_into_words(tmp_doc)\n",
    "\n",
    "fig = plt.figure(figsize=(16, 10))\n",
    "fpath = \"/System/Library/Fonts/ヒラギノ角ゴシック W3.ttc\"\n",
    "\n",
    "wordcloud = WordCloud(background_color=\"white\", width=900, height=500, font_path=fpath, stopwords=stop_words).generate(\" \".join(tmp_words))\n",
    "ax = fig.add_subplot(1, 1, 1)\n",
    "ax.imshow(wordcloud)\n",
    "[spine.set_visible(False) for spine in ax.spines.values()]\n",
    "\n",
    "ax.set_yticklabels([])\n",
    "ax.set_xticklabels([])\n",
    "ax.grid(False)\n",
    "ax.set_title(f'#{channel_name}', fontsize=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "channel_name = 'random'\n",
    "\n",
    "tmp_df = df.query(f'channel_name == \"{channel_name}\"')    \n",
    "tmp_doc = ''.join(tmp_df['text_trimed'].values.tolist())\n",
    "tmp_words = split_into_words(tmp_doc)\n",
    "\n",
    "fig = plt.figure(figsize=(16, 10))\n",
    "fpath = \"/System/Library/Fonts/ヒラギノ角ゴシック W3.ttc\"\n",
    "\n",
    "wordcloud = WordCloud(background_color=\"white\", width=900, height=500, font_path=fpath, stopwords=stop_words).generate(\" \".join(tmp_words))\n",
    "ax = fig.add_subplot(1, 1, 1)\n",
    "ax.imshow(wordcloud)\n",
    "[spine.set_visible(False) for spine in ax.spines.values()]\n",
    "\n",
    "ax.set_yticklabels([])\n",
    "ax.set_xticklabels([])\n",
    "ax.grid(False)\n",
    "ax.set_title(f'#{channel_name}', fontsize=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "channel_name = 'times_ikedayu'\n",
    "\n",
    "tmp_df = df.query(f'channel_name == \"{channel_name}\"')    \n",
    "tmp_doc = ''.join(tmp_df['text_trimed'].values.tolist())\n",
    "tmp_words = split_into_words(tmp_doc)\n",
    "\n",
    "fig = plt.figure(figsize=(16, 10))\n",
    "fpath = \"/System/Library/Fonts/ヒラギノ角ゴシック W3.ttc\"\n",
    "\n",
    "wordcloud = WordCloud(background_color=\"white\", width=900, height=500, font_path=fpath, stopwords=stop_words).generate(\" \".join(tmp_words))\n",
    "ax = fig.add_subplot(1, 1, 1)\n",
    "ax.imshow(wordcloud)\n",
    "[spine.set_visible(False) for spine in ax.spines.values()]\n",
    "\n",
    "ax.set_yticklabels([])\n",
    "ax.set_xticklabels([])\n",
    "ax.grid(False)\n",
    "ax.set_title(f'#{channel_name}', fontsize=20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Doc2Vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sentences = []\n",
    "for channel_name, words in channel_words.items():\n",
    "    td = TaggedDocument(words=words, tags=[channel_name])\n",
    "    sentences.append(td)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = models.Doc2Vec(sentences, dm=0, vector_size=300, window=15, alpha=.025, min_alpha=.025, min_count=1, sample=1e-6)\n",
    "\n",
    "print('\\n訓練開始')\n",
    "for epoch in range(20):\n",
    "    print('Epoch: {}'.format(epoch + 1))\n",
    "    model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)\n",
    "    model.alpha -= (0.025 - 0.0001) / 19\n",
    "    model.min_alpha = model.alpha"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save(str(data_dir/'kaizen_slack/d2v.model'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = models.Doc2Vec.load(str(data_dir/'kaizen_slack/d2v.model'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "ls_similar_doc = []\n",
    "for channel_name in top20_channel:\n",
    "    similar_doc = {'channel_name': channel_name}\n",
    "    similar_channels = model.docvecs.most_similar(channel_name, topn=3)\n",
    "    for i, (cname, value) in enumerate(similar_channels):\n",
    "        text = f\"{cname}({value:.2f})\"\n",
    "        similar_doc[f'{i+1}位'] = text\n",
    "    ls_similar_doc.append(similar_doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_sim = pd.DataFrame(ls_similar_doc)[['channel_name', '1位', '2位', '3位']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "df_sim[df_sim['channel_name'].str.contains('times_')].to_clipboard(sep=';')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# t-SNE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_channels = [c for c in top20_channel if 'kz-' not in c]\n",
    "X = np.stack([model.docvecs[cname] for cname in target_channels])\n",
    "X_embedded = TSNE(n_components=2, n_iter=100000, learning_rate=4).fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(16, 8))\n",
    "ax = fig.add_subplot(1, 1, 1)\n",
    "\n",
    "ax.scatter(X_embedded.T[0], X_embedded.T[1])\n",
    "ax.set_xlim(-0.105, -0.094)\n",
    "ax.set_ylim(-0.075, -0.063)\n",
    "fontsize=19\n",
    "\n",
    "for i, c_name in enumerate(target_channels):\n",
    "    if c_name == 'ad-cs':\n",
    "        ax.annotate(c_name, (X_embedded[i][0]+0.0001, X_embedded[i][1]+0.0005), fontsize=fontsize)\n",
    "    elif c_name =='support-tech':\n",
    "        ax.annotate(c_name, (X_embedded[i][0]+0.0001, X_embedded[i][1]-0.0005), fontsize=fontsize)\n",
    "    elif c_name =='cs-engineering':\n",
    "        ax.annotate(c_name, (X_embedded[i][0]+0.0001, X_embedded[i][1]-0.0005), fontsize=fontsize)\n",
    "    elif c_name =='prd-random':\n",
    "        ax.annotate(c_name, (X_embedded[i][0]+0.0001, X_embedded[i][1]-0.0005), fontsize=fontsize)\n",
    "    elif c_name =='ad-dev-qa':\n",
    "        ax.annotate(c_name, (X_embedded[i][0]-0.001, X_embedded[i][1]+0.0005), fontsize=fontsize)\n",
    "    elif c_name =='prd-team-sre':\n",
    "        ax.annotate(c_name, (X_embedded[i][0]+0.0001, X_embedded[i][1]-0.0007), fontsize=fontsize)\n",
    "    else:\n",
    "        ax.annotate(c_name, (X_embedded[i][0]+0.0001, X_embedded[i][1]+0.0001), fontsize=fontsize)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}