uni-3/ginza_mrph.ipynb

## ginza_mrph.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.6"
    },
    "colab": {
      "name": "ginza_mrph.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/uni-3/8f743303ffb71ecf824859ab9a7258bb/ginza_mrph.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "_7Te7T2t9NGL",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import spacy\n",
        "import pandas as pd"
      ],
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gXZpJ_O49NGf",
        "colab_type": "text"
      },
      "source": [
        "#### init ginza"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "outputs_hidden": true
        },
        "id": "XMa-PMst9NGh",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!pip install -U ginza"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bO0709P89NGx",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 61
        },
        "outputId": "c14ad25e-31af-4f74-d4ef-7d2455d1843a"
      },
      "source": [
        "#https://megagonlabs.github.io/ginza/\n",
        "!ginza -i"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "system.dic is already extracted: /usr/local/lib/python3.6/dist-packages/ja_ginza_dict/sudachidict/system.dic.tar.xz\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "cr0n-Phh9NG_",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "class ginza_nlp():\n",
        "    def __init__(self):\n",
        "        self.nlp = spacy.load('ja_ginza')\n",
        "\n",
        "\n",
        "    def mrph(self, text: str) -> pd.DataFrame:\n",
        "        \"\"\"\n",
        "        あるカラムのテキストを形態素解析して、dfに追加して返す\n",
        "\n",
        "        Parameters\n",
        "        ----------\n",
        "        text : str\n",
        "        col_name : str\n",
        "\n",
        "        Returns\n",
        "        -------\n",
        "        rows : pd.DaraFrame\n",
        "             中身の意味\n",
        "             mrph_cols = ['sent', 'word_num', 'word_len', 'word', 'orig'\n",
        "                  , 'ent_type', 'pos1', 'pos2', 'pos3', '係り受け', '木の深さ']\n",
        "\n",
        "        \"\"\"\n",
        "        doc = self.nlp(text)\n",
        "\n",
        "        rows = []\n",
        "        sent = doc[:]\n",
        "        for token in sent:\n",
        "            pos1, pos2, pos3 = self.ex_pos(token.tag_.split('-'))\n",
        "    \n",
        "            rows.append([sent.text, len(sent), len(token)\n",
        "                    , token.orth_, token.lemma_\n",
        "                    , token.ent_type_, pos1, pos2, pos3\n",
        "                    , [token.text + '->' + child.text for child in token.children]\n",
        "                    , self.tree_depth(sent.root)\n",
        "                   ])\n",
        "\n",
        "        return pd.DataFrame(rows)\n",
        "    \n",
        "    def ex_pos(self,pos: list) -> (str, str, str):\n",
        "        \"\"\"\n",
        "        品詞情報を分解して渡す\n",
        "\n",
        "        Parameters\n",
        "        ----------\n",
        "        pos : list]\n",
        "\n",
        "        Returns\n",
        "        -------\n",
        "\n",
        "        \"\"\"\n",
        "        try:\n",
        "            pos1 = pos[0]\n",
        "        except:\n",
        "            pos1 = None \n",
        "            \n",
        "        try:\n",
        "            pos2 = pos[1]\n",
        "        except:\n",
        "            pos2 = None\n",
        "        \n",
        "        try:   \n",
        "            pos3 = pos[2]\n",
        "        except:\n",
        "            pos3 = None\n",
        "    \n",
        "        return pos1, pos2, pos3\n",
        "    \n",
        "    \n",
        "    def tree_depth(self, token: spacy):\n",
        "        \"\"\"\n",
        "            係り受け木の深さ: 文の係り受け木の深さの最大値の平均\n",
        "        \"\"\"\n",
        "        d = 0\n",
        "        if token.n_lefts + token.n_rights > 0:\n",
        "            d = [tree_depth(child) for child in token.children]\n",
        "            return sum(d)     \n",
        "        else:\n",
        "            return 1\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "O0UYtN5M9NHH",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import concurrent.futures\n",
        "\n",
        "def mrph_df(df: pd.DataFrame, target=None, mrph_cols=['sent', 'word_num', 'word_len', 'word', 'orig'\n",
        "    , 'ent_type', 'pos1', 'pos2', 'pos3', '係り受け', '木の深さ']) -> pd.DataFrame:\n",
        "    gn = ginza_nlp()\n",
        "    with concurrent.futures.ProcessPoolExecutor() as executor:\n",
        "        mrph_list = list(executor.map(gn.mrph, df[target]))\n",
        "\n",
        "        df_m = pd.concat(mrph_list, ignore_index=True)\n",
        "        df_m.columns = mrph_cols\n",
        "\n",
        "    df_nlp = df.merge(df_m, left_on=target, right_on=\"sent\")\n",
        "    return df_nlp, df_m"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "dXmOhumJ9NHR",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gQRz01Sz9NHa",
        "colab_type": "text"
      },
      "source": [
        "#### load data"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xfSz33XD9NHc",
        "colab_type": "code",
        "colab": {},
        "outputId": "90f64402-a76d-4dc3-c856-d45f55c94308"
      },
      "source": [
        "cols = [\"サービスID\", \"アイテムID\", \"13桁ISBN\", \"カテゴリ\",\n",
        "        \"評価\", \"読書状況\", \"レビュー\", \"タグ\", \"読書メモ(非公開)\",\n",
        "        \"登録日時\", \"読了日\", \"タイトル\", \"作者名\", \"出版社名\", \"発行年\", \"ジャンル\", \"ページ数\"]\n",
        "df_raw = pd.read_csv('./booklog.csv', encoding=\"shift-jis\", names=cols)\n",
        "df_raw.shape"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(252, 17)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 17
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7m5ow5iU9NHj",
        "colab_type": "code",
        "colab": {},
        "outputId": "02822b22-043a-4b3d-c1c2-39cd84b33408"
      },
      "source": [
        "df_raw.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>サービスID</th>\n",
              "      <th>アイテムID</th>\n",
              "      <th>13桁ISBN</th>\n",
              "      <th>カテゴリ</th>\n",
              "      <th>評価</th>\n",
              "      <th>読書状況</th>\n",
              "      <th>レビュー</th>\n",
              "      <th>タグ</th>\n",
              "      <th>読書メモ(非公開)</th>\n",
              "      <th>登録日時</th>\n",
              "      <th>読了日</th>\n",
              "      <th>タイトル</th>\n",
              "      <th>作者名</th>\n",
              "      <th>出版社名</th>\n",
              "      <th>発行年</th>\n",
              "      <th>ジャンル</th>\n",
              "      <th>ページ数</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1</td>\n",
              "      <td>4794221657</td>\n",
              "      <td>9.784794e+12</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>読みたい</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>2016-05-14 01:35:03</td>\n",
              "      <td>NaN</td>\n",
              "      <td>コネクトーム:脳の配線はどのように「わたし」をつくり出すのか</td>\n",
              "      <td>セバスチャン・スン</td>\n",
              "      <td>草思社</td>\n",
              "      <td>2015.0</td>\n",
              "      <td>本</td>\n",
              "      <td>504.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1</td>\n",
              "      <td>4535558140</td>\n",
              "      <td>9.784536e+12</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>読みたい</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>2016-05-14 01:35:23</td>\n",
              "      <td>NaN</td>\n",
              "      <td>マーケット進化論 経済が解き明かす日本の歴史</td>\n",
              "      <td>横山 和輝</td>\n",
              "      <td>日本評論社</td>\n",
              "      <td>2016.0</td>\n",
              "      <td>本</td>\n",
              "      <td>263.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1</td>\n",
              "      <td>415209611X</td>\n",
              "      <td>9.784152e+12</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>読みたい</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>2016-05-14 01:35:58</td>\n",
              "      <td>NaN</td>\n",
              "      <td>貨幣の「新」世界史――ハンムラビ法典からビットコインまで</td>\n",
              "      <td>カビール セガール</td>\n",
              "      <td>早川書房</td>\n",
              "      <td>2016.0</td>\n",
              "      <td>本</td>\n",
              "      <td>400.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1</td>\n",
              "      <td>4326504005</td>\n",
              "      <td>9.784327e+12</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>読みたい</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>2016-05-14 01:41:48</td>\n",
              "      <td>NaN</td>\n",
              "      <td>父が息子に語るマクロ経済学</td>\n",
              "      <td>齊藤 誠</td>\n",
              "      <td>勁草書房</td>\n",
              "      <td>2014.0</td>\n",
              "      <td>本</td>\n",
              "      <td>358.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>1</td>\n",
              "      <td>4492314776</td>\n",
              "      <td>9.784492e+12</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>読みたい</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>2016-05-14 01:45:58</td>\n",
              "      <td>NaN</td>\n",
              "      <td>ヤバすぎる経済学</td>\n",
              "      <td>スティーヴン・D・レヴィット</td>\n",
              "      <td>東洋経済新報社</td>\n",
              "      <td>2016.0</td>\n",
              "      <td>本</td>\n",
              "      <td>436.0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   サービスID      アイテムID       13桁ISBN  カテゴリ  評価  読書状況  レビュー  タグ 読書メモ(非公開)  \\\n",
              "0       1  4794221657  9.784794e+12   NaN NaN  読みたい   NaN NaN       NaN   \n",
              "1       1  4535558140  9.784536e+12   NaN NaN  読みたい   NaN NaN       NaN   \n",
              "2       1  415209611X  9.784152e+12   NaN NaN  読みたい   NaN NaN       NaN   \n",
              "3       1  4326504005  9.784327e+12   NaN NaN  読みたい   NaN NaN       NaN   \n",
              "4       1  4492314776  9.784492e+12   NaN NaN  読みたい   NaN NaN       NaN   \n",
              "\n",
              "                  登録日時  読了日                            タイトル             作者名  \\\n",
              "0  2016-05-14 01:35:03  NaN  コネクトーム:脳の配線はどのように「わたし」をつくり出すのか       セバスチャン・スン   \n",
              "1  2016-05-14 01:35:23  NaN          マーケット進化論 経済が解き明かす日本の歴史           横山 和輝   \n",
              "2  2016-05-14 01:35:58  NaN    貨幣の「新」世界史――ハンムラビ法典からビットコインまで       カビール セガール   \n",
              "3  2016-05-14 01:41:48  NaN                   父が息子に語るマクロ経済学            齊藤 誠   \n",
              "4  2016-05-14 01:45:58  NaN                        ヤバすぎる経済学  スティーヴン・D・レヴィット   \n",
              "\n",
              "      出版社名     発行年 ジャンル   ページ数  \n",
              "0      草思社  2015.0    本  504.0  \n",
              "1    日本評論社  2016.0    本  263.0  \n",
              "2     早川書房  2016.0    本  400.0  \n",
              "3     勁草書房  2014.0    本  358.0  \n",
              "4  東洋経済新報社  2016.0    本  436.0  "
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 18
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0-9KT0Y99NHr",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "target = 'タイトル'"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "AQytYR3z9NHz",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# 元データと結合したものと文章と単語情報のみのdf\n",
        "df_nlp, df_mrph = mrph_df(df_raw, target=target)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VyPbNoHQ9NH7",
        "colab_type": "code",
        "colab": {},
        "outputId": "232b105a-0e7e-4ed0-afcb-580da7b5f525"
      },
      "source": [
        "df_nlp.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>サービスID</th>\n",
              "      <th>アイテムID</th>\n",
              "      <th>13桁ISBN</th>\n",
              "      <th>カテゴリ</th>\n",
              "      <th>評価</th>\n",
              "      <th>読書状況</th>\n",
              "      <th>レビュー</th>\n",
              "      <th>タグ</th>\n",
              "      <th>読書メモ(非公開)</th>\n",
              "      <th>登録日時</th>\n",
              "      <th>...</th>\n",
              "      <th>word_num</th>\n",
              "      <th>word_len</th>\n",
              "      <th>word</th>\n",
              "      <th>orig</th>\n",
              "      <th>ent_type</th>\n",
              "      <th>pos1</th>\n",
              "      <th>pos2</th>\n",
              "      <th>pos3</th>\n",
              "      <th>係り受け</th>\n",
              "      <th>木の深さ</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1</td>\n",
              "      <td>4794221657</td>\n",
              "      <td>9.784794e+12</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>読みたい</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>2016-05-14 01:35:03</td>\n",
              "      <td>...</td>\n",
              "      <td>16</td>\n",
              "      <td>6</td>\n",
              "      <td>コネクトーム</td>\n",
              "      <td>コネクトーム</td>\n",
              "      <td></td>\n",
              "      <td>名詞</td>\n",
              "      <td>普通名詞</td>\n",
              "      <td>一般</td>\n",
              "      <td>[]</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1</td>\n",
              "      <td>4794221657</td>\n",
              "      <td>9.784794e+12</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>読みたい</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>2016-05-14 01:35:03</td>\n",
              "      <td>...</td>\n",
              "      <td>16</td>\n",
              "      <td>1</td>\n",
              "      <td>:</td>\n",
              "      <td>：</td>\n",
              "      <td></td>\n",
              "      <td>補助記号</td>\n",
              "      <td>一般</td>\n",
              "      <td>None</td>\n",
              "      <td>[]</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1</td>\n",
              "      <td>4794221657</td>\n",
              "      <td>9.784794e+12</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>読みたい</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>2016-05-14 01:35:03</td>\n",
              "      <td>...</td>\n",
              "      <td>16</td>\n",
              "      <td>1</td>\n",
              "      <td>脳</td>\n",
              "      <td>脳</td>\n",
              "      <td></td>\n",
              "      <td>名詞</td>\n",
              "      <td>普通名詞</td>\n",
              "      <td>一般</td>\n",
              "      <td>[]</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1</td>\n",
              "      <td>4794221657</td>\n",
              "      <td>9.784794e+12</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>読みたい</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>2016-05-14 01:35:03</td>\n",
              "      <td>...</td>\n",
              "      <td>16</td>\n",
              "      <td>1</td>\n",
              "      <td>の</td>\n",
              "      <td>の</td>\n",
              "      <td></td>\n",
              "      <td>助詞</td>\n",
              "      <td>格助詞</td>\n",
              "      <td>None</td>\n",
              "      <td>[]</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>1</td>\n",
              "      <td>4794221657</td>\n",
              "      <td>9.784794e+12</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>読みたい</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>2016-05-14 01:35:03</td>\n",
              "      <td>...</td>\n",
              "      <td>16</td>\n",
              "      <td>2</td>\n",
              "      <td>配線</td>\n",
              "      <td>配線</td>\n",
              "      <td></td>\n",
              "      <td>名詞</td>\n",
              "      <td>普通名詞</td>\n",
              "      <td>サ変可能</td>\n",
              "      <td>[]</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>5 rows × 28 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "   サービスID      アイテムID       13桁ISBN  カテゴリ  評価  読書状況  レビュー  タグ 読書メモ(非公開)  \\\n",
              "0       1  4794221657  9.784794e+12   NaN NaN  読みたい   NaN NaN       NaN   \n",
              "1       1  4794221657  9.784794e+12   NaN NaN  読みたい   NaN NaN       NaN   \n",
              "2       1  4794221657  9.784794e+12   NaN NaN  読みたい   NaN NaN       NaN   \n",
              "3       1  4794221657  9.784794e+12   NaN NaN  読みたい   NaN NaN       NaN   \n",
              "4       1  4794221657  9.784794e+12   NaN NaN  読みたい   NaN NaN       NaN   \n",
              "\n",
              "                  登録日時  ... word_num word_len    word    orig  ent_type  pos1  \\\n",
              "0  2016-05-14 01:35:03  ...       16        6  コネクトーム  コネクトーム              名詞   \n",
              "1  2016-05-14 01:35:03  ...       16        1       :       ：            補助記号   \n",
              "2  2016-05-14 01:35:03  ...       16        1       脳       脳              名詞   \n",
              "3  2016-05-14 01:35:03  ...       16        1       の       の              助詞   \n",
              "4  2016-05-14 01:35:03  ...       16        2      配線      配線              名詞   \n",
              "\n",
              "   pos2  pos3  係り受け  木の深さ  \n",
              "0  普通名詞    一般    []     1  \n",
              "1    一般  None    []     1  \n",
              "2  普通名詞    一般    []     1  \n",
              "3   格助詞  None    []     1  \n",
              "4  普通名詞  サ変可能    []     1  \n",
              "\n",
              "[5 rows x 28 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 26
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Oimfehhf9NID",
        "colab_type": "code",
        "colab": {},
        "outputId": "23b8164c-2368-4726-994c-90c97081ae2a"
      },
      "source": [
        "df_mrph.shape"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(2760, 11)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 27
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1KuTDBhu9NIJ",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "df_mrph.to_csv('mrph_booklog.csv', index=False)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "F40Zhhes9NIS",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.6"
	},
	"colab": {
	"name": "ginza_mrph.ipynb",
	"provenance": [],
	"collapsed_sections": [],
	"include_colab_link": true
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/uni-3/8f743303ffb71ecf824859ab9a7258bb/ginza_mrph.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "_7Te7T2t9NGL",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"import spacy\n",
	"import pandas as pd"
	],
	"execution_count": 1,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "gXZpJ_O49NGf",
	"colab_type": "text"
	},
	"source": [
	"#### init ginza"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"collapsed": true,
	"jupyter": {
	"outputs_hidden": true
	},
	"id": "XMa-PMst9NGh",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"!pip install -U ginza"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "bO0709P89NGx",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 61
	},
	"outputId": "c14ad25e-31af-4f74-d4ef-7d2455d1843a"
	},
	"source": [
	"#https://megagonlabs.github.io/ginza/\n",
	"!ginza -i"
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"system.dic is already extracted: /usr/local/lib/python3.6/dist-packages/ja_ginza_dict/sudachidict/system.dic.tar.xz\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "cr0n-Phh9NG_",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"class ginza_nlp():\n",
	" def __init__(self):\n",
	" self.nlp = spacy.load('ja_ginza')\n",
	"\n",
	"\n",
	" def mrph(self, text: str) -> pd.DataFrame:\n",
	" \"\"\"\n",
	" あるカラムのテキストを形態素解析して、dfに追加して返す\n",
	"\n",
	" Parameters\n",
	" ----------\n",
	" text : str\n",
	" col_name : str\n",
	"\n",
	" Returns\n",
	" -------\n",
	" rows : pd.DaraFrame\n",
	" 中身の意味\n",
	" mrph_cols = ['sent', 'word_num', 'word_len', 'word', 'orig'\n",
	" , 'ent_type', 'pos1', 'pos2', 'pos3', '係り受け', '木の深さ']\n",
	"\n",
	" \"\"\"\n",
	" doc = self.nlp(text)\n",
	"\n",
	" rows = []\n",
	" sent = doc[:]\n",
	" for token in sent:\n",
	" pos1, pos2, pos3 = self.ex_pos(token.tag_.split('-'))\n",
	" \n",
	" rows.append([sent.text, len(sent), len(token)\n",
	" , token.orth_, token.lemma_\n",
	" , token.ent_type_, pos1, pos2, pos3\n",
	" , [token.text + '->' + child.text for child in token.children]\n",
	" , self.tree_depth(sent.root)\n",
	" ])\n",
	"\n",
	" return pd.DataFrame(rows)\n",
	" \n",
	" def ex_pos(self,pos: list) -> (str, str, str):\n",
	" \"\"\"\n",
	" 品詞情報を分解して渡す\n",
	"\n",
	" Parameters\n",
	" ----------\n",
	" pos : list]\n",
	"\n",
	" Returns\n",
	" -------\n",
	"\n",
	" \"\"\"\n",
	" try:\n",
	" pos1 = pos[0]\n",
	" except:\n",
	" pos1 = None \n",
	" \n",
	" try:\n",
	" pos2 = pos[1]\n",
	" except:\n",
	" pos2 = None\n",
	" \n",
	" try: \n",
	" pos3 = pos[2]\n",
	" except:\n",
	" pos3 = None\n",
	" \n",
	" return pos1, pos2, pos3\n",
	" \n",
	" \n",
	" def tree_depth(self, token: spacy):\n",
	" \"\"\"\n",
	" 係り受け木の深さ: 文の係り受け木の深さの最大値の平均\n",
	" \"\"\"\n",
	" d = 0\n",
	" if token.n_lefts + token.n_rights > 0:\n",
	" d = [tree_depth(child) for child in token.children]\n",
	" return sum(d) \n",
	" else:\n",
	" return 1\n"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "O0UYtN5M9NHH",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"import concurrent.futures\n",
	"\n",
	"def mrph_df(df: pd.DataFrame, target=None, mrph_cols=['sent', 'word_num', 'word_len', 'word', 'orig'\n",
	" , 'ent_type', 'pos1', 'pos2', 'pos3', '係り受け', '木の深さ']) -> pd.DataFrame:\n",
	" gn = ginza_nlp()\n",
	" with concurrent.futures.ProcessPoolExecutor() as executor:\n",
	" mrph_list = list(executor.map(gn.mrph, df[target]))\n",
	"\n",
	" df_m = pd.concat(mrph_list, ignore_index=True)\n",
	" df_m.columns = mrph_cols\n",
	"\n",
	" df_nlp = df.merge(df_m, left_on=target, right_on=\"sent\")\n",
	" return df_nlp, df_m"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "dXmOhumJ9NHR",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	""
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "gQRz01Sz9NHa",
	"colab_type": "text"
	},
	"source": [
	"#### load data"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "xfSz33XD9NHc",
	"colab_type": "code",
	"colab": {},
	"outputId": "90f64402-a76d-4dc3-c856-d45f55c94308"
	},
	"source": [
	"cols = [\"サービスID\", \"アイテムID\", \"13桁ISBN\", \"カテゴリ\",\n",
	" \"評価\", \"読書状況\", \"レビュー\", \"タグ\", \"読書メモ(非公開)\",\n",
	" \"登録日時\", \"読了日\", \"タイトル\", \"作者名\", \"出版社名\", \"発行年\", \"ジャンル\", \"ページ数\"]\n",
	"df_raw = pd.read_csv('./booklog.csv', encoding=\"shift-jis\", names=cols)\n",
	"df_raw.shape"
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"(252, 17)"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 17
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "7m5ow5iU9NHj",
	"colab_type": "code",
	"colab": {},
	"outputId": "02822b22-043a-4b3d-c1c2-39cd84b33408"
	},
	"source": [
	"df_raw.head()"
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>サービスID</th>\n",
	" <th>アイテムID</th>\n",
	" <th>13桁ISBN</th>\n",
	" <th>カテゴリ</th>\n",
	" <th>評価</th>\n",
	" <th>読書状況</th>\n",
	" <th>レビュー</th>\n",
	" <th>タグ</th>\n",
	" <th>読書メモ(非公開)</th>\n",
	" <th>登録日時</th>\n",
	" <th>読了日</th>\n",
	" <th>タイトル</th>\n",
	" <th>作者名</th>\n",
	" <th>出版社名</th>\n",
	" <th>発行年</th>\n",
	" <th>ジャンル</th>\n",
	" <th>ページ数</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>1</td>\n",
	" <td>4794221657</td>\n",
	" <td>9.784794e+12</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>読みたい</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>2016-05-14 01:35:03</td>\n",
	" <td>NaN</td>\n",
	" <td>コネクトーム:脳の配線はどのように「わたし」をつくり出すのか</td>\n",
	" <td>セバスチャン・スン</td>\n",
	" <td>草思社</td>\n",
	" <td>2015.0</td>\n",
	" <td>本</td>\n",
	" <td>504.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>1</td>\n",
	" <td>4535558140</td>\n",
	" <td>9.784536e+12</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>読みたい</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>2016-05-14 01:35:23</td>\n",
	" <td>NaN</td>\n",
	" <td>マーケット進化論経済が解き明かす日本の歴史</td>\n",
	" <td>横山和輝</td>\n",
	" <td>日本評論社</td>\n",
	" <td>2016.0</td>\n",
	" <td>本</td>\n",
	" <td>263.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>1</td>\n",
	" <td>415209611X</td>\n",
	" <td>9.784152e+12</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>読みたい</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>2016-05-14 01:35:58</td>\n",
	" <td>NaN</td>\n",
	" <td>貨幣の「新」世界史――ハンムラビ法典からビットコインまで</td>\n",
	" <td>カビールセガール</td>\n",
	" <td>早川書房</td>\n",
	" <td>2016.0</td>\n",
	" <td>本</td>\n",
	" <td>400.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>1</td>\n",
	" <td>4326504005</td>\n",
	" <td>9.784327e+12</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>読みたい</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>2016-05-14 01:41:48</td>\n",
	" <td>NaN</td>\n",
	" <td>父が息子に語るマクロ経済学</td>\n",
	" <td>齊藤誠</td>\n",
	" <td>勁草書房</td>\n",
	" <td>2014.0</td>\n",
	" <td>本</td>\n",
	" <td>358.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>1</td>\n",
	" <td>4492314776</td>\n",
	" <td>9.784492e+12</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>読みたい</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>2016-05-14 01:45:58</td>\n",
	" <td>NaN</td>\n",
	" <td>ヤバすぎる経済学</td>\n",
	" <td>スティーヴン・D・レヴィット</td>\n",
	" <td>東洋経済新報社</td>\n",
	" <td>2016.0</td>\n",
	" <td>本</td>\n",
	" <td>436.0</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" サービスID アイテムID 13桁ISBN カテゴリ評価読書状況レビュータグ読書メモ(非公開) \\\n",
	"0 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n",
	"1 1 4535558140 9.784536e+12 NaN NaN 読みたい NaN NaN NaN \n",
	"2 1 415209611X 9.784152e+12 NaN NaN 読みたい NaN NaN NaN \n",
	"3 1 4326504005 9.784327e+12 NaN NaN 読みたい NaN NaN NaN \n",
	"4 1 4492314776 9.784492e+12 NaN NaN 読みたい NaN NaN NaN \n",
	"\n",
	" 登録日時読了日タイトル作者名 \\\n",
	"0 2016-05-14 01:35:03 NaN コネクトーム:脳の配線はどのように「わたし」をつくり出すのかセバスチャン・スン \n",
	"1 2016-05-14 01:35:23 NaN マーケット進化論経済が解き明かす日本の歴史横山和輝 \n",
	"2 2016-05-14 01:35:58 NaN 貨幣の「新」世界史――ハンムラビ法典からビットコインまでカビールセガール \n",
	"3 2016-05-14 01:41:48 NaN 父が息子に語るマクロ経済学齊藤誠 \n",
	"4 2016-05-14 01:45:58 NaN ヤバすぎる経済学スティーヴン・D・レヴィット \n",
	"\n",
	" 出版社名発行年ジャンルページ数 \n",
	"0 草思社 2015.0 本 504.0 \n",
	"1 日本評論社 2016.0 本 263.0 \n",
	"2 早川書房 2016.0 本 400.0 \n",
	"3 勁草書房 2014.0 本 358.0 \n",
	"4 東洋経済新報社 2016.0 本 436.0 "
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 18
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "0-9KT0Y99NHr",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"target = 'タイトル'"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "AQytYR3z9NHz",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"# 元データと結合したものと文章と単語情報のみのdf\n",
	"df_nlp, df_mrph = mrph_df(df_raw, target=target)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "VyPbNoHQ9NH7",
	"colab_type": "code",
	"colab": {},
	"outputId": "232b105a-0e7e-4ed0-afcb-580da7b5f525"
	},
	"source": [
	"df_nlp.head()"
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>サービスID</th>\n",
	" <th>アイテムID</th>\n",
	" <th>13桁ISBN</th>\n",
	" <th>カテゴリ</th>\n",
	" <th>評価</th>\n",
	" <th>読書状況</th>\n",
	" <th>レビュー</th>\n",
	" <th>タグ</th>\n",
	" <th>読書メモ(非公開)</th>\n",
	" <th>登録日時</th>\n",
	" <th>...</th>\n",
	" <th>word_num</th>\n",
	" <th>word_len</th>\n",
	" <th>word</th>\n",
	" <th>orig</th>\n",
	" <th>ent_type</th>\n",
	" <th>pos1</th>\n",
	" <th>pos2</th>\n",
	" <th>pos3</th>\n",
	" <th>係り受け</th>\n",
	" <th>木の深さ</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>1</td>\n",
	" <td>4794221657</td>\n",
	" <td>9.784794e+12</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>読みたい</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>2016-05-14 01:35:03</td>\n",
	" <td>...</td>\n",
	" <td>16</td>\n",
	" <td>6</td>\n",
	" <td>コネクトーム</td>\n",
	" <td>コネクトーム</td>\n",
	" <td></td>\n",
	" <td>名詞</td>\n",
	" <td>普通名詞</td>\n",
	" <td>一般</td>\n",
	" <td>[]</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>1</td>\n",
	" <td>4794221657</td>\n",
	" <td>9.784794e+12</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>読みたい</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>2016-05-14 01:35:03</td>\n",
	" <td>...</td>\n",
	" <td>16</td>\n",
	" <td>1</td>\n",
	" <td>:</td>\n",
	" <td>：</td>\n",
	" <td></td>\n",
	" <td>補助記号</td>\n",
	" <td>一般</td>\n",
	" <td>None</td>\n",
	" <td>[]</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>1</td>\n",
	" <td>4794221657</td>\n",
	" <td>9.784794e+12</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>読みたい</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>2016-05-14 01:35:03</td>\n",
	" <td>...</td>\n",
	" <td>16</td>\n",
	" <td>1</td>\n",
	" <td>脳</td>\n",
	" <td>脳</td>\n",
	" <td></td>\n",
	" <td>名詞</td>\n",
	" <td>普通名詞</td>\n",
	" <td>一般</td>\n",
	" <td>[]</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>1</td>\n",
	" <td>4794221657</td>\n",
	" <td>9.784794e+12</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>読みたい</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>2016-05-14 01:35:03</td>\n",
	" <td>...</td>\n",
	" <td>16</td>\n",
	" <td>1</td>\n",
	" <td>の</td>\n",
	" <td>の</td>\n",
	" <td></td>\n",
	" <td>助詞</td>\n",
	" <td>格助詞</td>\n",
	" <td>None</td>\n",
	" <td>[]</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>1</td>\n",
	" <td>4794221657</td>\n",
	" <td>9.784794e+12</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>読みたい</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>NaN</td>\n",
	" <td>2016-05-14 01:35:03</td>\n",
	" <td>...</td>\n",
	" <td>16</td>\n",
	" <td>2</td>\n",
	" <td>配線</td>\n",
	" <td>配線</td>\n",
	" <td></td>\n",
	" <td>名詞</td>\n",
	" <td>普通名詞</td>\n",
	" <td>サ変可能</td>\n",
	" <td>[]</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>5 rows × 28 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" サービスID アイテムID 13桁ISBN カテゴリ評価読書状況レビュータグ読書メモ(非公開) \\\n",
	"0 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n",
	"1 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n",
	"2 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n",
	"3 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n",
	"4 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n",
	"\n",
	" 登録日時 ... word_num word_len word orig ent_type pos1 \\\n",
	"0 2016-05-14 01:35:03 ... 16 6 コネクトームコネクトーム名詞 \n",
	"1 2016-05-14 01:35:03 ... 16 1 : ：補助記号 \n",
	"2 2016-05-14 01:35:03 ... 16 1 脳脳名詞 \n",
	"3 2016-05-14 01:35:03 ... 16 1 のの助詞 \n",
	"4 2016-05-14 01:35:03 ... 16 2 配線配線名詞 \n",
	"\n",
	" pos2 pos3 係り受け木の深さ \n",
	"0 普通名詞一般 [] 1 \n",
	"1 一般 None [] 1 \n",
	"2 普通名詞一般 [] 1 \n",
	"3 格助詞 None [] 1 \n",
	"4 普通名詞サ変可能 [] 1 \n",
	"\n",
	"[5 rows x 28 columns]"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 26
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "Oimfehhf9NID",
	"colab_type": "code",
	"colab": {},
	"outputId": "23b8164c-2368-4726-994c-90c97081ae2a"
	},
	"source": [
	"df_mrph.shape"
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"(2760, 11)"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 27
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "1KuTDBhu9NIJ",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"df_mrph.to_csv('mrph_booklog.csv', index=False)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "F40Zhhes9NIS",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	""
	],
	"execution_count": null,
	"outputs": []
	}
	]
	}