Created
July 25, 2020 07:54
-
-
Save uni-3/8f743303ffb71ecf824859ab9a7258bb to your computer and use it in GitHub Desktop.
ginza_mrph.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.6" | |
}, | |
"colab": { | |
"name": "ginza_mrph.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"include_colab_link": true | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/uni-3/8f743303ffb71ecf824859ab9a7258bb/ginza_mrph.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "_7Te7T2t9NGL", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"import spacy\n", | |
"import pandas as pd" | |
], | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "gXZpJ_O49NGf", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"#### init ginza" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"collapsed": true, | |
"jupyter": { | |
"outputs_hidden": true | |
}, | |
"id": "XMa-PMst9NGh", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"!pip install -U ginza" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "bO0709P89NGx", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 61 | |
}, | |
"outputId": "c14ad25e-31af-4f74-d4ef-7d2455d1843a" | |
}, | |
"source": [ | |
"#https://megagonlabs.github.io/ginza/\n", | |
"!ginza -i" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"system.dic is already extracted: /usr/local/lib/python3.6/dist-packages/ja_ginza_dict/sudachidict/system.dic.tar.xz\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "cr0n-Phh9NG_", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"class ginza_nlp():\n", | |
" def __init__(self):\n", | |
" self.nlp = spacy.load('ja_ginza')\n", | |
"\n", | |
"\n", | |
" def mrph(self, text: str) -> pd.DataFrame:\n", | |
" \"\"\"\n", | |
" あるカラムのテキストを形態素解析して、dfに追加して返す\n", | |
"\n", | |
" Parameters\n", | |
" ----------\n", | |
" text : str\n", | |
" col_name : str\n", | |
"\n", | |
" Returns\n", | |
" -------\n", | |
" rows : pd.DaraFrame\n", | |
" 中身の意味\n", | |
" mrph_cols = ['sent', 'word_num', 'word_len', 'word', 'orig'\n", | |
" , 'ent_type', 'pos1', 'pos2', 'pos3', '係り受け', '木の深さ']\n", | |
"\n", | |
" \"\"\"\n", | |
" doc = self.nlp(text)\n", | |
"\n", | |
" rows = []\n", | |
" sent = doc[:]\n", | |
" for token in sent:\n", | |
" pos1, pos2, pos3 = self.ex_pos(token.tag_.split('-'))\n", | |
" \n", | |
" rows.append([sent.text, len(sent), len(token)\n", | |
" , token.orth_, token.lemma_\n", | |
" , token.ent_type_, pos1, pos2, pos3\n", | |
" , [token.text + '->' + child.text for child in token.children]\n", | |
" , self.tree_depth(sent.root)\n", | |
" ])\n", | |
"\n", | |
" return pd.DataFrame(rows)\n", | |
" \n", | |
" def ex_pos(self,pos: list) -> (str, str, str):\n", | |
" \"\"\"\n", | |
" 品詞情報を分解して渡す\n", | |
"\n", | |
" Parameters\n", | |
" ----------\n", | |
" pos : list]\n", | |
"\n", | |
" Returns\n", | |
" -------\n", | |
"\n", | |
" \"\"\"\n", | |
" try:\n", | |
" pos1 = pos[0]\n", | |
" except:\n", | |
" pos1 = None \n", | |
" \n", | |
" try:\n", | |
" pos2 = pos[1]\n", | |
" except:\n", | |
" pos2 = None\n", | |
" \n", | |
" try: \n", | |
" pos3 = pos[2]\n", | |
" except:\n", | |
" pos3 = None\n", | |
" \n", | |
" return pos1, pos2, pos3\n", | |
" \n", | |
" \n", | |
" def tree_depth(self, token: spacy):\n", | |
" \"\"\"\n", | |
" 係り受け木の深さ: 文の係り受け木の深さの最大値の平均\n", | |
" \"\"\"\n", | |
" d = 0\n", | |
" if token.n_lefts + token.n_rights > 0:\n", | |
" d = [tree_depth(child) for child in token.children]\n", | |
" return sum(d) \n", | |
" else:\n", | |
" return 1\n" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "O0UYtN5M9NHH", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"import concurrent.futures\n", | |
"\n", | |
"def mrph_df(df: pd.DataFrame, target=None, mrph_cols=['sent', 'word_num', 'word_len', 'word', 'orig'\n", | |
" , 'ent_type', 'pos1', 'pos2', 'pos3', '係り受け', '木の深さ']) -> pd.DataFrame:\n", | |
" gn = ginza_nlp()\n", | |
" with concurrent.futures.ProcessPoolExecutor() as executor:\n", | |
" mrph_list = list(executor.map(gn.mrph, df[target]))\n", | |
"\n", | |
" df_m = pd.concat(mrph_list, ignore_index=True)\n", | |
" df_m.columns = mrph_cols\n", | |
"\n", | |
" df_nlp = df.merge(df_m, left_on=target, right_on=\"sent\")\n", | |
" return df_nlp, df_m" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "dXmOhumJ9NHR", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "gQRz01Sz9NHa", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"#### load data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "xfSz33XD9NHc", | |
"colab_type": "code", | |
"colab": {}, | |
"outputId": "90f64402-a76d-4dc3-c856-d45f55c94308" | |
}, | |
"source": [ | |
"cols = [\"サービスID\", \"アイテムID\", \"13桁ISBN\", \"カテゴリ\",\n", | |
" \"評価\", \"読書状況\", \"レビュー\", \"タグ\", \"読書メモ(非公開)\",\n", | |
" \"登録日時\", \"読了日\", \"タイトル\", \"作者名\", \"出版社名\", \"発行年\", \"ジャンル\", \"ページ数\"]\n", | |
"df_raw = pd.read_csv('./booklog.csv', encoding=\"shift-jis\", names=cols)\n", | |
"df_raw.shape" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(252, 17)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 17 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "7m5ow5iU9NHj", | |
"colab_type": "code", | |
"colab": {}, | |
"outputId": "02822b22-043a-4b3d-c1c2-39cd84b33408" | |
}, | |
"source": [ | |
"df_raw.head()" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>サービスID</th>\n", | |
" <th>アイテムID</th>\n", | |
" <th>13桁ISBN</th>\n", | |
" <th>カテゴリ</th>\n", | |
" <th>評価</th>\n", | |
" <th>読書状況</th>\n", | |
" <th>レビュー</th>\n", | |
" <th>タグ</th>\n", | |
" <th>読書メモ(非公開)</th>\n", | |
" <th>登録日時</th>\n", | |
" <th>読了日</th>\n", | |
" <th>タイトル</th>\n", | |
" <th>作者名</th>\n", | |
" <th>出版社名</th>\n", | |
" <th>発行年</th>\n", | |
" <th>ジャンル</th>\n", | |
" <th>ページ数</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>4794221657</td>\n", | |
" <td>9.784794e+12</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>読みたい</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2016-05-14 01:35:03</td>\n", | |
" <td>NaN</td>\n", | |
" <td>コネクトーム:脳の配線はどのように「わたし」をつくり出すのか</td>\n", | |
" <td>セバスチャン・スン</td>\n", | |
" <td>草思社</td>\n", | |
" <td>2015.0</td>\n", | |
" <td>本</td>\n", | |
" <td>504.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>4535558140</td>\n", | |
" <td>9.784536e+12</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>読みたい</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2016-05-14 01:35:23</td>\n", | |
" <td>NaN</td>\n", | |
" <td>マーケット進化論 経済が解き明かす日本の歴史</td>\n", | |
" <td>横山 和輝</td>\n", | |
" <td>日本評論社</td>\n", | |
" <td>2016.0</td>\n", | |
" <td>本</td>\n", | |
" <td>263.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>415209611X</td>\n", | |
" <td>9.784152e+12</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>読みたい</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2016-05-14 01:35:58</td>\n", | |
" <td>NaN</td>\n", | |
" <td>貨幣の「新」世界史――ハンムラビ法典からビットコインまで</td>\n", | |
" <td>カビール セガール</td>\n", | |
" <td>早川書房</td>\n", | |
" <td>2016.0</td>\n", | |
" <td>本</td>\n", | |
" <td>400.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>4326504005</td>\n", | |
" <td>9.784327e+12</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>読みたい</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2016-05-14 01:41:48</td>\n", | |
" <td>NaN</td>\n", | |
" <td>父が息子に語るマクロ経済学</td>\n", | |
" <td>齊藤 誠</td>\n", | |
" <td>勁草書房</td>\n", | |
" <td>2014.0</td>\n", | |
" <td>本</td>\n", | |
" <td>358.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1</td>\n", | |
" <td>4492314776</td>\n", | |
" <td>9.784492e+12</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>読みたい</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2016-05-14 01:45:58</td>\n", | |
" <td>NaN</td>\n", | |
" <td>ヤバすぎる経済学</td>\n", | |
" <td>スティーヴン・D・レヴィット</td>\n", | |
" <td>東洋経済新報社</td>\n", | |
" <td>2016.0</td>\n", | |
" <td>本</td>\n", | |
" <td>436.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" サービスID アイテムID 13桁ISBN カテゴリ 評価 読書状況 レビュー タグ 読書メモ(非公開) \\\n", | |
"0 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n", | |
"1 1 4535558140 9.784536e+12 NaN NaN 読みたい NaN NaN NaN \n", | |
"2 1 415209611X 9.784152e+12 NaN NaN 読みたい NaN NaN NaN \n", | |
"3 1 4326504005 9.784327e+12 NaN NaN 読みたい NaN NaN NaN \n", | |
"4 1 4492314776 9.784492e+12 NaN NaN 読みたい NaN NaN NaN \n", | |
"\n", | |
" 登録日時 読了日 タイトル 作者名 \\\n", | |
"0 2016-05-14 01:35:03 NaN コネクトーム:脳の配線はどのように「わたし」をつくり出すのか セバスチャン・スン \n", | |
"1 2016-05-14 01:35:23 NaN マーケット進化論 経済が解き明かす日本の歴史 横山 和輝 \n", | |
"2 2016-05-14 01:35:58 NaN 貨幣の「新」世界史――ハンムラビ法典からビットコインまで カビール セガール \n", | |
"3 2016-05-14 01:41:48 NaN 父が息子に語るマクロ経済学 齊藤 誠 \n", | |
"4 2016-05-14 01:45:58 NaN ヤバすぎる経済学 スティーヴン・D・レヴィット \n", | |
"\n", | |
" 出版社名 発行年 ジャンル ページ数 \n", | |
"0 草思社 2015.0 本 504.0 \n", | |
"1 日本評論社 2016.0 本 263.0 \n", | |
"2 早川書房 2016.0 本 400.0 \n", | |
"3 勁草書房 2014.0 本 358.0 \n", | |
"4 東洋経済新報社 2016.0 本 436.0 " | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 18 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "0-9KT0Y99NHr", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"target = 'タイトル'" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "AQytYR3z9NHz", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# 元データと結合したものと文章と単語情報のみのdf\n", | |
"df_nlp, df_mrph = mrph_df(df_raw, target=target)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "VyPbNoHQ9NH7", | |
"colab_type": "code", | |
"colab": {}, | |
"outputId": "232b105a-0e7e-4ed0-afcb-580da7b5f525" | |
}, | |
"source": [ | |
"df_nlp.head()" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>サービスID</th>\n", | |
" <th>アイテムID</th>\n", | |
" <th>13桁ISBN</th>\n", | |
" <th>カテゴリ</th>\n", | |
" <th>評価</th>\n", | |
" <th>読書状況</th>\n", | |
" <th>レビュー</th>\n", | |
" <th>タグ</th>\n", | |
" <th>読書メモ(非公開)</th>\n", | |
" <th>登録日時</th>\n", | |
" <th>...</th>\n", | |
" <th>word_num</th>\n", | |
" <th>word_len</th>\n", | |
" <th>word</th>\n", | |
" <th>orig</th>\n", | |
" <th>ent_type</th>\n", | |
" <th>pos1</th>\n", | |
" <th>pos2</th>\n", | |
" <th>pos3</th>\n", | |
" <th>係り受け</th>\n", | |
" <th>木の深さ</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>4794221657</td>\n", | |
" <td>9.784794e+12</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>読みたい</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2016-05-14 01:35:03</td>\n", | |
" <td>...</td>\n", | |
" <td>16</td>\n", | |
" <td>6</td>\n", | |
" <td>コネクトーム</td>\n", | |
" <td>コネクトーム</td>\n", | |
" <td></td>\n", | |
" <td>名詞</td>\n", | |
" <td>普通名詞</td>\n", | |
" <td>一般</td>\n", | |
" <td>[]</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>4794221657</td>\n", | |
" <td>9.784794e+12</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>読みたい</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2016-05-14 01:35:03</td>\n", | |
" <td>...</td>\n", | |
" <td>16</td>\n", | |
" <td>1</td>\n", | |
" <td>:</td>\n", | |
" <td>:</td>\n", | |
" <td></td>\n", | |
" <td>補助記号</td>\n", | |
" <td>一般</td>\n", | |
" <td>None</td>\n", | |
" <td>[]</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>4794221657</td>\n", | |
" <td>9.784794e+12</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>読みたい</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2016-05-14 01:35:03</td>\n", | |
" <td>...</td>\n", | |
" <td>16</td>\n", | |
" <td>1</td>\n", | |
" <td>脳</td>\n", | |
" <td>脳</td>\n", | |
" <td></td>\n", | |
" <td>名詞</td>\n", | |
" <td>普通名詞</td>\n", | |
" <td>一般</td>\n", | |
" <td>[]</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>4794221657</td>\n", | |
" <td>9.784794e+12</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>読みたい</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2016-05-14 01:35:03</td>\n", | |
" <td>...</td>\n", | |
" <td>16</td>\n", | |
" <td>1</td>\n", | |
" <td>の</td>\n", | |
" <td>の</td>\n", | |
" <td></td>\n", | |
" <td>助詞</td>\n", | |
" <td>格助詞</td>\n", | |
" <td>None</td>\n", | |
" <td>[]</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1</td>\n", | |
" <td>4794221657</td>\n", | |
" <td>9.784794e+12</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>読みたい</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>2016-05-14 01:35:03</td>\n", | |
" <td>...</td>\n", | |
" <td>16</td>\n", | |
" <td>2</td>\n", | |
" <td>配線</td>\n", | |
" <td>配線</td>\n", | |
" <td></td>\n", | |
" <td>名詞</td>\n", | |
" <td>普通名詞</td>\n", | |
" <td>サ変可能</td>\n", | |
" <td>[]</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 28 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" サービスID アイテムID 13桁ISBN カテゴリ 評価 読書状況 レビュー タグ 読書メモ(非公開) \\\n", | |
"0 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n", | |
"1 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n", | |
"2 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n", | |
"3 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n", | |
"4 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n", | |
"\n", | |
" 登録日時 ... word_num word_len word orig ent_type pos1 \\\n", | |
"0 2016-05-14 01:35:03 ... 16 6 コネクトーム コネクトーム 名詞 \n", | |
"1 2016-05-14 01:35:03 ... 16 1 : : 補助記号 \n", | |
"2 2016-05-14 01:35:03 ... 16 1 脳 脳 名詞 \n", | |
"3 2016-05-14 01:35:03 ... 16 1 の の 助詞 \n", | |
"4 2016-05-14 01:35:03 ... 16 2 配線 配線 名詞 \n", | |
"\n", | |
" pos2 pos3 係り受け 木の深さ \n", | |
"0 普通名詞 一般 [] 1 \n", | |
"1 一般 None [] 1 \n", | |
"2 普通名詞 一般 [] 1 \n", | |
"3 格助詞 None [] 1 \n", | |
"4 普通名詞 サ変可能 [] 1 \n", | |
"\n", | |
"[5 rows x 28 columns]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 26 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Oimfehhf9NID", | |
"colab_type": "code", | |
"colab": {}, | |
"outputId": "23b8164c-2368-4726-994c-90c97081ae2a" | |
}, | |
"source": [ | |
"df_mrph.shape" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(2760, 11)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 27 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "1KuTDBhu9NIJ", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"df_mrph.to_csv('mrph_booklog.csv', index=False)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "F40Zhhes9NIS", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment