Skip to content

Instantly share code, notes, and snippets.

@uni-3
Created July 25, 2020 07:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save uni-3/8f743303ffb71ecf824859ab9a7258bb to your computer and use it in GitHub Desktop.
Save uni-3/8f743303ffb71ecf824859ab9a7258bb to your computer and use it in GitHub Desktop.
ginza_mrph.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
},
"colab": {
"name": "ginza_mrph.ipynb",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/uni-3/8f743303ffb71ecf824859ab9a7258bb/ginza_mrph.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "_7Te7T2t9NGL",
"colab_type": "code",
"colab": {}
},
"source": [
"import spacy\n",
"import pandas as pd"
],
"execution_count": 1,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "gXZpJ_O49NGf",
"colab_type": "text"
},
"source": [
"#### init ginza"
]
},
{
"cell_type": "code",
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
},
"id": "XMa-PMst9NGh",
"colab_type": "code",
"colab": {}
},
"source": [
"!pip install -U ginza"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "bO0709P89NGx",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 61
},
"outputId": "c14ad25e-31af-4f74-d4ef-7d2455d1843a"
},
"source": [
"#https://megagonlabs.github.io/ginza/\n",
"!ginza -i"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"system.dic is already extracted: /usr/local/lib/python3.6/dist-packages/ja_ginza_dict/sudachidict/system.dic.tar.xz\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "cr0n-Phh9NG_",
"colab_type": "code",
"colab": {}
},
"source": [
"class ginza_nlp():\n",
" def __init__(self):\n",
" self.nlp = spacy.load('ja_ginza')\n",
"\n",
"\n",
" def mrph(self, text: str) -> pd.DataFrame:\n",
" \"\"\"\n",
" あるカラムのテキストを形態素解析して、dfに追加して返す\n",
"\n",
" Parameters\n",
" ----------\n",
" text : str\n",
" col_name : str\n",
"\n",
" Returns\n",
" -------\n",
" rows : pd.DaraFrame\n",
" 中身の意味\n",
" mrph_cols = ['sent', 'word_num', 'word_len', 'word', 'orig'\n",
" , 'ent_type', 'pos1', 'pos2', 'pos3', '係り受け', '木の深さ']\n",
"\n",
" \"\"\"\n",
" doc = self.nlp(text)\n",
"\n",
" rows = []\n",
" sent = doc[:]\n",
" for token in sent:\n",
" pos1, pos2, pos3 = self.ex_pos(token.tag_.split('-'))\n",
" \n",
" rows.append([sent.text, len(sent), len(token)\n",
" , token.orth_, token.lemma_\n",
" , token.ent_type_, pos1, pos2, pos3\n",
" , [token.text + '->' + child.text for child in token.children]\n",
" , self.tree_depth(sent.root)\n",
" ])\n",
"\n",
" return pd.DataFrame(rows)\n",
" \n",
" def ex_pos(self,pos: list) -> (str, str, str):\n",
" \"\"\"\n",
" 品詞情報を分解して渡す\n",
"\n",
" Parameters\n",
" ----------\n",
" pos : list]\n",
"\n",
" Returns\n",
" -------\n",
"\n",
" \"\"\"\n",
" try:\n",
" pos1 = pos[0]\n",
" except:\n",
" pos1 = None \n",
" \n",
" try:\n",
" pos2 = pos[1]\n",
" except:\n",
" pos2 = None\n",
" \n",
" try: \n",
" pos3 = pos[2]\n",
" except:\n",
" pos3 = None\n",
" \n",
" return pos1, pos2, pos3\n",
" \n",
" \n",
" def tree_depth(self, token: spacy):\n",
" \"\"\"\n",
" 係り受け木の深さ: 文の係り受け木の深さの最大値の平均\n",
" \"\"\"\n",
" d = 0\n",
" if token.n_lefts + token.n_rights > 0:\n",
" d = [tree_depth(child) for child in token.children]\n",
" return sum(d) \n",
" else:\n",
" return 1\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "O0UYtN5M9NHH",
"colab_type": "code",
"colab": {}
},
"source": [
"import concurrent.futures\n",
"\n",
"def mrph_df(df: pd.DataFrame, target=None, mrph_cols=['sent', 'word_num', 'word_len', 'word', 'orig'\n",
" , 'ent_type', 'pos1', 'pos2', 'pos3', '係り受け', '木の深さ']) -> pd.DataFrame:\n",
" gn = ginza_nlp()\n",
" with concurrent.futures.ProcessPoolExecutor() as executor:\n",
" mrph_list = list(executor.map(gn.mrph, df[target]))\n",
"\n",
" df_m = pd.concat(mrph_list, ignore_index=True)\n",
" df_m.columns = mrph_cols\n",
"\n",
" df_nlp = df.merge(df_m, left_on=target, right_on=\"sent\")\n",
" return df_nlp, df_m"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "dXmOhumJ9NHR",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "gQRz01Sz9NHa",
"colab_type": "text"
},
"source": [
"#### load data"
]
},
{
"cell_type": "code",
"metadata": {
"id": "xfSz33XD9NHc",
"colab_type": "code",
"colab": {},
"outputId": "90f64402-a76d-4dc3-c856-d45f55c94308"
},
"source": [
"cols = [\"サービスID\", \"アイテムID\", \"13桁ISBN\", \"カテゴリ\",\n",
" \"評価\", \"読書状況\", \"レビュー\", \"タグ\", \"読書メモ(非公開)\",\n",
" \"登録日時\", \"読了日\", \"タイトル\", \"作者名\", \"出版社名\", \"発行年\", \"ジャンル\", \"ページ数\"]\n",
"df_raw = pd.read_csv('./booklog.csv', encoding=\"shift-jis\", names=cols)\n",
"df_raw.shape"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(252, 17)"
]
},
"metadata": {
"tags": []
},
"execution_count": 17
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "7m5ow5iU9NHj",
"colab_type": "code",
"colab": {},
"outputId": "02822b22-043a-4b3d-c1c2-39cd84b33408"
},
"source": [
"df_raw.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>サービスID</th>\n",
" <th>アイテムID</th>\n",
" <th>13桁ISBN</th>\n",
" <th>カテゴリ</th>\n",
" <th>評価</th>\n",
" <th>読書状況</th>\n",
" <th>レビュー</th>\n",
" <th>タグ</th>\n",
" <th>読書メモ(非公開)</th>\n",
" <th>登録日時</th>\n",
" <th>読了日</th>\n",
" <th>タイトル</th>\n",
" <th>作者名</th>\n",
" <th>出版社名</th>\n",
" <th>発行年</th>\n",
" <th>ジャンル</th>\n",
" <th>ページ数</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>4794221657</td>\n",
" <td>9.784794e+12</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>読みたい</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2016-05-14 01:35:03</td>\n",
" <td>NaN</td>\n",
" <td>コネクトーム:脳の配線はどのように「わたし」をつくり出すのか</td>\n",
" <td>セバスチャン・スン</td>\n",
" <td>草思社</td>\n",
" <td>2015.0</td>\n",
" <td>本</td>\n",
" <td>504.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>4535558140</td>\n",
" <td>9.784536e+12</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>読みたい</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2016-05-14 01:35:23</td>\n",
" <td>NaN</td>\n",
" <td>マーケット進化論 経済が解き明かす日本の歴史</td>\n",
" <td>横山 和輝</td>\n",
" <td>日本評論社</td>\n",
" <td>2016.0</td>\n",
" <td>本</td>\n",
" <td>263.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>415209611X</td>\n",
" <td>9.784152e+12</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>読みたい</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2016-05-14 01:35:58</td>\n",
" <td>NaN</td>\n",
" <td>貨幣の「新」世界史――ハンムラビ法典からビットコインまで</td>\n",
" <td>カビール セガール</td>\n",
" <td>早川書房</td>\n",
" <td>2016.0</td>\n",
" <td>本</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>4326504005</td>\n",
" <td>9.784327e+12</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>読みたい</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2016-05-14 01:41:48</td>\n",
" <td>NaN</td>\n",
" <td>父が息子に語るマクロ経済学</td>\n",
" <td>齊藤 誠</td>\n",
" <td>勁草書房</td>\n",
" <td>2014.0</td>\n",
" <td>本</td>\n",
" <td>358.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>4492314776</td>\n",
" <td>9.784492e+12</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>読みたい</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2016-05-14 01:45:58</td>\n",
" <td>NaN</td>\n",
" <td>ヤバすぎる経済学</td>\n",
" <td>スティーヴン・D・レヴィット</td>\n",
" <td>東洋経済新報社</td>\n",
" <td>2016.0</td>\n",
" <td>本</td>\n",
" <td>436.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" サービスID アイテムID 13桁ISBN カテゴリ 評価 読書状況 レビュー タグ 読書メモ(非公開) \\\n",
"0 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n",
"1 1 4535558140 9.784536e+12 NaN NaN 読みたい NaN NaN NaN \n",
"2 1 415209611X 9.784152e+12 NaN NaN 読みたい NaN NaN NaN \n",
"3 1 4326504005 9.784327e+12 NaN NaN 読みたい NaN NaN NaN \n",
"4 1 4492314776 9.784492e+12 NaN NaN 読みたい NaN NaN NaN \n",
"\n",
" 登録日時 読了日 タイトル 作者名 \\\n",
"0 2016-05-14 01:35:03 NaN コネクトーム:脳の配線はどのように「わたし」をつくり出すのか セバスチャン・スン \n",
"1 2016-05-14 01:35:23 NaN マーケット進化論 経済が解き明かす日本の歴史 横山 和輝 \n",
"2 2016-05-14 01:35:58 NaN 貨幣の「新」世界史――ハンムラビ法典からビットコインまで カビール セガール \n",
"3 2016-05-14 01:41:48 NaN 父が息子に語るマクロ経済学 齊藤 誠 \n",
"4 2016-05-14 01:45:58 NaN ヤバすぎる経済学 スティーヴン・D・レヴィット \n",
"\n",
" 出版社名 発行年 ジャンル ページ数 \n",
"0 草思社 2015.0 本 504.0 \n",
"1 日本評論社 2016.0 本 263.0 \n",
"2 早川書房 2016.0 本 400.0 \n",
"3 勁草書房 2014.0 本 358.0 \n",
"4 東洋経済新報社 2016.0 本 436.0 "
]
},
"metadata": {
"tags": []
},
"execution_count": 18
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "0-9KT0Y99NHr",
"colab_type": "code",
"colab": {}
},
"source": [
"target = 'タイトル'"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "AQytYR3z9NHz",
"colab_type": "code",
"colab": {}
},
"source": [
"# 元データと結合したものと文章と単語情報のみのdf\n",
"df_nlp, df_mrph = mrph_df(df_raw, target=target)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "VyPbNoHQ9NH7",
"colab_type": "code",
"colab": {},
"outputId": "232b105a-0e7e-4ed0-afcb-580da7b5f525"
},
"source": [
"df_nlp.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>サービスID</th>\n",
" <th>アイテムID</th>\n",
" <th>13桁ISBN</th>\n",
" <th>カテゴリ</th>\n",
" <th>評価</th>\n",
" <th>読書状況</th>\n",
" <th>レビュー</th>\n",
" <th>タグ</th>\n",
" <th>読書メモ(非公開)</th>\n",
" <th>登録日時</th>\n",
" <th>...</th>\n",
" <th>word_num</th>\n",
" <th>word_len</th>\n",
" <th>word</th>\n",
" <th>orig</th>\n",
" <th>ent_type</th>\n",
" <th>pos1</th>\n",
" <th>pos2</th>\n",
" <th>pos3</th>\n",
" <th>係り受け</th>\n",
" <th>木の深さ</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>4794221657</td>\n",
" <td>9.784794e+12</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>読みたい</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2016-05-14 01:35:03</td>\n",
" <td>...</td>\n",
" <td>16</td>\n",
" <td>6</td>\n",
" <td>コネクトーム</td>\n",
" <td>コネクトーム</td>\n",
" <td></td>\n",
" <td>名詞</td>\n",
" <td>普通名詞</td>\n",
" <td>一般</td>\n",
" <td>[]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>4794221657</td>\n",
" <td>9.784794e+12</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>読みたい</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2016-05-14 01:35:03</td>\n",
" <td>...</td>\n",
" <td>16</td>\n",
" <td>1</td>\n",
" <td>:</td>\n",
" <td>:</td>\n",
" <td></td>\n",
" <td>補助記号</td>\n",
" <td>一般</td>\n",
" <td>None</td>\n",
" <td>[]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>4794221657</td>\n",
" <td>9.784794e+12</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>読みたい</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2016-05-14 01:35:03</td>\n",
" <td>...</td>\n",
" <td>16</td>\n",
" <td>1</td>\n",
" <td>脳</td>\n",
" <td>脳</td>\n",
" <td></td>\n",
" <td>名詞</td>\n",
" <td>普通名詞</td>\n",
" <td>一般</td>\n",
" <td>[]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>4794221657</td>\n",
" <td>9.784794e+12</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>読みたい</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2016-05-14 01:35:03</td>\n",
" <td>...</td>\n",
" <td>16</td>\n",
" <td>1</td>\n",
" <td>の</td>\n",
" <td>の</td>\n",
" <td></td>\n",
" <td>助詞</td>\n",
" <td>格助詞</td>\n",
" <td>None</td>\n",
" <td>[]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>4794221657</td>\n",
" <td>9.784794e+12</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>読みたい</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2016-05-14 01:35:03</td>\n",
" <td>...</td>\n",
" <td>16</td>\n",
" <td>2</td>\n",
" <td>配線</td>\n",
" <td>配線</td>\n",
" <td></td>\n",
" <td>名詞</td>\n",
" <td>普通名詞</td>\n",
" <td>サ変可能</td>\n",
" <td>[]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 28 columns</p>\n",
"</div>"
],
"text/plain": [
" サービスID アイテムID 13桁ISBN カテゴリ 評価 読書状況 レビュー タグ 読書メモ(非公開) \\\n",
"0 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n",
"1 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n",
"2 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n",
"3 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n",
"4 1 4794221657 9.784794e+12 NaN NaN 読みたい NaN NaN NaN \n",
"\n",
" 登録日時 ... word_num word_len word orig ent_type pos1 \\\n",
"0 2016-05-14 01:35:03 ... 16 6 コネクトーム コネクトーム 名詞 \n",
"1 2016-05-14 01:35:03 ... 16 1 : : 補助記号 \n",
"2 2016-05-14 01:35:03 ... 16 1 脳 脳 名詞 \n",
"3 2016-05-14 01:35:03 ... 16 1 の の 助詞 \n",
"4 2016-05-14 01:35:03 ... 16 2 配線 配線 名詞 \n",
"\n",
" pos2 pos3 係り受け 木の深さ \n",
"0 普通名詞 一般 [] 1 \n",
"1 一般 None [] 1 \n",
"2 普通名詞 一般 [] 1 \n",
"3 格助詞 None [] 1 \n",
"4 普通名詞 サ変可能 [] 1 \n",
"\n",
"[5 rows x 28 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 26
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Oimfehhf9NID",
"colab_type": "code",
"colab": {},
"outputId": "23b8164c-2368-4726-994c-90c97081ae2a"
},
"source": [
"df_mrph.shape"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(2760, 11)"
]
},
"metadata": {
"tags": []
},
"execution_count": 27
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "1KuTDBhu9NIJ",
"colab_type": "code",
"colab": {}
},
"source": [
"df_mrph.to_csv('mrph_booklog.csv', index=False)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "F40Zhhes9NIS",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment