Created
September 28, 2019 06:23
-
-
Save RottenFruits/acc0708a58086b263f243242dc16c051 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import MeCab\n", | |
"import numpy as np\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class TextAnalyzer: \n", | |
" def raw_morphological_analize(self, text):\n", | |
" m = MeCab.Tagger('-Ochasen')\n", | |
" return(m.parse(text))\n", | |
" \n", | |
" def morphological_analize(self, text, target_pos):\n", | |
" #テキストを形態素解析し指定した品詞のみ取り出して返す\n", | |
" #hyouso = np.array([])\n", | |
" genkei = np.array([])\n", | |
" hinshi = np.array([])\n", | |
" target_pos_pattern = self.pos_pattern_create(target_pos)\n", | |
" \n", | |
" words_tmp = self.raw_morphological_analize(text)\n", | |
" words = words_tmp.split(\"\\n\")\n", | |
" for word in words:\n", | |
" if word.split(\"\\t\")[0] == \"EOS\":\n", | |
" break\n", | |
" #print(word.split(\"\\t\"))\n", | |
" tags = word.split(\"\\t\")\n", | |
" if target_pos_pattern.search(tags[3]) != None: #指定した品詞のみ取り出す\n", | |
" #hyouso.append(tags[0]) #表層形\n", | |
" #tags[1] #ヨミ\n", | |
" genkei = np.append(genkei, tags[2]) #原型\n", | |
" hinshi = np.append(hinshi, tags[3].split(\"-\")[0]) #品詞\n", | |
" #tags[4] #活用形\n", | |
" #tags[5] #活用型\n", | |
" #print(len(w.split(\"\\t\"))) \n", | |
" return(np.array([genkei, hinshi]))\n", | |
" \n", | |
" def pos_pattern_create(self, pos):\n", | |
" #品詞絞る用の正規表現パターン生成\n", | |
" import re\n", | |
" i = 0\n", | |
" for p in pos:\n", | |
" pos[i] = \"^\" + p\n", | |
" i += 1\n", | |
" pos = \"|\".join(pos)\n", | |
" pos_pattern = re.compile(pos)\n", | |
" return(pos_pattern)\n", | |
" \n", | |
" def base_bi_gram_create(self, arr):\n", | |
" #単純にバイグラムを作る\n", | |
" i = 0\n", | |
" bi_gram = np.empty((0, 2), int)\n", | |
" while i < (arr.size - 1):\n", | |
" bi_gram = np.append(bi_gram, np.array([[arr[i], arr[i+1]]]), axis = 0)\n", | |
" i += 1\n", | |
" return(bi_gram)\n", | |
" \n", | |
" def bi_gram_pos_order_filter(self, genkei, hinshi, target_pos_order):\n", | |
" #指定した品詞順のバイグラムパターンを取り出して返す \n", | |
" if genkei.size == 0:\n", | |
" print(\"array is null\")\n", | |
" return\n", | |
" bi_gram_genkei = np.empty((0, 2), int)\n", | |
" i = 0\n", | |
" while i < (genkei.size - 1):\n", | |
" if hinshi[i] == target_pos_order[0] and hinshi[i+1] == target_pos_order[1]:\n", | |
" bi_gram_genkei = np.append(bi_gram_genkei, np.array([[genkei[i], genkei[i+1]]]), axis = 0)\n", | |
" g = np.array([])\n", | |
" i += 1\n", | |
" return(bi_gram_genkei)\n", | |
" \n", | |
" def bi_gram_pos_order_filter2(self, genkei, hinshi, target_pos_order):\n", | |
" #指定した品詞順のバイグラムパターンを取り出して返す \n", | |
" #同じ品詞が連続している場合結合する\n", | |
" if genkei.size == 0:\n", | |
" print(\"array is null\")\n", | |
" return\n", | |
" bi_gram_genkei = np.empty((0, 2), int)\n", | |
" g = np.array([])\n", | |
" i = 0\n", | |
" while i < (genkei.size - 1):\n", | |
" if hinshi[i] == target_pos_order[0] and hinshi[i+1] == target_pos_order[0]:\n", | |
" if g.size == 0:\n", | |
" g = genkei[i]\n", | |
" g = np.append(g, genkei[i+1])\n", | |
" if hinshi[i] == target_pos_order[0] and hinshi[i+1] == target_pos_order[1]:\n", | |
" if g.size == 0:\n", | |
" g = genkei[i]\n", | |
" if g.size >= 2:\n", | |
" g = \"-\".join(g)\n", | |
" bi_gram_genkei = np.append(bi_gram_genkei, np.array([[g, genkei[i+1]]]), axis = 0)\n", | |
" g = np.array([])\n", | |
" i += 1\n", | |
" return(bi_gram_genkei)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[['どこ' '越す']\n", | |
" ['越す' '住む']\n", | |
" ['住む' '悟る']\n", | |
" ['悟る' '時']\n", | |
" ['時' '詩']\n", | |
" ['詩' '生れる']\n", | |
" ['生れる' '画']\n", | |
" ['画' '出来る']]\n", | |
"[['名詞' '動詞']\n", | |
" ['動詞' '動詞']\n", | |
" ['動詞' '動詞']\n", | |
" ['動詞' '名詞']\n", | |
" ['名詞' '名詞']\n", | |
" ['名詞' '動詞']\n", | |
" ['動詞' '名詞']\n", | |
" ['名詞' '動詞']]\n", | |
"[['どこ' '越す']\n", | |
" ['詩' '生れる']\n", | |
" ['画' '出来る']]\n", | |
" 0 1\n", | |
"0 どこ 越す\n", | |
"1 詩 生れる\n", | |
"2 画 出来る\n", | |
" 0 1\n", | |
"0 どこ 越す\n", | |
"1 時-詩 生れる\n", | |
"2 画 出来る\n" | |
] | |
} | |
], | |
"source": [ | |
"if __name__ == '__main__':\n", | |
" text = \"どこへ越しても住みにくいと悟った時、詩が生れて、画えが出来る。\"\n", | |
"\n", | |
" ta = TextAnalyzer()\n", | |
" genkei, hinshi = ta.morphological_analize(text, [\"名詞\", \"動詞\"])\n", | |
" \n", | |
" print(ta.base_bi_gram_create(genkei))\n", | |
" print(ta.base_bi_gram_create(hinshi))\n", | |
" \n", | |
" print(ta.bi_gram_pos_order_filter(genkei, hinshi, [\"名詞\", \"動詞\"]))\n", | |
" \n", | |
" print(pd.DataFrame(ta.bi_gram_pos_order_filter(genkei, hinshi, [\"名詞\", \"動詞\"])))\n", | |
" \n", | |
" print(pd.DataFrame(ta.bi_gram_pos_order_filter2(genkei, hinshi, [\"名詞\", \"動詞\"])))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array(['どこ', '越す', '住む', '悟る', '時', '詩', '生れる', '画', '出来る'], dtype='<U32')" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"genkei" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>どこ</td>\n", | |
" <td>越す</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>時-詩</td>\n", | |
" <td>生れる</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>画</td>\n", | |
" <td>出来る</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" 0 1\n", | |
"0 どこ 越す\n", | |
"1 時-詩 生れる\n", | |
"2 画 出来る" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pd.DataFrame(ta.bi_gram_pos_order_filter2(genkei, hinshi, [\"名詞\", \"動詞\"]))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([['どこ', '越す'],\n", | |
" ['時-詩', '生れる'],\n", | |
" ['画', '出来る']], dtype='<U21')" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ta.bi_gram_pos_order_filter2(genkei, hinshi, [\"名詞\", \"動詞\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment