Skip to content

Instantly share code, notes, and snippets.

@RottenFruits
Created September 28, 2019 06:23
Show Gist options
  • Save RottenFruits/acc0708a58086b263f243242dc16c051 to your computer and use it in GitHub Desktop.
Save RottenFruits/acc0708a58086b263f243242dc16c051 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import MeCab\n",
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"class TextAnalyzer: \n",
" def raw_morphological_analize(self, text):\n",
" m = MeCab.Tagger('-Ochasen')\n",
" return(m.parse(text))\n",
" \n",
" def morphological_analize(self, text, target_pos):\n",
" #テキストを形態素解析し指定した品詞のみ取り出して返す\n",
" #hyouso = np.array([])\n",
" genkei = np.array([])\n",
" hinshi = np.array([])\n",
" target_pos_pattern = self.pos_pattern_create(target_pos)\n",
" \n",
" words_tmp = self.raw_morphological_analize(text)\n",
" words = words_tmp.split(\"\\n\")\n",
" for word in words:\n",
" if word.split(\"\\t\")[0] == \"EOS\":\n",
" break\n",
" #print(word.split(\"\\t\"))\n",
" tags = word.split(\"\\t\")\n",
" if target_pos_pattern.search(tags[3]) != None: #指定した品詞のみ取り出す\n",
" #hyouso.append(tags[0]) #表層形\n",
" #tags[1] #ヨミ\n",
" genkei = np.append(genkei, tags[2]) #原型\n",
" hinshi = np.append(hinshi, tags[3].split(\"-\")[0]) #品詞\n",
" #tags[4] #活用形\n",
" #tags[5] #活用型\n",
" #print(len(w.split(\"\\t\"))) \n",
" return(np.array([genkei, hinshi]))\n",
" \n",
" def pos_pattern_create(self, pos):\n",
" #品詞絞る用の正規表現パターン生成\n",
" import re\n",
" i = 0\n",
" for p in pos:\n",
" pos[i] = \"^\" + p\n",
" i += 1\n",
" pos = \"|\".join(pos)\n",
" pos_pattern = re.compile(pos)\n",
" return(pos_pattern)\n",
" \n",
" def base_bi_gram_create(self, arr):\n",
" #単純にバイグラムを作る\n",
" i = 0\n",
" bi_gram = np.empty((0, 2), int)\n",
" while i < (arr.size - 1):\n",
" bi_gram = np.append(bi_gram, np.array([[arr[i], arr[i+1]]]), axis = 0)\n",
" i += 1\n",
" return(bi_gram)\n",
" \n",
" def bi_gram_pos_order_filter(self, genkei, hinshi, target_pos_order):\n",
" #指定した品詞順のバイグラムパターンを取り出して返す \n",
" if genkei.size == 0:\n",
" print(\"array is null\")\n",
" return\n",
" bi_gram_genkei = np.empty((0, 2), int)\n",
" i = 0\n",
" while i < (genkei.size - 1):\n",
" if hinshi[i] == target_pos_order[0] and hinshi[i+1] == target_pos_order[1]:\n",
" bi_gram_genkei = np.append(bi_gram_genkei, np.array([[genkei[i], genkei[i+1]]]), axis = 0)\n",
" g = np.array([])\n",
" i += 1\n",
" return(bi_gram_genkei)\n",
" \n",
" def bi_gram_pos_order_filter2(self, genkei, hinshi, target_pos_order):\n",
" #指定した品詞順のバイグラムパターンを取り出して返す \n",
" #同じ品詞が連続している場合結合する\n",
" if genkei.size == 0:\n",
" print(\"array is null\")\n",
" return\n",
" bi_gram_genkei = np.empty((0, 2), int)\n",
" g = np.array([])\n",
" i = 0\n",
" while i < (genkei.size - 1):\n",
" if hinshi[i] == target_pos_order[0] and hinshi[i+1] == target_pos_order[0]:\n",
" if g.size == 0:\n",
" g = genkei[i]\n",
" g = np.append(g, genkei[i+1])\n",
" if hinshi[i] == target_pos_order[0] and hinshi[i+1] == target_pos_order[1]:\n",
" if g.size == 0:\n",
" g = genkei[i]\n",
" if g.size >= 2:\n",
" g = \"-\".join(g)\n",
" bi_gram_genkei = np.append(bi_gram_genkei, np.array([[g, genkei[i+1]]]), axis = 0)\n",
" g = np.array([])\n",
" i += 1\n",
" return(bi_gram_genkei)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['どこ' '越す']\n",
" ['越す' '住む']\n",
" ['住む' '悟る']\n",
" ['悟る' '時']\n",
" ['時' '詩']\n",
" ['詩' '生れる']\n",
" ['生れる' '画']\n",
" ['画' '出来る']]\n",
"[['名詞' '動詞']\n",
" ['動詞' '動詞']\n",
" ['動詞' '動詞']\n",
" ['動詞' '名詞']\n",
" ['名詞' '名詞']\n",
" ['名詞' '動詞']\n",
" ['動詞' '名詞']\n",
" ['名詞' '動詞']]\n",
"[['どこ' '越す']\n",
" ['詩' '生れる']\n",
" ['画' '出来る']]\n",
" 0 1\n",
"0 どこ 越す\n",
"1 詩 生れる\n",
"2 画 出来る\n",
" 0 1\n",
"0 どこ 越す\n",
"1 時-詩 生れる\n",
"2 画 出来る\n"
]
}
],
"source": [
"if __name__ == '__main__':\n",
" text = \"どこへ越しても住みにくいと悟った時、詩が生れて、画えが出来る。\"\n",
"\n",
" ta = TextAnalyzer()\n",
" genkei, hinshi = ta.morphological_analize(text, [\"名詞\", \"動詞\"])\n",
" \n",
" print(ta.base_bi_gram_create(genkei))\n",
" print(ta.base_bi_gram_create(hinshi))\n",
" \n",
" print(ta.bi_gram_pos_order_filter(genkei, hinshi, [\"名詞\", \"動詞\"]))\n",
" \n",
" print(pd.DataFrame(ta.bi_gram_pos_order_filter(genkei, hinshi, [\"名詞\", \"動詞\"])))\n",
" \n",
" print(pd.DataFrame(ta.bi_gram_pos_order_filter2(genkei, hinshi, [\"名詞\", \"動詞\"])))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['どこ', '越す', '住む', '悟る', '時', '詩', '生れる', '画', '出来る'], dtype='<U32')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"genkei"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>どこ</td>\n",
" <td>越す</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>時-詩</td>\n",
" <td>生れる</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>画</td>\n",
" <td>出来る</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1\n",
"0 どこ 越す\n",
"1 時-詩 生れる\n",
"2 画 出来る"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(ta.bi_gram_pos_order_filter2(genkei, hinshi, [\"名詞\", \"動詞\"]))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([['どこ', '越す'],\n",
" ['時-詩', '生れる'],\n",
" ['画', '出来る']], dtype='<U21')"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ta.bi_gram_pos_order_filter2(genkei, hinshi, [\"名詞\", \"動詞\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment