RottenFruits/TextAnalyzer.ipynb

## TextAnalyzer.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import MeCab\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "class TextAnalyzer:        \n",
    "    def raw_morphological_analize(self, text):\n",
    "        m = MeCab.Tagger('-Ochasen')\n",
    "        return(m.parse(text))\n",
    "    \n",
    "    def morphological_analize(self, text, target_pos):\n",
    "        #テキストを形態素解析し指定した品詞のみ取り出して返す\n",
    "        #hyouso = np.array([])\n",
    "        genkei = np.array([])\n",
    "        hinshi = np.array([])\n",
    "        target_pos_pattern = self.pos_pattern_create(target_pos)\n",
    "        \n",
    "        words_tmp = self.raw_morphological_analize(text)\n",
    "        words = words_tmp.split(\"\\n\")\n",
    "        for word in words:\n",
    "            if word.split(\"\\t\")[0] == \"EOS\":\n",
    "                break\n",
    "            #print(word.split(\"\\t\"))\n",
    "            tags = word.split(\"\\t\")\n",
    "            if target_pos_pattern.search(tags[3]) != None: #指定した品詞のみ取り出す\n",
    "                #hyouso.append(tags[0]) #表層形\n",
    "                #tags[1] #ヨミ\n",
    "                genkei = np.append(genkei, tags[2]) #原型\n",
    "                hinshi = np.append(hinshi, tags[3].split(\"-\")[0]) #品詞\n",
    "                #tags[4] #活用形\n",
    "                #tags[5] #活用型\n",
    "                #print(len(w.split(\"\\t\")))            \n",
    "        return(np.array([genkei, hinshi]))\n",
    "    \n",
    "    def pos_pattern_create(self, pos):\n",
    "        #品詞絞る用の正規表現パターン生成\n",
    "        import re\n",
    "        i = 0\n",
    "        for p in pos:\n",
    "            pos[i] = \"^\" + p\n",
    "            i += 1\n",
    "        pos = \"|\".join(pos)\n",
    "        pos_pattern = re.compile(pos)\n",
    "        return(pos_pattern)\n",
    "    \n",
    "    def base_bi_gram_create(self, arr):\n",
    "        #単純にバイグラムを作る\n",
    "        i = 0\n",
    "        bi_gram = np.empty((0, 2), int)\n",
    "        while i < (arr.size - 1):\n",
    "            bi_gram = np.append(bi_gram, np.array([[arr[i], arr[i+1]]]), axis = 0)\n",
    "            i += 1\n",
    "        return(bi_gram)\n",
    "    \n",
    "    def bi_gram_pos_order_filter(self, genkei, hinshi, target_pos_order):\n",
    "        #指定した品詞順のバイグラムパターンを取り出して返す \n",
    "        if genkei.size == 0:\n",
    "            print(\"array is null\")\n",
    "            return\n",
    "        bi_gram_genkei = np.empty((0, 2), int)\n",
    "        i = 0\n",
    "        while i < (genkei.size - 1):\n",
    "            if hinshi[i] == target_pos_order[0] and hinshi[i+1] == target_pos_order[1]:\n",
    "                bi_gram_genkei = np.append(bi_gram_genkei, np.array([[genkei[i], genkei[i+1]]]), axis = 0)\n",
    "                g = np.array([])\n",
    "            i += 1\n",
    "        return(bi_gram_genkei)\n",
    "    \n",
    "    def bi_gram_pos_order_filter2(self, genkei, hinshi, target_pos_order):\n",
    "        #指定した品詞順のバイグラムパターンを取り出して返す \n",
    "        #同じ品詞が連続している場合結合する\n",
    "        if genkei.size == 0:\n",
    "            print(\"array is null\")\n",
    "            return\n",
    "        bi_gram_genkei = np.empty((0, 2), int)\n",
    "        g = np.array([])\n",
    "        i = 0\n",
    "        while i < (genkei.size - 1):\n",
    "            if hinshi[i] == target_pos_order[0] and hinshi[i+1] == target_pos_order[0]:\n",
    "                if g.size == 0:\n",
    "                    g = genkei[i]\n",
    "                g = np.append(g, genkei[i+1])\n",
    "            if hinshi[i] == target_pos_order[0] and hinshi[i+1] == target_pos_order[1]:\n",
    "                if g.size == 0:\n",
    "                    g = genkei[i]\n",
    "                if g.size >= 2:\n",
    "                    g = \"-\".join(g)\n",
    "                bi_gram_genkei = np.append(bi_gram_genkei, np.array([[g, genkei[i+1]]]), axis = 0)\n",
    "                g = np.array([])\n",
    "            i += 1\n",
    "        return(bi_gram_genkei)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['どこ' '越す']\n",
      " ['越す' '住む']\n",
      " ['住む' '悟る']\n",
      " ['悟る' '時']\n",
      " ['時' '詩']\n",
      " ['詩' '生れる']\n",
      " ['生れる' '画']\n",
      " ['画' '出来る']]\n",
      "[['名詞' '動詞']\n",
      " ['動詞' '動詞']\n",
      " ['動詞' '動詞']\n",
      " ['動詞' '名詞']\n",
      " ['名詞' '名詞']\n",
      " ['名詞' '動詞']\n",
      " ['動詞' '名詞']\n",
      " ['名詞' '動詞']]\n",
      "[['どこ' '越す']\n",
      " ['詩' '生れる']\n",
      " ['画' '出来る']]\n",
      "    0    1\n",
      "0  どこ   越す\n",
      "1   詩  生れる\n",
      "2   画  出来る\n",
      "     0    1\n",
      "0   どこ   越す\n",
      "1  時-詩  生れる\n",
      "2    画  出来る\n"
     ]
    }
   ],
   "source": [
    "if __name__ == '__main__':\n",
    "    text = \"どこへ越しても住みにくいと悟った時、詩が生れて、画えが出来る。\"\n",
    "\n",
    "    ta = TextAnalyzer()\n",
    "    genkei, hinshi = ta.morphological_analize(text, [\"名詞\", \"動詞\"])\n",
    "    \n",
    "    print(ta.base_bi_gram_create(genkei))\n",
    "    print(ta.base_bi_gram_create(hinshi))\n",
    "    \n",
    "    print(ta.bi_gram_pos_order_filter(genkei, hinshi, [\"名詞\", \"動詞\"]))\n",
    "    \n",
    "    print(pd.DataFrame(ta.bi_gram_pos_order_filter(genkei, hinshi, [\"名詞\", \"動詞\"])))\n",
    "    \n",
    "    print(pd.DataFrame(ta.bi_gram_pos_order_filter2(genkei, hinshi, [\"名詞\", \"動詞\"])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['どこ', '越す', '住む', '悟る', '時', '詩', '生れる', '画', '出来る'], dtype='<U32')"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "genkei"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>どこ</td>\n",
       "      <td>越す</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>時-詩</td>\n",
       "      <td>生れる</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>画</td>\n",
       "      <td>出来る</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     0    1\n",
       "0   どこ   越す\n",
       "1  時-詩  生れる\n",
       "2    画  出来る"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(ta.bi_gram_pos_order_filter2(genkei, hinshi, [\"名詞\", \"動詞\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([['どこ', '越す'],\n",
       "       ['時-詩', '生れる'],\n",
       "       ['画', '出来る']], dtype='<U21')"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ta.bi_gram_pos_order_filter2(genkei, hinshi, [\"名詞\", \"動詞\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import MeCab\n",
	"import numpy as np\n",
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"class TextAnalyzer: \n",
	" def raw_morphological_analize(self, text):\n",
	" m = MeCab.Tagger('-Ochasen')\n",
	" return(m.parse(text))\n",
	" \n",
	" def morphological_analize(self, text, target_pos):\n",
	" #テキストを形態素解析し指定した品詞のみ取り出して返す\n",
	" #hyouso = np.array([])\n",
	" genkei = np.array([])\n",
	" hinshi = np.array([])\n",
	" target_pos_pattern = self.pos_pattern_create(target_pos)\n",
	" \n",
	" words_tmp = self.raw_morphological_analize(text)\n",
	" words = words_tmp.split(\"\\n\")\n",
	" for word in words:\n",
	" if word.split(\"\\t\")[0] == \"EOS\":\n",
	" break\n",
	" #print(word.split(\"\\t\"))\n",
	" tags = word.split(\"\\t\")\n",
	" if target_pos_pattern.search(tags[3]) != None: #指定した品詞のみ取り出す\n",
	" #hyouso.append(tags[0]) #表層形\n",
	" #tags[1] #ヨミ\n",
	" genkei = np.append(genkei, tags[2]) #原型\n",
	" hinshi = np.append(hinshi, tags[3].split(\"-\")[0]) #品詞\n",
	" #tags[4] #活用形\n",
	" #tags[5] #活用型\n",
	" #print(len(w.split(\"\\t\"))) \n",
	" return(np.array([genkei, hinshi]))\n",
	" \n",
	" def pos_pattern_create(self, pos):\n",
	" #品詞絞る用の正規表現パターン生成\n",
	" import re\n",
	" i = 0\n",
	" for p in pos:\n",
	" pos[i] = \"^\" + p\n",
	" i += 1\n",
	" pos = \"\|\".join(pos)\n",
	" pos_pattern = re.compile(pos)\n",
	" return(pos_pattern)\n",
	" \n",
	" def base_bi_gram_create(self, arr):\n",
	" #単純にバイグラムを作る\n",
	" i = 0\n",
	" bi_gram = np.empty((0, 2), int)\n",
	" while i < (arr.size - 1):\n",
	" bi_gram = np.append(bi_gram, np.array([[arr[i], arr[i+1]]]), axis = 0)\n",
	" i += 1\n",
	" return(bi_gram)\n",
	" \n",
	" def bi_gram_pos_order_filter(self, genkei, hinshi, target_pos_order):\n",
	" #指定した品詞順のバイグラムパターンを取り出して返す \n",
	" if genkei.size == 0:\n",
	" print(\"array is null\")\n",
	" return\n",
	" bi_gram_genkei = np.empty((0, 2), int)\n",
	" i = 0\n",
	" while i < (genkei.size - 1):\n",
	" if hinshi[i] == target_pos_order[0] and hinshi[i+1] == target_pos_order[1]:\n",
	" bi_gram_genkei = np.append(bi_gram_genkei, np.array([[genkei[i], genkei[i+1]]]), axis = 0)\n",
	" g = np.array([])\n",
	" i += 1\n",
	" return(bi_gram_genkei)\n",
	" \n",
	" def bi_gram_pos_order_filter2(self, genkei, hinshi, target_pos_order):\n",
	" #指定した品詞順のバイグラムパターンを取り出して返す \n",
	" #同じ品詞が連続している場合結合する\n",
	" if genkei.size == 0:\n",
	" print(\"array is null\")\n",
	" return\n",
	" bi_gram_genkei = np.empty((0, 2), int)\n",
	" g = np.array([])\n",
	" i = 0\n",
	" while i < (genkei.size - 1):\n",
	" if hinshi[i] == target_pos_order[0] and hinshi[i+1] == target_pos_order[0]:\n",
	" if g.size == 0:\n",
	" g = genkei[i]\n",
	" g = np.append(g, genkei[i+1])\n",
	" if hinshi[i] == target_pos_order[0] and hinshi[i+1] == target_pos_order[1]:\n",
	" if g.size == 0:\n",
	" g = genkei[i]\n",
	" if g.size >= 2:\n",
	" g = \"-\".join(g)\n",
	" bi_gram_genkei = np.append(bi_gram_genkei, np.array([[g, genkei[i+1]]]), axis = 0)\n",
	" g = np.array([])\n",
	" i += 1\n",
	" return(bi_gram_genkei)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[['どこ' '越す']\n",
	" ['越す' '住む']\n",
	" ['住む' '悟る']\n",
	" ['悟る' '時']\n",
	" ['時' '詩']\n",
	" ['詩' '生れる']\n",
	" ['生れる' '画']\n",
	" ['画' '出来る']]\n",
	"[['名詞' '動詞']\n",
	" ['動詞' '動詞']\n",
	" ['動詞' '動詞']\n",
	" ['動詞' '名詞']\n",
	" ['名詞' '名詞']\n",
	" ['名詞' '動詞']\n",
	" ['動詞' '名詞']\n",
	" ['名詞' '動詞']]\n",
	"[['どこ' '越す']\n",
	" ['詩' '生れる']\n",
	" ['画' '出来る']]\n",
	" 0 1\n",
	"0 どこ越す\n",
	"1 詩生れる\n",
	"2 画出来る\n",
	" 0 1\n",
	"0 どこ越す\n",
	"1 時-詩生れる\n",
	"2 画出来る\n"
	]
	}
	],
	"source": [
	"if __name__ == '__main__':\n",
	" text = \"どこへ越しても住みにくいと悟った時、詩が生れて、画えが出来る。\"\n",
	"\n",
	" ta = TextAnalyzer()\n",
	" genkei, hinshi = ta.morphological_analize(text, [\"名詞\", \"動詞\"])\n",
	" \n",
	" print(ta.base_bi_gram_create(genkei))\n",
	" print(ta.base_bi_gram_create(hinshi))\n",
	" \n",
	" print(ta.bi_gram_pos_order_filter(genkei, hinshi, [\"名詞\", \"動詞\"]))\n",
	" \n",
	" print(pd.DataFrame(ta.bi_gram_pos_order_filter(genkei, hinshi, [\"名詞\", \"動詞\"])))\n",
	" \n",
	" print(pd.DataFrame(ta.bi_gram_pos_order_filter2(genkei, hinshi, [\"名詞\", \"動詞\"])))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array(['どこ', '越す', '住む', '悟る', '時', '詩', '生れる', '画', '出来る'], dtype='<U32')"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"genkei"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>0</th>\n",
	" <th>1</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>どこ</td>\n",
	" <td>越す</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>時-詩</td>\n",
	" <td>生れる</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>画</td>\n",
	" <td>出来る</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" 0 1\n",
	"0 どこ越す\n",
	"1 時-詩生れる\n",
	"2 画出来る"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pd.DataFrame(ta.bi_gram_pos_order_filter2(genkei, hinshi, [\"名詞\", \"動詞\"]))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([['どこ', '越す'],\n",
	" ['時-詩', '生れる'],\n",
	" ['画', '出来る']], dtype='<U21')"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"ta.bi_gram_pos_order_filter2(genkei, hinshi, [\"名詞\", \"動詞\"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}