Created
September 19, 2022 14:50
-
-
Save yssymmt/12c351b99edb2cce27fafe5f3a48cbb7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "0e7e167e", | |
"metadata": {}, | |
"source": [ | |
"#04: sudachipy" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "346b5eee", | |
"metadata": {}, | |
"source": [ | |
"####パッケージの読み込み" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "dceb6e43", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"from sqlalchemy import create_engine\n", | |
"import teradatasql\n", | |
"import teradatasqlalchemy\n", | |
"from sudachipy import tokenizer\n", | |
"from sudachipy import dictionary" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "5c9a266e", | |
"metadata": {}, | |
"source": [ | |
"####Teradataへの接続、sqlalchemy エンジンを作成" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "9193c7d9", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"host = \"192.168.999.999\"\n", | |
"user = \"jumbo\"\n", | |
"password = \"mambo\"\n", | |
"connstr = \"teradatasql://{user}:{password}@{host}\".format(host=host, user=user, password=password)\n", | |
"engine = create_engine(connstr)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "2b066a34", | |
"metadata": {}, | |
"source": [ | |
"####データを取得 " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "878b7db0", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>docid</th>\n", | |
" <th>cat</th>\n", | |
" <th>docdesc</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>19</td>\n", | |
" <td>春日</td>\n", | |
" <td>ぼる塾の人と「まあねぇ」と「トゥース!」の掛け合いは面白かった</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>17</td>\n", | |
" <td>若林</td>\n", | |
" <td>山里亮太にはツッコミでは敵わないと思っている</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>7</td>\n", | |
" <td>若林</td>\n", | |
" <td>藤井青銅「ピンクのベストじゃない方がしゃべれるんだよ」</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>15</td>\n", | |
" <td>春日</td>\n", | |
" <td>普段は靴下を履かないので、足の裏が象のようになっている</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>若林</td>\n", | |
" <td>プライベートのバスケットで足を怪我した</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" docid cat docdesc\n", | |
"0 19 春日 ぼる塾の人と「まあねぇ」と「トゥース!」の掛け合いは面白かった\n", | |
"1 17 若林 山里亮太にはツッコミでは敵わないと思っている\n", | |
"2 7 若林 藤井青銅「ピンクのベストじゃない方がしゃべれるんだよ」\n", | |
"3 15 春日 普段は靴下を履かないので、足の裏が象のようになっている\n", | |
"4 5 若林 プライベートのバスケットで足を怪我した" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"with engine.connect() as conn:\n", | |
" df = pd.read_sql(\"\"\"\n", | |
" select *\n", | |
" from jumbo.aud03_neologdn\n", | |
" \"\"\", conn)\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "32c0a5ae", | |
"metadata": {}, | |
"source": [ | |
"####分解用文字の確認" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "a8229105", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>docid</th>\n", | |
" <th>cat</th>\n", | |
" <th>docdesc</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
"Empty DataFrame\n", | |
"Columns: [docid, cat, docdesc]\n", | |
"Index: []" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"with engine.connect() as conn:\n", | |
" sf = pd.read_sql(\"\"\"\n", | |
" select *\n", | |
" from jumbo.aud03_neologdn where docdesc like '%★%' \n", | |
" \"\"\", conn)\n", | |
"sf.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ffb76350", | |
"metadata": {}, | |
"source": [ | |
"####sudachi、モードなど指定" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "7175084d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"config_path = \"C:/Users/youruserdir/Anaconda3/Lib/site-packages/sudachipy/resources/sudachi.json\"\n", | |
"tokenizer_obj = dictionary.Dictionary(config_path=config_path, dict=\"full\").create() \n", | |
"mode = tokenizer.Tokenizer.SplitMode.C" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "09e05807", | |
"metadata": {}, | |
"source": [ | |
"####最終的な結果を出力するためのData Frameを作成する" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "bab30604", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df1 = pd.DataFrame( columns=['docid','docdesc'] )" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "f4c19b4b", | |
"metadata": {}, | |
"source": [ | |
"####形態素解析" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "d495fbcd", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>docid</th>\n", | |
" <th>docdesc</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>19</td>\n", | |
" <td>★ぼる塾,,('名詞', '固有名詞', '一般', '*', '*', '*')ぼる塾,ぼ...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>17</td>\n", | |
" <td>★山里亮太,,('名詞', '固有名詞', '一般', '*', '*', '*')山里亮太...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>7</td>\n", | |
" <td>★藤井青銅,,('名詞', '固有名詞', '一般', '*', '*', '*')藤井青銅...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>15</td>\n", | |
" <td>★普段,,('名詞', '普通名詞', '副詞可能', '*', '*', '*')普段,普...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>★プライベート,,('名詞', '普通名詞', '形状詞可能', '*', '*', '*'...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" docid docdesc\n", | |
"0 19 ★ぼる塾,,('名詞', '固有名詞', '一般', '*', '*', '*')ぼる塾,ぼ...\n", | |
"1 17 ★山里亮太,,('名詞', '固有名詞', '一般', '*', '*', '*')山里亮太...\n", | |
"2 7 ★藤井青銅,,('名詞', '固有名詞', '一般', '*', '*', '*')藤井青銅...\n", | |
"3 15 ★普段,,('名詞', '普通名詞', '副詞可能', '*', '*', '*')普段,普...\n", | |
"4 5 ★プライベート,,('名詞', '普通名詞', '形状詞可能', '*', '*', '*'..." | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"for row, item in df.iterrows():\n", | |
" #形態素解析結果をセットする変数。一行ずつ処理するため、処理前に一度クリアする\n", | |
" tokens = ''\n", | |
" #データフレーム内の列bodyを文字列にして、sudachiの形態素解析を実施\n", | |
" tokens = tokenizer_obj.tokenize(item.docdesc, mode)\n", | |
" #空の箱を用意(リスト)\n", | |
" tt = ['']\n", | |
" for t in tokens:\n", | |
" #キーワードごとに結果を横につなげて縦に一旦追記し(tt)、それを横につなげる\n", | |
" tt.append(str(t.surface()) + ',' + ',' + str(t.part_of_speech()) + str(t.normalized_form()) + ',' + str(t.dictionary_form()))\n", | |
" docdesc = \"★\".join(map(str, tt))\n", | |
" #結果をデータフレームに変換\n", | |
" df2 = pd.DataFrame({'docid':[item.docid], 'docdesc':[docdesc]})\n", | |
" #追記\n", | |
" df1 = pd.concat([df1,df2],ignore_index=True)\n", | |
"df1.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "87234739", | |
"metadata": {}, | |
"source": [ | |
"####最大文字数を確認" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "77e53bed", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1219" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"max(map(len, df1['docdesc']))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a5f43cd3", | |
"metadata": {}, | |
"source": [ | |
"####格納用テーブルを用意" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "506ab370", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with engine.connect() as conn:\n", | |
" x1 = pd.read_sql(\"\"\"\n", | |
" create multiset table jumbo.aud05_sudachi (\n", | |
" docid integer, \n", | |
" docdesc varchar(1500) character set unicode \n", | |
" ) primary index (docid) \n", | |
" \"\"\", conn)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "5a3df4d3", | |
"metadata": {}, | |
"source": [ | |
"####形態素解析後データの格納" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "862792f9", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df1.to_sql('aud05_sudachi',engine,if_exists='append',index=False)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment