Created
February 18, 2023 04:36
-
-
Save yssymmt/749f619afbee5049f7a720fb504b65f7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "5568d80c", | |
"metadata": {}, | |
"source": [ | |
"#11_mecab_loop" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "52a9a8e8", | |
"metadata": {}, | |
"source": [ | |
"####パッケージの読み込み" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "5c6b4a70", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import re\n", | |
"from sqlalchemy import create_engine\n", | |
"#import teradatasql\n", | |
"import teradatasqlalchemy\n", | |
"import MeCab" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "6efb7326", | |
"metadata": {}, | |
"source": [ | |
"####Teradataへの接続、sqlalchemy エンジンを作成" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "07b13ed8", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"host = \"192.168.999.999\"\n", | |
"user = \"jumbo\"\n", | |
"password = \"mambo\"\n", | |
"connstr = \"teradatasql://{user}:{password}@{host}\".format(host=host, user=user, password=password)\n", | |
"engine = create_engine(connstr)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "150a9309", | |
"metadata": {}, | |
"source": [ | |
"####gridの付与とそのリスト取得" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "5ff3fab0", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>grid</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" grid\n", | |
"0 0\n", | |
"1 1" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"with engine.connect() as conn:\n", | |
" df0 = pd.read_sql(\"\"\"\n", | |
" select distinct grid from ( \n", | |
" select \n", | |
" docid, \n", | |
" cast((cast(docid as float) -1) / 10 as integer) as grid, \n", | |
" docdesc \n", | |
" from jumbo.aud03_neologdn \n", | |
" ) a1 \n", | |
" \"\"\", conn)\n", | |
"df0" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "6763d6b4", | |
"metadata": {}, | |
"source": [ | |
"####試しにgridを順に表示してみる" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "a00c7527", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0\n", | |
"1\n" | |
] | |
} | |
], | |
"source": [ | |
"for item in df0['grid']:\n", | |
" print(item)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "6f60bd6d", | |
"metadata": {}, | |
"source": [ | |
"####格納用の空テーブルを用意する" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "623ccd37", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with engine.connect() as conn:\n", | |
" x1 = pd.read_sql(\"\"\"\n", | |
" create multiset table jumbo.aud17 (\n", | |
" docid integer, \n", | |
" grid integer, \n", | |
" seqno integer, \n", | |
" word varchar(100) character set unicode, \n", | |
" poss varchar(1000) character set unicode \n", | |
" ) primary index (docid) \n", | |
" \"\"\", conn)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "69b3e7b1", | |
"metadata": {}, | |
"source": [ | |
"####テーブルではなくデータフレームに格納する場合は以下のような感じで" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "834ad1c3", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#dfx = pd.DataFrame( columns=['docid','grid','seqno','word','poss'] )" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "34f43606", | |
"metadata": {}, | |
"source": [ | |
"####分割単位ごとにデータをダウンロード、形態素解析、結果を分割、テーブルに格納" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "ada28e93", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for item in df0['grid']: \n", | |
" sql = \"select * from ( select docid, cast((cast(docid as float) -1) / 10 as integer) as grid, docdesc from jumbo.aud03_neologdn ) a1 where grid=\" + str(item)\n", | |
" with engine.connect() as conn:\n", | |
" df1 = pd.read_sql(sql, conn) \n", | |
" tagger = MeCab.Tagger()\n", | |
" df1['docdesc'] = df1['docdesc'].apply(tagger.parse) #形態素解析処理\n", | |
" df1[\"docdesc\"] = df1['docdesc'].str.split(pat=\"\\n\") #改行コードでレコードを配列化\n", | |
" df1 = df1.explode('docdesc', ignore_index=True) #explodeで配列を行に分割\n", | |
" df1['original_index'] = df1.index #一行ずつにindexを設定\n", | |
" df1[\"seqno\"] = df1.groupby([\"docid\"])[\"original_index\"].rank(ascending=True) #グループ内で上の行からrank付けをしていく\n", | |
" df1['seqno'] = df1['seqno'].astype(np.int64) #float型で入っているのでintに直す\n", | |
" df1 = df1.drop('original_index', axis=1) #不要列を削除\n", | |
" df2 = pd.concat([df1, df1['docdesc'].str.split('\\t', expand=True)], axis=1).drop('docdesc', axis=1) #タブ区切りで分解し元文章を削除\n", | |
" df2.rename(columns={0: 'word', 1: 'poss'}, inplace=True) #列名の変更\n", | |
" df2.to_sql('aud17',engine,if_exists='append',index=False)\n", | |
" #dfx = pd.concat([dfx,df2],ignore_index=True) #追記用データフレームに追記する場合" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "17175448", | |
"metadata": {}, | |
"source": [ | |
"####処理結果の確認" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "0f17d751", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>docid</th>\n", | |
" <th>grid</th>\n", | |
" <th>seqno</th>\n", | |
" <th>word</th>\n", | |
" <th>poss</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>ナナメ</td>\n", | |
" <td>名詞,固有名詞,一般,*,*,*,ナナメ,ナナメ,ナナメ</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>の</td>\n", | |
" <td>助詞,連体化,*,*,*,*,の,ノ,ノ</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>夕暮れ</td>\n", | |
" <td>名詞,副詞可能,*,*,*,*,夕暮れ,ユウグレ,ユーグレ</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>4</td>\n", | |
" <td>他</td>\n", | |
" <td>名詞,非自立,副詞可能,*,*,*,他,ホカ,ホカ</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>5</td>\n", | |
" <td>、</td>\n", | |
" <td>記号,読点,*,*,*,*,、,、,、</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>6</td>\n", | |
" <td>本</td>\n", | |
" <td>名詞,一般,*,*,*,*,本,ホン,ホン</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>7</td>\n", | |
" <td>を</td>\n", | |
" <td>助詞,格助詞,一般,*,*,*,を,ヲ,ヲ</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>8</td>\n", | |
" <td>出し</td>\n", | |
" <td>動詞,自立,*,*,五段・サ行,連用形,出す,ダシ,ダシ</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>9</td>\n", | |
" <td>て</td>\n", | |
" <td>助詞,接続助詞,*,*,*,*,て,テ,テ</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>10</td>\n", | |
" <td>いる</td>\n", | |
" <td>動詞,非自立,*,*,一段,基本形,いる,イル,イル</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>11</td>\n", | |
" <td>EOS</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>12</td>\n", | |
" <td></td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" docid grid seqno word poss\n", | |
"0 3 0 1 ナナメ 名詞,固有名詞,一般,*,*,*,ナナメ,ナナメ,ナナメ\n", | |
"1 3 0 2 の 助詞,連体化,*,*,*,*,の,ノ,ノ\n", | |
"2 3 0 3 夕暮れ 名詞,副詞可能,*,*,*,*,夕暮れ,ユウグレ,ユーグレ\n", | |
"3 3 0 4 他 名詞,非自立,副詞可能,*,*,*,他,ホカ,ホカ\n", | |
"4 3 0 5 、 記号,読点,*,*,*,*,、,、,、\n", | |
"5 3 0 6 本 名詞,一般,*,*,*,*,本,ホン,ホン\n", | |
"6 3 0 7 を 助詞,格助詞,一般,*,*,*,を,ヲ,ヲ\n", | |
"7 3 0 8 出し 動詞,自立,*,*,五段・サ行,連用形,出す,ダシ,ダシ\n", | |
"8 3 0 9 て 助詞,接続助詞,*,*,*,*,て,テ,テ\n", | |
"9 3 0 10 いる 動詞,非自立,*,*,一段,基本形,いる,イル,イル\n", | |
"10 3 0 11 EOS None\n", | |
"11 3 0 12 None" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"with engine.connect() as conn:\n", | |
" df10 = pd.read_sql(\"\"\"\n", | |
" select * from jumbo.aud17 where docid =3 order by seqno \n", | |
" \"\"\", conn)\n", | |
"df10" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "083fe4df", | |
"metadata": {}, | |
"source": [ | |
"####データフレームにおけるデータを参照し、結果をcsvに吐き出す場合" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "cd300b5a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#dfx\n", | |
"#dfx.to_csv('replaceherewithwhateverfilename.csv')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment