Created
September 19, 2022 14:47
-
-
Save yssymmt/fe9dcd939651384b61693bd5b5969588 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "0f686c8c", | |
"metadata": {}, | |
"source": [ | |
"#01: initload" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "2ab558f6", | |
"metadata": {}, | |
"source": [ | |
"####パッケージの読み込み" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "88bb72b8", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"from sqlalchemy import create_engine\n", | |
"import teradatasql\n", | |
"import teradatasqlalchemy\n", | |
"import mojimoji" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "f9e60a57", | |
"metadata": {}, | |
"source": [ | |
"####Excelファイル読み込み" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "af3eb723", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>文書</th>\n", | |
" <th>カテゴリー</th>\n", | |
" <th>出典</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>若槻千夏「幾つかのテレビの番組で司会を務めるが、本番以外では人見知りで話さない」</td>\n", | |
" <td>若林</td>\n", | |
" <td>あちこちオードリー</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>漫才ではツッコミを担当するが、「たりないふたり」ではボケを担当していた</td>\n", | |
" <td>若林</td>\n", | |
" <td>たりないふたり</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>ナナメの夕暮れ他、本を出している</td>\n", | |
" <td>若林</td>\n", | |
" <td>文藝春秋</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>深夜に一人でバスケットボールのスリーポイントを練習している</td>\n", | |
" <td>若林</td>\n", | |
" <td>オールナイトニッポン</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>プライベートのバスケットで足を怪我した</td>\n", | |
" <td>若林</td>\n", | |
" <td>しくじり先生</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" 文書 カテゴリー 出典\n", | |
"0 若槻千夏「幾つかのテレビの番組で司会を務めるが、本番以外では人見知りで話さない」 若林 あちこちオードリー\n", | |
"1 漫才ではツッコミを担当するが、「たりないふたり」ではボケを担当していた 若林 たりないふたり\n", | |
"2 ナナメの夕暮れ他、本を出している 若林 文藝春秋\n", | |
"3 深夜に一人でバスケットボールのスリーポイントを練習している 若林 オールナイトニッポン\n", | |
"4 プライベートのバスケットで足を怪我した 若林 しくじり先生" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.read_excel('aud.xlsx')\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "697e2a44", | |
"metadata": {}, | |
"source": [ | |
"####出典列が不要" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "1a8edcce", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>文書</th>\n", | |
" <th>カテゴリー</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>若槻千夏「幾つかのテレビの番組で司会を務めるが、本番以外では人見知りで話さない」</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>漫才ではツッコミを担当するが、「たりないふたり」ではボケを担当していた</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>ナナメの夕暮れ他、本を出している</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>深夜に一人でバスケットボールのスリーポイントを練習している</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>プライベートのバスケットで足を怪我した</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" 文書 カテゴリー\n", | |
"0 若槻千夏「幾つかのテレビの番組で司会を務めるが、本番以外では人見知りで話さない」 若林\n", | |
"1 漫才ではツッコミを担当するが、「たりないふたり」ではボケを担当していた 若林\n", | |
"2 ナナメの夕暮れ他、本を出している 若林\n", | |
"3 深夜に一人でバスケットボールのスリーポイントを練習している 若林\n", | |
"4 プライベートのバスケットで足を怪我した 若林" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = df[['文書','カテゴリー']]\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "7931c825", | |
"metadata": {}, | |
"source": [ | |
"####idを付与、列順入替、列名変更" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "f3743b7f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>docid</th>\n", | |
" <th>cat</th>\n", | |
" <th>docdesc</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>若林</td>\n", | |
" <td>若槻千夏「幾つかのテレビの番組で司会を務めるが、本番以外では人見知りで話さない」</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>若林</td>\n", | |
" <td>漫才ではツッコミを担当するが、「たりないふたり」ではボケを担当していた</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>若林</td>\n", | |
" <td>ナナメの夕暮れ他、本を出している</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>若林</td>\n", | |
" <td>深夜に一人でバスケットボールのスリーポイントを練習している</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>若林</td>\n", | |
" <td>プライベートのバスケットで足を怪我した</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" docid cat docdesc\n", | |
"0 1 若林 若槻千夏「幾つかのテレビの番組で司会を務めるが、本番以外では人見知りで話さない」\n", | |
"1 2 若林 漫才ではツッコミを担当するが、「たりないふたり」ではボケを担当していた\n", | |
"2 3 若林 ナナメの夕暮れ他、本を出している\n", | |
"3 4 若林 深夜に一人でバスケットボールのスリーポイントを練習している\n", | |
"4 5 若林 プライベートのバスケットで足を怪我した" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"serial_num = pd.RangeIndex(start=1, stop=len(df.index) + 1, step=1)\n", | |
"df['docid'] = serial_num\n", | |
"df = df.loc[:,['docid','カテゴリー','文書']]\n", | |
"df.columns = ['docid','cat','docdesc']\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "0977eac1", | |
"metadata": {}, | |
"source": [ | |
"####件数確認" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "2beae481", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"RangeIndex: 20 entries, 0 to 19\n", | |
"Data columns (total 3 columns):\n", | |
" # Column Non-Null Count Dtype \n", | |
"--- ------ -------------- ----- \n", | |
" 0 docid 20 non-null int64 \n", | |
" 1 cat 20 non-null object\n", | |
" 2 docdesc 20 non-null object\n", | |
"dtypes: int64(1), object(2)\n", | |
"memory usage: 608.0+ bytes\n" | |
] | |
} | |
], | |
"source": [ | |
"df.info()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "fb438d8b", | |
"metadata": {}, | |
"source": [ | |
"####最大文字数を確認" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "e6499141", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"44" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"max(map(len, df['docdesc']))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "099eae10", | |
"metadata": {}, | |
"source": [ | |
"####Teradataへの接続、sqlalchemy エンジンを作成" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "f03c988e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"host = \"192.168.999.999\"\n", | |
"user = \"jumbo\"\n", | |
"password = \"mambo\"\n", | |
"connstr = \"teradatasql://{user}:{password}@{host}\".format(host=host, user=user, password=password)\n", | |
"engine = create_engine(connstr)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "78fd34ee", | |
"metadata": {}, | |
"source": [ | |
"####空テーブル作成、元データ用 " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "aa2a92e2", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with engine.connect() as conn:\n", | |
" x1 = pd.read_sql(\"\"\"\n", | |
" create multiset table jumbo.aud01_org (\n", | |
" docid integer, \n", | |
" cat varchar(10) character set unicode, \n", | |
" docdesc varchar(100) character set unicode \n", | |
" ) primary index (docid) \n", | |
" \"\"\", conn)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "1152b69f", | |
"metadata": {}, | |
"source": [ | |
"####元データの格納" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "fe47c09e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.to_sql('aud01_org',engine,if_exists='append',index=False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "b56ede38", | |
"metadata": {}, | |
"source": [ | |
"####格納を確認 " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "ad3a18cd", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>docid</th>\n", | |
" <th>cat</th>\n", | |
" <th>docdesc</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>若林</td>\n", | |
" <td>若槻千夏「幾つかのテレビの番組で司会を務めるが、本番以外では人見知りで話さない」</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>若林</td>\n", | |
" <td>漫才ではツッコミを担当するが、「たりないふたり」ではボケを担当していた</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>若林</td>\n", | |
" <td>ナナメの夕暮れ他、本を出している</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>若林</td>\n", | |
" <td>深夜に一人でバスケットボールのスリーポイントを練習している</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>若林</td>\n", | |
" <td>プライベートのバスケットで足を怪我した</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>6</td>\n", | |
" <td>若林</td>\n", | |
" <td>星野源「日本、テレビ界の希望だと思う」</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>7</td>\n", | |
" <td>若林</td>\n", | |
" <td>藤井青銅「ピンクのベストじゃない方がしゃべれるんだよ」</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>8</td>\n", | |
" <td>若林</td>\n", | |
" <td>MC.Wakaとして、日本武道館、横浜アリーナなどで人の歌にラップで茶々を入れている</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>9</td>\n", | |
" <td>春日</td>\n", | |
" <td>茶々という名前のチワワ犬を飼っている</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>10</td>\n", | |
" <td>春日</td>\n", | |
" <td>結婚直前に浮気がばれた</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>11</td>\n", | |
" <td>春日</td>\n", | |
" <td>六本木の社長からモンクレールのダウンをもらっていた</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>12</td>\n", | |
" <td>春日</td>\n", | |
" <td>ピンクベストを着て胸を張っていて、トゥースと大声で叫ぶ</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>13</td>\n", | |
" <td>春日</td>\n", | |
" <td>ピンクのセーターを着た後輩の芸人から、すいません、ピンク着させてもらってますと挨拶された</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>14</td>\n", | |
" <td>春日</td>\n", | |
" <td>漫才ではボケを担当するが、ラジオやテレビでは全然ボケない</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>15</td>\n", | |
" <td>春日</td>\n", | |
" <td>普段は靴下を履かないので、足の裏が象のようになっている</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>16</td>\n", | |
" <td>春日</td>\n", | |
" <td>バカリズム「存在が面白い。ウケるスベるとかじゃない」</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>17</td>\n", | |
" <td>若林</td>\n", | |
" <td>山里亮太にはツッコミでは敵わないと思っている</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>18</td>\n", | |
" <td>若林</td>\n", | |
" <td>入船出身なのに築地出身ですと嘘をついたら、地元の人にお前入船だろとツッコミされた</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>19</td>\n", | |
" <td>春日</td>\n", | |
" <td>ぼる塾の人と「まあねぇ」と「トゥーーース!」の掛け合いは面白かった</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>20</td>\n", | |
" <td>春日</td>\n", | |
" <td>スベる芸風なのに、スベるのを怖いと思っている</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" docid cat docdesc\n", | |
"0 1 若林 若槻千夏「幾つかのテレビの番組で司会を務めるが、本番以外では人見知りで話さない」\n", | |
"1 2 若林 漫才ではツッコミを担当するが、「たりないふたり」ではボケを担当していた\n", | |
"2 3 若林 ナナメの夕暮れ他、本を出している\n", | |
"3 4 若林 深夜に一人でバスケットボールのスリーポイントを練習している\n", | |
"4 5 若林 プライベートのバスケットで足を怪我した\n", | |
"5 6 若林 星野源「日本、テレビ界の希望だと思う」\n", | |
"6 7 若林 藤井青銅「ピンクのベストじゃない方がしゃべれるんだよ」\n", | |
"7 8 若林 MC.Wakaとして、日本武道館、横浜アリーナなどで人の歌にラップで茶々を入れている\n", | |
"8 9 春日 茶々という名前のチワワ犬を飼っている\n", | |
"9 10 春日 結婚直前に浮気がばれた\n", | |
"10 11 春日 六本木の社長からモンクレールのダウンをもらっていた\n", | |
"11 12 春日 ピンクベストを着て胸を張っていて、トゥースと大声で叫ぶ\n", | |
"12 13 春日 ピンクのセーターを着た後輩の芸人から、すいません、ピンク着させてもらってますと挨拶された\n", | |
"13 14 春日 漫才ではボケを担当するが、ラジオやテレビでは全然ボケない\n", | |
"14 15 春日 普段は靴下を履かないので、足の裏が象のようになっている\n", | |
"15 16 春日 バカリズム「存在が面白い。ウケるスベるとかじゃない」\n", | |
"16 17 若林 山里亮太にはツッコミでは敵わないと思っている\n", | |
"17 18 若林 入船出身なのに築地出身ですと嘘をついたら、地元の人にお前入船だろとツッコミされた\n", | |
"18 19 春日 ぼる塾の人と「まあねぇ」と「トゥーーース!」の掛け合いは面白かった\n", | |
"19 20 春日 スベる芸風なのに、スベるのを怖いと思っている" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"with engine.connect() as conn:\n", | |
" x1 = pd.read_sql(\"\"\"\n", | |
" select * from jumbo.aud01_org order by 1 \n", | |
" \"\"\", conn)\n", | |
"x1" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "19409801", | |
"metadata": {}, | |
"source": [ | |
"####全角半角の統一(英数字は半角に、カタカナは全角に)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "74055c20", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>docid</th>\n", | |
" <th>cat</th>\n", | |
" <th>docdesc</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>若林</td>\n", | |
" <td>若槻千夏「幾つかのテレビの番組で司会を務めるが、本番以外では人見知りで話さない」</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>若林</td>\n", | |
" <td>漫才ではツッコミを担当するが、「たりないふたり」ではボケを担当していた</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>若林</td>\n", | |
" <td>ナナメの夕暮れ他、本を出している</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>若林</td>\n", | |
" <td>深夜に一人でバスケットボールのスリーポイントを練習している</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>若林</td>\n", | |
" <td>プライベートのバスケットで足を怪我した</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>6</td>\n", | |
" <td>若林</td>\n", | |
" <td>星野源「日本、テレビ界の希望だと思う」</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>7</td>\n", | |
" <td>若林</td>\n", | |
" <td>藤井青銅「ピンクのベストじゃない方がしゃべれるんだよ」</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>8</td>\n", | |
" <td>若林</td>\n", | |
" <td>MC.Wakaとして、日本武道館、横浜アリーナなどで人の歌にラップで茶々を入れている</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>9</td>\n", | |
" <td>春日</td>\n", | |
" <td>茶々という名前のチワワ犬を飼っている</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>10</td>\n", | |
" <td>春日</td>\n", | |
" <td>結婚直前に浮気がばれた</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>11</td>\n", | |
" <td>春日</td>\n", | |
" <td>六本木の社長からモンクレールのダウンをもらっていた</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>12</td>\n", | |
" <td>春日</td>\n", | |
" <td>ピンクベストを着て胸を張っていて、トゥースと大声で叫ぶ</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>13</td>\n", | |
" <td>春日</td>\n", | |
" <td>ピンクのセーターを着た後輩の芸人から、すいません、ピンク着させてもらってますと挨拶された</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>14</td>\n", | |
" <td>春日</td>\n", | |
" <td>漫才ではボケを担当するが、ラジオやテレビでは全然ボケない</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>15</td>\n", | |
" <td>春日</td>\n", | |
" <td>普段は靴下を履かないので、足の裏が象のようになっている</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>16</td>\n", | |
" <td>春日</td>\n", | |
" <td>バカリズム「存在が面白い。ウケるスベるとかじゃない」</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>17</td>\n", | |
" <td>若林</td>\n", | |
" <td>山里亮太にはツッコミでは敵わないと思っている</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>18</td>\n", | |
" <td>若林</td>\n", | |
" <td>入船出身なのに築地出身ですと嘘をついたら、地元の人にお前入船だろとツッコミされた</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>19</td>\n", | |
" <td>春日</td>\n", | |
" <td>ぼる塾の人と「まあねぇ」と「トゥーーース!」の掛け合いは面白かった</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>20</td>\n", | |
" <td>春日</td>\n", | |
" <td>スベる芸風なのに、スベるのを怖いと思っている</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" docid cat docdesc\n", | |
"0 1 若林 若槻千夏「幾つかのテレビの番組で司会を務めるが、本番以外では人見知りで話さない」\n", | |
"1 2 若林 漫才ではツッコミを担当するが、「たりないふたり」ではボケを担当していた\n", | |
"2 3 若林 ナナメの夕暮れ他、本を出している\n", | |
"3 4 若林 深夜に一人でバスケットボールのスリーポイントを練習している\n", | |
"4 5 若林 プライベートのバスケットで足を怪我した\n", | |
"5 6 若林 星野源「日本、テレビ界の希望だと思う」\n", | |
"6 7 若林 藤井青銅「ピンクのベストじゃない方がしゃべれるんだよ」\n", | |
"7 8 若林 MC.Wakaとして、日本武道館、横浜アリーナなどで人の歌にラップで茶々を入れている\n", | |
"8 9 春日 茶々という名前のチワワ犬を飼っている\n", | |
"9 10 春日 結婚直前に浮気がばれた\n", | |
"10 11 春日 六本木の社長からモンクレールのダウンをもらっていた\n", | |
"11 12 春日 ピンクベストを着て胸を張っていて、トゥースと大声で叫ぶ\n", | |
"12 13 春日 ピンクのセーターを着た後輩の芸人から、すいません、ピンク着させてもらってますと挨拶された\n", | |
"13 14 春日 漫才ではボケを担当するが、ラジオやテレビでは全然ボケない\n", | |
"14 15 春日 普段は靴下を履かないので、足の裏が象のようになっている\n", | |
"15 16 春日 バカリズム「存在が面白い。ウケるスベるとかじゃない」\n", | |
"16 17 若林 山里亮太にはツッコミでは敵わないと思っている\n", | |
"17 18 若林 入船出身なのに築地出身ですと嘘をついたら、地元の人にお前入船だろとツッコミされた\n", | |
"18 19 春日 ぼる塾の人と「まあねぇ」と「トゥーーース!」の掛け合いは面白かった\n", | |
"19 20 春日 スベる芸風なのに、スベるのを怖いと思っている" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"####全角半角の統一(英数字は半角に、カタカナは全角に)\n", | |
"df['docdesc'] = df['docdesc'].apply(mojimoji.zen_to_han, kana=False)\n", | |
"df['docdesc'] = df['docdesc'].apply(mojimoji.han_to_zen, digit=False, ascii=False)\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "6aaf0a3b", | |
"metadata": {}, | |
"source": [ | |
"####空テーブル作成、全角半角修正結果 " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "4f90ae24", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with engine.connect() as conn:\n", | |
" x1 = pd.read_sql(\"\"\"\n", | |
" create multiset table jumbo.aud02_mjmj (\n", | |
" docid integer, \n", | |
" cat varchar(10) character set unicode, \n", | |
" docdesc varchar(100) character set unicode \n", | |
" ) primary index (docid) \n", | |
" \"\"\", conn)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "2bdb73e7", | |
"metadata": {}, | |
"source": [ | |
"####全角半角矯正結果データの格納" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "cc8a31db", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.to_sql('aud02_mjmj',engine,if_exists='append',index=False)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment