Created
September 19, 2022 14:51
-
-
Save yssymmt/155dd2871ba6aebba5d8765cb3e7892d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "d27fd661", | |
"metadata": {}, | |
"source": [ | |
"#05: denorm" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "f049f83e", | |
"metadata": {}, | |
"source": [ | |
"####パッケージの読み込み" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "5ea9bd8d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"from sqlalchemy import create_engine\n", | |
"import teradatasql\n", | |
"import teradatasqlalchemy" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "287d06ba", | |
"metadata": {}, | |
"source": [ | |
"####Teradataへの接続、sqlalchemy エンジンを作成" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "ed02e9b0", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"host = \"192.168.999.999\"\n", | |
"user = \"jumbo\"\n", | |
"password = \"mambo\"\n", | |
"connstr = \"teradatasql://{user}:{password}@{host}\".format(host=host, user=user, password=password)\n", | |
"engine = create_engine(connstr)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "1c59a433", | |
"metadata": {}, | |
"source": [ | |
"####データを取得" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "6456629f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>docid</th>\n", | |
" <th>seqno</th>\n", | |
" <th>cat</th>\n", | |
" <th>word</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>若林</td>\n", | |
" <td>若槻千夏</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>若林</td>\n", | |
" <td>幾つ</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>6</td>\n", | |
" <td>若林</td>\n", | |
" <td>テレビ</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>8</td>\n", | |
" <td>若林</td>\n", | |
" <td>番組</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1</td>\n", | |
" <td>10</td>\n", | |
" <td>若林</td>\n", | |
" <td>司会</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" docid seqno cat word\n", | |
"0 1 1 若林 若槻千夏\n", | |
"1 1 3 若林 幾つ\n", | |
"2 1 6 若林 テレビ\n", | |
"3 1 8 若林 番組\n", | |
"4 1 10 若林 司会" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"with engine.connect() as conn:\n", | |
" df = pd.read_sql(\"\"\"\n", | |
" select *\n", | |
" from jumbo.aud07_wordseq \n", | |
" order by docid, seqno \n", | |
" \"\"\", conn)\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "1c2f5157", | |
"metadata": {}, | |
"source": [ | |
"####リストに変換する" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "a3096b97", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"docid\n", | |
"1 [若槻千夏, 幾つ, テレビ, 番組, 司会, 務める, 本番, 以外, 人見知り, 話す,...\n", | |
"2 [漫才, ツッコミ, 担当, たりないふたり, ボケ, 担当]\n", | |
"3 [ナナメ, 夕暮れ, 他, 本, 出す]\n", | |
"4 [深夜, 一人, バスケットボール, スリーポイント, 練習]\n", | |
"5 [プライベート, バスケットボール, 足, 怪我]\n", | |
"6 [星野源, 日本, テレビ, 界, 希望, 思う]\n", | |
"7 [藤井青銅, ピンク, ベスト, じゃない方, しゃべれる]\n", | |
"8 [mc, waka, 日本武道館, 横浜アリーナ, 人, 歌, ラップ, 茶々, 入れる]\n", | |
"9 [茶々, 名前, チワワ, 犬, 飼う]\n", | |
"10 [結婚, 直前, 浮気, ばれる]\n", | |
"11 [六本木, 社長, モンクレール, ダウン, もらう]\n", | |
"12 [ピンク, ベスト, 着る, 胸, 張る, トゥース, 大声, 叫ぶ]\n", | |
"13 [ピンク, セーター, 着る, 後輩, 芸人, すいません, ピンク, 着, もらう, 挨拶]\n", | |
"14 [漫才, ボケ, 担当, ラジオ, テレビ, ボケ, ない]\n", | |
"15 [普段, 靴下, 履く, ない, 足, 裏, 象, よう]\n", | |
"16 [バカリズム, 存在, 面白い, ウケる, スベる, ない]\n", | |
"17 [山里亮太, ツッコミ, 敵わ, ない, 思う]\n", | |
"18 [入船, 出身, 築地, 出身, 嘘, 地元, 人, お前, 入船, ツッコミ]\n", | |
"19 [ぼる塾, 人, トゥース, 掛け合い, 面白い]\n", | |
"20 [スベる, 芸風, スベる, 怖い, 思う]\n", | |
"Name: word, dtype: object" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df1 = df.groupby(\"docid\")[\"word\"].apply(list)\n", | |
"df1" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "20658b0b", | |
"metadata": {}, | |
"source": [ | |
"####カラムや中身の調整" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "a9ee8a19", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>docid</th>\n", | |
" <th>word</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>若槻千夏 幾つ テレビ 番組 司会 務める 本番 以外 人見知り 話す ない</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>漫才 ツッコミ 担当 たりないふたり ボケ 担当</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>ナナメ 夕暮れ 他 本 出す</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>深夜 一人 バスケットボール スリーポイント 練習</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>プライベート バスケットボール 足 怪我</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>6</td>\n", | |
" <td>星野源 日本 テレビ 界 希望 思う</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>7</td>\n", | |
" <td>藤井青銅 ピンク ベスト じゃない方 しゃべれる</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>8</td>\n", | |
" <td>mc waka 日本武道館 横浜アリーナ 人 歌 ラップ 茶々 入れる</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>9</td>\n", | |
" <td>茶々 名前 チワワ 犬 飼う</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>10</td>\n", | |
" <td>結婚 直前 浮気 ばれる</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>11</td>\n", | |
" <td>六本木 社長 モンクレール ダウン もらう</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>12</td>\n", | |
" <td>ピンク ベスト 着る 胸 張る トゥース 大声 叫ぶ</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>13</td>\n", | |
" <td>ピンク セーター 着る 後輩 芸人 すいません ピンク 着 もらう 挨拶</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>14</td>\n", | |
" <td>漫才 ボケ 担当 ラジオ テレビ ボケ ない</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>15</td>\n", | |
" <td>普段 靴下 履く ない 足 裏 象 よう</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>16</td>\n", | |
" <td>バカリズム 存在 面白い ウケる スベる ない</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>17</td>\n", | |
" <td>山里亮太 ツッコミ 敵わ ない 思う</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>18</td>\n", | |
" <td>入船 出身 築地 出身 嘘 地元 人 お前 入船 ツッコミ</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>19</td>\n", | |
" <td>ぼる塾 人 トゥース 掛け合い 面白い</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>20</td>\n", | |
" <td>スベる 芸風 スベる 怖い 思う</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" docid word\n", | |
"0 1 若槻千夏 幾つ テレビ 番組 司会 務める 本番 以外 人見知り 話す ない\n", | |
"1 2 漫才 ツッコミ 担当 たりないふたり ボケ 担当\n", | |
"2 3 ナナメ 夕暮れ 他 本 出す\n", | |
"3 4 深夜 一人 バスケットボール スリーポイント 練習\n", | |
"4 5 プライベート バスケットボール 足 怪我\n", | |
"5 6 星野源 日本 テレビ 界 希望 思う\n", | |
"6 7 藤井青銅 ピンク ベスト じゃない方 しゃべれる\n", | |
"7 8 mc waka 日本武道館 横浜アリーナ 人 歌 ラップ 茶々 入れる\n", | |
"8 9 茶々 名前 チワワ 犬 飼う\n", | |
"9 10 結婚 直前 浮気 ばれる\n", | |
"10 11 六本木 社長 モンクレール ダウン もらう\n", | |
"11 12 ピンク ベスト 着る 胸 張る トゥース 大声 叫ぶ\n", | |
"12 13 ピンク セーター 着る 後輩 芸人 すいません ピンク 着 もらう 挨拶\n", | |
"13 14 漫才 ボケ 担当 ラジオ テレビ ボケ ない\n", | |
"14 15 普段 靴下 履く ない 足 裏 象 よう\n", | |
"15 16 バカリズム 存在 面白い ウケる スベる ない\n", | |
"16 17 山里亮太 ツッコミ 敵わ ない 思う\n", | |
"17 18 入船 出身 築地 出身 嘘 地元 人 お前 入船 ツッコミ\n", | |
"18 19 ぼる塾 人 トゥース 掛け合い 面白い\n", | |
"19 20 スベる 芸風 スベる 怖い 思う" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df1 = pd.DataFrame(df.groupby(\"docid\")[\"word\"].apply(list))\n", | |
"df1.reset_index(inplace=True)\n", | |
"df1['word'] = df1['word'].astype(str)\n", | |
"df1['word'] = df1['word'].str.replace('[','', regex=True)\n", | |
"df1['word'] = df1['word'].str.replace(']','', regex=True)\n", | |
"df1['word'] = df1['word'].str.replace(',','', regex=True)\n", | |
"df1['word'] = df1['word'].str.replace(\"'\",'', regex=True)\n", | |
"df1" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "5a2511ad", | |
"metadata": {}, | |
"source": [ | |
"####不要列を削除してgroup by " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "90b93bf2", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>docid</th>\n", | |
" <th>cat</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>6</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>7</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>8</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>9</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>10</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>11</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>12</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>13</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>14</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>15</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>16</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>17</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>18</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>19</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>20</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" docid cat\n", | |
"0 1 若林\n", | |
"1 2 若林\n", | |
"2 3 若林\n", | |
"3 4 若林\n", | |
"4 5 若林\n", | |
"5 6 若林\n", | |
"6 7 若林\n", | |
"7 8 若林\n", | |
"8 9 春日\n", | |
"9 10 春日\n", | |
"10 11 春日\n", | |
"11 12 春日\n", | |
"12 13 春日\n", | |
"13 14 春日\n", | |
"14 15 春日\n", | |
"15 16 春日\n", | |
"16 17 若林\n", | |
"17 18 若林\n", | |
"18 19 春日\n", | |
"19 20 春日" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df2 = df[['docid','cat']]\n", | |
"df2 = pd.DataFrame(df2.groupby(['docid','cat']).size())\n", | |
"df2.reset_index(inplace=True)\n", | |
"df2 = df2[['docid','cat']]\n", | |
"df2" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a0e693e5", | |
"metadata": {}, | |
"source": [ | |
"####結合する" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "385199a4", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>docid</th>\n", | |
" <th>word</th>\n", | |
" <th>cat</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>若槻千夏 幾つ テレビ 番組 司会 務める 本番 以外 人見知り 話す ない</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>漫才 ツッコミ 担当 たりないふたり ボケ 担当</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>ナナメ 夕暮れ 他 本 出す</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>深夜 一人 バスケットボール スリーポイント 練習</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>プライベート バスケットボール 足 怪我</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>6</td>\n", | |
" <td>星野源 日本 テレビ 界 希望 思う</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>7</td>\n", | |
" <td>藤井青銅 ピンク ベスト じゃない方 しゃべれる</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>8</td>\n", | |
" <td>mc waka 日本武道館 横浜アリーナ 人 歌 ラップ 茶々 入れる</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>9</td>\n", | |
" <td>茶々 名前 チワワ 犬 飼う</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>10</td>\n", | |
" <td>結婚 直前 浮気 ばれる</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>11</td>\n", | |
" <td>六本木 社長 モンクレール ダウン もらう</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>12</td>\n", | |
" <td>ピンク ベスト 着る 胸 張る トゥース 大声 叫ぶ</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>13</td>\n", | |
" <td>ピンク セーター 着る 後輩 芸人 すいません ピンク 着 もらう 挨拶</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>14</td>\n", | |
" <td>漫才 ボケ 担当 ラジオ テレビ ボケ ない</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>15</td>\n", | |
" <td>普段 靴下 履く ない 足 裏 象 よう</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>16</td>\n", | |
" <td>バカリズム 存在 面白い ウケる スベる ない</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>17</td>\n", | |
" <td>山里亮太 ツッコミ 敵わ ない 思う</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>18</td>\n", | |
" <td>入船 出身 築地 出身 嘘 地元 人 お前 入船 ツッコミ</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>19</td>\n", | |
" <td>ぼる塾 人 トゥース 掛け合い 面白い</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>20</td>\n", | |
" <td>スベる 芸風 スベる 怖い 思う</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" docid word cat\n", | |
"0 1 若槻千夏 幾つ テレビ 番組 司会 務める 本番 以外 人見知り 話す ない 若林\n", | |
"1 2 漫才 ツッコミ 担当 たりないふたり ボケ 担当 若林\n", | |
"2 3 ナナメ 夕暮れ 他 本 出す 若林\n", | |
"3 4 深夜 一人 バスケットボール スリーポイント 練習 若林\n", | |
"4 5 プライベート バスケットボール 足 怪我 若林\n", | |
"5 6 星野源 日本 テレビ 界 希望 思う 若林\n", | |
"6 7 藤井青銅 ピンク ベスト じゃない方 しゃべれる 若林\n", | |
"7 8 mc waka 日本武道館 横浜アリーナ 人 歌 ラップ 茶々 入れる 若林\n", | |
"8 9 茶々 名前 チワワ 犬 飼う 春日\n", | |
"9 10 結婚 直前 浮気 ばれる 春日\n", | |
"10 11 六本木 社長 モンクレール ダウン もらう 春日\n", | |
"11 12 ピンク ベスト 着る 胸 張る トゥース 大声 叫ぶ 春日\n", | |
"12 13 ピンク セーター 着る 後輩 芸人 すいません ピンク 着 もらう 挨拶 春日\n", | |
"13 14 漫才 ボケ 担当 ラジオ テレビ ボケ ない 春日\n", | |
"14 15 普段 靴下 履く ない 足 裏 象 よう 春日\n", | |
"15 16 バカリズム 存在 面白い ウケる スベる ない 春日\n", | |
"16 17 山里亮太 ツッコミ 敵わ ない 思う 若林\n", | |
"17 18 入船 出身 築地 出身 嘘 地元 人 お前 入船 ツッコミ 若林\n", | |
"18 19 ぼる塾 人 トゥース 掛け合い 面白い 春日\n", | |
"19 20 スベる 芸風 スベる 怖い 思う 春日" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df3 = pd.merge(df1, df2, on='docid', how='inner') \n", | |
"df3 " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "900219e3", | |
"metadata": {}, | |
"source": [ | |
"####最大文字数を確認" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "0d4e6e8e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"38" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"max(map(len, df3['word']))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "fe17a6a0", | |
"metadata": {}, | |
"source": [ | |
"####空テーブル作成" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "3a26f23d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with engine.connect() as conn:\n", | |
" x1 = pd.read_sql(\"\"\"\n", | |
" create multiset table jumbo.aud11_denorm (\n", | |
" docid integer, \n", | |
" word varchar(100) character set unicode, \n", | |
" cat varchar(10) character set unicode \n", | |
" ) primary index (docid) \n", | |
" \"\"\", conn)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c391efa8", | |
"metadata": {}, | |
"source": [ | |
"####データの格納" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "851303c6", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df3.to_sql('aud11_denorm',engine,if_exists='append',index=False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "fcc8dabb", | |
"metadata": {}, | |
"source": [ | |
"####格納を確認" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "2783eacd", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>docid</th>\n", | |
" <th>word</th>\n", | |
" <th>cat</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>若槻千夏 幾つ テレビ 番組 司会 務める 本番 以外 人見知り 話す ない</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>漫才 ツッコミ 担当 たりないふたり ボケ 担当</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>ナナメ 夕暮れ 他 本 出す</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>深夜 一人 バスケットボール スリーポイント 練習</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>プライベート バスケットボール 足 怪我</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>6</td>\n", | |
" <td>星野源 日本 テレビ 界 希望 思う</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>7</td>\n", | |
" <td>藤井青銅 ピンク ベスト じゃない方 しゃべれる</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>8</td>\n", | |
" <td>mc waka 日本武道館 横浜アリーナ 人 歌 ラップ 茶々 入れる</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>9</td>\n", | |
" <td>茶々 名前 チワワ 犬 飼う</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>10</td>\n", | |
" <td>結婚 直前 浮気 ばれる</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>11</td>\n", | |
" <td>六本木 社長 モンクレール ダウン もらう</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>12</td>\n", | |
" <td>ピンク ベスト 着る 胸 張る トゥース 大声 叫ぶ</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>13</td>\n", | |
" <td>ピンク セーター 着る 後輩 芸人 すいません ピンク 着 もらう 挨拶</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>14</td>\n", | |
" <td>漫才 ボケ 担当 ラジオ テレビ ボケ ない</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>15</td>\n", | |
" <td>普段 靴下 履く ない 足 裏 象 よう</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>16</td>\n", | |
" <td>バカリズム 存在 面白い ウケる スベる ない</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>17</td>\n", | |
" <td>山里亮太 ツッコミ 敵わ ない 思う</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>18</td>\n", | |
" <td>入船 出身 築地 出身 嘘 地元 人 お前 入船 ツッコミ</td>\n", | |
" <td>若林</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>19</td>\n", | |
" <td>ぼる塾 人 トゥース 掛け合い 面白い</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>20</td>\n", | |
" <td>スベる 芸風 スベる 怖い 思う</td>\n", | |
" <td>春日</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" docid word cat\n", | |
"0 1 若槻千夏 幾つ テレビ 番組 司会 務める 本番 以外 人見知り 話す ない 若林\n", | |
"1 2 漫才 ツッコミ 担当 たりないふたり ボケ 担当 若林\n", | |
"2 3 ナナメ 夕暮れ 他 本 出す 若林\n", | |
"3 4 深夜 一人 バスケットボール スリーポイント 練習 若林\n", | |
"4 5 プライベート バスケットボール 足 怪我 若林\n", | |
"5 6 星野源 日本 テレビ 界 希望 思う 若林\n", | |
"6 7 藤井青銅 ピンク ベスト じゃない方 しゃべれる 若林\n", | |
"7 8 mc waka 日本武道館 横浜アリーナ 人 歌 ラップ 茶々 入れる 若林\n", | |
"8 9 茶々 名前 チワワ 犬 飼う 春日\n", | |
"9 10 結婚 直前 浮気 ばれる 春日\n", | |
"10 11 六本木 社長 モンクレール ダウン もらう 春日\n", | |
"11 12 ピンク ベスト 着る 胸 張る トゥース 大声 叫ぶ 春日\n", | |
"12 13 ピンク セーター 着る 後輩 芸人 すいません ピンク 着 もらう 挨拶 春日\n", | |
"13 14 漫才 ボケ 担当 ラジオ テレビ ボケ ない 春日\n", | |
"14 15 普段 靴下 履く ない 足 裏 象 よう 春日\n", | |
"15 16 バカリズム 存在 面白い ウケる スベる ない 春日\n", | |
"16 17 山里亮太 ツッコミ 敵わ ない 思う 若林\n", | |
"17 18 入船 出身 築地 出身 嘘 地元 人 お前 入船 ツッコミ 若林\n", | |
"18 19 ぼる塾 人 トゥース 掛け合い 面白い 春日\n", | |
"19 20 スベる 芸風 スベる 怖い 思う 春日" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"with engine.connect() as conn:\n", | |
" x1 = pd.read_sql(\"\"\"\n", | |
" select * from jumbo.aud11_denorm order by 1 \n", | |
" \"\"\", conn)\n", | |
"x1" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment