Created
July 9, 2020 11:15
-
-
Save jshirius/08bf2916b133fa9fb124a33c79ac8f31 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 共起語作成サンプルコード" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#別途JanomeDataSetクラスを以下のURLからダウンロードして設定してください。\n", | |
"#https://github.com/jshirius/nlp_tools/blob/master/janome_data_set.py\n", | |
"from janome_data_set import JanomeDataSet\n", | |
"\n", | |
"\n", | |
"import pyfpgrowth\n", | |
"\n", | |
"#ページ単位で文章を設定する\n", | |
"page_datas = []\n", | |
"\n", | |
"\n", | |
"sentence = \"毎週月曜日は、一週間の中で最も憂鬱な曜日である\"\n", | |
"page_datas.append(sentence)\n", | |
"\n", | |
"sentence = \"毎週火曜日は、一週間の中でちょっと憂鬱な曜日である\"\n", | |
"page_datas.append(sentence)\n", | |
"\n", | |
"sentence = \"毎週水曜日は、一週間の中で中間くらいの気分である\"\n", | |
"page_datas.append(sentence)\n", | |
"\n", | |
"sentence = \"毎週木曜日は、一週間の中でいちばん疲れる一日である\"\n", | |
"page_datas.append(sentence)\n", | |
"\n", | |
"sentence = \"毎週金曜日は、一週間の中でいちばんテンションが高い日である\"\n", | |
"page_datas.append(sentence)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#共起語用に形態素分析する\n", | |
"documents=[]\n", | |
"\n", | |
"#形態素に分けるときは、neologdを使ったほうが良い\n", | |
"morpheme_janome = JanomeDataSet()\n", | |
"#morpheme_janome = JanomeDataSet('neologd')\n", | |
"for t in page_datas:\n", | |
" #形態素処理\n", | |
" data = morpheme_janome.text_morpheme(t,\"名詞\")\n", | |
" if(len(data) == 0):\n", | |
" continue\n", | |
" #documents.extend(data)\n", | |
" documents.append(data)\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#共起語を作成する\n", | |
"patterns = pyfpgrowth.find_frequent_patterns(documents, 2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{('憂鬱',): 2,\n", | |
" ('中', '憂鬱'): 2,\n", | |
" ('一週間', '憂鬱'): 2,\n", | |
" ('憂鬱', '毎週'): 2,\n", | |
" ('一週間', '中', '憂鬱'): 2,\n", | |
" ('中', '憂鬱', '毎週'): 2,\n", | |
" ('一週間', '憂鬱', '毎週'): 2,\n", | |
" ('一週間', '中', '憂鬱', '毎週'): 2,\n", | |
" ('曜日',): 2,\n", | |
" ('憂鬱', '曜日'): 2,\n", | |
" ('中', '曜日'): 2,\n", | |
" ('一週間', '曜日'): 2,\n", | |
" ('曜日', '毎週'): 2,\n", | |
" ('中', '憂鬱', '曜日'): 2,\n", | |
" ('一週間', '憂鬱', '曜日'): 2,\n", | |
" ('憂鬱', '曜日', '毎週'): 2,\n", | |
" ('一週間', '中', '曜日'): 2,\n", | |
" ('中', '曜日', '毎週'): 2,\n", | |
" ('一週間', '曜日', '毎週'): 2,\n", | |
" ('一週間', '中', '憂鬱', '曜日'): 2,\n", | |
" ('中', '憂鬱', '曜日', '毎週'): 2,\n", | |
" ('一週間', '憂鬱', '曜日', '毎週'): 2,\n", | |
" ('一週間', '中', '曜日', '毎週'): 2,\n", | |
" ('一週間', '中', '憂鬱', '曜日', '毎週'): 2,\n", | |
" ('いちばん',): 2,\n", | |
" ('いちばん', '中'): 2,\n", | |
" ('いちばん', '一週間'): 2,\n", | |
" ('いちばん', '毎週'): 2,\n", | |
" ('いちばん', '一週間', '中'): 2,\n", | |
" ('いちばん', '中', '毎週'): 2,\n", | |
" ('いちばん', '一週間', '毎週'): 2,\n", | |
" ('いちばん', '一週間', '中', '毎週'): 2,\n", | |
" ('日',): 2,\n", | |
" ('いちばん', '日'): 2,\n", | |
" ('中', '日'): 2,\n", | |
" ('一週間', '日'): 2,\n", | |
" ('日', '毎週'): 2,\n", | |
" ('いちばん', '中', '日'): 2,\n", | |
" ('いちばん', '一週間', '日'): 2,\n", | |
" ('いちばん', '日', '毎週'): 2,\n", | |
" ('一週間', '中', '日'): 2,\n", | |
" ('中', '日', '毎週'): 2,\n", | |
" ('一週間', '日', '毎週'): 2,\n", | |
" ('いちばん', '一週間', '中', '日'): 2,\n", | |
" ('いちばん', '中', '日', '毎週'): 2,\n", | |
" ('いちばん', '一週間', '日', '毎週'): 2,\n", | |
" ('一週間', '中', '日', '毎週'): 2,\n", | |
" ('いちばん', '一週間', '中', '日', '毎週'): 2,\n", | |
" ('毎週',): 5,\n", | |
" ('一週間',): 5,\n", | |
" ('一週間', '毎週'): 5,\n", | |
" ('中',): 5,\n", | |
" ('一週間', '中'): 5,\n", | |
" ('中', '毎週'): 5,\n", | |
" ('一週間', '中', '毎週'): 5}" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"patterns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[['毎週', '月曜日', '一週間', '中', '憂鬱', '曜日'],\n", | |
" ['毎週', '火曜日', '一週間', '中', '憂鬱', '曜日'],\n", | |
" ['毎週', '水曜日', '一週間', '中', '中間', '気分'],\n", | |
" ['毎週', '木曜日', '一週間', '中', 'いちばん', '一', '日'],\n", | |
" ['毎週', '金曜日', '一週間', '中', 'いちばん', 'テンション', '日']]" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#デバッグ用のログ\n", | |
"documents" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment