Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save jshirius/08bf2916b133fa9fb124a33c79ac8f31 to your computer and use it in GitHub Desktop.
Save jshirius/08bf2916b133fa9fb124a33c79ac8f31 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 共起語作成サンプルコード"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#別途JanomeDataSetクラスを以下のURLからダウンロードして設定してください。\n",
"#https://github.com/jshirius/nlp_tools/blob/master/janome_data_set.py\n",
"from janome_data_set import JanomeDataSet\n",
"\n",
"\n",
"import pyfpgrowth\n",
"\n",
"#ページ単位で文章を設定する\n",
"page_datas = []\n",
"\n",
"\n",
"sentence = \"毎週月曜日は、一週間の中で最も憂鬱な曜日である\"\n",
"page_datas.append(sentence)\n",
"\n",
"sentence = \"毎週火曜日は、一週間の中でちょっと憂鬱な曜日である\"\n",
"page_datas.append(sentence)\n",
"\n",
"sentence = \"毎週水曜日は、一週間の中で中間くらいの気分である\"\n",
"page_datas.append(sentence)\n",
"\n",
"sentence = \"毎週木曜日は、一週間の中でいちばん疲れる一日である\"\n",
"page_datas.append(sentence)\n",
"\n",
"sentence = \"毎週金曜日は、一週間の中でいちばんテンションが高い日である\"\n",
"page_datas.append(sentence)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"#共起語用に形態素分析する\n",
"documents=[]\n",
"\n",
"#形態素に分けるときは、neologdを使ったほうが良い\n",
"morpheme_janome = JanomeDataSet()\n",
"#morpheme_janome = JanomeDataSet('neologd')\n",
"for t in page_datas:\n",
" #形態素処理\n",
" data = morpheme_janome.text_morpheme(t,\"名詞\")\n",
" if(len(data) == 0):\n",
" continue\n",
" #documents.extend(data)\n",
" documents.append(data)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#共起語を作成する\n",
"patterns = pyfpgrowth.find_frequent_patterns(documents, 2)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{('憂鬱',): 2,\n",
" ('中', '憂鬱'): 2,\n",
" ('一週間', '憂鬱'): 2,\n",
" ('憂鬱', '毎週'): 2,\n",
" ('一週間', '中', '憂鬱'): 2,\n",
" ('中', '憂鬱', '毎週'): 2,\n",
" ('一週間', '憂鬱', '毎週'): 2,\n",
" ('一週間', '中', '憂鬱', '毎週'): 2,\n",
" ('曜日',): 2,\n",
" ('憂鬱', '曜日'): 2,\n",
" ('中', '曜日'): 2,\n",
" ('一週間', '曜日'): 2,\n",
" ('曜日', '毎週'): 2,\n",
" ('中', '憂鬱', '曜日'): 2,\n",
" ('一週間', '憂鬱', '曜日'): 2,\n",
" ('憂鬱', '曜日', '毎週'): 2,\n",
" ('一週間', '中', '曜日'): 2,\n",
" ('中', '曜日', '毎週'): 2,\n",
" ('一週間', '曜日', '毎週'): 2,\n",
" ('一週間', '中', '憂鬱', '曜日'): 2,\n",
" ('中', '憂鬱', '曜日', '毎週'): 2,\n",
" ('一週間', '憂鬱', '曜日', '毎週'): 2,\n",
" ('一週間', '中', '曜日', '毎週'): 2,\n",
" ('一週間', '中', '憂鬱', '曜日', '毎週'): 2,\n",
" ('いちばん',): 2,\n",
" ('いちばん', '中'): 2,\n",
" ('いちばん', '一週間'): 2,\n",
" ('いちばん', '毎週'): 2,\n",
" ('いちばん', '一週間', '中'): 2,\n",
" ('いちばん', '中', '毎週'): 2,\n",
" ('いちばん', '一週間', '毎週'): 2,\n",
" ('いちばん', '一週間', '中', '毎週'): 2,\n",
" ('日',): 2,\n",
" ('いちばん', '日'): 2,\n",
" ('中', '日'): 2,\n",
" ('一週間', '日'): 2,\n",
" ('日', '毎週'): 2,\n",
" ('いちばん', '中', '日'): 2,\n",
" ('いちばん', '一週間', '日'): 2,\n",
" ('いちばん', '日', '毎週'): 2,\n",
" ('一週間', '中', '日'): 2,\n",
" ('中', '日', '毎週'): 2,\n",
" ('一週間', '日', '毎週'): 2,\n",
" ('いちばん', '一週間', '中', '日'): 2,\n",
" ('いちばん', '中', '日', '毎週'): 2,\n",
" ('いちばん', '一週間', '日', '毎週'): 2,\n",
" ('一週間', '中', '日', '毎週'): 2,\n",
" ('いちばん', '一週間', '中', '日', '毎週'): 2,\n",
" ('毎週',): 5,\n",
" ('一週間',): 5,\n",
" ('一週間', '毎週'): 5,\n",
" ('中',): 5,\n",
" ('一週間', '中'): 5,\n",
" ('中', '毎週'): 5,\n",
" ('一週間', '中', '毎週'): 5}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patterns"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['毎週', '月曜日', '一週間', '中', '憂鬱', '曜日'],\n",
" ['毎週', '火曜日', '一週間', '中', '憂鬱', '曜日'],\n",
" ['毎週', '水曜日', '一週間', '中', '中間', '気分'],\n",
" ['毎週', '木曜日', '一週間', '中', 'いちばん', '一', '日'],\n",
" ['毎週', '金曜日', '一週間', '中', 'いちばん', 'テンション', '日']]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#デバッグ用のログ\n",
"documents"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment