Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save jshirius/b60abbb0e566a75ef6f5c51a2e269cd0 to your computer and use it in GitHub Desktop.
Save jshirius/b60abbb0e566a75ef6f5c51a2e269cd0 to your computer and use it in GitHub Desktop.
共起語ライブラリ(pyfpgrowth)・networkxを使って共起語のネットワークを作る
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 共起語ライブラリ(pyfpgrowth)・networkxを使って共起語のネットワークを作る"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#別途JanomeDataSetクラスを以下のURLからダウンロードして設定してください。\n",
"#https://github.com/jshirius/nlp_tools/blob/master/janome_data_set.py\n",
"from janome_data_set import JanomeDataSet\n",
"import twitter\n",
"\n",
"import pyfpgrowth\n",
"\n",
"\n",
"#Twitter APIにアクセスするための4つの設定\n",
"#Twitterより与えられたものを設定する\n",
"CONSUMER_KEY = ''\n",
"CONSUMER_SECRET = ''\n",
"ACCESS_TOKEN = ''\n",
"ACCESS_TOKEN_SECRET = ''\n",
"\n",
"#Twitter APIにアクセスする\n",
"api = twitter.Api(consumer_key=CONSUMER_KEY,\n",
" consumer_secret=CONSUMER_SECRET,\n",
" access_token_key=ACCESS_TOKEN,\n",
" access_token_secret=ACCESS_TOKEN_SECRET)\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"#検索結果を取得する\n",
"from urllib.parse import urlencode\n",
"query = urlencode({\n",
" 'q': 'プログラミングスクール', # 検索ワード\n",
" 'result_type': 'recent', # recent/popular/mixed\n",
" 'count': 100 # 取得するツイート数(100が最大)\n",
" # 'max_id': これを利用して更に過去の情報を取れる\n",
"})\n",
"\n",
"#発言内容を無視するユーザー\n",
"#例えば、宣伝目的のツイートを弾くなど\n",
"exclude_users = []\n",
"\n",
"result = api.GetSearch(raw_query=query)\n",
"#print(result)\n",
"text_list = []\n",
"#ページ単位で文章を設定する\n",
"page_datas=[]\n",
"\n",
"for status in result:\n",
" \n",
" #除外ユーザーか?\n",
" #if(status.user.screen_name in exclude_users):\n",
" # continue\n",
" \n",
" if(\"https:\" in status.text):\n",
" continue\n",
" \n",
" #print(status)\n",
" print(\"-\" * 50)\n",
" print(status.id)\n",
" print(status.created_at)\n",
" \n",
" print(status.user.screen_name)\n",
" print(status.user.name)\n",
" print(status.text)\n",
"\n",
" page_datas.append(status.text)\n",
" #text_list.append(text)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#共起語用に形態素分析する\n",
"documents=[]\n",
"\n",
"#形態素に分けるときは、neologdを使ったほうが良い\n",
"#morpheme_janome = JanomeDataSet()\n",
"morpheme_janome = JanomeDataSet('neologd')\n",
"for t in page_datas:\n",
" #形態素処理\n",
" data = morpheme_janome.text_morpheme(t,\"名詞\")\n",
" if(len(data) == 0):\n",
" continue\n",
" #documents.extend(data)\n",
" documents.append(data)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"documents"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#共起語を作成する\n",
"patterns = pyfpgrowth.find_frequent_patterns(documents, 5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"patterns"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#デバッグ用のログ\n",
"documents"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ここから共起語をnetworkxを使ってネットワーク図を作ってみる"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#数字とラベルの辞書作成\n",
"#キーワードをユニーク化する\n",
"uniqu_list = []\n",
"for keys, v in patterns.items():\n",
" #print(keys)\n",
" uniqu_list.extend(keys) \n",
"uniqu_list = list(set(uniqu_list))\n",
"\n",
"print(\"キーワードをユニークにする\")\n",
"print(uniqu_list)\n",
"\n",
"#エッジの数字とラベルを紐付ける\n",
"dict_label ={} \n",
"int_dict_label = {}\n",
"for i in range(len(uniqu_list)):\n",
" k = uniqu_list[i]\n",
" dict_label[k] = i\n",
" int_dict_label[i] = k\n",
"\n",
"\n",
"#数字とラベルの紐付け完了\n",
"print(dict_label)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#networkxを使って実際にグラフを書いてみよう\n",
"import matplotlib.pyplot as plt\n",
"import networkx as nx\n",
"import matplotlib.font_manager #日本語\n",
"\n",
"#日本語フォント読み込み\n",
"font_path = r'/Users/develop/python/kaggle/Osaka.ttf'\n",
"font_prop = matplotlib.font_manager.FontProperties(fname=font_path)\n",
"\n",
"#network初期化\n",
"G=nx.Graph()\n",
"\n",
"#ネットワーク作成\n",
"for keys, v in patterns.items():\n",
" #print(keys)\n",
" old_k = \"\"\n",
" for index, k in enumerate(keys):\n",
" if(index == 0):\n",
" old_k = k\n",
" continue\n",
" \n",
" no = dict_label[k]\n",
" old_no = dict_label[old_k]\n",
" G.add_edge(old_no, no)\n",
" old_k = k\n",
" #uniqu_list.extend(keys)\n",
" \n",
"pos=nx.spring_layout(G)\n",
"nx.draw(G,pos,node_color='#A0CBE2',width=1,edge_cmap=plt.cm.Blues,with_labels=False)\n",
"\n",
"datas = nx.draw_networkx_labels(G,pos,int_dict_label,font_size=16)\n",
"\n",
"#日本語に対応できるようにするため、日本語が使えるフォントを設定している\n",
"for t in datas.values():\n",
" t.set_fontproperties(font_prop)\n",
" \n",
"plt.show()\n",
"#中心まではやる"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment