Created
January 30, 2021 04:34
-
-
Save jshirius/3769b9752c40038cd659d364b4261390 to your computer and use it in GitHub Desktop.
2021年版 pkeによるキーフレーズ処理
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 2021年版 pkeによる日本語のキーフレーズ処理\n", | |
"\n", | |
"- 必要なライブラリ\n", | |
" - spacy\n", | |
" - ginza\n", | |
" - pke\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import ginza # pip install ginza\n", | |
"import nltk # pip install nltk\n", | |
"import spacy\n", | |
"import pke #pip install git+https://github.com/boudinfl/pke.git\n", | |
"\n", | |
"#pkeデフォルトでは、日本語の考慮がないため、以下のようにストップワードの設定をする\n", | |
"#2021年1月現在は、以下の記述が必要です\n", | |
"pke.base.lang_stopwords['ja_ginza'] = 'japanese'\n", | |
"\n", | |
"#以下の方法を紹介しているサイトが多いですが、2021年1月現在はpkeの仕様変更により、ISO_to_languageが存在しません\n", | |
"#pke.base.ISO_to_language['ja_ginza'] = 'japanese'\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<spacy.lang.ja.Japanese object at 0x10cde2e10>\n" | |
] | |
} | |
], | |
"source": [ | |
"\n", | |
"def get_key_phrase(spacy_model, text, n = 5):\n", | |
" \"\"\"キーフレーズ処理\n", | |
"\n", | |
" Args:\n", | |
" text ([type]): キーフレーズ対象の文字列\n", | |
" n (int, optional): [description]. Defaults to 10.\n", | |
" gc_collect (bool, optional): [description]. Defaults to False.\n", | |
"\n", | |
" Returns:\n", | |
" [type]: [description]\n", | |
" \"\"\"\n", | |
" #MultipartiteRankなどのキーフレーズの手法については以下のサイトが参考になります\n", | |
" #https://www.ogis-ri.co.jp/otc/hiroba/technical/similar-document-search/part5.html\n", | |
" extractor = pke.unsupervised.MultipartiteRank()\n", | |
" extractor.load_document(input=text, language='ja_ginza', normalization=None, spacy_model= spacy_model )\n", | |
" extractor.candidate_selection( pos={'NOUN', 'PROPN', 'ADJ', 'NUM'})\n", | |
" extractor.candidate_weighting(threshold=0.74, method='average', alpha=1.1)\n", | |
" key_phrase = extractor.get_n_best(n)\n", | |
" \n", | |
" return key_phrase\n", | |
" \n", | |
" \n", | |
"#spacyに設定されているstopwordを使う\n", | |
"from spacy.lang.ja import stop_words\n", | |
"\n", | |
"spacy_model = spacy.load(\"ja_ginza\") # GiNZAモデルの読み込み\n", | |
"print(spacy_model)\n", | |
"stopwords = list(stop_words.STOP_WORDS)\n", | |
"nltk.corpus.stopwords.words_org = nltk.corpus.stopwords.words\n", | |
"nltk.corpus.stopwords.words = lambda lang : stopwords if lang == 'japanese' else nltk.corpus.stopwords.words_org(lang)\n", | |
" \n", | |
" \n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('ドラゴン ボール', 0.5), ('ジャンプ 作品', 0.5)]" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"get_key_phrase(spacy_model,\"ドラゴンボールは鳥山明の作品です。だから、今でも人気があるジャンプ作品です\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('スクエニ', 0.3962987032710885),\n", | |
" ('ゲームソフト', 0.3370938981808533),\n", | |
" ('ドラゴン クエスト', 0.2666073985480575)]" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"get_key_phrase(spacy_model,\"ドラゴンクエストは、誰の作品ですか?スクエニから発売されたゲームソフトです。\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('まとめ買い', 1.0)]" | |
] | |
}, | |
"execution_count": 31, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"get_key_phrase(spacy_model,\"並んで、まとめ買いの人もいる\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('ストーリ', 0.39361741945785217),\n", | |
" ('ドラゴン クエスト', 0.30319129027107367),\n", | |
" ('当たり前', 0.30319129027107367)]" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"get_key_phrase(spacy_model,\"違法改造されたドラゴンクエストで遊んでいたら、途中からストーリが全く進まなくなりました(当たり前)\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"ドラゴンボール、ドラゴンクエスト、ゲームソフトは、ちゃんと意味的な繋がりを表現できているようだ。\n", | |
"また、ジャンプについても「ジャンプ」と「作品」でうまく繋がりを表現できているようだ。\n" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment