Last active
March 29, 2018 08:43
-
-
Save nakagami/3b977dec929e961d678eb193e4017690 to your computer and use it in GitHub Desktop.
青空文庫の「我輩は猫である」を形態素解析して4文字以上の名詞の出現頻度の高い順にベスト30を表示
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
import re | |
import collections | |
import zipfile | |
import requests | |
from janome.tokenizer import Tokenizer | |
r = requests.get('http://www.aozora.gr.jp/cards/000148/files/789_ruby_5639.zip') | |
f = zipfile.ZipFile(io.BytesIO(r.content)).open('wagahaiwa_nekodearu.txt') | |
text = f.read().decode('cp932') | |
text = re.sub('《[^》]+》', '', text) | |
text = re.sub('|', '', text) | |
text = re.sub('[.+?]', '', text) | |
text = re.sub('-----[\s\S]*-----', '', text) | |
text = re.split('底本:',text)[0] | |
t = Tokenizer() | |
wordcount = collections.Counter([ | |
token.base_form | |
for token in t.tokenize(text) | |
if token.part_of_speech.split(',')[0] == '名詞' and len(token.base_form) > 3 | |
]) | |
for e in wordcount.most_common(30): | |
print(e) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment