Last active
July 17, 2019 11:36
-
-
Save yFGq-Ziw/3f1c9170885a2422bcc830b209c7d5a7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from janome.tokenizer import Tokenizer | |
# .txtをエンコーディングして読み込み | |
book = open("tweets.txt", "rt", encoding='utf-8') | |
text = book.read() | |
book.close() | |
# 形態素解析のオブジェクトを生成 | |
tok = Tokenizer() | |
# テキストを、一行ずつ処理する | |
word_dic = {} | |
lines = text.split("\r\n") | |
for line in lines: | |
mailist = tok.tokenize(line) | |
for w in mailist: | |
word = w.surface | |
ps = w.part_of_speech # 品詞 | |
if ps.find('名詞') < 0: continue # 名詞だけカウントする | |
if not word in word_dic: | |
word_dic[word] = 0 | |
word_dic[word] += 1 # カウント | |
# 使用頻度の多い単語を、降順で表示 | |
keys = sorted(word_dic.items(), key=lambda x:x[1], reverse=True) | |
with open(r"result.txt", "w", encoding='utf-8') as fp: | |
for word,cnt in keys[:100]: | |
fp.write(str("{0}\t{1}\n".format(word,cnt))) | |
print("完了デス!∠(・`_´・ )") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment