Skip to content

Instantly share code, notes, and snippets.

@yFGq-Ziw
Last active July 17, 2019 11:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yFGq-Ziw/3f1c9170885a2422bcc830b209c7d5a7 to your computer and use it in GitHub Desktop.
Save yFGq-Ziw/3f1c9170885a2422bcc830b209c7d5a7 to your computer and use it in GitHub Desktop.
from janome.tokenizer import Tokenizer
# .txtをエンコーディングして読み込み
book = open("tweets.txt", "rt", encoding='utf-8')
text = book.read()
book.close()
# 形態素解析のオブジェクトを生成
tok = Tokenizer()
# テキストを、一行ずつ処理する
word_dic = {}
lines = text.split("\r\n")
for line in lines:
mailist = tok.tokenize(line)
for w in mailist:
word = w.surface
ps = w.part_of_speech # 品詞
if ps.find('名詞') < 0: continue # 名詞だけカウントする
if not word in word_dic:
word_dic[word] = 0
word_dic[word] += 1 # カウント
# 使用頻度の多い単語を、降順で表示
keys = sorted(word_dic.items(), key=lambda x:x[1], reverse=True)
with open(r"result.txt", "w", encoding='utf-8') as fp:
for word,cnt in keys[:100]:
fp.write(str("{0}\t{1}\n".format(word,cnt)))
print("完了デス!∠(・`_´・ )")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment