|
#-*- encoding: utf-8 -*- |
|
|
|
import glob |
|
import re |
|
import sys |
|
import MeCab |
|
|
|
mecab = MeCab.Tagger("-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd") |
|
|
|
df_dict = {} |
|
song_dict = {} |
|
|
|
# http://d.hatena.ne.jp/mohayonao/20101213/1292237816 |
|
def make_function_hiragana(): |
|
re_katakana = re.compile(r'[ァ-ン]') |
|
def hiragana(s): |
|
return re_katakana.sub(lambda x: chr(ord(x.group(0)) - 0x60), s) |
|
return hiragana |
|
hiragana = make_function_hiragana() |
|
def make_function_katakana(): |
|
re_hiragana = re.compile(r'[ぁ-ん]') |
|
def katakana(s): |
|
return re_hiragana.sub(lambda x: chr(ord(x.group(0)) + 0x60), s) |
|
return katakana |
|
katakana = make_function_katakana() |
|
|
|
for filename in glob.glob('./*.txt'): |
|
data = open(filename).read() |
|
# 記号や改行を削除 |
|
data = data.replace("\n", " ").replace("\r", " ") |
|
data = data.replace("(", "").replace(")", "").replace("(", "").replace(")", "") |
|
data = data.replace("!", "").replace("!", "").replace("?", "").replace("?", "") |
|
data = data.replace("・", "") |
|
data = re.sub(r'\s+', " ", data).strip() |
|
|
|
node = mecab.parseToNode(data) |
|
|
|
words = [] |
|
|
|
while node: |
|
meta = node.feature.split(",") |
|
|
|
if re.search(r'^[A-Za-z0-9 ]+$', node.surface) != None: |
|
# アルファベットのみなら単語に分解、小文字にする |
|
words += list(map(lambda s: s.lower(), node.surface.split(" "))) |
|
elif re.search(r'[!-/:-@\[-`{-~]', node.surface) != None: |
|
# 記号のみならスキップする |
|
pass |
|
else: |
|
# 可能なら読み仮名に変換する |
|
if len(meta) >= 8 and meta[7] != "*": |
|
word = hiragana(meta[7]) |
|
else: |
|
word = node.surface |
|
|
|
# 2文字以上もしくは漢字1文字のときのみ使う |
|
if len(word) >= 2: |
|
words.append(word) |
|
elif len(word) >= 1 and re.search(r'^[ぁ-んァ-ン]+$', word) == None: |
|
words.append(word) |
|
|
|
node = node.next |
|
|
|
for word in set(words): |
|
if word in df_dict: |
|
df_dict[word] += 1 |
|
else: |
|
df_dict[word] = 1 |
|
|
|
song_dict[filename] = set(words) |
|
|
|
# df |
|
items = sorted(df_dict.items(), key=lambda p: p[1]) |
|
items = reversed(items) |
|
items = list(items)[:150] |
|
|
|
freq_words = set(map(lambda p: p[0], items)) |
|
|
|
freq_songs = map(lambda p: (p[0], p[1].intersection(freq_words)), song_dict.items()) |
|
freq_songs = sorted(freq_songs, key=lambda pair: len(pair[1])) |
|
freq_songs = reversed(freq_songs) |
|
|
|
for k, v in freq_songs: |
|
print("'%s', %d" % (k, len(v))) |
|
print(v) |