tondol/README.md

## README.md

      
    Raw
  

              README.md
            
          
    README

tfdf.py

各単語のTerm Frequency, Document Frequencyを計算する。
MeCab, mecab-ipadic-neologdによる分かち書きを行う。
動詞は原形に変換してから集計する。
アルファベットはスペースで分割し、小文字に変換する。
記号のみの単語や、平仮名・片仮名のみからなる1文字の単語は削除する。

tfdf_kana.py

tfdf.py の原形変換部分を、元の表現のまま平仮名に変換する処理に置換してから集計したもの。

df_song.py

tfdf.py により求めたdf上位150件の単語を多く含む楽曲を集計する。

df_song_kana.py

df_song.py のdf集計時の処理を tfdf_kana.py と同じロジックにしたもの。

スクリプトの説明

実行準備

歌詞テキストの整形（テキストの文字コードがUTF-8かつスペースを含まないファイル名なら不要）
$ brew install nkf rename
$ find . -name "*.txt" -print0 | xargs -0 nkf -w --overwrite
$ rename 's/ /_/g' *
MeCabのインストール
$ brew install mecab
$ pip3.4 install mecab-python3
MeCab用辞書のインストール
$ brew install git curl xz
$ git clone --depth 1 git@github.com:neologd/mecab-ipadic-neologd.git
$ cd mecab-ipadic-neologd
$ ./bin/install-mecab-ipadic-neologd -n
実行

$ ls
aqours_heroes.txt		mattete.txt			tfdf.py
...
$ python3.4 tfdf.py tf > tf.md
$ python3.4 tfdf.py df > df.md

  
## df_song.py
#-*- encoding: utf-8 -*-

import glob
import re
import sys
import MeCab

mecab = MeCab.Tagger("-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")

df_dict = {}
song_dict = {}

for filename in glob.glob('./*.txt'):
    data = open(filename).read()
    # 記号や改行を削除
    data = data.replace("\n", " ").replace("\r", " ")
    data = data.replace("(", "").replace(")", "").replace("（", "").replace("）", "")
    data = data.replace("!", "").replace("！", "").replace("?", "").replace("？", "")
    data = data.replace("・", "")
    data = re.sub(r'\s+', " ", data).strip()

    node = mecab.parseToNode(data)

    words = []

    while node:
        meta = node.feature.split(",")

        if re.search(r'^[A-Za-z0-9 ]+$', node.surface) != None:
            # アルファベットのみなら単語に分解、小文字にする
            words += list(map(lambda s: s.lower(), node.surface.split(" ")))
        elif re.search(r'[!-/:-@\[-`{-~]', node.surface) != None:
            # 記号のみならスキップする
            pass
        else:
            # 可能なら原形に変換する
            if len(meta) >= 7 and meta[6] != "*":
                word = meta[6]
            else:
                word = node.surface

            # 2文字以上もしくは漢字1文字のときのみ使う
            if len(word) >= 2:
                words.append(word)
            elif len(word) >= 1 and re.search(r'^[ぁ-んァ-ン]+$', word) == None:
                words.append(word)

        node = node.next

    for word in set(words):
        if word in df_dict:
            df_dict[word] += 1
        else:
            df_dict[word] = 1

    song_dict[filename] = set(words)

# df
items = sorted(df_dict.items(), key=lambda p: p[1])
items = reversed(items)
items = list(items)[:150]

freq_words = set(map(lambda p: p[0], items))

freq_songs = map(lambda p: (p[0], p[1].intersection(freq_words)), song_dict.items())
freq_songs = sorted(freq_songs, key=lambda pair: len(pair[1]))
freq_songs = reversed(freq_songs)

for k, v in freq_songs:
    print("'%s', %d" % (k, len(v)))
    print(v)

## df_song_kana.py
#-*- encoding: utf-8 -*-

import glob
import re
import sys
import MeCab

mecab = MeCab.Tagger("-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")

df_dict = {}
song_dict = {}

# http://d.hatena.ne.jp/mohayonao/20101213/1292237816
def make_function_hiragana():
    re_katakana = re.compile(r'[ァ-ン]')
    def hiragana(s):
        return re_katakana.sub(lambda x: chr(ord(x.group(0)) - 0x60), s)
    return hiragana
hiragana = make_function_hiragana()
def make_function_katakana():
    re_hiragana = re.compile(r'[ぁ-ん]')
    def katakana(s):
        return re_hiragana.sub(lambda x: chr(ord(x.group(0)) + 0x60), s)
    return katakana
katakana = make_function_katakana()

for filename in glob.glob('./*.txt'):
    data = open(filename).read()
    # 記号や改行を削除
    data = data.replace("\n", " ").replace("\r", " ")
    data = data.replace("(", "").replace(")", "").replace("（", "").replace("）", "")
    data = data.replace("!", "").replace("！", "").replace("?", "").replace("？", "")
    data = data.replace("・", "")
    data = re.sub(r'\s+', " ", data).strip()

    node = mecab.parseToNode(data)

    words = []

    while node:
        meta = node.feature.split(",")

        if re.search(r'^[A-Za-z0-9 ]+$', node.surface) != None:
            # アルファベットのみなら単語に分解、小文字にする
            words += list(map(lambda s: s.lower(), node.surface.split(" ")))
        elif re.search(r'[!-/:-@\[-`{-~]', node.surface) != None:
            # 記号のみならスキップする
            pass
        else:
            # 可能なら読み仮名に変換する
            if len(meta) >= 8 and meta[7] != "*":
                word = hiragana(meta[7])
            else:
                word = node.surface

            # 2文字以上もしくは漢字1文字のときのみ使う
            if len(word) >= 2:
                words.append(word)
            elif len(word) >= 1 and re.search(r'^[ぁ-んァ-ン]+$', word) == None:
                words.append(word)

        node = node.next

    for word in set(words):
        if word in df_dict:
            df_dict[word] += 1
        else:
            df_dict[word] = 1

    song_dict[filename] = set(words)

# df
items = sorted(df_dict.items(), key=lambda p: p[1])
items = reversed(items)
items = list(items)[:150]

freq_words = set(map(lambda p: p[0], items))

freq_songs = map(lambda p: (p[0], p[1].intersection(freq_words)), song_dict.items())
freq_songs = sorted(freq_songs, key=lambda pair: len(pair[1]))
freq_songs = reversed(freq_songs)

for k, v in freq_songs:
    print("'%s', %d" % (k, len(v)))
    print(v)

## tfdf.py
#-*- encoding: utf-8 -*-

import glob
import re
import sys
import MeCab

mecab = MeCab.Tagger("-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")

tf_dict = {}
df_dict = {}

for filename in glob.glob('./*.txt'):
    data = open(filename).read()
    # 記号や改行を削除
    data = data.replace("\n", " ").replace("\r", " ")
    data = data.replace("(", "").replace(")", "").replace("（", "").replace("）", "")
    data = data.replace("!", "").replace("！", "").replace("?", "").replace("？", "")
    data = data.replace("・", "")
    data = re.sub(r'\s+', " ", data).strip()

    node = mecab.parseToNode(data)

    words = []

    while node:
        meta = node.feature.split(",")

        if re.search(r'^[A-Za-z0-9 ]+$', node.surface) != None:
            # アルファベットのみなら単語に分解、小文字にする
            words += list(map(lambda s: s.lower(), node.surface.split(" ")))
        elif re.search(r'[!-/:-@\[-`{-~]', node.surface) != None:
            # 記号のみならスキップする
            pass
        else:
            # 可能なら原形に変換する
            if len(meta) >= 7 and meta[6] != "*":
                word = meta[6]
            else:
                word = node.surface

            # 2文字以上もしくは漢字1文字のときのみ使う
            if len(word) >= 2:
                words.append(word)
            elif len(word) >= 1 and re.search(r'^[ぁ-んァ-ン]+$', word) == None:
                words.append(word)

        node = node.next

    for word in words:
        if word in tf_dict:
            tf_dict[word] += 1
        else:
            tf_dict[word] = 1

    for word in set(words):
        if word in df_dict:
            df_dict[word] += 1
        else:
            df_dict[word] = 1

if len(sys.argv) >= 2 and sys.argv[1] == "tf":
    # tf
    items = sorted(tf_dict.items(), key=lambda p: p[1])
    items = reversed(items)

    for k, v in items:
        print("'%s' %d" % (k, v))
else:
    # df
    items = sorted(df_dict.items(), key=lambda p: p[1])
    items = reversed(items)

    for k, v in items:
        print("'%s' %d" % (k, v))

## tfdf_kana.py
#-*- encoding: utf-8 -*-

import glob
import re
import sys
import MeCab

mecab = MeCab.Tagger("-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")

tf_dict = {}
df_dict = {}

# http://d.hatena.ne.jp/mohayonao/20101213/1292237816
def make_function_hiragana():
    re_katakana = re.compile(r'[ァ-ン]')
    def hiragana(s):
        return re_katakana.sub(lambda x: chr(ord(x.group(0)) - 0x60), s)
    return hiragana
hiragana = make_function_hiragana()
def make_function_katakana():
    re_hiragana = re.compile(r'[ぁ-ん]')
    def katakana(s):
        return re_hiragana.sub(lambda x: chr(ord(x.group(0)) + 0x60), s)
    return katakana
katakana = make_function_katakana()

for filename in glob.glob('./*.txt'):
    data = open(filename).read()
    # 記号や改行を削除
    data = data.replace("\n", " ").replace("\r", " ")
    data = data.replace("(", "").replace(")", "").replace("（", "").replace("）", "")
    data = data.replace("!", "").replace("！", "").replace("?", "").replace("？", "")
    data = data.replace("・", "")
    data = re.sub(r'\s+', " ", data).strip()

    node = mecab.parseToNode(data)

    words = []

    while node:
        meta = node.feature.split(",")

        if re.search(r'^[A-Za-z0-9 ]+$', node.surface) != None:
            # アルファベットのみなら単語に分解、小文字にする
            words += list(map(lambda s: s.lower(), node.surface.split(" ")))
        elif re.search(r'[!-/:-@\[-`{-~]', node.surface) != None:
            # 記号のみならスキップする
            pass
        else:
            # 可能なら読み仮名に変換する
            if len(meta) >= 8 and meta[7] != "*":
                word = hiragana(meta[7])
            else:
                word = hiragana(node.surface)

            # 2文字以上もしくは漢字1文字のときのみ使う
            if len(word) >= 2:
                words.append(word)
            elif len(word) >= 1 and re.search(r'^[ぁ-んァ-ン]+$', word) == None:
                words.append(word)

        node = node.next

    for word in words:
        if word in tf_dict:
            tf_dict[word] += 1
        else:
            tf_dict[word] = 1

    for word in set(words):
        if word in df_dict:
            df_dict[word] += 1
        else:
            df_dict[word] = 1

if len(sys.argv) >= 2 and sys.argv[1] == "tf":
    # tf
    items = sorted(tf_dict.items(), key=lambda p: p[1])
    items = reversed(items)

    for k, v in items:
        print("'%s' %d" % (k, v))
else:
    # df
    items = sorted(df_dict.items(), key=lambda p: p[1])
    items = reversed(items)

    for k, v in items:
        print("'%s' %d" % (k, v))
	#-- encoding: utf-8 --

	import glob
	import re
	import sys
	import MeCab

	mecab = MeCab.Tagger("-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")

	df_dict = {}
	song_dict = {}

	for filename in glob.glob('./*.txt'):
	data = open(filename).read()
	# 記号や改行を削除
	data = data.replace("\n", " ").replace("\r", " ")
	data = data.replace("(", "").replace(")", "").replace("（", "").replace("）", "")
	data = data.replace("!", "").replace("！", "").replace("?", "").replace("？", "")
	data = data.replace("・", "")
	data = re.sub(r'\s+', " ", data).strip()

	node = mecab.parseToNode(data)

	words = []

	while node:
	meta = node.feature.split(",")

	if re.search(r'^[A-Za-z0-9 ]+$', node.surface) != None:
	# アルファベットのみなら単語に分解、小文字にする
	words += list(map(lambda s: s.lower(), node.surface.split(" ")))
	elif re.search(r'[!-/:-@\[-`{-~]', node.surface) != None:
	# 記号のみならスキップする
	pass
	else:
	# 可能なら原形に変換する
	if len(meta) >= 7 and meta[6] != "*":
	word = meta[6]
	else:
	word = node.surface

	# 2文字以上もしくは漢字1文字のときのみ使う
	if len(word) >= 2:
	words.append(word)
	elif len(word) >= 1 and re.search(r'^[ぁ-んァ-ン]+$', word) == None:
	words.append(word)

	node = node.next

	for word in set(words):
	if word in df_dict:
	df_dict[word] += 1
	else:
	df_dict[word] = 1

	song_dict[filename] = set(words)

	# df
	items = sorted(df_dict.items(), key=lambda p: p[1])
	items = reversed(items)
	items = list(items)[:150]

	freq_words = set(map(lambda p: p[0], items))

	freq_songs = map(lambda p: (p[0], p[1].intersection(freq_words)), song_dict.items())
	freq_songs = sorted(freq_songs, key=lambda pair: len(pair[1]))
	freq_songs = reversed(freq_songs)

	for k, v in freq_songs:
	print("'%s', %d" % (k, len(v)))
	print(v)