Skip to content

Instantly share code, notes, and snippets.

@youzaka
Created April 13, 2011 15:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save youzaka/917818 to your computer and use it in GitHub Desktop.
Save youzaka/917818 to your computer and use it in GitHub Desktop.
mecab-ipadicから捨て仮名だけで構成されている語彙を抽出
#!/usr/bin/env python2.6
# -*- coding: utf-8 -*-
import codecs
import csv
import os
import re
import sys
directory = sys.argv[1]
# "ㇷ゚"が2文字扱いされるので、一旦削除
motoneta = list(u'アイウエオカクケシスツトヌハヒフヘホムヤユヨラリルレロワ')
sutegana = list(u'ァィゥェォヵㇰヶㇱㇲッㇳㇴㇵㇶㇷㇸㇹㇺャュョㇻㇼㇽㇾㇿヮ')
table = dict(zip(motoneta, sutegana))
to_sutegana = lambda x: ''.join([table.get(char, char) for char in x])
regex = re.compile(u'^[%s]+$' % ''.join(motoneta))
total = set()
count = dict()
os.chdir(directory)
for name in os.listdir(directory):
if not name.endswith('.csv'):
continue
reader = csv.reader(codecs.open(name, 'r', 'euc_jp'))
for item in reader:
item = map(unicode, item)
total.add(item[0])
if regex.match(item[11]) and item[9] in (u'基本形', u'*') :
count[item[0]] = item[11]
for k, v in sorted(count.items(), key=lambda x: len(x[1])):
print k, v, to_sutegana(v)
print "%d / %d = %f %%" % (len(count), len(total), len(count) * 100.0 / len(total))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment