Created
January 7, 2012 16:50
-
-
Save satomacoto/1575291 to your computer and use it in GitHub Desktop.
Extracts rubies of Aozora Bunko
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
regex = u'^[ぁ-ゞ・ー]+$' # ひらがな+・+ーだけ | |
p = re.compile(regex) | |
for line in open('ruby.txt'): | |
file, kanji, yomi = line.strip().split('\t') | |
if p.match(yomi.decode('utf-8')): | |
print "%s\t%s\t%s" % (file, kanji, yomi) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
kana = u"""ヴ う゛ | |
ア あ | |
イ い | |
ウ う | |
エ え | |
オ お | |
ァ ぁ | |
ィ ぃ | |
ゥ ぅ | |
ェ ぇ | |
ォ ぉ | |
カ か | |
キ き | |
ク く | |
ケ け | |
コ こ | |
ガ が | |
ギ ぎ | |
グ ぐ | |
ゲ げ | |
ゴ ご | |
サ さ | |
シ し | |
ス す | |
セ せ | |
ソ そ | |
ザ ざ | |
ジ じ | |
ズ ず | |
ゼ ぜ | |
ゾ ぞ | |
タ た | |
チ ち | |
ツ つ | |
テ て | |
ト と | |
ダ だ | |
ヂ ぢ | |
ヅ づ | |
デ で | |
ド ど | |
ッ っ | |
ナ な | |
ニ に | |
ヌ ぬ | |
ネ ね | |
ノ の | |
ハ は | |
ヒ ひ | |
フ ふ | |
ヘ へ | |
ホ ほ | |
バ ば | |
ビ び | |
ブ ぶ | |
ベ べ | |
ボ ぼ | |
パ ぱ | |
ピ ぴ | |
プ ぷ | |
ペ ぺ | |
ポ ぽ | |
マ ま | |
ミ み | |
ム む | |
メ め | |
モ も | |
ャ ゃ | |
ヤ や | |
ュ ゅ | |
ユ ゆ | |
ョ ょ | |
ヨ よ | |
ラ ら | |
リ り | |
ル る | |
レ れ | |
ロ ろ | |
ヮ ゎ | |
ワ わ | |
ヰ ゐ | |
ヱ ゑ | |
ヲ を | |
ン ん""" | |
katahira = {} | |
for l in kana.split('\n'): | |
kata, hira = l.split('\t') | |
katahira[kata] = hira | |
def convert(req): | |
res = "" | |
for c in req: | |
if c in katahira: | |
res += katahira[c] | |
else: | |
res += c | |
return res | |
mariko = 'セイウンスカイ' | |
print convert(mariko.decode('utf-8')).encode('utf-8') #→せいうんすかい |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys | |
import os | |
import re | |
import glob | |
import zipfile | |
# 引数でパスを指定 | |
path = sys.argv[1] | |
files = glob.glob(path) | |
regex = u"(([一-龠]+?)|(?<=|)([^|]+?))《([^》]+?)》" | |
p = re.compile(regex) | |
for file in files: | |
try: | |
zip_file = zipfile.ZipFile(file, 'r') | |
except zipfile.BadZipfile: | |
continue | |
for member in zip_file.namelist(): | |
if '.txt' == member[-4:]: | |
with zip_file.open(member) as f: | |
for line in f: | |
try: | |
line = line.decode('shift_jis').strip() | |
except UnicodeDecodeError: | |
continue | |
for m in p.findall(line): | |
print "%s\t%s\t%s" % (os.path.basename(file), m[0].encode('utf-8'), m[-1].encode('utf-8')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys | |
import csv | |
try: | |
file = sys.argv[1] | |
with open(file, 'rb') as f: | |
reader = csv.reader(f) | |
for row in reader: | |
print row[45] | |
except: | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment