Created
June 18, 2011 14:37
-
-
Save k4200/1033145 to your computer and use it in GitHub Desktop.
Script to create a CSV file for en-ja term dictionary
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# This is my first python script. Whoo hoo! | |
# I needed to create a csv, each line of which consists of | |
# key, Japanese text, and English text, out of Java language | |
# resource files. | |
import sys | |
import glob | |
import re | |
import csv | |
def is_en_file(fn): | |
return not re.search(r'_ja.properties', fn) | |
ja_files = glob.glob('*_ja.properties') | |
en_files = filter(is_en_file, glob.glob('*.properties')) | |
reline = re.compile(r'(.*?)=(.*)') | |
# http://stackoverflow.com/questions/267436/how-do-i-treat-an-ascii-string-as-unicode-and-unescape-the-escaped-characters-in | |
def encode_val_ja(str): | |
#return str.decode('unicode-escape').encode('utf-8') | |
return str.decode('unicode-escape').encode('cp932') | |
dic = {} | |
def add_to_dic(files, lang): | |
for fn in files: | |
f = open(fn, 'r') | |
for line in f: | |
m = re.search(reline, line) | |
if m: | |
key = m.group(1) | |
val = m.group(2) | |
valstr = encode_val_ja(val) | |
if dic.get(key): | |
dic[key][lang] = valstr | |
else: | |
dic[key] = {lang: valstr} | |
add_to_dic(ja_files, 'ja') | |
add_to_dic(en_files, 'en') | |
writer = csv.writer(sys.stdout, lineterminator="\n") | |
for k, v in dic.iteritems(): | |
writer.writerow([k, v.get('en'), v.get('ja')]) | |
#print k, v.get('en'), v.get('ja') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment