Last active
May 24, 2016 07:15
-
-
Save amake/2f5b98ad5e517269f636182f353bba30 to your computer and use it in GitHub Desktop.
Japanese–English Software Corpus to TMX
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Convert the parallel corpora available at | |
# http://www2.nict.go.jp/univ-com/multi_trans/member/mutiyama/manual/index.html | |
# to TMX format | |
import os | |
import re | |
import tarfile | |
import urllib | |
from codecs import open | |
ARCHIVE_FILE = 'je.tgz' | |
ARCHIVE_ROOT = 'http://www2.nict.go.jp/univ-com/multi_trans/member/mutiyama/manual' | |
PROJECTS = ['FreeBSD', | |
'Gentoo_Linux', | |
'JM', | |
'JF', | |
'NetBeans', | |
'PEAR', | |
'PHP', | |
'PostgreSQL', | |
'Python', | |
'XFree86'] | |
def scrub(text): | |
text = (text.replace('\n', '').replace('\r', '').replace('<', '<') | |
.replace(u'\u001b', '').replace(']]>', '')) | |
text = re.sub(r'&(?!amp;)', '&', text) | |
text = re.sub(ur'.\u0008', '', text) | |
return text | |
def main(): | |
for project in PROJECTS: | |
urllib.urlretrieve('/'.join([ARCHIVE_ROOT, project, ARCHIVE_FILE]), | |
ARCHIVE_FILE) | |
tmx = project + '-en-ja.tmx' | |
with open(tmx, 'w', encoding='utf-8') as out: | |
out.write('''<?xml version="1.0" encoding="UTF-8"?> | |
<tmx version="1.4"> | |
<header creationtool="%s" creationtoolversion="unknown" segtype="paragraph" o-tmf="unknown" adminlang="en" srclang="en-us" datatype="text" /> | |
<body> | |
''' % os.path.basename(__file__)) | |
with tarfile.open('je.tgz', 'r:gz') as tar: | |
for line in tar.extractfile('je/para.txt'): | |
try: | |
decoded = line.decode('euc_jp') | |
except UnicodeDecodeError: | |
continue | |
print(decoded) | |
_, ja, en = scrub(decoded).split(' ||| ') | |
out.write('''<tu> | |
<tuv xml:lang="en"> | |
<seg>%s</seg> | |
</tuv> | |
<tuv xml:lang="ja"> | |
<seg>%s</seg> | |
</tuv> | |
</tu> | |
''' % (en, ja)) | |
out.write(''' | |
</body> | |
</tmx> | |
''') | |
os.remove(ARCHIVE_FILE) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment