Skip to content

Instantly share code, notes, and snippets.

@amake
Last active May 24, 2016 07:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amake/2f5b98ad5e517269f636182f353bba30 to your computer and use it in GitHub Desktop.
Save amake/2f5b98ad5e517269f636182f353bba30 to your computer and use it in GitHub Desktop.
Japanese–English Software Corpus to TMX
#!/usr/bin/env python
# Convert the parallel corpora available at
# http://www2.nict.go.jp/univ-com/multi_trans/member/mutiyama/manual/index.html
# to TMX format
import os
import re
import tarfile
import urllib
from codecs import open
ARCHIVE_FILE = 'je.tgz'
ARCHIVE_ROOT = 'http://www2.nict.go.jp/univ-com/multi_trans/member/mutiyama/manual'
PROJECTS = ['FreeBSD',
'Gentoo_Linux',
'JM',
'JF',
'NetBeans',
'PEAR',
'PHP',
'PostgreSQL',
'Python',
'XFree86']
def scrub(text):
text = (text.replace('\n', '').replace('\r', '').replace('<', '&lt;')
.replace(u'\u001b', '').replace(']]>', ''))
text = re.sub(r'&(?!amp;)', '&amp;', text)
text = re.sub(ur'.\u0008', '', text)
return text
def main():
for project in PROJECTS:
urllib.urlretrieve('/'.join([ARCHIVE_ROOT, project, ARCHIVE_FILE]),
ARCHIVE_FILE)
tmx = project + '-en-ja.tmx'
with open(tmx, 'w', encoding='utf-8') as out:
out.write('''<?xml version="1.0" encoding="UTF-8"?>
<tmx version="1.4">
<header creationtool="%s" creationtoolversion="unknown" segtype="paragraph" o-tmf="unknown" adminlang="en" srclang="en-us" datatype="text" />
<body>
''' % os.path.basename(__file__))
with tarfile.open('je.tgz', 'r:gz') as tar:
for line in tar.extractfile('je/para.txt'):
try:
decoded = line.decode('euc_jp')
except UnicodeDecodeError:
continue
print(decoded)
_, ja, en = scrub(decoded).split(' ||| ')
out.write('''<tu>
<tuv xml:lang="en">
<seg>%s</seg>
</tuv>
<tuv xml:lang="ja">
<seg>%s</seg>
</tuv>
</tu>
''' % (en, ja))
out.write('''
</body>
</tmx>
''')
os.remove(ARCHIVE_FILE)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment