Skip to content

Instantly share code, notes, and snippets.

@jaseg
Created May 17, 2017 20:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jaseg/cd8a79a0fe061cee6a41e83abd7bd485 to your computer and use it in GitHub Desktop.
Save jaseg/cd8a79a0fe061cee6a41e83abd7bd485 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
pinyinToneMarks = {
u'a': u'āáǎà', u'e': u'ēéěè', u'i': u'īíǐì',
u'o': u'ōóǒò', u'u': u'ūúǔù', u'ü': u'ǖǘǚǜ',
u'A': u'ĀÁǍÀ', u'E': u'ĒÉĚÈ', u'I': u'ĪÍǏÌ',
u'O': u'ŌÓǑÒ', u'U': u'ŪÚǓÙ', u'Ü': u'ǕǗǙǛ'
}
def convertPinyinCallback(m):
tone=int(m.group(3))%5
r=m.group(1).replace(u'v', u'ü').replace(u'V', u'Ü')
# for multple vowels, use first one if it is a/e/o, otherwise use second one
pos=0
if len(r)>1 and not r[0] in 'aeoAEO':
pos=1
if tone != 0:
r=r[0:pos]+pinyinToneMarks[r[pos]][tone-1]+r[pos+1:]
return r+m.group(2)
def convertPinyin(s):
return re.sub(u'([aeiouüvÜ]{1,3})(n?g?r?)([012345])', convertPinyinCallback, s, flags=re.IGNORECASE)
with open('pcr1-cedict-vocabulary.utf8.txt') as f:
lines = [ l.strip() for l in f.readlines() ]
for line in lines:
if not line or line.startswith('#'):
continue
match = re.match(r'^(.*) \[(.*)] /(.*)/$', line)
if not match:
print('ERR:', line)
hanzi, pinyin, gloss = match.groups()
print(hanzi, convertPinyin(pinyin), gloss, sep='\t')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment