Created
November 22, 2011 03:33
-
-
Save reusee/1384816 to your computer and use it in GitHub Desktop.
collins cobuild 6th xml file formatter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf8 | |
import re | |
class Formatter(dict): | |
def __init__(self, input_file): | |
dict.__init__(self) | |
xml_source = open(input_file, 'r') | |
for line in xml_source.xreadlines(): | |
line = line.strip() | |
if line.startswith('<?xml'): continue | |
elif line.startswith('<dictionary'): continue | |
elif line.startswith('</dictionary'): continue | |
elif line.startswith('<headword'): # 开始词汇 | |
self.current_word = None | |
self.current_str = '' | |
self.num = 0 | |
elif line.startswith('</headword'): # 结束词汇 | |
# 格式化词汇 to implement | |
self.current_str += '\n' | |
#print '=' * 10, self.current_word, '=' * 10 | |
#print self.current_str | |
self[self.current_word] = self.current_str | |
elif line.startswith('<'): | |
cmd = line[1:5] | |
if cmd in [ | |
'FREQ', | |
'BXTH', | |
'BLOC', | |
'BTSY', | |
'XRCA', | |
'XRCA', | |
'BXWL', | |
'BWWL', | |
'BWEG', | |
'BXWP', | |
'BWCP', | |
'BOXU', | |
'BXUT', | |
'PXWW', | |
'PXPD', | |
'CAT0', | |
'BOXA', | |
'BOXB', | |
'XRC0', | |
]: continue | |
if 'value=' in line: | |
value = line[14:-4] | |
else: | |
value = '' | |
formatter = getattr(self, 'format_' + cmd) | |
self.current_str += formatter(value) | |
@staticmethod | |
def nojunk(s): | |
s = s.replace('<b>', '') | |
s = s.replace('</b>', '') | |
s = s.replace('<x>', '') | |
s = s.replace('</x>', '') | |
s = s.replace('<i>', '') | |
s = s.replace('</i>', '') | |
s = s.replace('&', '&') | |
return s | |
@staticmethod | |
def format_word(w): | |
w = w.replace('+', '') | |
w = w.replace('^', '') | |
return w | |
# 词汇 | |
def format_HWME(self, v): | |
v = self.nojunk(v) | |
v = self.format_word(v) | |
self.current_word = v | |
return v | |
def format_HWSE(self, v): | |
v = self.nojunk(v) | |
v = self.format_word(v) | |
self.current_word = v | |
return v | |
def format_HWAE(self, v): | |
v = self.nojunk(v) | |
v = self.format_word(v) | |
self.current_word = v | |
return v | |
# also ... | |
def format_HWAS(self, v): | |
return '\nalso %s' % v | |
# 发音 | |
def format_PRON(self, v): | |
return '\n\n' + self.nojunk(v) + ' ' | |
# 变体 | |
def format_HDIF(self, v): | |
return '(' + v + ') ' | |
def format_RNON(self, v): | |
return '(' + v + ') ' | |
def format_HDIA(self, v): | |
return '(' + v + ') ' | |
# 编号 | |
def format_DNUM(self, v): | |
self.num += 1 | |
return '\n\n' + str(self.num) + ' ' | |
# 词性 | |
def format_POSP(self, v): | |
return '[%s] ' % v | |
# 解释 | |
def format_DEFN(self, v): | |
return '\n' + self.nojunk(v) | |
def format_DEFX(self, v): | |
return '\n' + self.nojunk(v) | |
def format_POSC(self, v): | |
return '\n' + self.nojunk(v) | |
# 例句 | |
def format_EGPH(self, v): | |
return '\n=> ' + self.nojunk(v) | |
# or ... | |
def format_HWAF(self, v): | |
return '\nor ' + v | |
# 用法提示, in addition to ... | |
def format_BOXX(self, v): | |
return '\n-> ' + self.nojunk(v) | |
# see ... | |
def format_XRSE(self, v): | |
return '\n-> see ' + v | |
# BRIN or AM | |
def format_LBRN(self, v): | |
return ' (' + v + ')' | |
# 用法提示 | |
def format_BOXR(self, v): | |
return '\n-> ' + self.nojunk(v) | |
# 跟在词性后面的语法用法 | |
def format_GRAM(self, v): | |
return ' [' + self.nojunk(v) + ']' | |
def format_HDGR(self, v): | |
return ' [' + self.nojunk(v) + ']' | |
# 情感 | |
def format_BOXE(self, v): | |
return ' (' + v + ')' | |
# see also .. | |
def format_XRSA(self, v): | |
return '\n-> see also ' + v | |
def format_XRPX(self, v): | |
return '\n-> see also ' + v | |
# 反义词 | |
def format_BTAN(self, v): | |
return '\n<> ' + v | |
# 性质 | |
def format_LBRR(self, v): | |
return ' (' + v + ')' | |
def format_LBSF(self, v): | |
return ' (' + v + ')' | |
# 短语 | |
def format_PHVA(self, v): | |
return '\n\n** ' + v + ' ' | |
def format_PHVB(self, v): | |
return '\n\n** ' + v + ' ' | |
def format_XRLS(self, v): | |
return '\n** ' + self.nojunk(v) + ' ' | |
# 提示 | |
def format_BOXP(self, v): | |
return '\n-> ' + v | |
def format_BOXL(self, v): | |
return '\n-> ' + v | |
def format_XRXR(self, v): | |
return '\n-> ' + v | |
# 消歧 | |
def format_MNXP(self, v): | |
return '\n-> ' + v | |
# trademark | |
def format_LBTM(self, v): | |
return ' (' + v + ')' | |
outfile = open('collins.src', 'w') | |
for c in range(ord('a'), ord('z') + 1): | |
filename = 'xmls/%s.xml' % chr(c) | |
print filename | |
f = Formatter(filename) | |
for word in f: | |
d = f[word].replace('\n', r'\n') | |
outfile.write('%s\t%s\n' % (word, d)) | |
outfile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment