Skip to content

Instantly share code, notes, and snippets.

@reusee
Created November 22, 2011 03:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save reusee/1384816 to your computer and use it in GitHub Desktop.
Save reusee/1384816 to your computer and use it in GitHub Desktop.
collins cobuild 6th xml file formatter
# coding: utf8
import re
class Formatter(dict):
def __init__(self, input_file):
dict.__init__(self)
xml_source = open(input_file, 'r')
for line in xml_source.xreadlines():
line = line.strip()
if line.startswith('<?xml'): continue
elif line.startswith('<dictionary'): continue
elif line.startswith('</dictionary'): continue
elif line.startswith('<headword'): # 开始词汇
self.current_word = None
self.current_str = ''
self.num = 0
elif line.startswith('</headword'): # 结束词汇
# 格式化词汇 to implement
self.current_str += '\n'
#print '=' * 10, self.current_word, '=' * 10
#print self.current_str
self[self.current_word] = self.current_str
elif line.startswith('<'):
cmd = line[1:5]
if cmd in [
'FREQ',
'BXTH',
'BLOC',
'BTSY',
'XRCA',
'XRCA',
'BXWL',
'BWWL',
'BWEG',
'BXWP',
'BWCP',
'BOXU',
'BXUT',
'PXWW',
'PXPD',
'CAT0',
'BOXA',
'BOXB',
'XRC0',
]: continue
if 'value=' in line:
value = line[14:-4]
else:
value = ''
formatter = getattr(self, 'format_' + cmd)
self.current_str += formatter(value)
@staticmethod
def nojunk(s):
s = s.replace('&lt;b&gt;', '')
s = s.replace('&lt;/b&gt;', '')
s = s.replace('&lt;x&gt;', '')
s = s.replace('&lt;/x&gt;', '')
s = s.replace('&lt;i&gt;', '')
s = s.replace('&lt;/i&gt;', '')
s = s.replace('&amp;', '&')
return s
@staticmethod
def format_word(w):
w = w.replace('+', '')
w = w.replace('^', '')
return w
# 词汇
def format_HWME(self, v):
v = self.nojunk(v)
v = self.format_word(v)
self.current_word = v
return v
def format_HWSE(self, v):
v = self.nojunk(v)
v = self.format_word(v)
self.current_word = v
return v
def format_HWAE(self, v):
v = self.nojunk(v)
v = self.format_word(v)
self.current_word = v
return v
# also ...
def format_HWAS(self, v):
return '\nalso %s' % v
# 发音
def format_PRON(self, v):
return '\n\n' + self.nojunk(v) + ' '
# 变体
def format_HDIF(self, v):
return '(' + v + ') '
def format_RNON(self, v):
return '(' + v + ') '
def format_HDIA(self, v):
return '(' + v + ') '
# 编号
def format_DNUM(self, v):
self.num += 1
return '\n\n' + str(self.num) + ' '
# 词性
def format_POSP(self, v):
return '[%s] ' % v
# 解释
def format_DEFN(self, v):
return '\n' + self.nojunk(v)
def format_DEFX(self, v):
return '\n' + self.nojunk(v)
def format_POSC(self, v):
return '\n' + self.nojunk(v)
# 例句
def format_EGPH(self, v):
return '\n=> ' + self.nojunk(v)
# or ...
def format_HWAF(self, v):
return '\nor ' + v
# 用法提示, in addition to ...
def format_BOXX(self, v):
return '\n-> ' + self.nojunk(v)
# see ...
def format_XRSE(self, v):
return '\n-> see ' + v
# BRIN or AM
def format_LBRN(self, v):
return ' (' + v + ')'
# 用法提示
def format_BOXR(self, v):
return '\n-> ' + self.nojunk(v)
# 跟在词性后面的语法用法
def format_GRAM(self, v):
return ' [' + self.nojunk(v) + ']'
def format_HDGR(self, v):
return ' [' + self.nojunk(v) + ']'
# 情感
def format_BOXE(self, v):
return ' (' + v + ')'
# see also ..
def format_XRSA(self, v):
return '\n-> see also ' + v
def format_XRPX(self, v):
return '\n-> see also ' + v
# 反义词
def format_BTAN(self, v):
return '\n<> ' + v
# 性质
def format_LBRR(self, v):
return ' (' + v + ')'
def format_LBSF(self, v):
return ' (' + v + ')'
# 短语
def format_PHVA(self, v):
return '\n\n** ' + v + ' '
def format_PHVB(self, v):
return '\n\n** ' + v + ' '
def format_XRLS(self, v):
return '\n** ' + self.nojunk(v) + ' '
# 提示
def format_BOXP(self, v):
return '\n-> ' + v
def format_BOXL(self, v):
return '\n-> ' + v
def format_XRXR(self, v):
return '\n-> ' + v
# 消歧
def format_MNXP(self, v):
return '\n-> ' + v
# trademark
def format_LBTM(self, v):
return ' (' + v + ')'
outfile = open('collins.src', 'w')
for c in range(ord('a'), ord('z') + 1):
filename = 'xmls/%s.xml' % chr(c)
print filename
f = Formatter(filename)
for word in f:
d = f[word].replace('\n', r'\n')
outfile.write('%s\t%s\n' % (word, d))
outfile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment