Skip to content

Instantly share code, notes, and snippets.

@jannson
Last active August 29, 2015 13:56
Show Gist options
  • Save jannson/9243722 to your computer and use it in GitHub Desktop.
Save jannson/9243722 to your computer and use it in GitHub Desktop.
translate msr_tranning.utf to crf++ template
#!/usr/bin/python
# -*- coding: utf-8 -*-
import codecs
import re
fout = codecs.open('fout.txt', 'w','utf-8')
accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+", re.UNICODE)
en_chars = re.compile(ur"[a-zA-Z0-9]+", re.UNICODE)
B = ['B','B1','B2']+['M']*10
NS = u'1234567890.%%%'
Q1 = u'~·!@#¥……&×()-——=+{}【】\|;:“‘~·《》、?' ?'
Q2 = u'~`!@#$^&*()_+-={}[]|\:;"\'<,>.?/'
Q3 = u'~!@#¥……&*()——-+={}【】|、:;”’《,》。?、' 、'
Q = set(Q1+Q2+Q3)
print Q
def get_ws(w):
ws = ''
for wss in w:
if wss not in Q:
ws += wss
else:
if ws != '':
yield ws, True
ws = ''
yield wss, False
if ws != '':
yield ws, True
with codecs.open('icwb2-data/training/msr_training.utf8','r','utf-8') as f:
for line in f:
#line += u' hello 23月'
#line = u'截至 6月30日 , 累计 上网 电量 58.6亿千瓦时 , 占 年度 计划 的 47.8% 。'
words = line.split()
if len(words) <= 0:
continue
if words[0] == u'“':'
words = words[1:]
for w in words:
if accepted_chars.match(w):
if len(w) == 1:
#print w+' CN S'
fout.write(w+' CN S\n')
else:
if len(w) > 6:
continue
i = 0
for ww in w[0:len(w)-1]:
#print ww+' CN '+B[i]
fout.write(ww+' CN '+B[i]+'\n')
i += 1
#print w[len(w)-1]+' CN E'
fout.write(w[len(w)-1]+' CN E\n')
elif en_chars.match(w):
#print w+' ASCII S'
fout.write(w+' ASCII S\n')
else:
if len(w) == 1 and w in Q:
print w
fout.write(w+' FUNCS S\n')
else:
#print w
for www, t in get_ws(w):
if not t:
fout.write(www+' FUNCS S\n')
continue
ns = ''
for ww in www:
if ww in NS:
ns += ww
else:
if ns != '':
fout.write(ns+' ASCII B\n')
ns = ''
fout.write(ww+' CN E\n')
else:
fout.write(ww+' CN S\n')
if ns != '':
fout.write(ns+' ASCII S\n')
fout.write('\n')
#break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment