Skip to content

Instantly share code, notes, and snippets.

@andelf
Created November 9, 2011 04:46
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save andelf/1350419 to your computer and use it in GitHub Desktop.
Save andelf/1350419 to your computer and use it in GitHub Desktop.
中文引号处理转换
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import operator
import itertools
import re
def debug(x):
print 'debug', dir(x), x.group()
def convert1(sent):
# too bad
assert isinstance(sent, unicode), "not support non-unicode yet"
def quote_gen(quotes=u"“”"):
yield u""
while 1:
yield quotes[0]
yield quotes[1]
seg = sent.split('"')
if len(seg) % 2 != 1:
# raise RuntimeError('non-balenced quotes!')
return "ERROR"
newseg = reduce(tuple.__add__, zip(quote_gen(), seg ))
newsent = reduce(unicode.__add__, newseg)
return newsent
def convert2(sent):
trans_table = {':': u':', ',': u',', '.': u'。',
'?': u'?', '!': u'!', ';': u';',}
# '"': u'"', "'": u'''}
for k in trans_table.keys():
trans_table[ord(k)] = trans_table[k] # make a transtable
dquotes = itertools.cycle(u'“”' if sent.count('"') & 1 == 0 else u'"')
squotes = itertools.cycle(u'‘’' if sent.count("'") & 1 == 0 else u''')
_obj = lambda x: dquotes.next() if '"' == x.group() else squotes.next()
pattern = re.compile(ur'''['"]''', re.U)
newsent = pattern.sub(_obj, sent)
return newsent.translate(trans_table)
def test(msg):
print "msg =>", msg
print "convert1(msg) =>", convert1(msg)
print "convert2(msg) =>", convert2(msg)
def pk():
msg = u'基本原则"这"是一个"测试用例".' * 1000
return msg
if __name__ == '__main__':
test( u'基本原则"这"是一个"测试用例".' )
test( u'''基本原则"这"是一个"'测试'用例"''' )
test( u'''昨日,玉溪市公安局红塔分局驻"高古楼网站"警长李正平表示,目前警方已介入调查此事.''')
test( u'测试, "这是"一"个"错误测试用例"')
test( u'""""""""""""')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment