Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@tianchaijz
Created March 25, 2016 01:27
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save tianchaijz/1cc919d26c82005c139b to your computer and use it in GitHub Desktop.
Save tianchaijz/1cc919d26c82005c139b to your computer and use it in GitHub Desktop.
CJK Auto Formating
#!/usr/bin/env python
# encoding: utf-8
from __future__ import (unicode_literals, print_function)
import os
import re
import sys
import codecs
utf8_reader = codecs.getreader('utf-8')
cjk_range = [
('\u3400', '\u4DB5'), # CJK Unified Ideographs Extension A
('\u4E00', '\u9FA5'), # CJK Unified Ideographs
('\u9FA6', '\u9FBB'), # CJK Unified Ideographs
('\uF900', '\uFA2D'), # CJK Compatibility Ideographs
('\uFA30', '\uFA6A'), # CJK Compatibility Ideographs
('\uFA70', '\uFAD9'), # CJK Compatibility Ideographs
('\U00020000', '\U0002A6D6'), # CJK Unified Ideographs Extension B
('\U0002F800', '\U0002FA1D'), # CJK Compatibility Supplement
]
punc_range = [
('\u0000', '\u0020'), # space
('\u201c', '\u201d'), # “”
('\u3000', '\u303f'), # CJK Symbols and Punctuation
('\uff00', '\uffef'), # Halfwidth and Fullwidth Forms
]
def _chinese_auto_spacing(text):
def _with_range(char, check_range):
for start, end in check_range:
if char >= start and char <= end:
return True
return False
def is_cjk(char):
return _with_range(char, cjk_range)
def is_punc(char):
return _with_range(char, punc_range)
ret = ''
prev = None
for char in text:
sp = ''
curr_is_cjk = is_cjk(char)
curr_is_punc = is_punc(char)
if prev:
prev_is_cjk, prev_is_punc = prev
if curr_is_punc or prev_is_punc:
# do not add space around a punctuation
sp = ''
elif prev_is_cjk != curr_is_cjk:
sp = ' '
ret = ret + sp + char
prev = (curr_is_cjk, curr_is_punc)
return ret
def _punc_sub(text):
punc_dict = {
',': ',',
':': ':',
';': ';',
'(': '(',
')': ')',
}
text = re.sub(r'[ \t]+', ' ', text)
text = re.sub(r'[ \t]+([.,:;()])', r'\1', text)
text = re.sub(r'"([\u4e00-\u9fff]+)"', r'“\1”', text)
text = re.sub(r"'([\u4e00-\u9fff]+)'", r'“\1”', text)
text = re.sub(r'([\u4e00-\u9fff])\.', r'\1。', text)
text = re.sub(r'[,:;()]',
lambda m: punc_dict[m.group(0)], text)
text = re.sub(r'[ \t]?([,:。;()“”])[ \t]?', r'\1', text)
return text
def process_text(text):
text = _punc_sub(text)
text = _chinese_auto_spacing(text)
return text.encode('utf-8')
if __name__ == '__main__':
text = None
if len(sys.argv) == 2 and os.path.isfile(sys.argv[1]):
text = codecs.open(sys.argv[1], 'rb', 'utf-8').read()
if text is None:
text = utf8_reader(sys.stdin).read()
sys.stdout.write(process_text(text))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment