tianchaijz/cjk_auto_formating.py

## cjk_auto_formating.py
#!/usr/bin/env python
# encoding: utf-8

from __future__ import (unicode_literals, print_function)

import os
import re
import sys
import codecs

utf8_reader = codecs.getreader('utf-8')

cjk_range = [
    ('\u3400', '\u4DB5'),  # CJK Unified Ideographs Extension A
    ('\u4E00', '\u9FA5'),  # CJK Unified Ideographs
    ('\u9FA6', '\u9FBB'),  # CJK Unified Ideographs
    ('\uF900', '\uFA2D'),  # CJK Compatibility Ideographs
    ('\uFA30', '\uFA6A'),  # CJK Compatibility Ideographs
    ('\uFA70', '\uFAD9'),  # CJK Compatibility Ideographs
    ('\U00020000', '\U0002A6D6'),  # CJK Unified Ideographs Extension B
    ('\U0002F800', '\U0002FA1D'),  # CJK Compatibility Supplement
]

punc_range = [
    ('\u0000', '\u0020'),  # space
    ('\u201c', '\u201d'),  # “”
    ('\u3000', '\u303f'),  # CJK Symbols and Punctuation
    ('\uff00', '\uffef'),  # Halfwidth and Fullwidth Forms
]


def _chinese_auto_spacing(text):
    def _with_range(char, check_range):
        for start, end in check_range:
            if char >= start and char <= end:
                return True
        return False

    def is_cjk(char):
        return _with_range(char, cjk_range)

    def is_punc(char):
        return _with_range(char, punc_range)

    ret = ''
    prev = None

    for char in text:
        sp = ''
        curr_is_cjk = is_cjk(char)
        curr_is_punc = is_punc(char)

        if prev:
            prev_is_cjk, prev_is_punc = prev

            if curr_is_punc or prev_is_punc:
                # do not add space around a punctuation
                sp = ''
            elif prev_is_cjk != curr_is_cjk:
                sp = ' '

        ret = ret + sp + char
        prev = (curr_is_cjk, curr_is_punc)

    return ret


def _punc_sub(text):
    punc_dict = {
        ',': '，',
        ':': '：',
        ';': '；',
        '(': '（',
        ')': '）',
    }

    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'[ \t]+([.,:;()])', r'\1', text)
    text = re.sub(r'"([\u4e00-\u9fff]+)"', r'“\1”', text)
    text = re.sub(r"'([\u4e00-\u9fff]+)'", r'“\1”', text)
    text = re.sub(r'([\u4e00-\u9fff])\.', r'\1。', text)
    text = re.sub(r'[,:;()]',
                  lambda m: punc_dict[m.group(0)], text)
    text = re.sub(r'[ \t]?([，：。；（）“”])[ \t]?', r'\1', text)

    return text


def process_text(text):
    text = _punc_sub(text)
    text = _chinese_auto_spacing(text)

    return text.encode('utf-8')


if __name__ == '__main__':
    text = None
    if len(sys.argv) == 2 and os.path.isfile(sys.argv[1]):
        text = codecs.open(sys.argv[1], 'rb', 'utf-8').read()

    if text is None:
        text = utf8_reader(sys.stdin).read()

    sys.stdout.write(process_text(text))
	#!/usr/bin/env python
	# encoding: utf-8

	from __future__ import (unicode_literals, print_function)

	import os
	import re
	import sys
	import codecs

	utf8_reader = codecs.getreader('utf-8')

	cjk_range = [
	('\u3400', '\u4DB5'), # CJK Unified Ideographs Extension A
	('\u4E00', '\u9FA5'), # CJK Unified Ideographs
	('\u9FA6', '\u9FBB'), # CJK Unified Ideographs
	('\uF900', '\uFA2D'), # CJK Compatibility Ideographs
	('\uFA30', '\uFA6A'), # CJK Compatibility Ideographs
	('\uFA70', '\uFAD9'), # CJK Compatibility Ideographs
	('\U00020000', '\U0002A6D6'), # CJK Unified Ideographs Extension B
	('\U0002F800', '\U0002FA1D'), # CJK Compatibility Supplement
	]

	punc_range = [
	('\u0000', '\u0020'), # space
	('\u201c', '\u201d'), # “”
	('\u3000', '\u303f'), # CJK Symbols and Punctuation
	('\uff00', '\uffef'), # Halfwidth and Fullwidth Forms
	]


	def _chinese_auto_spacing(text):
	def _with_range(char, check_range):
	for start, end in check_range:
	if char >= start and char <= end:
	return True
	return False

	def is_cjk(char):
	return _with_range(char, cjk_range)

	def is_punc(char):
	return _with_range(char, punc_range)

	ret = ''
	prev = None

	for char in text:
	sp = ''
	curr_is_cjk = is_cjk(char)
	curr_is_punc = is_punc(char)

	if prev:
	prev_is_cjk, prev_is_punc = prev

	if curr_is_punc or prev_is_punc:
	# do not add space around a punctuation
	sp = ''
	elif prev_is_cjk != curr_is_cjk:
	sp = ' '

	ret = ret + sp + char
	prev = (curr_is_cjk, curr_is_punc)

	return ret


	def _punc_sub(text):
	punc_dict = {
	',': '，',
	':': '：',
	';': '；',
	'(': '（',
	')': '）',
	}

	text = re.sub(r'[ \t]+', ' ', text)
	text = re.sub(r'[ \t]+([.,:;()])', r'\1', text)
	text = re.sub(r'"([\u4e00-\u9fff]+)"', r'“\1”', text)
	text = re.sub(r"'([\u4e00-\u9fff]+)'", r'“\1”', text)
	text = re.sub(r'([\u4e00-\u9fff])\.', r'\1。', text)
	text = re.sub(r'[,:;()]',
	lambda m: punc_dict[m.group(0)], text)
	text = re.sub(r'[ \t]?([，：。；（）“”])[ \t]?', r'\1', text)

	return text


	def process_text(text):
	text = _punc_sub(text)
	text = _chinese_auto_spacing(text)

	return text.encode('utf-8')


	if __name__ == '__main__':
	text = None
	if len(sys.argv) == 2 and os.path.isfile(sys.argv[1]):
	text = codecs.open(sys.argv[1], 'rb', 'utf-8').read()

	if text is None:
	text = utf8_reader(sys.stdin).read()

	sys.stdout.write(process_text(text))