zhiyue/cn.py

## cn.py
from types import StringType
def statistics_cn_words(s, encoding='utf-8'):
    rx = re.compile(u"[a-zA-Z0-9_\u0392-\u03c9]+|[\u4E00-\u9FFF\u3400-\u4dbf\uf900-\ufaff\u3040-\u309f\uac00-\ud7af]+",
                    re.UNICODE)
    if type(s) is StringType:  # not unicode
        s = unicode(s, encoding, 'ignore')

    splitted = rx.findall(s)
    cjk_len = 0
    for w in splitted:
        if ord(w[0]) >= 12352:  # \u3040
            cjk_len += len(w)
    return cjk_len
	from types import StringType
	def statistics_cn_words(s, encoding='utf-8'):
	rx = re.compile(u"[a-zA-Z0-9_\u0392-\u03c9]+\|[\u4E00-\u9FFF\u3400-\u4dbf\uf900-\ufaff\u3040-\u309f\uac00-\ud7af]+",
	re.UNICODE)
	if type(s) is StringType: # not unicode
	s = unicode(s, encoding, 'ignore')

	splitted = rx.findall(s)
	cjk_len = 0
	for w in splitted:
	if ord(w[0]) >= 12352: # \u3040
	cjk_len += len(w)
	return cjk_len