Skip to content

Instantly share code, notes, and snippets.

@sanpingz
Created May 25, 2013 09:41
Show Gist options
  • Save sanpingz/5648527 to your computer and use it in GitHub Desktop.
Save sanpingz/5648527 to your computer and use it in GitHub Desktop.
unicode字符计数
#!/usr/bin/env python
#-*- coding:utf8 -*-
import re
import urllib
def counter(url, start, end):
p = urllib.urlopen(url).read().decode('utf8')
art = r'(?s)(?=%s).*?(?<=%s)' % (start, end)
article = re.findall(art, p)[0]
# print article
cn = re.compile(u'(?:<.*?>)|([\u4e00-\u9fa5]+)|([a-zA-Z0-9_-]+)')
num_zh = num_en = 0
for chr in cn.finditer(article):
zh, en = chr.groups()
if zh:
num_zh += len(zh.encode('utf8'))/3
print zh.encode('utf8'),
if en:
num_en += 1
print en,
return num_zh, num_en
if __name__ == '__main__':
url = r'http://qd.58.com/zufang/14336862785801x.shtml'
start, end = '<article class="description_con " >', '</article>'
num_zh, num_en = counter(url, start, end)
print
print '中文字数: %d' % num_zh
print '英文字数: %d' % num_en
print '总字数: %d' % (num_zh + num_en)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment