Created
February 8, 2015 02:46
-
-
Save gumblex/0d65cad2ba607fd14de7 to your computer and use it in GitHub Desktop.
Numbers to Chinese representations converter in Python. 中文数字转换
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# Licensed under WTFPL or the Unlicense or CC0. | |
# This uses Python 3, but it's easy to port to Python 2 by changing | |
# strings to u'xx'. | |
import itertools | |
def num2chinese(num, big=False, simp=True, o=False, twoalt=False): | |
""" | |
Converts numbers to Chinese representations. | |
`big` : use financial characters. | |
`simp` : use simplified characters instead of traditional characters. | |
`o` : use 〇 for zero. | |
`twoalt`: use 两/兩 for two when appropriate. | |
Note that `o` and `twoalt` is ignored when `big` is used, | |
and `twoalt` is ignored when `o` is used for formal representations. | |
""" | |
# check num first | |
nd = str(num) | |
if abs(float(nd)) >= 1e48: | |
raise ValueError('number out of range') | |
elif 'e' in nd: | |
raise ValueError('scientific notation is not supported') | |
c_symbol = '正负点' if simp else '正負點' | |
if o: # formal | |
twoalt = False | |
if big: | |
c_basic = '零壹贰叁肆伍陆柒捌玖' if simp else '零壹貳參肆伍陸柒捌玖' | |
c_unit1 = '拾佰仟' | |
c_twoalt = '贰' if simp else '貳' | |
else: | |
c_basic = '〇一二三四五六七八九' if o else '零一二三四五六七八九' | |
c_unit1 = '十百千' | |
if twoalt: | |
c_twoalt = '两' if simp else '兩' | |
else: | |
c_twoalt = '二' | |
c_unit2 = '万亿兆京垓秭穰沟涧正载' if simp else '萬億兆京垓秭穰溝澗正載' | |
revuniq = lambda l: ''.join(k for k, g in itertools.groupby(reversed(l))) | |
nd = str(num) | |
result = [] | |
if nd[0] == '+': | |
result.append(c_symbol[0]) | |
elif nd[0] == '-': | |
result.append(c_symbol[1]) | |
if '.' in nd: | |
integer, remainder = nd.lstrip('+-').split('.') | |
else: | |
integer, remainder = nd.lstrip('+-'), None | |
if int(integer): | |
splitted = [integer[max(i - 4, 0):i] | |
for i in range(len(integer), 0, -4)] | |
intresult = [] | |
for nu, unit in enumerate(splitted): | |
# special cases | |
if int(unit) == 0: # 0000 | |
intresult.append(c_basic[0]) | |
continue | |
elif nu > 0 and int(unit) == 2: # 0002 | |
intresult.append(c_twoalt + c_unit2[nu - 1]) | |
continue | |
ulist = [] | |
unit = unit.zfill(4) | |
for nc, ch in enumerate(reversed(unit)): | |
if ch == '0': | |
if ulist: # ???0 | |
ulist.append(c_basic[0]) | |
elif nc == 0: | |
ulist.append(c_basic[int(ch)]) | |
elif nc == 1 and ch == '1' and unit[1] == '0': | |
# special case for tens | |
# edit the 'elif' if you don't like | |
# 十四, 三千零十四, 三千三百一十四 | |
ulist.append(c_unit1[0]) | |
elif nc > 1 and ch == '2': | |
ulist.append(c_twoalt + c_unit1[nc - 1]) | |
else: | |
ulist.append(c_basic[int(ch)] + c_unit1[nc - 1]) | |
ustr = revuniq(ulist) | |
if nu == 0: | |
intresult.append(ustr) | |
else: | |
intresult.append(ustr + c_unit2[nu - 1]) | |
result.append(revuniq(intresult).strip(c_basic[0])) | |
else: | |
result.append(c_basic[0]) | |
if remainder: | |
result.append(c_symbol[2]) | |
result.append(''.join(c_basic[int(ch)] for ch in remainder)) | |
return ''.join(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
i wrote an arabic-to-chinese number converter just now for fun. upon finishing I was curious to see how others are doing this. and that led me to this gist. thought i'd share my attempt here as well:
core implementation (
_zhnum
andzhnum
sans comments) ~20 lines. doesn't handle decimal numbers though.EDIT: fixed regex for incorrect 两's in for example 二十两万. alternative to using a regex to handle 二 and 两, we can do this in
_zhnum
when concatenatingleft_part
and the place unit. specifically:the idea is to use 两 only when the whole (four-digit) chunk resolves to 2, instead of just some random number ending in 2.