Created
February 8, 2015 02:46
-
-
Save gumblex/0d65cad2ba607fd14de7 to your computer and use it in GitHub Desktop.
Numbers to Chinese representations converter in Python. 中文数字转换
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# Licensed under WTFPL or the Unlicense or CC0. | |
# This uses Python 3, but it's easy to port to Python 2 by changing | |
# strings to u'xx'. | |
import itertools | |
def num2chinese(num, big=False, simp=True, o=False, twoalt=False): | |
""" | |
Converts numbers to Chinese representations. | |
`big` : use financial characters. | |
`simp` : use simplified characters instead of traditional characters. | |
`o` : use 〇 for zero. | |
`twoalt`: use 两/兩 for two when appropriate. | |
Note that `o` and `twoalt` is ignored when `big` is used, | |
and `twoalt` is ignored when `o` is used for formal representations. | |
""" | |
# check num first | |
nd = str(num) | |
if abs(float(nd)) >= 1e48: | |
raise ValueError('number out of range') | |
elif 'e' in nd: | |
raise ValueError('scientific notation is not supported') | |
c_symbol = '正负点' if simp else '正負點' | |
if o: # formal | |
twoalt = False | |
if big: | |
c_basic = '零壹贰叁肆伍陆柒捌玖' if simp else '零壹貳參肆伍陸柒捌玖' | |
c_unit1 = '拾佰仟' | |
c_twoalt = '贰' if simp else '貳' | |
else: | |
c_basic = '〇一二三四五六七八九' if o else '零一二三四五六七八九' | |
c_unit1 = '十百千' | |
if twoalt: | |
c_twoalt = '两' if simp else '兩' | |
else: | |
c_twoalt = '二' | |
c_unit2 = '万亿兆京垓秭穰沟涧正载' if simp else '萬億兆京垓秭穰溝澗正載' | |
revuniq = lambda l: ''.join(k for k, g in itertools.groupby(reversed(l))) | |
nd = str(num) | |
result = [] | |
if nd[0] == '+': | |
result.append(c_symbol[0]) | |
elif nd[0] == '-': | |
result.append(c_symbol[1]) | |
if '.' in nd: | |
integer, remainder = nd.lstrip('+-').split('.') | |
else: | |
integer, remainder = nd.lstrip('+-'), None | |
if int(integer): | |
splitted = [integer[max(i - 4, 0):i] | |
for i in range(len(integer), 0, -4)] | |
intresult = [] | |
for nu, unit in enumerate(splitted): | |
# special cases | |
if int(unit) == 0: # 0000 | |
intresult.append(c_basic[0]) | |
continue | |
elif nu > 0 and int(unit) == 2: # 0002 | |
intresult.append(c_twoalt + c_unit2[nu - 1]) | |
continue | |
ulist = [] | |
unit = unit.zfill(4) | |
for nc, ch in enumerate(reversed(unit)): | |
if ch == '0': | |
if ulist: # ???0 | |
ulist.append(c_basic[0]) | |
elif nc == 0: | |
ulist.append(c_basic[int(ch)]) | |
elif nc == 1 and ch == '1' and unit[1] == '0': | |
# special case for tens | |
# edit the 'elif' if you don't like | |
# 十四, 三千零十四, 三千三百一十四 | |
ulist.append(c_unit1[0]) | |
elif nc > 1 and ch == '2': | |
ulist.append(c_twoalt + c_unit1[nc - 1]) | |
else: | |
ulist.append(c_basic[int(ch)] + c_unit1[nc - 1]) | |
ustr = revuniq(ulist) | |
if nu == 0: | |
intresult.append(ustr) | |
else: | |
intresult.append(ustr + c_unit2[nu - 1]) | |
result.append(revuniq(intresult).strip(c_basic[0])) | |
else: | |
result.append(c_basic[0]) | |
if remainder: | |
result.append(c_symbol[2]) | |
result.append(''.join(c_basic[int(ch)] for ch in remainder)) | |
return ''.join(result) |
good one
try
-1297000000000101.1
it outputs:
负一千二百九十七兆零零一百零一点一
i wrote an arabic-to-chinese number converter just now for fun. upon finishing I was curious to see how others are doing this. and that led me to this gist. thought i'd share my attempt here as well:
import bisect
import re
import unittest
zhdigits = '零一二三四五六七八九'
zhplaces = {
0: '' ,
1: '十',
2: '百',
3: '千',
4: '万',
8: '亿',
}
zhplace_keys = sorted(zhplaces.keys())
def numdigits(n):
return len(str(abs(n)))
def _zhnum(n):
if n < 10:
return zhdigits[n]
# Largest place length (number of trailing digits) with a name
# that fits `n`.
# Examples:
# n | LPL | name
# --- | --- | ---
# 100 | 2 | 百
# 10_0000 | 4 | 万
named_place_len = zhplace_keys[bisect.bisect_right(zhplace_keys,
numdigits(n)-1) - 1]
# Break `n` on the `named_place_len`. The final answer (without handling
# grammar exceptions like '十' instead of '一十零') will generally be
# zhnum(left_part) + unit name at `named_place_len` + zhnum(right_part)
# additionally, if the `right_part` has leading zeros, a complimenting '零'
# should lead it.
# Examples:
#
# 208|0300
# / \
# / \
# 2|08 万 零 3|00
# / \ / \
# 二 百 零 八 三 百 empty
left_part, right_part = n // 10**named_place_len, n % 10**named_place_len
return (_zhnum(left_part) +
zhplaces[named_place_len] +
((zhdigits[0] if numdigits(right_part) != named_place_len else '') +
_zhnum(right_part)
if right_part else ''))
def zhnum(n):
answer = ('负' if n < 0 else '') + _zhnum(abs(n))
answer = re.sub(r'^一十', '十', answer)
answer = re.sub(r'(?<![零十])二(?=[千万亿])', r'两', answer)
return answer
class TestZhnum(unittest.TestCase):
'''Test Chinese number formatter.'''
def test(self):
self.assertEqual(zhnum(-1), '负一')
self.assertEqual(zhnum(0), '零')
self.assertEqual(zhnum(-0), '零')
self.assertEqual(zhnum(6), '六')
self.assertEqual(zhnum(10), '十')
self.assertEqual(zhnum(14), '十四')
self.assertEqual(zhnum(28), '二十八')
self.assertEqual(zhnum(59), '五十九')
self.assertEqual(zhnum(100), '一百')
self.assertEqual(zhnum(101), '一百零一')
self.assertEqual(zhnum(110), '一百一十')
self.assertEqual(zhnum(132), '一百三十二')
self.assertEqual(zhnum(1000), '一千')
self.assertEqual(zhnum(2001), '两千零一')
self.assertEqual(zhnum(3010), '三千零一十')
self.assertEqual(zhnum(4012), '四千零一十二')
self.assertEqual(zhnum(5230), '五千二百三十')
self.assertEqual(zhnum(6234), '六千二百三十四')
self.assertEqual(zhnum(9999), '九千九百九十九')
self.assertEqual(zhnum(1_0000), '一万')
self.assertEqual(zhnum(20_0000), '二十万')
self.assertEqual(zhnum(123_4567), '一百二十三万四千五百六十七')
self.assertEqual(zhnum(500_1024), '五百万一千零二十四')
self.assertEqual(zhnum(360_5000), '三百六十万五千')
self.assertEqual(zhnum(3_0000_0000), '三亿')
self.assertEqual(zhnum(2_7600_2010), '两亿七千六百万两千零一十')
self.assertEqual(zhnum(2_0000_0000_0000_0000), '两亿亿')
self.assertEqual(zhnum(2_2002_2222), '两亿两千零二万两千二百二十二')
self.assertEqual(zhnum(22_2222_2222), '二十二亿两千二百二十二万两千二百二十二')
if __name__ == '__main__':
unittest.main()
core implementation (_zhnum
and zhnum
sans comments) ~20 lines. doesn't handle decimal numbers though.
EDIT: fixed regex for incorrect 两's in for example 二十两万. alternative to using a regex to handle 二 and 两, we can do this in _zhnum
when concatenating left_part
and the place unit. specifically:
unit = zhplaces[named_place_len]
f'两{unit}' if left_part == 2 and unit in set('千万亿') else (_zhnum(left_part + unit)
the idea is to use 两 only when the whole (four-digit) chunk resolves to 2, instead of just some random number ending in 2.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
very nice!