-
-
Save gumblex/0d65cad2ba607fd14de7 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# Licensed under WTFPL or the Unlicense or CC0. | |
# This uses Python 3, but it's easy to port to Python 2 by changing | |
# strings to u'xx'. | |
import itertools | |
def num2chinese(num, big=False, simp=True, o=False, twoalt=False): | |
""" | |
Converts numbers to Chinese representations. | |
`big` : use financial characters. | |
`simp` : use simplified characters instead of traditional characters. | |
`o` : use 〇 for zero. | |
`twoalt`: use 两/兩 for two when appropriate. | |
Note that `o` and `twoalt` is ignored when `big` is used, | |
and `twoalt` is ignored when `o` is used for formal representations. | |
""" | |
# check num first | |
nd = str(num) | |
if abs(float(nd)) >= 1e48: | |
raise ValueError('number out of range') | |
elif 'e' in nd: | |
raise ValueError('scientific notation is not supported') | |
c_symbol = '正负点' if simp else '正負點' | |
if o: # formal | |
twoalt = False | |
if big: | |
c_basic = '零壹贰叁肆伍陆柒捌玖' if simp else '零壹貳參肆伍陸柒捌玖' | |
c_unit1 = '拾佰仟' | |
c_twoalt = '贰' if simp else '貳' | |
else: | |
c_basic = '〇一二三四五六七八九' if o else '零一二三四五六七八九' | |
c_unit1 = '十百千' | |
if twoalt: | |
c_twoalt = '两' if simp else '兩' | |
else: | |
c_twoalt = '二' | |
c_unit2 = '万亿兆京垓秭穰沟涧正载' if simp else '萬億兆京垓秭穰溝澗正載' | |
revuniq = lambda l: ''.join(k for k, g in itertools.groupby(reversed(l))) | |
nd = str(num) | |
result = [] | |
if nd[0] == '+': | |
result.append(c_symbol[0]) | |
elif nd[0] == '-': | |
result.append(c_symbol[1]) | |
if '.' in nd: | |
integer, remainder = nd.lstrip('+-').split('.') | |
else: | |
integer, remainder = nd.lstrip('+-'), None | |
if int(integer): | |
splitted = [integer[max(i - 4, 0):i] | |
for i in range(len(integer), 0, -4)] | |
intresult = [] | |
for nu, unit in enumerate(splitted): | |
# special cases | |
if int(unit) == 0: # 0000 | |
intresult.append(c_basic[0]) | |
continue | |
elif nu > 0 and int(unit) == 2: # 0002 | |
intresult.append(c_twoalt + c_unit2[nu - 1]) | |
continue | |
ulist = [] | |
unit = unit.zfill(4) | |
for nc, ch in enumerate(reversed(unit)): | |
if ch == '0': | |
if ulist: # ???0 | |
ulist.append(c_basic[0]) | |
elif nc == 0: | |
ulist.append(c_basic[int(ch)]) | |
elif nc == 1 and ch == '1' and unit[1] == '0': | |
# special case for tens | |
# edit the 'elif' if you don't like | |
# 十四, 三千零十四, 三千三百一十四 | |
ulist.append(c_unit1[0]) | |
elif nc > 1 and ch == '2': | |
ulist.append(c_twoalt + c_unit1[nc - 1]) | |
else: | |
ulist.append(c_basic[int(ch)] + c_unit1[nc - 1]) | |
ustr = revuniq(ulist) | |
if nu == 0: | |
intresult.append(ustr) | |
else: | |
intresult.append(ustr + c_unit2[nu - 1]) | |
result.append(revuniq(intresult).strip(c_basic[0])) | |
else: | |
result.append(c_basic[0]) | |
if remainder: | |
result.append(c_symbol[2]) | |
result.append(''.join(c_basic[int(ch)] for ch in remainder)) | |
return ''.join(result) |
Works. Thanks.
works. thanks
Thanks, it works. I also write a simple reverse function。
from unicodedata import numeric
def chinese2num(s):
amount = 0
for ch in s:
number = numeric(ch)
if number < 10:
digit = number
else:
amount = (amount + digit) * number if number > amount else amount + digit * number
digit = 0
if len(s) > 1 and numeric(s[-2]) != 0:
return amount + digit * numeric(s[-2]) / 10
return amount + digit
Thanks, it works. I also write a simple reverse function。
from unicodedata import numeric def chinese2num(s): amount = 0 for ch in s: number = numeric(ch) if number < 10: digit = number else: amount = (amount + digit) * number if number > amount else amount + digit * number digit = 0 if len(s) > 1 and numeric(s[-2]) != 0: return amount + digit * numeric(s[-2]) / 10 return amount + digit
Hi, maybe you need to consider "十二" and I can't say we need this: "len(s) > 1 and numeric(s[-2]) != 0", so my suugestion is:
def chinese2num(s): amount = 0 if s[0] == "十": s = "一"+s for ch in s: number = numeric(ch) if number < 10: digit = number else: amount = (amount + digit) * number if number > amount else amount + digit * number digit = 0 return int(amount + digit)
Thanks, it works. I also write a simple reverse function。
from unicodedata import numeric def chinese2num(s): amount = 0 for ch in s: number = numeric(ch) if number < 10: digit = number else: amount = (amount + digit) * number if number > amount else amount + digit * number digit = 0 if len(s) > 1 and numeric(s[-2]) != 0: return amount + digit * numeric(s[-2]) / 10 return amount + digit
Hi, maybe you need to consider "十二" and I can't say we need this: "len(s) > 1 and numeric(s[-2]) != 0", so my suugestion is:
def chinese2num(s): amount = 0 if s[0] == "十": s = "一"+s for ch in s: number = numeric(ch) if number < 10: digit = number else: amount = (amount + digit) * number if number > amount else amount + digit * number digit = 0 return int(amount + digit)
very nice!
good one
try
-1297000000000101.1
it outputs:
负一千二百九十七兆零零一百零一点一
i wrote an arabic-to-chinese number converter just now for fun. upon finishing I was curious to see how others are doing this. and that led me to this gist. thought i'd share my attempt here as well:
import bisect
import re
import unittest
zhdigits = '零一二三四五六七八九'
zhplaces = {
0: '' ,
1: '十',
2: '百',
3: '千',
4: '万',
8: '亿',
}
zhplace_keys = sorted(zhplaces.keys())
def numdigits(n):
return len(str(abs(n)))
def _zhnum(n):
if n < 10:
return zhdigits[n]
# Largest place length (number of trailing digits) with a name
# that fits `n`.
# Examples:
# n | LPL | name
# --- | --- | ---
# 100 | 2 | 百
# 10_0000 | 4 | 万
named_place_len = zhplace_keys[bisect.bisect_right(zhplace_keys,
numdigits(n)-1) - 1]
# Break `n` on the `named_place_len`. The final answer (without handling
# grammar exceptions like '十' instead of '一十零') will generally be
# zhnum(left_part) + unit name at `named_place_len` + zhnum(right_part)
# additionally, if the `right_part` has leading zeros, a complimenting '零'
# should lead it.
# Examples:
#
# 208|0300
# / \
# / \
# 2|08 万 零 3|00
# / \ / \
# 二 百 零 八 三 百 empty
left_part, right_part = n // 10**named_place_len, n % 10**named_place_len
return (_zhnum(left_part) +
zhplaces[named_place_len] +
((zhdigits[0] if numdigits(right_part) != named_place_len else '') +
_zhnum(right_part)
if right_part else ''))
def zhnum(n):
answer = ('负' if n < 0 else '') + _zhnum(abs(n))
answer = re.sub(r'^一十', '十', answer)
answer = re.sub(r'(?<![零十])二(?=[千万亿])', r'两', answer)
return answer
class TestZhnum(unittest.TestCase):
'''Test Chinese number formatter.'''
def test(self):
self.assertEqual(zhnum(-1), '负一')
self.assertEqual(zhnum(0), '零')
self.assertEqual(zhnum(-0), '零')
self.assertEqual(zhnum(6), '六')
self.assertEqual(zhnum(10), '十')
self.assertEqual(zhnum(14), '十四')
self.assertEqual(zhnum(28), '二十八')
self.assertEqual(zhnum(59), '五十九')
self.assertEqual(zhnum(100), '一百')
self.assertEqual(zhnum(101), '一百零一')
self.assertEqual(zhnum(110), '一百一十')
self.assertEqual(zhnum(132), '一百三十二')
self.assertEqual(zhnum(1000), '一千')
self.assertEqual(zhnum(2001), '两千零一')
self.assertEqual(zhnum(3010), '三千零一十')
self.assertEqual(zhnum(4012), '四千零一十二')
self.assertEqual(zhnum(5230), '五千二百三十')
self.assertEqual(zhnum(6234), '六千二百三十四')
self.assertEqual(zhnum(9999), '九千九百九十九')
self.assertEqual(zhnum(1_0000), '一万')
self.assertEqual(zhnum(20_0000), '二十万')
self.assertEqual(zhnum(123_4567), '一百二十三万四千五百六十七')
self.assertEqual(zhnum(500_1024), '五百万一千零二十四')
self.assertEqual(zhnum(360_5000), '三百六十万五千')
self.assertEqual(zhnum(3_0000_0000), '三亿')
self.assertEqual(zhnum(2_7600_2010), '两亿七千六百万两千零一十')
self.assertEqual(zhnum(2_0000_0000_0000_0000), '两亿亿')
self.assertEqual(zhnum(2_2002_2222), '两亿两千零二万两千二百二十二')
self.assertEqual(zhnum(22_2222_2222), '二十二亿两千二百二十二万两千二百二十二')
if __name__ == '__main__':
unittest.main()
core implementation (_zhnum
and zhnum
sans comments) ~20 lines. doesn't handle decimal numbers though.
EDIT: fixed regex for incorrect 两's in for example 二十两万. alternative to using a regex to handle 二 and 两, we can do this in _zhnum
when concatenating left_part
and the place unit. specifically:
unit = zhplaces[named_place_len]
f'两{unit}' if left_part == 2 and unit in set('千万亿') else (_zhnum(left_part + unit)
the idea is to use 两 only when the whole (four-digit) chunk resolves to 2, instead of just some random number ending in 2.
thkx