Skip to content

Instantly share code, notes, and snippets.

Created February 8, 2015 02:46
Show Gist options
  • Save gumblex/0d65cad2ba607fd14de7 to your computer and use it in GitHub Desktop.
Save gumblex/0d65cad2ba607fd14de7 to your computer and use it in GitHub Desktop.
Numbers to Chinese representations converter in Python. 中文数字转换
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed under WTFPL or the Unlicense or CC0.
# This uses Python 3, but it's easy to port to Python 2 by changing
# strings to u'xx'.
import itertools
def num2chinese(num, big=False, simp=True, o=False, twoalt=False):
Converts numbers to Chinese representations.
`big` : use financial characters.
`simp` : use simplified characters instead of traditional characters.
`o` : use 〇 for zero.
`twoalt`: use 两/兩 for two when appropriate.
Note that `o` and `twoalt` is ignored when `big` is used,
and `twoalt` is ignored when `o` is used for formal representations.
# check num first
nd = str(num)
if abs(float(nd)) >= 1e48:
raise ValueError('number out of range')
elif 'e' in nd:
raise ValueError('scientific notation is not supported')
c_symbol = '正负点' if simp else '正負點'
if o: # formal
twoalt = False
if big:
c_basic = '零壹贰叁肆伍陆柒捌玖' if simp else '零壹貳參肆伍陸柒捌玖'
c_unit1 = '拾佰仟'
c_twoalt = '贰' if simp else '貳'
c_basic = '〇一二三四五六七八九' if o else '零一二三四五六七八九'
c_unit1 = '十百千'
if twoalt:
c_twoalt = '两' if simp else '兩'
c_twoalt = '二'
c_unit2 = '万亿兆京垓秭穰沟涧正载' if simp else '萬億兆京垓秭穰溝澗正載'
revuniq = lambda l: ''.join(k for k, g in itertools.groupby(reversed(l)))
nd = str(num)
result = []
if nd[0] == '+':
elif nd[0] == '-':
if '.' in nd:
integer, remainder = nd.lstrip('+-').split('.')
integer, remainder = nd.lstrip('+-'), None
if int(integer):
splitted = [integer[max(i - 4, 0):i]
for i in range(len(integer), 0, -4)]
intresult = []
for nu, unit in enumerate(splitted):
# special cases
if int(unit) == 0: # 0000
elif nu > 0 and int(unit) == 2: # 0002
intresult.append(c_twoalt + c_unit2[nu - 1])
ulist = []
unit = unit.zfill(4)
for nc, ch in enumerate(reversed(unit)):
if ch == '0':
if ulist: # ???0
elif nc == 0:
elif nc == 1 and ch == '1' and unit[1] == '0':
# special case for tens
# edit the 'elif' if you don't like
# 十四, 三千零十四, 三千三百一十四
elif nc > 1 and ch == '2':
ulist.append(c_twoalt + c_unit1[nc - 1])
ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
ustr = revuniq(ulist)
if nu == 0:
intresult.append(ustr + c_unit2[nu - 1])
if remainder:
result.append(''.join(c_basic[int(ch)] for ch in remainder))
return ''.join(result)
Copy link

Thanks, it works. I also write a simple reverse function。

from unicodedata import numeric
def chinese2num(s):
    amount = 0
    for ch in s:
        number = numeric(ch)
        if number < 10:
            digit = number
            amount = (amount + digit) * number if number > amount else amount + digit * number         
            digit = 0
    if len(s) > 1 and numeric(s[-2]) != 0:
        return amount + digit * numeric(s[-2]) / 10
    return amount + digit

Hi, maybe you need to consider "十二" and I can't say we need this: "len(s) > 1 and numeric(s[-2]) != 0", so my suugestion is:

def chinese2num(s): amount = 0 if s[0] == "十": s = "一"+s for ch in s: number = numeric(ch) if number < 10: digit = number else: amount = (amount + digit) * number if number > amount else amount + digit * number digit = 0 return int(amount + digit)

very nice!

Copy link

good one

Copy link

it outputs:

Copy link

yjyao commented Feb 13, 2022

i wrote an arabic-to-chinese number converter just now for fun. upon finishing I was curious to see how others are doing this. and that led me to this gist. thought i'd share my attempt here as well:

import bisect
import re
import unittest

zhdigits = '零一二三四五六七八九'
zhplaces = {
  0: ''  ,
  1: '十',
  2: '百',
  3: '千',
  4: '万',
  8: '亿',
zhplace_keys = sorted(zhplaces.keys())

def numdigits(n):
  return len(str(abs(n)))

def _zhnum(n):
  if n < 10:
    return zhdigits[n]
  # Largest place length (number of trailing digits) with a name
  # that fits `n`.
  # Examples:
  #     n       | LPL | name
  #     ---     | --- | ---
  #     100     | 2   | 百
  #     10_0000 | 4   | 万
  named_place_len = zhplace_keys[bisect.bisect_right(zhplace_keys,
                                                     numdigits(n)-1) - 1]
  # Break `n` on the `named_place_len`. The final answer (without handling
  # grammar exceptions like '十' instead of '一十零') will generally be
  #     zhnum(left_part) + unit name at `named_place_len` + zhnum(right_part)
  # additionally, if the `right_part` has leading zeros, a complimenting '零'
  # should lead it.
  # Examples:
  #                208|0300
  #             /           \
  #           /              \
  #        2|08      万  零 3|00
  #      /      \          /   \
  #    二 百 零 八       三 百 empty
  left_part, right_part = n // 10**named_place_len, n % 10**named_place_len
  return (_zhnum(left_part) +
          zhplaces[named_place_len] +
          ((zhdigits[0] if numdigits(right_part) != named_place_len else '') +
           if right_part else ''))

def zhnum(n):
  answer = ('负' if n < 0 else '') + _zhnum(abs(n))
  answer = re.sub(r'^一十', '十', answer)
  answer = re.sub(r'(?<![零十])二(?=[千万亿])', r'两', answer)
  return answer

class TestZhnum(unittest.TestCase):

  '''Test Chinese number formatter.'''

  def test(self):
    self.assertEqual(zhnum(-1),                    '负一')
    self.assertEqual(zhnum(0),                     '零')
    self.assertEqual(zhnum(-0),                    '零')
    self.assertEqual(zhnum(6),                     '六')
    self.assertEqual(zhnum(10),                    '十')
    self.assertEqual(zhnum(14),                    '十四')
    self.assertEqual(zhnum(28),                    '二十八')
    self.assertEqual(zhnum(59),                    '五十九')
    self.assertEqual(zhnum(100),                   '一百')
    self.assertEqual(zhnum(101),                   '一百零一')
    self.assertEqual(zhnum(110),                   '一百一十')
    self.assertEqual(zhnum(132),                   '一百三十二')
    self.assertEqual(zhnum(1000),                  '一千')
    self.assertEqual(zhnum(2001),                  '两千零一')
    self.assertEqual(zhnum(3010),                  '三千零一十')
    self.assertEqual(zhnum(4012),                  '四千零一十二')
    self.assertEqual(zhnum(5230),                  '五千二百三十')
    self.assertEqual(zhnum(6234),                  '六千二百三十四')
    self.assertEqual(zhnum(9999),                  '九千九百九十九')
    self.assertEqual(zhnum(1_0000),                '一万')
    self.assertEqual(zhnum(20_0000),               '二十万')
    self.assertEqual(zhnum(123_4567),              '一百二十三万四千五百六十七')
    self.assertEqual(zhnum(500_1024),              '五百万一千零二十四')
    self.assertEqual(zhnum(360_5000),              '三百六十万五千')
    self.assertEqual(zhnum(3_0000_0000),           '三亿')
    self.assertEqual(zhnum(2_7600_2010),           '两亿七千六百万两千零一十')
    self.assertEqual(zhnum(2_0000_0000_0000_0000), '两亿亿')
    self.assertEqual(zhnum(2_2002_2222),           '两亿两千零二万两千二百二十二')
    self.assertEqual(zhnum(22_2222_2222),          '二十二亿两千二百二十二万两千二百二十二')

if __name__ == '__main__':

core implementation (_zhnum and zhnum sans comments) ~20 lines. doesn't handle decimal numbers though.

EDIT: fixed regex for incorrect 两's in for example 二十两万. alternative to using a regex to handle 二 and 两, we can do this in _zhnum when concatenating left_part and the place unit. specifically:

unit = zhplaces[named_place_len]
f'两{unit}' if left_part == 2 and unit in set('千万亿') else (_zhnum(left_part + unit)

the idea is to use 两 only when the whole (four-digit) chunk resolves to 2, instead of just some random number ending in 2.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment