Skip to content

Instantly share code, notes, and snippets.

@acdha
Created June 12, 2015 22:32
Show Gist options
  • Save acdha/49a610089c2798db6fe2 to your computer and use it in GitHub Desktop.
Save acdha/49a610089c2798db6fe2 to your computer and use it in GitHub Desktop.
Ways to get the name of a Unicode block for a character in Python
#!/usr/bin/env PYTHONIOENCODING=utf-8 python
# encoding: utf-8
from __future__ import absolute_import, print_function, unicode_literals
import os
import re
import requests
def get_block_for_codepoint(cp):
"""Return the Unicode block name for the provided numeric codepoint"""
for start, end, block_name in UNICODE_BLOCKS:
if start <= cp <= end:
return block_name
return 'No_Block'
def load_unicode_blocks_from_file(f):
file_contents = f.read().decode('utf-8')
blocks = []
for start, end, block_name in re.findall(r'([0-9A-F]+)\.\.([0-9A-F]+);\ (\S.*\S)', file_contents):
if block_name == 'No_Block':
continue
blocks.append((int(start, 16), int(end, 16), block_name))
return blocks
def load_unicode_blocks(block_filename):
if not os.path.exists(block_filename):
print('Unicode block file %s does not exist. Downloading…' % block_filename)
r = requests.get('http://unicode.org/Public/UNIDATA/Blocks.txt')
r.raise_for_status()
with open(block_filename, 'wb') as f:
for chunk in r.iter_content():
f.write(chunk)
with open(block_filename, 'rb') as f:
blocks = load_unicode_blocks_from_file(f)
return blocks
UNICODE_BLOCKS = load_unicode_blocks('UNIDATA-Blocks.txt')
import icu
# Astoundingly, PyICU has no documented way to get a Unicode block name.
# There are two ways to get the offset into the UCodeBlock enum
# – icu.Char.ublock_getCode and icu.Char.getIntPropertyValue(…, icu.UProperty.BLOCK) –
# so we'll build a lookup table to turn that into a human-readable string:
UNICODE_BLOCKS = {getattr(icu.UBlockCode, i): i for i in icu.UBlockCode.__dict__ if i.isupper()}
@andjc
Copy link

andjc commented Jun 28, 2024

If you are using PyICU to get a dictionary of Unicode blocks, it may be easier to just use PyICU to identify the name of the block for a given character:

def get_enum_property(char, property):
    if len(char) != 1:
        print("Please specify a single character.")
        return None
    value = icu.Char.getIntPropertyValue(char, property)
    return icu.Char.getPropertyValueName(property, value, icu.UPropertyNameChoice.LONG_PROPERTY_NAME)

get_enum_property('𞤀', icu.UProperty.BLOCK)
# 'Adlam'

get_enum_property("Ɛ", icu.UProperty.BLOCK)
# 'Latin_Extended_B'
get_enum_property("Ɛ", icu.UProperty.SCRIPT)
# 'Latin'

@andjc
Copy link

andjc commented Jun 28, 2024

Alternatively, use unicodedataplus, a drop-in replacement for unicodedata:

pip install -U unicodedataplus
import unicodedataplus as ud
ud.block('𞤀')
# 'Adlam'

ud.block("Ɛ")
# 'Latin Extended-B'

@acdha
Copy link
Author

acdha commented Jul 11, 2024

If you are using PyICU to get a dictionary of Unicode blocks, it may be easier to just use PyICU to identify the name of the block for a given character:

Thanks for the update - amusingly, it looks like LONG_PROPERTY_NAME was added 9 years ago, right around when I wrote that little utility script so if I'd been only a few months later it would have had a simple answer!

https://gitlab.pyicu.org/main/pyicu/-/commit/47f2f2858aba6d6e5de21d41a809d2e46e50e0f4

My goal for this initially was to have something for a demo which didn't use anything more than stdlib Python but I definitely would recommend one of the dedicated libraries for serious use.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment