acdha/get-unicode-blocks.py

## get-unicode-blocks.py
#!/usr/bin/env PYTHONIOENCODING=utf-8  python
# encoding: utf-8

from __future__ import absolute_import, print_function, unicode_literals

import os
import re

import requests


def get_block_for_codepoint(cp):
    """Return the Unicode block name for the provided numeric codepoint"""

    for start, end, block_name in UNICODE_BLOCKS:
        if start <= cp <= end:
            return block_name

    return 'No_Block'


def load_unicode_blocks_from_file(f):
    file_contents = f.read().decode('utf-8')

    blocks = []
    for start, end, block_name in re.findall(r'([0-9A-F]+)\.\.([0-9A-F]+);\ (\S.*\S)', file_contents):
        if block_name == 'No_Block':
            continue

        blocks.append((int(start, 16), int(end, 16), block_name))

    return blocks


def load_unicode_blocks(block_filename):
    if not os.path.exists(block_filename):
        print('Unicode block file %s does not exist. Downloading…' % block_filename)
        r = requests.get('http://unicode.org/Public/UNIDATA/Blocks.txt')
        r.raise_for_status()

        with open(block_filename, 'wb') as f:
            for chunk in r.iter_content():
                f.write(chunk)

    with open(block_filename, 'rb') as f:
        blocks = load_unicode_blocks_from_file(f)

    return blocks

UNICODE_BLOCKS = load_unicode_blocks('UNIDATA-Blocks.txt')

## pyicu-unicode-block-names.py
import icu

# Astoundingly, PyICU has no documented way to get a Unicode block name.
# There are two ways to get the offset into the UCodeBlock enum
# – icu.Char.ublock_getCode and icu.Char.getIntPropertyValue(…, icu.UProperty.BLOCK) –
# so we'll build a lookup table to turn that into a human-readable string:

UNICODE_BLOCKS = {getattr(icu.UBlockCode, i): i for i in icu.UBlockCode.__dict__ if i.isupper()}
	#!/usr/bin/env PYTHONIOENCODING=utf-8 python
	# encoding: utf-8

	from __future__ import absolute_import, print_function, unicode_literals

	import os
	import re

	import requests


	def get_block_for_codepoint(cp):
	"""Return the Unicode block name for the provided numeric codepoint"""

	for start, end, block_name in UNICODE_BLOCKS:
	if start <= cp <= end:
	return block_name

	return 'No_Block'


	def load_unicode_blocks_from_file(f):
	file_contents = f.read().decode('utf-8')

	blocks = []
	for start, end, block_name in re.findall(r'([0-9A-F]+)\.\.([0-9A-F]+);\ (\S.*\S)', file_contents):
	if block_name == 'No_Block':
	continue

	blocks.append((int(start, 16), int(end, 16), block_name))

	return blocks


	def load_unicode_blocks(block_filename):
	if not os.path.exists(block_filename):
	print('Unicode block file %s does not exist. Downloading…' % block_filename)
	r = requests.get('http://unicode.org/Public/UNIDATA/Blocks.txt')
	r.raise_for_status()

	with open(block_filename, 'wb') as f:
	for chunk in r.iter_content():
	f.write(chunk)

	with open(block_filename, 'rb') as f:
	blocks = load_unicode_blocks_from_file(f)

	return blocks

	UNICODE_BLOCKS = load_unicode_blocks('UNIDATA-Blocks.txt')
	import icu

	# Astoundingly, PyICU has no documented way to get a Unicode block name.
	# There are two ways to get the offset into the UCodeBlock enum
	# – icu.Char.ublock_getCode and icu.Char.getIntPropertyValue(…, icu.UProperty.BLOCK) –
	# so we'll build a lookup table to turn that into a human-readable string:

	UNICODE_BLOCKS = {getattr(icu.UBlockCode, i): i for i in icu.UBlockCode.__dict__ if i.isupper()}