ramalho/charindex.py

## charindex.py
"""
``char_index`` builds an inverted index mapping words to sets of Unicode
characters which contain that word in their names. For example::

    >>> index = char_index(32, 65)
    >>> sorted(index['SIGN'])
    ['#', '$', '%', '+', '<', '=', '>']
    >>> sorted(index['DIGIT'])
    ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    >>> index['DIGIT'] & index['EIGHT']
    {'8'}
"""


import sys
import re
import unicodedata
from typing import Dict, Set, Iterator, cast

RE_WORD = re.compile('\w+')

def tokenize(text: str) -> Iterator[str]:
    """return iterable of uppercased words"""
    for match in RE_WORD.finditer(text):
        yield match.group().upper()

def char_index(start: int = 32, end: int = 0) -> Dict[str, Set[str]]:
    if end == 0:
        end = sys.maxunicode + 1
    index: Dict[str, Set[str]] = {}
    for char in (chr(i) for i in range(start, end)):
        if name := unicodedata.name(char, ''):
            for word in tokenize(name):
                index.setdefault(word, cast(Set[str], set())).add(char)
    return index
	"""
	``char_index`` builds an inverted index mapping words to sets of Unicode
	characters which contain that word in their names. For example::

	>>> index = char_index(32, 65)
	>>> sorted(index['SIGN'])
	['#', '$', '%', '+', '<', '=', '>']
	>>> sorted(index['DIGIT'])
	['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
	>>> index['DIGIT'] & index['EIGHT']
	{'8'}
	"""


	import sys
	import re
	import unicodedata
	from typing import Dict, Set, Iterator, cast

	RE_WORD = re.compile('\w+')

	def tokenize(text: str) -> Iterator[str]:
	"""return iterable of uppercased words"""
	for match in RE_WORD.finditer(text):
	yield match.group().upper()

	def char_index(start: int = 32, end: int = 0) -> Dict[str, Set[str]]:
	if end == 0:
	end = sys.maxunicode + 1
	index: Dict[str, Set[str]] = {}
	for char in (chr(i) for i in range(start, end)):
	if name := unicodedata.name(char, ''):
	for word in tokenize(name):
	index.setdefault(word, cast(Set[str], set())).add(char)
	return index