hipertracker/tokenize.py

## tokenize.py
import glob
import re
from typing import Iterator, List, Set, Tuple

from yaml import BaseLoader, load


def extract_words(text: str, min_length: int = 2) -> Set:
    regex_words = re.compile(r"[\W]+", re.U)
    return set(
        word.lower()
        for word in re.split(regex_words, text)
        if word and len(word) > min_length
    )


def tokenize(
    src_path: str, languages: List[str] = None, bibles: List[str] = None
) -> Iterator[Tuple[str, Set]]:
    for path in glob.glob(f"{src_path}/**/*.yml", recursive=True):
        meta = load(open(path), Loader=BaseLoader)
        lang, bible = meta["lang"], meta["code"]
        if languages is not None:
            if lang not in languages:
                continue
        if bibles is not None:
            if bible not in bibles:
                continue
        regex_verse = re.compile(r"^(^\S+) ([^:]+):(\S+)\s?(.*)$")
        for line in open(path.replace(".yml", ".txt")):
            match = re.match(regex_verse, line.strip())
            if match:
                book, chapter, verse, text = match.groups()
            else:
                print(f"INVALID LINE: {line}")
            key = f"{lang}-{bible}-{book}-{chapter}-{verse}".lower()
            yield key, extract_words(text)
	import glob
	import re
	from typing import Iterator, List, Set, Tuple

	from yaml import BaseLoader, load


	def extract_words(text: str, min_length: int = 2) -> Set:
	regex_words = re.compile(r"[\W]+", re.U)
	return set(
	word.lower()
	for word in re.split(regex_words, text)
	if word and len(word) > min_length
	)


	def tokenize(
	src_path: str, languages: List[str] = None, bibles: List[str] = None
	) -> Iterator[Tuple[str, Set]]:
	for path in glob.glob(f"{src_path}/*/.yml", recursive=True):
	meta = load(open(path), Loader=BaseLoader)
	lang, bible = meta["lang"], meta["code"]
	if languages is not None:
	if lang not in languages:
	continue
	if bibles is not None:
	if bible not in bibles:
	continue
	regex_verse = re.compile(r"^(^\S+) ([^:]+):(\S+)\s?(.*)$")
	for line in open(path.replace(".yml", ".txt")):
	match = re.match(regex_verse, line.strip())
	if match:
	book, chapter, verse, text = match.groups()
	else:
	print(f"INVALID LINE: {line}")
	key = f"{lang}-{bible}-{book}-{chapter}-{verse}".lower()
	yield key, extract_words(text)