Last active
February 1, 2020 23:22
-
-
Save hipertracker/7d3d9ce838a337ed8ba14d779dd39e8f to your computer and use it in GitHub Desktop.
Words tokenizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import re | |
from typing import Iterator, List, Set, Tuple | |
from yaml import BaseLoader, load | |
def extract_words(text: str, min_length: int = 2) -> Set: | |
regex_words = re.compile(r"[\W]+", re.U) | |
return set( | |
word.lower() | |
for word in re.split(regex_words, text) | |
if word and len(word) > min_length | |
) | |
def tokenize( | |
src_path: str, languages: List[str] = None, bibles: List[str] = None | |
) -> Iterator[Tuple[str, Set]]: | |
for path in glob.glob(f"{src_path}/**/*.yml", recursive=True): | |
meta = load(open(path), Loader=BaseLoader) | |
lang, bible = meta["lang"], meta["code"] | |
if languages is not None: | |
if lang not in languages: | |
continue | |
if bibles is not None: | |
if bible not in bibles: | |
continue | |
regex_verse = re.compile(r"^(^\S+) ([^:]+):(\S+)\s?(.*)$") | |
for line in open(path.replace(".yml", ".txt")): | |
match = re.match(regex_verse, line.strip()) | |
if match: | |
book, chapter, verse, text = match.groups() | |
else: | |
print(f"INVALID LINE: {line}") | |
key = f"{lang}-{bible}-{book}-{chapter}-{verse}".lower() | |
yield key, extract_words(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment