Skip to content

Instantly share code, notes, and snippets.

@hipertracker
Last active February 1, 2020 23:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hipertracker/7d3d9ce838a337ed8ba14d779dd39e8f to your computer and use it in GitHub Desktop.
Save hipertracker/7d3d9ce838a337ed8ba14d779dd39e8f to your computer and use it in GitHub Desktop.
Words tokenizer
import glob
import re
from typing import Iterator, List, Set, Tuple
from yaml import BaseLoader, load
def extract_words(text: str, min_length: int = 2) -> Set:
regex_words = re.compile(r"[\W]+", re.U)
return set(
word.lower()
for word in re.split(regex_words, text)
if word and len(word) > min_length
)
def tokenize(
src_path: str, languages: List[str] = None, bibles: List[str] = None
) -> Iterator[Tuple[str, Set]]:
for path in glob.glob(f"{src_path}/**/*.yml", recursive=True):
meta = load(open(path), Loader=BaseLoader)
lang, bible = meta["lang"], meta["code"]
if languages is not None:
if lang not in languages:
continue
if bibles is not None:
if bible not in bibles:
continue
regex_verse = re.compile(r"^(^\S+) ([^:]+):(\S+)\s?(.*)$")
for line in open(path.replace(".yml", ".txt")):
match = re.match(regex_verse, line.strip())
if match:
book, chapter, verse, text = match.groups()
else:
print(f"INVALID LINE: {line}")
key = f"{lang}-{bible}-{book}-{chapter}-{verse}".lower()
yield key, extract_words(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment