This is a literate doctest. Run
python -m doctest -v examples.rst
to test.
>>> from vocab_utils import group_tokens_by >>> from gnt_data import ChunkType, TokenType
from collections import defaultdict | |
import sys | |
import csv | |
filename = 'macula-greek-SBLGNT.tsv' | |
lemmas_domain = defaultdict(set) | |
lemmas_senses = defaultdict(set) |
import xml.etree.ElementTree as ET | |
from difflib import get_close_matches | |
from pathlib import Path | |
import sys | |
DIR = Path('macula-greek/SBLGNT/lowfat/') | |
FILES = { | |
'MAT': '01-matthew.xml', | |
'MRK': '02-mark.xml', |
from pathlib import Path | |
from greek_normalisation.utils import nfc | |
from greek_accentuation.characters import add_breathing, Breathing,Accent, add_diacritic | |
import re | |
import sys | |
FIND = [x for x in "α ε η υ ο ω i".split(' ') if x] | |
DIPTH = [x for x in "ει αι οι ου αυ ευ ηυ".split(' ') if x] |
from gnt_data import ChunkType, TokenType, get_tokens, get_tokens_by_chunk | |
from collections import Counter | |
def get_stats(token_type): | |
ALL_GNT = get_tokens(token_type) | |
BOOKS = get_tokens_by_chunk(token_type, ChunkType.book) |
from gnt_data import get_tokens, TokenType | |
from collections import defaultdict, namedtuple | |
from tabulate import tabulate | |
from statistics import mean, median | |
gnt_lemmas = get_tokens(TokenType.lemma) | |
data = defaultdict(list) |
from greek_normalisation.utils import nfc,grave_to_acute | |
def layout_paradigm(forms, row_identifiers, column_idenfiers): | |
#""" forms: [(form, parse)] | |
#... idenfiers = [lambda x: -> Boolean] | |
#""" | |
output = [] | |
for rfn in row_identifiers: | |
row = [] | |
for cfn in column_idenfiers: |
# filename-section OR filename-chapter.section (for longer works, i.e. shepherd) | |
013-shepherd.txt-22.0 | |
013-shepherd.txt-19.0 | |
013-shepherd.txt-20.0 | |
013-shepherd.txt-21.0 | |
013-shepherd.txt-24.0 | |
013-shepherd.txt-12.0 | |
013-shepherd.txt-4.0 | |
013-shepherd.txt-9.0 | |
013-shepherd.txt-3.0 |
from greek_normalisation.utils import nfc, strip_last_accent_if_two, grave_to_acute | |
import re | |
from ordering import next_best | |
import sys | |
import glob | |
import json | |
from pysblgnt import morphgnt_rows | |
from collections import defaultdict | |
import sys | |
import re | |
import mistletoe | |
def read_to_dict(fpath, encoding='UTF-8'): | |
lines = {} | |
with open(fpath, 'r', encoding=encoding) as file: | |
for line in file: | |
num, content = line.split(' ', maxsplit=1) |