Skip to content

Instantly share code, notes, and snippets.

@fhardison
fhardison / get_lemma_senses.py
Created October 12, 2023 18:14
A cli tool to get sense data for a lemma and then find verses in which it occur using the macula-greek repo
from collections import defaultdict
import sys
import csv
filename = 'macula-greek-SBLGNT.tsv'
lemmas_domain = defaultdict(set)
lemmas_senses = defaultdict(set)
@fhardison
fhardison / gr_tree.py
Created October 12, 2023 13:13
Simple script to display lowfat syntax trees from macula-greek on the command line
import xml.etree.ElementTree as ET
from difflib import get_close_matches
from pathlib import Path
import sys
DIR = Path('macula-greek/SBLGNT/lowfat/')
FILES = {
'MAT': '01-matthew.xml',
'MRK': '02-mark.xml',
from pathlib import Path
from greek_normalisation.utils import nfc
from greek_accentuation.characters import add_breathing, Breathing,Accent, add_diacritic
import re
import sys
FIND = [x for x in "α ε η υ ο ω i".split(' ') if x]
DIPTH = [x for x in "ει αι οι ου αυ ευ ηυ".split(' ') if x]
@fhardison
fhardison / john_stats.py
Last active August 23, 2022 13:18
Calculates some vocabulary statistics for John's Gospel vs the rest of the GNT
from gnt_data import ChunkType, TokenType, get_tokens, get_tokens_by_chunk
from collections import Counter
def get_stats(token_type):
ALL_GNT = get_tokens(token_type)
BOOKS = get_tokens_by_chunk(token_type, ChunkType.book)
@fhardison
fhardison / vocab_distance_gnt.py
Created September 21, 2021 14:35
Calculates the mean and median distance between lemma occurances in the Greek New Testament
from gnt_data import get_tokens, TokenType
from collections import defaultdict, namedtuple
from tabulate import tabulate
from statistics import mean, median
gnt_lemmas = get_tokens(TokenType.lemma)
data = defaultdict(list)
@fhardison
fhardison / paradigm_tools.py
Created September 20, 2021 14:04
Simple tool for generating a paradigm from the tokens file format used in vocabulary-tools gnt_data (by James Tauber: https://github.com/jtauber/vocabulary-tools)
from greek_normalisation.utils import nfc,grave_to_acute
def layout_paradigm(forms, row_identifiers, column_idenfiers):
#""" forms: [(form, parse)]
#... idenfiers = [lambda x: -> Boolean]
#"""
output = []
for rfn in row_identifiers:
row = []
for cfn in column_idenfiers:

This is a literate doctest. Run python -m doctest -v examples.rst to test.

>>> from vocab_utils import group_tokens_by >>> from gnt_data import ChunkType, TokenType

Get sentences grouped by pericopes and show data for first 5.

@fhardison
fhardison / af_reading_order_forms.txt
Created April 22, 2020 02:45
A reading order for the Apostolic Fathers based on the next_best algorithm from James Tauber's vocabulary tools repo. This order is based on the forms of the words, not lemmas.
# filename-section OR filename-chapter.section (for longer works, i.e. shepherd)
013-shepherd.txt-22.0
013-shepherd.txt-19.0
013-shepherd.txt-20.0
013-shepherd.txt-21.0
013-shepherd.txt-24.0
013-shepherd.txt-12.0
013-shepherd.txt-4.0
013-shepherd.txt-9.0
013-shepherd.txt-3.0
@fhardison
fhardison / build_reading_path_gnt_lxx.py
Created March 19, 2020 11:43
python file that using JTauber's next_best algorithm from his vocabulary-tools repo and py-sblgnt and the lemmatisation from https://github.com/openscriptures/GreekResources to create a reading order for the Greek Bible. The order is below. Note that the lines in the order that start with numbers are NT books and the number is their place in the…
from greek_normalisation.utils import nfc, strip_last_accent_if_two, grave_to_acute
import re
from ordering import next_best
import sys
import glob
import json
from pysblgnt import morphgnt_rows
from collections import defaultdict
@fhardison
fhardison / build_static.py
Last active March 17, 2020 02:46
A extensible functional method of building generating an html from the format used by Greek Learners Text Project
import sys
import re
import mistletoe
def read_to_dict(fpath, encoding='UTF-8'):
lines = {}
with open(fpath, 'r', encoding=encoding) as file:
for line in file:
num, content = line.split(' ', maxsplit=1)