Skip to content
{{ message }}

Instantly share code, notes, and snippets.

# fhardison/vocab_distance_gnt.py

Created Sep 21, 2021
Calculates the mean and median distance between lemma occurances in the Greek New Testament
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 from gnt_data import get_tokens, TokenType from collections import defaultdict, namedtuple from tabulate import tabulate from statistics import mean, median gnt_lemmas = get_tokens(TokenType.lemma) data = defaultdict(list) # loop in order, thus _i_ is the possition of # the word in the data for i, lemma in enumerate(gnt_lemmas): data[lemma].append(i) # now for each word calculate the distance # between occurances and divide by the # total number of occurances stats = [] Stats = namedtuple("Stats", "lemma mean median total") for lemma, occurances in data.items(): total_occurances = len(occurances) last_occurance = occurances[0] distances = [] for i in occurances[1:]: distances.append(i - last_occurance) last_occurance = i if distances: stats.append(Stats(lemma, round(mean(distances),1), median(distances), total_occurances)) # sort by the total number of occurances sorted_stats = sorted(stats, key=lambda x: x.mean, reverse=False) # filter out words occuring less than 25 times filtered_stats = [x for x in sorted_stats if x.total > 24] # Present results START = 25 END = 50 print(tabulate(filtered_stats[START:END], headers=('lemma', 'mean', 'median', 'total' )))
to join this conversation on GitHub. Already have an account? Sign in to comment