Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Calculates the mean and median distance between lemma occurances in the Greek New Testament
from gnt_data import get_tokens, TokenType
from collections import defaultdict, namedtuple
from tabulate import tabulate
from statistics import mean, median
gnt_lemmas = get_tokens(TokenType.lemma)
data = defaultdict(list)
# loop in order, thus _i_ is the possition of
# the word in the data
for i, lemma in enumerate(gnt_lemmas):
data[lemma].append(i)
# now for each word calculate the distance
# between occurances and divide by the
# total number of occurances
stats = []
Stats = namedtuple("Stats", "lemma mean median total")
for lemma, occurances in data.items():
total_occurances = len(occurances)
last_occurance = occurances[0]
distances = []
for i in occurances[1:]:
distances.append(i - last_occurance)
last_occurance = i
if distances:
stats.append(Stats(lemma,
round(mean(distances),1),
median(distances), total_occurances))
# sort by the total number of occurances
sorted_stats = sorted(stats,
key=lambda x: x.mean,
reverse=False)
# filter out words occuring less than 25 times
filtered_stats = [x for x in sorted_stats if x.total > 24]
# Present results
START = 25
END = 50
print(tabulate(filtered_stats[START:END],
headers=('lemma', 'mean', 'median', 'total' )))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment