Created Sep 21, 2021
Calculates the mean and median distance between lemma occurances in the Greek New Testament
 from gnt_data import get_tokens, TokenType from collections import defaultdict, namedtuple from tabulate import tabulate from statistics import mean, median gnt_lemmas = get_tokens(TokenType.lemma) data = defaultdict(list) # loop in order, thus _i_ is the possition of # the word in the data for i, lemma in enumerate(gnt_lemmas): data[lemma].append(i) # now for each word calculate the distance # between occurances and divide by the # total number of occurances stats = [] Stats = namedtuple("Stats", "lemma mean median total") for lemma, occurances in data.items(): total_occurances = len(occurances) last_occurance = occurances[0] distances = [] for i in occurances[1:]: distances.append(i - last_occurance) last_occurance = i if distances: stats.append(Stats(lemma, round(mean(distances),1), median(distances), total_occurances)) # sort by the total number of occurances sorted_stats = sorted(stats, key=lambda x: x.mean, reverse=False) # filter out words occuring less than 25 times filtered_stats = [x for x in sorted_stats if x.total > 24] # Present results START = 25 END = 50 print(tabulate(filtered_stats[START:END], headers=('lemma', 'mean', 'median', 'total' )))
