Skip to content

Instantly share code, notes, and snippets.

@fhardison
Last active August 23, 2022 13:18
Show Gist options
  • Save fhardison/2e36ad765e1b4ab6dd5a06ae061cb44c to your computer and use it in GitHub Desktop.
Save fhardison/2e36ad765e1b4ab6dd5a06ae061cb44c to your computer and use it in GitHub Desktop.
Calculates some vocabulary statistics for John's Gospel vs the rest of the GNT
from gnt_data import ChunkType, TokenType, get_tokens, get_tokens_by_chunk
from collections import Counter
def get_stats(token_type):
ALL_GNT = get_tokens(token_type)
BOOKS = get_tokens_by_chunk(token_type, ChunkType.book)
print(len(BOOKS))
JOHN_KEY = '64'
JOHN_WORDS = BOOKS[JOHN_KEY]
unique_gnt = set(ALL_GNT)
unique_john = set(JOHN_WORDS)
john_total = len(unique_john)
gnt_total = len(unique_gnt)
c = Counter()
for l in ALL_GNT:
if l in unique_john:
c[l] += 1
total_lemmas_john_gnt = sum(c.values())
token = 'lemmas'
if token_type == TokenType.form:
token = 'forms'
print("# John vs. GNT")
print(f"JOHN: {len(JOHN_WORDS)} words, {john_total} unique {token}")
print(f"GNT: {len(ALL_GNT)} words, {gnt_total} unique {token}")
print(f"By learning to read John, you learn to read {(john_total / gnt_total) * 100}% of GNT's {token}")
print(f"These John lemmas account for {(total_lemmas_john_gnt/len(ALL_GNT)) * 100}% of {token} encountered reading the GNT text.")
get_stats(TokenType.lemma)
print('')
get_stats(TokenType.form)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment