Last active
August 23, 2022 13:18
-
-
Save fhardison/2e36ad765e1b4ab6dd5a06ae061cb44c to your computer and use it in GitHub Desktop.
Calculates some vocabulary statistics for John's Gospel vs the rest of the GNT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gnt_data import ChunkType, TokenType, get_tokens, get_tokens_by_chunk | |
from collections import Counter | |
def get_stats(token_type): | |
ALL_GNT = get_tokens(token_type) | |
BOOKS = get_tokens_by_chunk(token_type, ChunkType.book) | |
print(len(BOOKS)) | |
JOHN_KEY = '64' | |
JOHN_WORDS = BOOKS[JOHN_KEY] | |
unique_gnt = set(ALL_GNT) | |
unique_john = set(JOHN_WORDS) | |
john_total = len(unique_john) | |
gnt_total = len(unique_gnt) | |
c = Counter() | |
for l in ALL_GNT: | |
if l in unique_john: | |
c[l] += 1 | |
total_lemmas_john_gnt = sum(c.values()) | |
token = 'lemmas' | |
if token_type == TokenType.form: | |
token = 'forms' | |
print("# John vs. GNT") | |
print(f"JOHN: {len(JOHN_WORDS)} words, {john_total} unique {token}") | |
print(f"GNT: {len(ALL_GNT)} words, {gnt_total} unique {token}") | |
print(f"By learning to read John, you learn to read {(john_total / gnt_total) * 100}% of GNT's {token}") | |
print(f"These John lemmas account for {(total_lemmas_john_gnt/len(ALL_GNT)) * 100}% of {token} encountered reading the GNT text.") | |
get_stats(TokenType.lemma) | |
print('') | |
get_stats(TokenType.form) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment