Created
October 27, 2015 23:20
-
-
Save jtauber/8e9156b34f452ea4cd89 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from collections import defaultdict | |
from math import log | |
from pysblgnt import morphgnt_rows | |
items_by_target = defaultdict(list) | |
count_by_item = defaultdict(int) | |
total_item_count = 0 | |
for book_num in range(1, 28): | |
for row in morphgnt_rows(book_num): | |
target = row["bcv"][:4] | |
item = row["lemma"] | |
items_by_target[target].append(item) | |
count_by_item[item] += 1 | |
total_item_count += 1 | |
for target in sorted(items_by_target): | |
items = items_by_target[target] | |
num_items = len(items) | |
mean_log_frequency = 0 | |
for item in items: | |
mean_log_frequency += log(count_by_item[item] / total_item_count) / num_items | |
print(int(-1000 * mean_log_frequency), target, num_items) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
thanks for sharing this!
QQ: Wouldn't line 25
mean_log_frequency += log(count_by_item[item] / total_item_count) / num_items
mean that higher frequency items would have lower mean_log_frequency score? Meaning that a LOWER means HARDER to read, rather than HIGHER MLF being HARDER to read?