Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
#!/usr/bin/env python3
import codecs
from collections import defaultdict
from math import log
import os
items_by_target = defaultdict(list)
count_by_item = defaultdict(int)
total_item_count = 0
paths = os.listdir('out/')
paths.sort()
for path in paths:
f = codecs.open('out/' + path, encoding="utf-8")
text = f.readlines()
f.close()
for line in text:
cols = line.split()
ref = cols[0]
target = ref[:5]
lemma = cols[3]
if len(cols) > 4:
lemma += " %s" % cols[4]
item = cols[3]
items_by_target[target].append(item)
count_by_item[item] += 1
total_item_count += 1
for target in sorted(items_by_target):
items = items_by_target[target]
num_items = len(items)
mean_log_frequency = 0
for item in items:
mean_log_frequency += log(count_by_item[item] / total_item_count) / num_items
print(int(-1000 * mean_log_frequency), target, num_items)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.
You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.