Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
#!/usr/bin/env python3
import codecs
from collections import defaultdict
from math import log
import os
items_by_target = defaultdict(list)
count_by_item = defaultdict(int)
total_item_count = 0
paths = os.listdir('out/')
paths.sort()
for path in paths:
f = codecs.open('out/' + path, encoding="utf-8")
text = f.readlines()
f.close()
for line in text:
cols = line.split()
ref = cols[0]
target = ref[:5]
lemma = cols[3]
if len(cols) > 4:
lemma += " %s" % cols[4]
item = cols[3]
items_by_target[target].append(item)
count_by_item[item] += 1
total_item_count += 1
for target in sorted(items_by_target):
items = items_by_target[target]
num_items = len(items)
mean_log_frequency = 0
for item in items:
mean_log_frequency += log(count_by_item[item] / total_item_count) / num_items
print(int(-1000 * mean_log_frequency), target, num_items)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment