Skip to content

Instantly share code, notes, and snippets.

@nathans
Forked from jtauber/mean_log_frequency.py
Last active November 8, 2015 04:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nathans/734adbbd3e1d30077d0f to your computer and use it in GitHub Desktop.
Save nathans/734adbbd3e1d30077d0f to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import codecs
from collections import defaultdict
from math import log
import os
items_by_target = defaultdict(list)
count_by_item = defaultdict(int)
total_item_count = 0
paths = os.listdir('out/')
paths.sort()
for path in paths:
f = codecs.open('out/' + path, encoding="utf-8")
text = f.readlines()
f.close()
for line in text:
cols = line.split()
ref = cols[0]
target = ref[:5]
lemma = cols[3]
if len(cols) > 4:
lemma += " %s" % cols[4]
item = cols[3]
items_by_target[target].append(item)
count_by_item[item] += 1
total_item_count += 1
for target in sorted(items_by_target):
items = items_by_target[target]
num_items = len(items)
mean_log_frequency = 0
for item in items:
mean_log_frequency += log(count_by_item[item] / total_item_count) / num_items
print(int(-1000 * mean_log_frequency), target, num_items)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment