Skip to content

Instantly share code, notes, and snippets.

@jkff
Created December 22, 2017 07:43
Show Gist options
  • Save jkff/dbbf3bb139c14407aaf7736b7e81797d to your computer and use it in GitHub Desktop.
Save jkff/dbbf3bb139c14407aaf7736b7e81797d to your computer and use it in GitHub Desktop.
import pickle
import random
from scipy.optimize import curve_fit
def shuffled(x):
res = list(x)
random.shuffle(res)
return res
def fit_heaps_law(phrases):
num_unique_after_phrase = [0]
num_unique_after_word = [0]
unique_words = set()
for phrase in shuffled(phrases):
for word in phrase:
unique_words.add(word)
num_unique_after_word.append(len(unique_words))
num_unique_after_phrase.append(len(unique_words))
def f(n, k, b):
return k * n ** b
def fit(data):
def condense(series):
# 1000 points ought to be enough
factor = max(1, len(series) / 1000)
return [x for i, x in enumerate(series) if i % factor == 0]
((k, b), _) = curve_fit(
f,
xdata=condense(range(0, len(data))),
ydata=condense(data),
bounds=((0, 0), (len(data), 1)))
return k, b, (lambda n: f(n, k, b))
return fit(num_unique_after_word), fit(num_unique_after_phrase)
phrases = pickle.load(open('movie_lines_cleaned.p'))
word_fit, phrase_fit = fit_heaps_law(phrases)
print word_fit
print phrase_fit
def fit_phrase_length(phrases):
lens = [len(phrase) for phrase in phrases]
len_to_counts = [0 for i in range(1 + max(lens))]
for n in lens:
len_to_counts[n] += 1
def pdf(w, a, b, c):
return a * (w**b) * (c**w)
((a, b, c), _) = curve_fit(
pdf,
xdata=range(0, len(len_to_counts)),
ydata=len_to_counts)
# We performed the fit on counts rather than frequencies: for some reason,
# it gives a much better fit, but we need to scale 'a' back.
s = sum(len_to_counts)
a = a / s
return a, b, c, lambda w: pdf(w, a, b, c), max(lens)
def p_phrase_has_new_word(omega, phrase_length_fit):
a, b, c, pdf, max_len = phrase_length_fit
return sum(
pdf(i) * (1 - (1 - omega)**i)
for i in range(1, 1 + max_len))
phrase_length_fit = fit_phrase_length(phrases)
k, beta, _ = word_fit
sample = shuffled(phrases)[1:1000]
w = len([word for phrase in sample for word in phrase])
omega = k * beta * w ** (beta - 1)
print w, omega, p_phrase_has_new_word(omega, phrase_length_fit)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment