Skip to content

Instantly share code, notes, and snippets.

@jkff
Created December 23, 2017 21:05
Show Gist options
  • Save jkff/9299649989779142740f116a4eabd7b2 to your computer and use it in GitHub Desktop.
Save jkff/9299649989779142740f116a4eabd7b2 to your computer and use it in GitHub Desktop.
def expected_fraction_of_corpus_understood(phrases):
N = len(phrases)
num_total_words = sum(len(phrase) for phrase in phrases)
phrase_length_fit = fit_phrase_length(phrases)
((k, beta, _), _) = fit_heaps_law(phrases)
def result(n, s):
w = 1.0 * n / N * num_total_words
omega = k * beta * w ** (beta - 1)
return 1.0 / N * (s + 1.0 * s / n * (N - n) * (1 - p_phrase_has_new_word(omega, phrase_length_fit)))
return result
def plot(phrases, max_sample_size, sample_fractions):
f = expected_fraction_of_corpus_understood(phrases)
n = np.linspace(0, max_sample_size, 100)[1:]
fig, ax = plt.subplots()
for frac in sample_fractions:
ax.plot(n, np.vectorize(f)(n, frac*n), label=str(frac))
ax.legend(loc='lower right')
ax.grid(which='both')
plt.locator_params(numticks=10)
plt.show()
plot(phrases, 20000, [0.1, 0.3, 0.5, 0.7, 0.9, 0.95])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment