jkff/sample_target_fraction.py

## sample_target_fraction.py
def expected_fraction_of_corpus_understood(phrases):
  N = len(phrases)
  num_total_words = sum(len(phrase) for phrase in phrases)
  phrase_length_fit = fit_phrase_length(phrases)
  ((k, beta, _), _) = fit_heaps_law(phrases)
  def result(n, s):
    w = 1.0 * n / N * num_total_words
    omega = k * beta * w ** (beta - 1)
    return 1.0 / N * (s + 1.0 * s / n * (N - n) * (1 - p_phrase_has_new_word(omega, phrase_length_fit)))
  return result

def plot(phrases, max_sample_size, sample_fractions):
  f = expected_fraction_of_corpus_understood(phrases)
  n = np.linspace(0, max_sample_size, 100)[1:]
  fig, ax = plt.subplots()
  for frac in sample_fractions:
    ax.plot(n, np.vectorize(f)(n, frac*n), label=str(frac))
  ax.legend(loc='lower right')
  ax.grid(which='both')
  plt.locator_params(numticks=10)
  plt.show()

plot(phrases, 20000, [0.1, 0.3, 0.5, 0.7, 0.9, 0.95])
	def expected_fraction_of_corpus_understood(phrases):
	N = len(phrases)
	num_total_words = sum(len(phrase) for phrase in phrases)
	phrase_length_fit = fit_phrase_length(phrases)
	((k, beta, _), _) = fit_heaps_law(phrases)
	def result(n, s):
	w = 1.0 * n / N * num_total_words
	omega = k * beta * w ** (beta - 1)
	return 1.0 / N * (s + 1.0 * s / n * (N - n) * (1 - p_phrase_has_new_word(omega, phrase_length_fit)))
	return result

	def plot(phrases, max_sample_size, sample_fractions):
	f = expected_fraction_of_corpus_understood(phrases)
	n = np.linspace(0, max_sample_size, 100)[1:]
	fig, ax = plt.subplots()
	for frac in sample_fractions:
	ax.plot(n, np.vectorize(f)(n, frac*n), label=str(frac))
	ax.legend(loc='lower right')
	ax.grid(which='both')
	plt.locator_params(numticks=10)
	plt.show()

	plot(phrases, 20000, [0.1, 0.3, 0.5, 0.7, 0.9, 0.95])