jkff/power_laws.py

## power_laws.py
import pickle
import random
from scipy.optimize import curve_fit

def shuffled(x):
  res = list(x)
  random.shuffle(res)
  return res

def fit_heaps_law(phrases):
  num_unique_after_phrase = [0]
  num_unique_after_word = [0]
  unique_words = set()
  for phrase in shuffled(phrases):
    for word in phrase:
      unique_words.add(word)
      num_unique_after_word.append(len(unique_words))
    num_unique_after_phrase.append(len(unique_words))

  def f(n, k, b):
    return k * n ** b

  def fit(data):
    def condense(series):
      # 1000 points ought to be enough
      factor = max(1, len(series) / 1000)
      return [x for i, x in enumerate(series) if i % factor == 0]

    ((k, b), _) = curve_fit(
        f,
        xdata=condense(range(0, len(data))),
        ydata=condense(data),
        bounds=((0, 0), (len(data), 1)))
    return k, b, (lambda n: f(n, k, b))

  return fit(num_unique_after_word), fit(num_unique_after_phrase)

phrases = pickle.load(open('movie_lines_cleaned.p'))

word_fit, phrase_fit = fit_heaps_law(phrases)
print word_fit
print phrase_fit

def fit_phrase_length(phrases):
  lens = [len(phrase) for phrase in phrases]
  len_to_counts = [0 for i in range(1 + max(lens))]
  for n in lens:
    len_to_counts[n] += 1
  def pdf(w, a, b, c):
    return a * (w**b) * (c**w)

  ((a, b, c), _) = curve_fit(
      pdf,
      xdata=range(0, len(len_to_counts)),
      ydata=len_to_counts)
  # We performed the fit on counts rather than frequencies: for some reason,
  # it gives a much better fit, but we need to scale 'a' back.
  s = sum(len_to_counts)
  a = a / s
  return a, b, c, lambda w: pdf(w, a, b, c), max(lens)

def p_phrase_has_new_word(omega, phrase_length_fit):
  a, b, c, pdf, max_len = phrase_length_fit
  return sum(
    pdf(i) * (1 - (1 - omega)**i)
    for i in range(1, 1 + max_len))

phrase_length_fit = fit_phrase_length(phrases)
k, beta, _ = word_fit

sample = shuffled(phrases)[1:1000]
w = len([word for phrase in sample for word in phrase])
omega = k * beta * w ** (beta - 1)

print w, omega, p_phrase_has_new_word(omega, phrase_length_fit)
	import pickle
	import random
	from scipy.optimize import curve_fit

	def shuffled(x):
	res = list(x)
	random.shuffle(res)
	return res

	def fit_heaps_law(phrases):
	num_unique_after_phrase = [0]
	num_unique_after_word = [0]
	unique_words = set()
	for phrase in shuffled(phrases):
	for word in phrase:
	unique_words.add(word)
	num_unique_after_word.append(len(unique_words))
	num_unique_after_phrase.append(len(unique_words))

	def f(n, k, b):
	return k * n ** b

	def fit(data):
	def condense(series):
	# 1000 points ought to be enough
	factor = max(1, len(series) / 1000)
	return [x for i, x in enumerate(series) if i % factor == 0]

	((k, b), _) = curve_fit(
	f,
	xdata=condense(range(0, len(data))),
	ydata=condense(data),
	bounds=((0, 0), (len(data), 1)))
	return k, b, (lambda n: f(n, k, b))

	return fit(num_unique_after_word), fit(num_unique_after_phrase)

	phrases = pickle.load(open('movie_lines_cleaned.p'))

	word_fit, phrase_fit = fit_heaps_law(phrases)
	print word_fit
	print phrase_fit

	def fit_phrase_length(phrases):
	lens = [len(phrase) for phrase in phrases]
	len_to_counts = [0 for i in range(1 + max(lens))]
	for n in lens:
	len_to_counts[n] += 1
	def pdf(w, a, b, c):
	return a * (w*b) (c**w)

	((a, b, c), _) = curve_fit(
	pdf,
	xdata=range(0, len(len_to_counts)),
	ydata=len_to_counts)
	# We performed the fit on counts rather than frequencies: for some reason,
	# it gives a much better fit, but we need to scale 'a' back.
	s = sum(len_to_counts)
	a = a / s
	return a, b, c, lambda w: pdf(w, a, b, c), max(lens)

	def p_phrase_has_new_word(omega, phrase_length_fit):
	a, b, c, pdf, max_len = phrase_length_fit
	return sum(
	pdf(i) * (1 - (1 - omega)**i)
	for i in range(1, 1 + max_len))

	phrase_length_fit = fit_phrase_length(phrases)
	k, beta, _ = word_fit

	sample = shuffled(phrases)[1:1000]
	w = len([word for phrase in sample for word in phrase])
	omega = k * beta * w ** (beta - 1)

	print w, omega, p_phrase_has_new_word(omega, phrase_length_fit)