hamishmorgan/gist:3342260

## gistfile1.py
# A quick demo of how to produce a loglog histogram plot of very large
# amounts of data, by using log-histogram bins

import numpy as np
import matplotlib.pyplot as plt
import itertools as it

# We shall draw millions of samples from a Zipf distribution.  Using linear
# bins this is too much data for a fast and attactive plot.

# Generator for zipfian data
X = (np.random.zipf(1.5) for _ in xrange(0, 5000000))

# Expected range of values
MIN_VALUE = 1
MAX_VALUE = 2.0 ** 64

# Log base we shall be using though-out
XBASE = 2
YBASE = 10

# Calculate the min and max powers:
start_power = np.floor(np.log(MIN_VALUE) / np.log(XBASE))
end_power = np.ceil(np.log(MAX_VALUE) / np.log(XBASE))
# ...and number of whole integer powers in that range
num_bins = (end_power - start_power) + 1

#  Generated a range of delimiters in log space
bins = np.logspace(start_power, end_power, num_bins, base=XBASE)

#  Iteratively generate the the histogram in 1k chunks
hist = np.zeros(len(bins) - 1)
while True:
	chunk = list(it.islice(X, 1000))
	if len(chunk) == 0: break
	(tmp,_) = np.histogram(chunk, bins=bins)
	hist += tmp

# Slice all the empty bins of the end
last_idx = max([i for i,h in enumerate(hist) if h])
(hist, bins) = (hist[:last_idx+1], bins[:last_idx+2])

#  Plot for great justice!
fig = plt.figure()

# A loglog plot
ax = fig.add_subplot(211)
plt.loglog(bins[:-1], hist, 'x-', basey=YBASE, basex=XBASE)

# A linear plot of 95% of the mass
ax = fig.add_subplot(212)
pp = max([i for i,x in enumerate((np.cumsum(hist) / sum(hist)) < 0.95) if x])
plt.bar( bins[:-1] - bins[0], hist, width=bins[1:] - bins[:-1])
plt.gca().set_xlim(0, XBASE ** pp  )

plt.show()

# All done. Have a biscuit.
	# A quick demo of how to produce a loglog histogram plot of very large
	# amounts of data, by using log-histogram bins

	import numpy as np
	import matplotlib.pyplot as plt
	import itertools as it

	# We shall draw millions of samples from a Zipf distribution. Using linear
	# bins this is too much data for a fast and attactive plot.

	# Generator for zipfian data
	X = (np.random.zipf(1.5) for _ in xrange(0, 5000000))

	# Expected range of values
	MIN_VALUE = 1
	MAX_VALUE = 2.0 ** 64

	# Log base we shall be using though-out
	XBASE = 2
	YBASE = 10

	# Calculate the min and max powers:
	start_power = np.floor(np.log(MIN_VALUE) / np.log(XBASE))
	end_power = np.ceil(np.log(MAX_VALUE) / np.log(XBASE))
	# ...and number of whole integer powers in that range
	num_bins = (end_power - start_power) + 1

	# Generated a range of delimiters in log space
	bins = np.logspace(start_power, end_power, num_bins, base=XBASE)

	# Iteratively generate the the histogram in 1k chunks
	hist = np.zeros(len(bins) - 1)
	while True:
	chunk = list(it.islice(X, 1000))
	if len(chunk) == 0: break
	(tmp,_) = np.histogram(chunk, bins=bins)
	hist += tmp

	# Slice all the empty bins of the end
	last_idx = max([i for i,h in enumerate(hist) if h])
	(hist, bins) = (hist[:last_idx+1], bins[:last_idx+2])

	# Plot for great justice!
	fig = plt.figure()

	# A loglog plot
	ax = fig.add_subplot(211)
	plt.loglog(bins[:-1], hist, 'x-', basey=YBASE, basex=XBASE)

	# A linear plot of 95% of the mass
	ax = fig.add_subplot(212)
	pp = max([i for i,x in enumerate((np.cumsum(hist) / sum(hist)) < 0.95) if x])
	plt.bar( bins[:-1] - bins[0], hist, width=bins[1:] - bins[:-1])
	plt.gca().set_xlim(0, XBASE ** pp )

	plt.show()

	# All done. Have a biscuit.