Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abrazhe/df8d2d788a507aa0d0e07ebd2ac2bcd8 to your computer and use it in GitHub Desktop.
Save abrazhe/df8d2d788a507aa0d0e07ebd2ac2bcd8 to your computer and use it in GitHub Desktop.
A quick demo of how to produce a loglog histogram plot of very large amounts of data, by using log-histogram bins.
# A quick demo of how to produce a loglog histogram plot of very large
# amounts of data, by using log-histogram bins
import numpy as np
import matplotlib.pyplot as plt
import itertools as it
# We shall draw millions of samples from a Zipf distribution. Using linear
# bins this is too much data for a fast and attactive plot.
# Generator for zipfian data
X = (np.random.zipf(1.5) for _ in xrange(0, 5000000))
# Expected range of values
MIN_VALUE = 1
MAX_VALUE = 2.0 ** 64
# Log base we shall be using though-out
XBASE = 2
YBASE = 10
# Calculate the min and max powers:
start_power = np.floor(np.log(MIN_VALUE) / np.log(XBASE))
end_power = np.ceil(np.log(MAX_VALUE) / np.log(XBASE))
# ...and number of whole integer powers in that range
num_bins = (end_power - start_power) + 1
# Generated a range of delimiters in log space
bins = np.logspace(start_power, end_power, num_bins, base=XBASE)
# Iteratively generate the the histogram in 1k chunks
hist = np.zeros(len(bins) - 1)
while True:
chunk = list(it.islice(X, 1000))
if len(chunk) == 0: break
(tmp,_) = np.histogram(chunk, bins=bins)
hist += tmp
# Slice all the empty bins of the end
last_idx = max([i for i,h in enumerate(hist) if h])
(hist, bins) = (hist[:last_idx+1], bins[:last_idx+2])
# Plot for great justice!
fig = plt.figure()
# A loglog plot
ax = fig.add_subplot(211)
plt.loglog(bins[:-1], hist, 'x-', basey=YBASE, basex=XBASE)
# A linear plot of 95% of the mass
ax = fig.add_subplot(212)
pp = max([i for i,x in enumerate((np.cumsum(hist) / sum(hist)) < 0.95) if x])
plt.bar( bins[:-1] - bins[0], hist, width=bins[1:] - bins[:-1])
plt.gca().set_xlim(0, XBASE ** pp )
plt.show()
# All done. Have a biscuit.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment