Skip to content

Instantly share code, notes, and snippets.

@alanbernstein
Created January 24, 2021 20:19
Show Gist options
  • Save alanbernstein/39cc97fc582cbb2cd30df59c40853129 to your computer and use it in GitHub Desktop.
Save alanbernstein/39cc97fc582cbb2cd30df59c40853129 to your computer and use it in GitHub Desktop.
charts for benford blog post
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import numpy as np
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
(44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
(148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
(227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
(188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]
#tableau10 = tableau20[::2]
tableau10 = [[c[0]/255, c[1]/255, c[2]/255] for c in tableau20[::2]]
def first_digit(x):
return np.floor(x/(10 ** np.floor(np.log10(x)))).astype(int)
def num_digits(x):
return np.floor(np.log10(x)) + 1
# demonstrate why benford's law happens
N = 10000
low, hi = 1, 1000
Nbins = 20
r = np.exp(np.random.uniform(np.log(low), np.log(hi), (N,)))
dr = first_digit(r)
plt.figure()
# histogram of random data, log-uniform, linear x-scale
plt.subplot(221)
plt.hist(r, Nbins, edgecolor='k')
plt.yscale('log')
plt.title('1a: Linear histogram of magnitude')
# histogram of random data, log-uniform, logarithmic x-scale
plt.subplot(222)
plt.hist(np.log10(r), Nbins, edgecolor='k')
plt.plot([np.log10(low), np.log10(hi)], [N/Nbins, N/Nbins], 'k-')
plt.title('1b: Logarithmic histogram of magnitude')
x_ticks = np.hstack((np.arange(1, 10), np.arange(10, 100, 10), np.arange(100, 1001, 100)))
#x_labels = [1, 2, None, None, 5, None, None, None, None,
# 10, 20, None, None, 50, None, None, None, None,
# 100, 200, None, None, 500, None, None, None, None, 1000]
x_labels = [1, None, 3, None, None, None, None, None, None,
10, None, 30, None, None, None, None, None, None,
100, None, 300, None, None, None, None, None, None, 1000]
ax = plt.gca()
ax.set_xticks(np.log10(x_ticks))
ax.set_xticklabels(x_labels)
# histogram of first digit of random data
plt.subplot(223)
bins = range(1, 10)
drc = [np.sum(dr == n) for n in bins]
plt.bar(bins, drc, edgecolor='k')
z = np.arange(1, 9.1, .1)
z2 = np.arange(1, 10)
plt.plot(z, N * np.log10(1+1/z), 'k-')
plt.plot(z2, N * np.log10(1+1/z2), 'k.')
plt.title('1c: Histogram of first_digit')
ax = plt.gca()
ax.set_xticks(np.arange(1, 10))
ax.set_xticklabels(np.arange(1, 10))
# illustration of first_digit function on deterministic input
ax = plt.subplot(224)
x = np.arange(1, N+1)
dx = first_digit(x)
colors = tableau10[2:5]
fdigs = [1, 2, 5]
for mag in [1, 10, 100, 1000]:
for kk, fdig in enumerate(fdigs):
ax.add_patch(Rectangle((fdig*mag, 0), mag, 9, fill=True, color=colors[kk]))
plt.plot(x, dx, '.-k')
plt.xscale('log')
plt.title('1d: first_digit(x)')
plt.suptitle('Log-uniform distributed random data')
plt.tight_layout()
plt.figure()
# is there a "dual" or counterpart to benfords law that would be equally surprising to an alien who tends to think in log space instead of linear space?
color2 = tableau10[1]
N = 100000
log_low, log_hi = 0, 6
low, hi = 10 ** log_low, 10 ** log_hi
r = np.random.uniform(low, hi, (N,))
plt.subplot(221)
plt.plot([low, hi], [N/Nbins, N/Nbins], 'k-')
plt.hist(r, Nbins, edgecolor='k', color=color2)
plt.title('2a: Linear histogram of magnitude')
plt.subplot(222)
plt.hist(np.log10(r), Nbins, edgecolor='k', color=color2)
plt.title('2b: Logarithmic histogram of magnitude')
x_ticks_log = np.arange(0, 7)
x_labels = ['$10^%d$' % n for n in x_ticks_log]
ax = plt.gca()
plt.yscale('log')
ax.set_xticks(x_ticks_log)
ax.set_xticklabels(x_labels)
plt.subplot(223)
cr = num_digits(r)
bins = range(1, log_hi+1)
crc = [np.sum(cr == n) for n in bins]
plt.bar(bins, crc, edgecolor='k', color=color2)
#plt.plot([1, 9], [N/len(bins), N/len(bins)], 'k-')
plt.yscale('log')
plt.title('2c: Histogram of num_digits')
#dr = first_digit(r)
#bins = range(1, 10)
#drc = [np.sum(dr == n) for n in bins]
#plt.bar(bins, drc)
#plt.plot([1, 9], [N/len(bins), N/len(bins)], 'k-')
#plt.title('linear-uniform distribution: first digit counts')
ax = plt.subplot(224)
x = np.arange(1, N)
dx = num_digits(x)
plt.plot(x, dx, '.-k')
# plt.xscale('log')
plt.title('2d: num_digits(x)')
colors = tableau10[6:9]
ax.add_patch(Rectangle((100, 1), 900, 4, fill=True, color=tableau10[8]))
ax.add_patch(Rectangle((1000, 1), 9000, 4, fill=True, color=tableau10[6]))
ax.add_patch(Rectangle((10000, 1), 90000, 4, fill=True, color=tableau10[7]))
plt.suptitle('Linear-uniform distributed random data')
plt.tight_layout()
# plt.figure()
# TODO:
# - what is the original data distribution that would result in uniform digit counts?
# - piecewise?
# - analytical?
plt.show()
import ipdb; ipdb.set_trace()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment