Created
January 24, 2021 20:19
-
-
Save alanbernstein/39cc97fc582cbb2cd30df59c40853129 to your computer and use it in GitHub Desktop.
charts for benford blog post
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
from matplotlib.patches import Rectangle | |
import numpy as np | |
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120), | |
(44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150), | |
(148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148), | |
(227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199), | |
(188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)] | |
#tableau10 = tableau20[::2] | |
tableau10 = [[c[0]/255, c[1]/255, c[2]/255] for c in tableau20[::2]] | |
def first_digit(x): | |
return np.floor(x/(10 ** np.floor(np.log10(x)))).astype(int) | |
def num_digits(x): | |
return np.floor(np.log10(x)) + 1 | |
# demonstrate why benford's law happens | |
N = 10000 | |
low, hi = 1, 1000 | |
Nbins = 20 | |
r = np.exp(np.random.uniform(np.log(low), np.log(hi), (N,))) | |
dr = first_digit(r) | |
plt.figure() | |
# histogram of random data, log-uniform, linear x-scale | |
plt.subplot(221) | |
plt.hist(r, Nbins, edgecolor='k') | |
plt.yscale('log') | |
plt.title('1a: Linear histogram of magnitude') | |
# histogram of random data, log-uniform, logarithmic x-scale | |
plt.subplot(222) | |
plt.hist(np.log10(r), Nbins, edgecolor='k') | |
plt.plot([np.log10(low), np.log10(hi)], [N/Nbins, N/Nbins], 'k-') | |
plt.title('1b: Logarithmic histogram of magnitude') | |
x_ticks = np.hstack((np.arange(1, 10), np.arange(10, 100, 10), np.arange(100, 1001, 100))) | |
#x_labels = [1, 2, None, None, 5, None, None, None, None, | |
# 10, 20, None, None, 50, None, None, None, None, | |
# 100, 200, None, None, 500, None, None, None, None, 1000] | |
x_labels = [1, None, 3, None, None, None, None, None, None, | |
10, None, 30, None, None, None, None, None, None, | |
100, None, 300, None, None, None, None, None, None, 1000] | |
ax = plt.gca() | |
ax.set_xticks(np.log10(x_ticks)) | |
ax.set_xticklabels(x_labels) | |
# histogram of first digit of random data | |
plt.subplot(223) | |
bins = range(1, 10) | |
drc = [np.sum(dr == n) for n in bins] | |
plt.bar(bins, drc, edgecolor='k') | |
z = np.arange(1, 9.1, .1) | |
z2 = np.arange(1, 10) | |
plt.plot(z, N * np.log10(1+1/z), 'k-') | |
plt.plot(z2, N * np.log10(1+1/z2), 'k.') | |
plt.title('1c: Histogram of first_digit') | |
ax = plt.gca() | |
ax.set_xticks(np.arange(1, 10)) | |
ax.set_xticklabels(np.arange(1, 10)) | |
# illustration of first_digit function on deterministic input | |
ax = plt.subplot(224) | |
x = np.arange(1, N+1) | |
dx = first_digit(x) | |
colors = tableau10[2:5] | |
fdigs = [1, 2, 5] | |
for mag in [1, 10, 100, 1000]: | |
for kk, fdig in enumerate(fdigs): | |
ax.add_patch(Rectangle((fdig*mag, 0), mag, 9, fill=True, color=colors[kk])) | |
plt.plot(x, dx, '.-k') | |
plt.xscale('log') | |
plt.title('1d: first_digit(x)') | |
plt.suptitle('Log-uniform distributed random data') | |
plt.tight_layout() | |
plt.figure() | |
# is there a "dual" or counterpart to benfords law that would be equally surprising to an alien who tends to think in log space instead of linear space? | |
color2 = tableau10[1] | |
N = 100000 | |
log_low, log_hi = 0, 6 | |
low, hi = 10 ** log_low, 10 ** log_hi | |
r = np.random.uniform(low, hi, (N,)) | |
plt.subplot(221) | |
plt.plot([low, hi], [N/Nbins, N/Nbins], 'k-') | |
plt.hist(r, Nbins, edgecolor='k', color=color2) | |
plt.title('2a: Linear histogram of magnitude') | |
plt.subplot(222) | |
plt.hist(np.log10(r), Nbins, edgecolor='k', color=color2) | |
plt.title('2b: Logarithmic histogram of magnitude') | |
x_ticks_log = np.arange(0, 7) | |
x_labels = ['$10^%d$' % n for n in x_ticks_log] | |
ax = plt.gca() | |
plt.yscale('log') | |
ax.set_xticks(x_ticks_log) | |
ax.set_xticklabels(x_labels) | |
plt.subplot(223) | |
cr = num_digits(r) | |
bins = range(1, log_hi+1) | |
crc = [np.sum(cr == n) for n in bins] | |
plt.bar(bins, crc, edgecolor='k', color=color2) | |
#plt.plot([1, 9], [N/len(bins), N/len(bins)], 'k-') | |
plt.yscale('log') | |
plt.title('2c: Histogram of num_digits') | |
#dr = first_digit(r) | |
#bins = range(1, 10) | |
#drc = [np.sum(dr == n) for n in bins] | |
#plt.bar(bins, drc) | |
#plt.plot([1, 9], [N/len(bins), N/len(bins)], 'k-') | |
#plt.title('linear-uniform distribution: first digit counts') | |
ax = plt.subplot(224) | |
x = np.arange(1, N) | |
dx = num_digits(x) | |
plt.plot(x, dx, '.-k') | |
# plt.xscale('log') | |
plt.title('2d: num_digits(x)') | |
colors = tableau10[6:9] | |
ax.add_patch(Rectangle((100, 1), 900, 4, fill=True, color=tableau10[8])) | |
ax.add_patch(Rectangle((1000, 1), 9000, 4, fill=True, color=tableau10[6])) | |
ax.add_patch(Rectangle((10000, 1), 90000, 4, fill=True, color=tableau10[7])) | |
plt.suptitle('Linear-uniform distributed random data') | |
plt.tight_layout() | |
# plt.figure() | |
# TODO: | |
# - what is the original data distribution that would result in uniform digit counts? | |
# - piecewise? | |
# - analytical? | |
plt.show() | |
import ipdb; ipdb.set_trace() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment