alanbernstein/benford.py

## benford.py
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import numpy as np


tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]

#tableau10 = tableau20[::2]
tableau10 = [[c[0]/255, c[1]/255, c[2]/255] for c in tableau20[::2]]


def first_digit(x):
    return np.floor(x/(10 ** np.floor(np.log10(x)))).astype(int)

def num_digits(x):
    return np.floor(np.log10(x)) + 1

# demonstrate why benford's law happens
N = 10000
low, hi = 1, 1000
Nbins = 20


r = np.exp(np.random.uniform(np.log(low), np.log(hi), (N,)))
dr = first_digit(r)


plt.figure()

# histogram of random data, log-uniform, linear x-scale
plt.subplot(221)
plt.hist(r, Nbins, edgecolor='k')
plt.yscale('log')
plt.title('1a: Linear histogram of magnitude')


# histogram of random data, log-uniform, logarithmic x-scale
plt.subplot(222)
plt.hist(np.log10(r), Nbins, edgecolor='k')
plt.plot([np.log10(low), np.log10(hi)], [N/Nbins, N/Nbins], 'k-')
plt.title('1b: Logarithmic histogram of magnitude')

x_ticks = np.hstack((np.arange(1, 10), np.arange(10, 100, 10), np.arange(100, 1001, 100)))
#x_labels = [1, 2, None, None, 5, None, None, None, None,
#            10, 20, None, None, 50, None, None, None, None,
#            100, 200, None, None, 500, None, None, None, None, 1000]
x_labels = [1, None, 3, None, None, None, None, None, None,
            10, None, 30, None, None, None, None, None, None,
            100, None, 300, None, None, None, None, None, None, 1000]

ax = plt.gca()
ax.set_xticks(np.log10(x_ticks))
ax.set_xticklabels(x_labels)


# histogram of first digit of random data
plt.subplot(223)
bins = range(1, 10)
drc = [np.sum(dr == n) for n in bins]
plt.bar(bins, drc, edgecolor='k')

z = np.arange(1, 9.1, .1)
z2 = np.arange(1, 10)
plt.plot(z, N * np.log10(1+1/z), 'k-')
plt.plot(z2, N * np.log10(1+1/z2), 'k.')
plt.title('1c: Histogram of first_digit')
ax = plt.gca()
ax.set_xticks(np.arange(1, 10))
ax.set_xticklabels(np.arange(1, 10))


# illustration of first_digit function on deterministic input
ax = plt.subplot(224)
x = np.arange(1, N+1)
dx = first_digit(x)

colors = tableau10[2:5]
fdigs = [1, 2, 5]
for mag in [1, 10, 100, 1000]:
    for kk, fdig in enumerate(fdigs):
        ax.add_patch(Rectangle((fdig*mag, 0), mag, 9, fill=True, color=colors[kk]))


plt.plot(x, dx, '.-k')
plt.xscale('log')
plt.title('1d: first_digit(x)')
plt.suptitle('Log-uniform distributed random data')
plt.tight_layout()

plt.figure()
# is there a "dual" or counterpart to benfords law that would be equally surprising to an alien who tends to think in log space instead of linear space?

color2 = tableau10[1]

N = 100000
log_low, log_hi = 0, 6
low, hi = 10 ** log_low, 10 ** log_hi

r = np.random.uniform(low, hi, (N,))
plt.subplot(221)
plt.plot([low, hi], [N/Nbins, N/Nbins], 'k-')
plt.hist(r, Nbins, edgecolor='k', color=color2)
plt.title('2a: Linear histogram of magnitude')


plt.subplot(222)
plt.hist(np.log10(r), Nbins, edgecolor='k', color=color2)
plt.title('2b: Logarithmic histogram of magnitude')

x_ticks_log = np.arange(0, 7)
x_labels = ['$10^%d$' % n for n in x_ticks_log]

ax = plt.gca()
plt.yscale('log')
ax.set_xticks(x_ticks_log)
ax.set_xticklabels(x_labels)


plt.subplot(223)
cr = num_digits(r)
bins = range(1, log_hi+1)
crc = [np.sum(cr == n) for n in bins]
plt.bar(bins, crc, edgecolor='k', color=color2)
#plt.plot([1, 9], [N/len(bins), N/len(bins)], 'k-')
plt.yscale('log')
plt.title('2c: Histogram of num_digits')

#dr = first_digit(r)
#bins = range(1, 10)
#drc = [np.sum(dr == n) for n in bins]
#plt.bar(bins, drc)
#plt.plot([1, 9], [N/len(bins), N/len(bins)], 'k-')
#plt.title('linear-uniform distribution: first digit counts')

ax = plt.subplot(224)
x = np.arange(1, N)
dx = num_digits(x)
plt.plot(x, dx, '.-k')
# plt.xscale('log')
plt.title('2d: num_digits(x)')

colors = tableau10[6:9]
ax.add_patch(Rectangle((100, 1), 900, 4, fill=True, color=tableau10[8]))
ax.add_patch(Rectangle((1000, 1), 9000, 4, fill=True, color=tableau10[6]))
ax.add_patch(Rectangle((10000, 1), 90000, 4, fill=True, color=tableau10[7]))


plt.suptitle('Linear-uniform distributed random data')
plt.tight_layout()


# plt.figure()
# TODO:
# - what is the original data distribution that would result in uniform digit counts?
#   - piecewise?
#   - analytical?


plt.show()

import ipdb; ipdb.set_trace()
	import matplotlib.pyplot as plt
	from matplotlib.patches import Rectangle
	import numpy as np


	tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
	(44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
	(148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
	(227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
	(188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]

	#tableau10 = tableau20[::2]
	tableau10 = [[c[0]/255, c[1]/255, c[2]/255] for c in tableau20[::2]]


	def first_digit(x):
	return np.floor(x/(10 ** np.floor(np.log10(x)))).astype(int)

	def num_digits(x):
	return np.floor(np.log10(x)) + 1

	# demonstrate why benford's law happens
	N = 10000
	low, hi = 1, 1000
	Nbins = 20



	r = np.exp(np.random.uniform(np.log(low), np.log(hi), (N,)))
	dr = first_digit(r)


	plt.figure()

	# histogram of random data, log-uniform, linear x-scale
	plt.subplot(221)
	plt.hist(r, Nbins, edgecolor='k')
	plt.yscale('log')
	plt.title('1a: Linear histogram of magnitude')


	# histogram of random data, log-uniform, logarithmic x-scale
	plt.subplot(222)
	plt.hist(np.log10(r), Nbins, edgecolor='k')
	plt.plot([np.log10(low), np.log10(hi)], [N/Nbins, N/Nbins], 'k-')
	plt.title('1b: Logarithmic histogram of magnitude')

	x_ticks = np.hstack((np.arange(1, 10), np.arange(10, 100, 10), np.arange(100, 1001, 100)))
	#x_labels = [1, 2, None, None, 5, None, None, None, None,
	# 10, 20, None, None, 50, None, None, None, None,
	# 100, 200, None, None, 500, None, None, None, None, 1000]
	x_labels = [1, None, 3, None, None, None, None, None, None,
	10, None, 30, None, None, None, None, None, None,
	100, None, 300, None, None, None, None, None, None, 1000]

	ax = plt.gca()
	ax.set_xticks(np.log10(x_ticks))
	ax.set_xticklabels(x_labels)


	# histogram of first digit of random data
	plt.subplot(223)
	bins = range(1, 10)
	drc = [np.sum(dr == n) for n in bins]
	plt.bar(bins, drc, edgecolor='k')

	z = np.arange(1, 9.1, .1)
	z2 = np.arange(1, 10)
	plt.plot(z, N * np.log10(1+1/z), 'k-')
	plt.plot(z2, N * np.log10(1+1/z2), 'k.')
	plt.title('1c: Histogram of first_digit')
	ax = plt.gca()
	ax.set_xticks(np.arange(1, 10))
	ax.set_xticklabels(np.arange(1, 10))


	# illustration of first_digit function on deterministic input
	ax = plt.subplot(224)
	x = np.arange(1, N+1)
	dx = first_digit(x)

	colors = tableau10[2:5]
	fdigs = [1, 2, 5]
	for mag in [1, 10, 100, 1000]:
	for kk, fdig in enumerate(fdigs):
	ax.add_patch(Rectangle((fdig*mag, 0), mag, 9, fill=True, color=colors[kk]))


	plt.plot(x, dx, '.-k')
	plt.xscale('log')
	plt.title('1d: first_digit(x)')
	plt.suptitle('Log-uniform distributed random data')
	plt.tight_layout()

	plt.figure()
	# is there a "dual" or counterpart to benfords law that would be equally surprising to an alien who tends to think in log space instead of linear space?

	color2 = tableau10[1]

	N = 100000
	log_low, log_hi = 0, 6
	low, hi = 10 log_low, 10 log_hi

	r = np.random.uniform(low, hi, (N,))
	plt.subplot(221)
	plt.plot([low, hi], [N/Nbins, N/Nbins], 'k-')
	plt.hist(r, Nbins, edgecolor='k', color=color2)
	plt.title('2a: Linear histogram of magnitude')


	plt.subplot(222)
	plt.hist(np.log10(r), Nbins, edgecolor='k', color=color2)
	plt.title('2b: Logarithmic histogram of magnitude')

	x_ticks_log = np.arange(0, 7)
	x_labels = ['$10^%d$' % n for n in x_ticks_log]

	ax = plt.gca()
	plt.yscale('log')
	ax.set_xticks(x_ticks_log)
	ax.set_xticklabels(x_labels)


	plt.subplot(223)
	cr = num_digits(r)
	bins = range(1, log_hi+1)
	crc = [np.sum(cr == n) for n in bins]
	plt.bar(bins, crc, edgecolor='k', color=color2)
	#plt.plot([1, 9], [N/len(bins), N/len(bins)], 'k-')
	plt.yscale('log')
	plt.title('2c: Histogram of num_digits')

	#dr = first_digit(r)
	#bins = range(1, 10)
	#drc = [np.sum(dr == n) for n in bins]
	#plt.bar(bins, drc)
	#plt.plot([1, 9], [N/len(bins), N/len(bins)], 'k-')
	#plt.title('linear-uniform distribution: first digit counts')

	ax = plt.subplot(224)
	x = np.arange(1, N)
	dx = num_digits(x)
	plt.plot(x, dx, '.-k')
	# plt.xscale('log')
	plt.title('2d: num_digits(x)')

	colors = tableau10[6:9]
	ax.add_patch(Rectangle((100, 1), 900, 4, fill=True, color=tableau10[8]))
	ax.add_patch(Rectangle((1000, 1), 9000, 4, fill=True, color=tableau10[6]))
	ax.add_patch(Rectangle((10000, 1), 90000, 4, fill=True, color=tableau10[7]))


	plt.suptitle('Linear-uniform distributed random data')
	plt.tight_layout()


	# plt.figure()
	# TODO:
	# - what is the original data distribution that would result in uniform digit counts?
	# - piecewise?
	# - analytical?


	plt.show()

	import ipdb; ipdb.set_trace()