Skip to content

Instantly share code, notes, and snippets.

@matthewdowney
Created January 9, 2018 22:53
Show Gist options
  • Save matthewdowney/1ca9c691706ef2f5d17f63ee346a4ccf to your computer and use it in GitHub Desktop.
Save matthewdowney/1ca9c691706ef2f5d17f63ee346a4ccf to your computer and use it in GitHub Desktop.
Pipe some data to this script to plot a data set vs benford's law.
# Pipe some data to stdin. Each line should contain a number.
import csv
from collections import defaultdict
from math import log10
import matplotlib
import matplotlib.pyplot as plt
import functools
import fileinput
# We care about digits 1-10
digits = set(map(str, range(1, 10)))
def log_frequency(frequencies, line):
"""Increment the frequency in the dict for {digit: count}."""
input_digits = [int(x) for x in line if x in digits]
if input_digits:
frequencies[input_digits[0]] = frequencies.get(input_digits[0], 0) + 1
return frequencies
# Scaling
data = functools.reduce(log_frequency, fileinput.input(), {})
total_numbers = sum(data.values())
data = sorted([(count / float(total_numbers), d) for (d, count) in data.items()], reverse=True)
# Comparison
benford = [(log10(1 + 1.0 / i), str(i)) for i in range(1, 10)]
# Plot first digit distribution vs. Benford
plt.plot([x[0] for x in data], label='Data Set')
plt.plot([x[0] for x in benford], label="Benford's Law", linewidth=10, alpha=0.23)
plt.ylabel("Distribution probability", fontsize=14)
plt.xlabel("First digit for %s numbers" % total_numbers, fontsize=14)
plt.title("Check it out yo\n", fontsize=12)
plt.xticks([x for x in range(len(benford))], [int(x[1]) for x in benford])
plt.legend()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment