Last active
January 1, 2016 13:29
-
-
Save hmml/8151335 to your computer and use it in GitHub Desktop.
Beford's law in action.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, sys, time | |
def print_stats(buckets): | |
# source: http://mathworld.wolfram.com/BenfordsLaw.html | |
benford = (0.30103, 0.176091, 0.124939, 0.09691, 0.0791812, 0.0669468, 0.0579919, 0.0511525, 0.0457575) | |
try: | |
vals = [float(buckets[x])/sum(buckets) for x in range(len(buckets))] | |
print 'Digit\tResult\tBenford\tDifference' | |
for i, val in enumerate(vals): | |
error = (benford[i] - vals[i]) | |
print '%d\t%.4f\t%.4f\t%+.4f' % (i + 1, vals[i], benford[i], error) | |
except: | |
print 'Error calculating stats...' | |
def do_stats(start_dir): | |
count = 0 | |
buckets = [0 for x in range(9)] | |
start_time = time.time() | |
for root, attr, files in os.walk(start_dir): | |
for fname in files: | |
try: | |
size = os.stat(os.path.join(root, fname))[6] | |
if size != 0: | |
count += 1 | |
if count % 512 == 0: | |
print '\rExamined files: %d' % count | |
buckets[int(str(size)[0]) - 1] += 1 | |
except: | |
pass | |
print 'Total samples:', count | |
print 'Start directory:', start_dir | |
print 'Total time: %0.2f seconds' % (time.time() - start_time) | |
print_stats(buckets) | |
if __name__ == '__main__': | |
if not sys.argv[1:] or sys.argv[1:] and sys.argv[1] == '-h': | |
print 'Examine file sizes from given directory to check Benford\'s Law.' | |
print 'For more information see: http://mathworld.wolfram.com/BenfordsLaw.html' | |
print 'Usage: %s <path>' % os.path.basename(sys.argv[0]) | |
print 'Example: %s c:/' % os.path.basename(sys.argv[0]) | |
sys.exit(1) | |
do_stats(sys.argv[1]) | |
sys.exit(0) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment