dpiponi/benford.py

## benford.py
import csv
import math
import matplotlib.pyplot as plt
import locale

def frac(x):
    return x - math.floor(x)

def first_digit(x):
    return int(math.floor(math.pow(10, frac(math.log10(x))) + 0.0001))

# https://exoplanetarchive.ipac.caltech.edu
def orbital_periods():
    filename = 'PS_2020.11.27_15.14.11.csv'
    with open(filename) as file:
        csvdata = csv.reader(file)
        headings = next(csvdata)
        index = dict(zip(headings, range(1000)))['pl_orbper']
        for data in csvdata:
            if not data[index]:
                return
            yield float(data[index])

# https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-cities-and-towns.html#tables
def town_pops():
    locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )
    filename = 'SUB-IP-EST2019-ANNRES.csv'
    with open(filename) as file:
        csvdata = csv.reader(file)
        headings = next(csvdata)
        index = 1
        for data in csvdata:
#             print(data)
            if not data[index]:
                return
            try:
                value = locale.atoi(data[index])
                yield locale.atoi(data[index])
            except:
                pass

# http://www.genomesize.com
def genome_sizes():
    filename = 'genome_size_data_271120_17_16_11.csv'
    with open(filename) as file:
        csvdata = csv.reader(file)
        headings = next(csvdata)
        index = dict(zip(headings, range(1000)))['C-value']
        for data in csvdata:
            try:
                print(data[index])
                yield float(data[index])
            except:
                pass

# https://www.astronexus.com/hyg
def stellar_distances():
    filename = 'hygdata_v3.csv'
    with open(filename) as file:
        csvdata = csv.reader(file)
        headings = next(csvdata)
        index = dict(zip(headings, range(1000)))['dist']
        for data in csvdata:
            try:
                dist = float(data[index])
                if dist > 0: # Eliminate Sol
                    # Not trusting exact numners.
                    if data[index][-4:] != '0000':
                        print(data[index])
                        yield dist
            except:
                pass

# periods = town_pops()
# # periods = orbital_periods()
# first_digits = list(map(first_digit, periods))
# print(first_digits)
# n = len(first_digits)

def pdf(x):
    return math.log10(x + 1) - math.log10(x)

datasets = [
    ('Expoplanet Orbital Periods (in days)', orbital_periods),
    ('US Town Populations', town_pops),
    ('Organism genome size (in picograms)', genome_sizes),
    ('Stellar distances from Hipparcos (in parsecs)', stellar_distances)
    ]
i = 1
plt.figure(figsize=(20,12))
for name, gen in datasets:
    plt.subplot(2, 2, i)
    i += 1
    first_digits = list(map(first_digit, gen()))
    n = len(first_digits)
    plt.hist(first_digits, bins=[(x) for x in range(0, 11)], width = 0.5)
    plt.bar([x for x in range(1, 10)], list(map(lambda x: n * pdf(x), range(1, 10))), color='r', width = 0.5)
    plt.title(name + " (N=" + str(n) + ")")
plt.show()
	import csv
	import math
	import matplotlib.pyplot as plt
	import locale

	def frac(x):
	return x - math.floor(x)

	def first_digit(x):
	return int(math.floor(math.pow(10, frac(math.log10(x))) + 0.0001))

	# https://exoplanetarchive.ipac.caltech.edu
	def orbital_periods():
	filename = 'PS_2020.11.27_15.14.11.csv'
	with open(filename) as file:
	csvdata = csv.reader(file)
	headings = next(csvdata)
	index = dict(zip(headings, range(1000)))['pl_orbper']
	for data in csvdata:
	if not data[index]:
	return
	yield float(data[index])

	# https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-cities-and-towns.html#tables
	def town_pops():
	locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )
	filename = 'SUB-IP-EST2019-ANNRES.csv'
	with open(filename) as file:
	csvdata = csv.reader(file)
	headings = next(csvdata)
	index = 1
	for data in csvdata:
	# print(data)
	if not data[index]:
	return
	try:
	value = locale.atoi(data[index])
	yield locale.atoi(data[index])
	except:
	pass

	# http://www.genomesize.com
	def genome_sizes():
	filename = 'genome_size_data_271120_17_16_11.csv'
	with open(filename) as file:
	csvdata = csv.reader(file)
	headings = next(csvdata)
	index = dict(zip(headings, range(1000)))['C-value']
	for data in csvdata:
	try:
	print(data[index])
	yield float(data[index])
	except:
	pass

	# https://www.astronexus.com/hyg
	def stellar_distances():
	filename = 'hygdata_v3.csv'
	with open(filename) as file:
	csvdata = csv.reader(file)
	headings = next(csvdata)
	index = dict(zip(headings, range(1000)))['dist']
	for data in csvdata:
	try:
	dist = float(data[index])
	if dist > 0: # Eliminate Sol
	# Not trusting exact numners.
	if data[index][-4:] != '0000':
	print(data[index])
	yield dist
	except:
	pass

	# periods = town_pops()
	# # periods = orbital_periods()
	# first_digits = list(map(first_digit, periods))
	# print(first_digits)
	# n = len(first_digits)

	def pdf(x):
	return math.log10(x + 1) - math.log10(x)

	datasets = [
	('Expoplanet Orbital Periods (in days)', orbital_periods),
	('US Town Populations', town_pops),
	('Organism genome size (in picograms)', genome_sizes),
	('Stellar distances from Hipparcos (in parsecs)', stellar_distances)
	]
	i = 1
	plt.figure(figsize=(20,12))
	for name, gen in datasets:
	plt.subplot(2, 2, i)
	i += 1
	first_digits = list(map(first_digit, gen()))
	n = len(first_digits)
	plt.hist(first_digits, bins=[(x) for x in range(0, 11)], width = 0.5)
	plt.bar([x for x in range(1, 10)], list(map(lambda x: n * pdf(x), range(1, 10))), color='r', width = 0.5)
	plt.title(name + " (N=" + str(n) + ")")
	plt.show()