bheni/calc_half_agi_percentages.py

## calc_half_agi_percentages.py
# calc_half_agi_percentages.py is a simple script that uses doltpy to retrieve data from the Dolt irs-soi dataset
# and calculates an approximate percentage of tax returns that make up half of the AGI for each state, for the tax
# years 2011 through 2017 and outputs a csv for each year.
#
# setup:
# Install Dolt. Dolt documentation and installation instructions are at https://github.com/liquidata-inc/dolt
# Clone the irs-soi dataset from dolt by running "dolt clone Liquidata/irs-soi" from the directory where you want the
#   data to be cloned to.
# Install doltpy.  doltpy documentation and installation instructions are at https://github.com/liquidata-inc/doltpy
#
# usage:
#   python calc_half_agi_percentages.py <irs-soi-repo-dir> <output-dir>
#     irs-soi-repo-dir is the directory that you cloned the data into
#     output-dir is the directory where you want the output csv files placed
#
# methodology:
#   for each tax year change to the appropriate dolt branch. Query the allnoagi data for zip "00000" which will give
#   totals for all zips in the state.  Then query the allagi data along with the average agi for each
#   agi category for each zip sorted by the average agi descending. For each state take the ordered query results and
#   calculate the approximate minimum number of returns that would make up half of the states total agi by using the
#   average agi for each return.  Compare with total return count to get a percentage.
import os.path
import sys
import doltpy.core as dpc

# constants
BRANCHES = [str(branch) for branch in range(2011, 2018)]

STATE_TOTAL_AGI_QUERY = '''
SELECT state, adjusted_gross_income, return_count
FROM allnoagi
WHERE zip = '00000';'''

PER_ZIP_AGI_QUERY = '''
SELECT  state,
        zip,
        agi_category,
        adjusted_gross_income,
        return_count,
        CAST(adjusted_gross_income AS DECIMAL(48,16))/CAST(return_count AS DECIMAL(48,16)) AS avg_agi
FROM allagi
WHERE zip != '00000' AND return_count != 0
ORDER BY state, avg_agi DESC;'''
STATE_COL_IDX = 0
ZIP_COL_IDX = 1
AGI_COL_IDX = 3
RET_CNT_COL_IDX = 4
AVG_AGI_COL_IDX = 5


# StateData is a helper class which is initialized with the state totals, and then has the per zip data added to it.
# With that data it then has the ability to calculate the minimum number of returns which represent half the states agi.
class StateData(object):
    def __init__(self, abbrev, total_agi, total_count):
        self.abbrev = abbrev
        self.total_agi = total_agi
        self.total_count = total_count
        self.zip_data = []
        self.min_returns_with_half_agi = 0

    def calc_min_returns_with_half_agi(self):
        remaining = self.total_agi/2.0

        for curr_zip_agi in self.zip_data:
            if remaining > curr_zip_agi[AGI_COL_IDX]:
                remaining -= curr_zip_agi[AGI_COL_IDX]
                self.min_returns_with_half_agi += curr_zip_agi[RET_CNT_COL_IDX]
            else:
                avg_agi = curr_zip_agi[AVG_AGI_COL_IDX]
                self.min_returns_with_half_agi += int((float(remaining) + avg_agi - 0.1) / avg_agi)
                break


# parse command line parameters
if len(sys.argv) < 3:
    print("usage: python calc_half_agi_percentages.py <irs-soi-repo-dir> <output-dir>")
    sys.exit(1)

repo_dir = sys.argv[1]

if not os.path.exists(repo_dir):
    print("%s is not a valid path" % repo_dir)
    sys.exit(1)

out_dir = sys.argv[2]

if not os.path.exists(out_dir):
    print("%s is not a valid path" % out_dir)
    sys.exit(1)

db = dpc.Dolt(repo_dir)
db.start_server()

# loop through all the branches
min_percentage, max_percentage = 100, 0
for branch in BRANCHES:
    if db.get_current_branch() != branch:
        db.checkout(branch)

    print("branch is " + branch)

    # query the totals
    state_data = {}
    cursor = db.query_server(STATE_TOTAL_AGI_QUERY)
    rows = cursor.fetchall(
    cursor.close()

    # initialize a StateData object for each state based on totals
    for row in rows:
        state, agi, count = row[0], row[1], row[2]
        state_data[state] = StateData(state, agi, count)

    # query each zip and agi category
    cursor = db.query_server(PER_ZIP_AGI_QUERY)
    rows = cursor.fetchall()
    cursor.close()

    # add the zip data to each StateData object
    for row in rows:
        state = row[0]
        state_data[state].zip_data.append(row)

    # open a file to write the output csv
    with open(os.path.join(out_dir, branch + ".csv"), 'w+') as f:
        f.write("state,percentage\n")

        # for each state calculate the approximate minimum number of returns needed to make up half of the states
        # total agi, convert that to a percentage of returns, and output to the csv.
        for state, data in state_data.items():
            data.calc_min_returns_with_half_agi()
            half_agi_percentage = (data.min_returns_with_half_agi / float(data.total_count)) * 100.0

            if half_agi_percentage < min_percentage:
                min_percentage = half_agi_percentage

            if half_agi_percentage > max_percentage:
                max_percentage = half_agi_percentage

            f.write("%s,%.02f\n" % (state, half_agi_percentage))

# output the all time minimum percentage and maximum percentage in order to be able to create a scale that can be used
# across all states and tax years
print("min_percentage: %.02f, max_percentage: %.02f" % (min_percentage, max_percentage))
	# calc_half_agi_percentages.py is a simple script that uses doltpy to retrieve data from the Dolt irs-soi dataset
	# and calculates an approximate percentage of tax returns that make up half of the AGI for each state, for the tax
	# years 2011 through 2017 and outputs a csv for each year.
	#
	# setup:
	# Install Dolt. Dolt documentation and installation instructions are at https://github.com/liquidata-inc/dolt
	# Clone the irs-soi dataset from dolt by running "dolt clone Liquidata/irs-soi" from the directory where you want the
	# data to be cloned to.
	# Install doltpy. doltpy documentation and installation instructions are at https://github.com/liquidata-inc/doltpy
	#
	# usage:
	# python calc_half_agi_percentages.py <irs-soi-repo-dir> <output-dir>
	# irs-soi-repo-dir is the directory that you cloned the data into
	# output-dir is the directory where you want the output csv files placed
	#
	# methodology:
	# for each tax year change to the appropriate dolt branch. Query the allnoagi data for zip "00000" which will give
	# totals for all zips in the state. Then query the allagi data along with the average agi for each
	# agi category for each zip sorted by the average agi descending. For each state take the ordered query results and
	# calculate the approximate minimum number of returns that would make up half of the states total agi by using the
	# average agi for each return. Compare with total return count to get a percentage.
	import os.path
	import sys
	import doltpy.core as dpc

	# constants
	BRANCHES = [str(branch) for branch in range(2011, 2018)]

	STATE_TOTAL_AGI_QUERY = '''
	SELECT state, adjusted_gross_income, return_count
	FROM allnoagi
	WHERE zip = '00000';'''

	PER_ZIP_AGI_QUERY = '''
	SELECT state,
	zip,
	agi_category,
	adjusted_gross_income,
	return_count,
	CAST(adjusted_gross_income AS DECIMAL(48,16))/CAST(return_count AS DECIMAL(48,16)) AS avg_agi
	FROM allagi
	WHERE zip != '00000' AND return_count != 0
	ORDER BY state, avg_agi DESC;'''
	STATE_COL_IDX = 0
	ZIP_COL_IDX = 1
	AGI_COL_IDX = 3
	RET_CNT_COL_IDX = 4
	AVG_AGI_COL_IDX = 5


	# StateData is a helper class which is initialized with the state totals, and then has the per zip data added to it.
	# With that data it then has the ability to calculate the minimum number of returns which represent half the states agi.
	class StateData(object):
	def __init__(self, abbrev, total_agi, total_count):
	self.abbrev = abbrev
	self.total_agi = total_agi
	self.total_count = total_count
	self.zip_data = []
	self.min_returns_with_half_agi = 0

	def calc_min_returns_with_half_agi(self):
	remaining = self.total_agi/2.0

	for curr_zip_agi in self.zip_data:
	if remaining > curr_zip_agi[AGI_COL_IDX]:
	remaining -= curr_zip_agi[AGI_COL_IDX]
	self.min_returns_with_half_agi += curr_zip_agi[RET_CNT_COL_IDX]
	else:
	avg_agi = curr_zip_agi[AVG_AGI_COL_IDX]
	self.min_returns_with_half_agi += int((float(remaining) + avg_agi - 0.1) / avg_agi)
	break


	# parse command line parameters
	if len(sys.argv) < 3:
	print("usage: python calc_half_agi_percentages.py <irs-soi-repo-dir> <output-dir>")
	sys.exit(1)

	repo_dir = sys.argv[1]

	if not os.path.exists(repo_dir):
	print("%s is not a valid path" % repo_dir)
	sys.exit(1)

	out_dir = sys.argv[2]

	if not os.path.exists(out_dir):
	print("%s is not a valid path" % out_dir)
	sys.exit(1)

	db = dpc.Dolt(repo_dir)
	db.start_server()

	# loop through all the branches
	min_percentage, max_percentage = 100, 0
	for branch in BRANCHES:
	if db.get_current_branch() != branch:
	db.checkout(branch)

	print("branch is " + branch)

	# query the totals
	state_data = {}
	cursor = db.query_server(STATE_TOTAL_AGI_QUERY)
	rows = cursor.fetchall(
	cursor.close()

	# initialize a StateData object for each state based on totals
	for row in rows:
	state, agi, count = row[0], row[1], row[2]
	state_data[state] = StateData(state, agi, count)

	# query each zip and agi category
	cursor = db.query_server(PER_ZIP_AGI_QUERY)
	rows = cursor.fetchall()
	cursor.close()

	# add the zip data to each StateData object
	for row in rows:
	state = row[0]
	state_data[state].zip_data.append(row)

	# open a file to write the output csv
	with open(os.path.join(out_dir, branch + ".csv"), 'w+') as f:
	f.write("state,percentage\n")

	# for each state calculate the approximate minimum number of returns needed to make up half of the states
	# total agi, convert that to a percentage of returns, and output to the csv.
	for state, data in state_data.items():
	data.calc_min_returns_with_half_agi()
	half_agi_percentage = (data.min_returns_with_half_agi / float(data.total_count)) * 100.0

	if half_agi_percentage < min_percentage:
	min_percentage = half_agi_percentage

	if half_agi_percentage > max_percentage:
	max_percentage = half_agi_percentage

	f.write("%s,%.02f\n" % (state, half_agi_percentage))

	# output the all time minimum percentage and maximum percentage in order to be able to create a scale that can be used
	# across all states and tax years
	print("min_percentage: %.02f, max_percentage: %.02f" % (min_percentage, max_percentage))