Skip to content

Instantly share code, notes, and snippets.

@bheni
Last active February 2, 2020 20:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bheni/6666ea6ad08e42968bf5f672d7104738 to your computer and use it in GitHub Desktop.
Save bheni/6666ea6ad08e42968bf5f672d7104738 to your computer and use it in GitHub Desktop.
# calc_half_agi_percentages.py is a simple script that uses doltpy to retrieve data from the Dolt irs-soi dataset
# and calculates an approximate percentage of tax returns that make up half of the AGI for each state, for the tax
# years 2011 through 2017 and outputs a csv for each year.
#
# setup:
# Install Dolt. Dolt documentation and installation instructions are at https://github.com/liquidata-inc/dolt
# Clone the irs-soi dataset from dolt by running "dolt clone Liquidata/irs-soi" from the directory where you want the
# data to be cloned to.
# Install doltpy. doltpy documentation and installation instructions are at https://github.com/liquidata-inc/doltpy
#
# usage:
# python calc_half_agi_percentages.py <irs-soi-repo-dir> <output-dir>
# irs-soi-repo-dir is the directory that you cloned the data into
# output-dir is the directory where you want the output csv files placed
#
# methodology:
# for each tax year change to the appropriate dolt branch. Query the allnoagi data for zip "00000" which will give
# totals for all zips in the state. Then query the allagi data along with the average agi for each
# agi category for each zip sorted by the average agi descending. For each state take the ordered query results and
# calculate the approximate minimum number of returns that would make up half of the states total agi by using the
# average agi for each return. Compare with total return count to get a percentage.
import os.path
import sys
import doltpy.core as dpc
# constants
BRANCHES = [str(branch) for branch in range(2011, 2018)]
STATE_TOTAL_AGI_QUERY = '''
SELECT state, adjusted_gross_income, return_count
FROM allnoagi
WHERE zip = '00000';'''
PER_ZIP_AGI_QUERY = '''
SELECT state,
zip,
agi_category,
adjusted_gross_income,
return_count,
CAST(adjusted_gross_income AS DECIMAL(48,16))/CAST(return_count AS DECIMAL(48,16)) AS avg_agi
FROM allagi
WHERE zip != '00000' AND return_count != 0
ORDER BY state, avg_agi DESC;'''
STATE_COL_IDX = 0
ZIP_COL_IDX = 1
AGI_COL_IDX = 3
RET_CNT_COL_IDX = 4
AVG_AGI_COL_IDX = 5
# StateData is a helper class which is initialized with the state totals, and then has the per zip data added to it.
# With that data it then has the ability to calculate the minimum number of returns which represent half the states agi.
class StateData(object):
def __init__(self, abbrev, total_agi, total_count):
self.abbrev = abbrev
self.total_agi = total_agi
self.total_count = total_count
self.zip_data = []
self.min_returns_with_half_agi = 0
def calc_min_returns_with_half_agi(self):
remaining = self.total_agi/2.0
for curr_zip_agi in self.zip_data:
if remaining > curr_zip_agi[AGI_COL_IDX]:
remaining -= curr_zip_agi[AGI_COL_IDX]
self.min_returns_with_half_agi += curr_zip_agi[RET_CNT_COL_IDX]
else:
avg_agi = curr_zip_agi[AVG_AGI_COL_IDX]
self.min_returns_with_half_agi += int((float(remaining) + avg_agi - 0.1) / avg_agi)
break
# parse command line parameters
if len(sys.argv) < 3:
print("usage: python calc_half_agi_percentages.py <irs-soi-repo-dir> <output-dir>")
sys.exit(1)
repo_dir = sys.argv[1]
if not os.path.exists(repo_dir):
print("%s is not a valid path" % repo_dir)
sys.exit(1)
out_dir = sys.argv[2]
if not os.path.exists(out_dir):
print("%s is not a valid path" % out_dir)
sys.exit(1)
db = dpc.Dolt(repo_dir)
db.start_server()
# loop through all the branches
min_percentage, max_percentage = 100, 0
for branch in BRANCHES:
if db.get_current_branch() != branch:
db.checkout(branch)
print("branch is " + branch)
# query the totals
state_data = {}
cursor = db.query_server(STATE_TOTAL_AGI_QUERY)
rows = cursor.fetchall(
cursor.close()
# initialize a StateData object for each state based on totals
for row in rows:
state, agi, count = row[0], row[1], row[2]
state_data[state] = StateData(state, agi, count)
# query each zip and agi category
cursor = db.query_server(PER_ZIP_AGI_QUERY)
rows = cursor.fetchall()
cursor.close()
# add the zip data to each StateData object
for row in rows:
state = row[0]
state_data[state].zip_data.append(row)
# open a file to write the output csv
with open(os.path.join(out_dir, branch + ".csv"), 'w+') as f:
f.write("state,percentage\n")
# for each state calculate the approximate minimum number of returns needed to make up half of the states
# total agi, convert that to a percentage of returns, and output to the csv.
for state, data in state_data.items():
data.calc_min_returns_with_half_agi()
half_agi_percentage = (data.min_returns_with_half_agi / float(data.total_count)) * 100.0
if half_agi_percentage < min_percentage:
min_percentage = half_agi_percentage
if half_agi_percentage > max_percentage:
max_percentage = half_agi_percentage
f.write("%s,%.02f\n" % (state, half_agi_percentage))
# output the all time minimum percentage and maximum percentage in order to be able to create a scale that can be used
# across all states and tax years
print("min_percentage: %.02f, max_percentage: %.02f" % (min_percentage, max_percentage))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment