Last active
February 2, 2020 20:11
-
-
Save bheni/6666ea6ad08e42968bf5f672d7104738 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# calc_half_agi_percentages.py is a simple script that uses doltpy to retrieve data from the Dolt irs-soi dataset | |
# and calculates an approximate percentage of tax returns that make up half of the AGI for each state, for the tax | |
# years 2011 through 2017 and outputs a csv for each year. | |
# | |
# setup: | |
# Install Dolt. Dolt documentation and installation instructions are at https://github.com/liquidata-inc/dolt | |
# Clone the irs-soi dataset from dolt by running "dolt clone Liquidata/irs-soi" from the directory where you want the | |
# data to be cloned to. | |
# Install doltpy. doltpy documentation and installation instructions are at https://github.com/liquidata-inc/doltpy | |
# | |
# usage: | |
# python calc_half_agi_percentages.py <irs-soi-repo-dir> <output-dir> | |
# irs-soi-repo-dir is the directory that you cloned the data into | |
# output-dir is the directory where you want the output csv files placed | |
# | |
# methodology: | |
# for each tax year change to the appropriate dolt branch. Query the allnoagi data for zip "00000" which will give | |
# totals for all zips in the state. Then query the allagi data along with the average agi for each | |
# agi category for each zip sorted by the average agi descending. For each state take the ordered query results and | |
# calculate the approximate minimum number of returns that would make up half of the states total agi by using the | |
# average agi for each return. Compare with total return count to get a percentage. | |
import os.path | |
import sys | |
import doltpy.core as dpc | |
# constants | |
BRANCHES = [str(branch) for branch in range(2011, 2018)] | |
STATE_TOTAL_AGI_QUERY = ''' | |
SELECT state, adjusted_gross_income, return_count | |
FROM allnoagi | |
WHERE zip = '00000';''' | |
PER_ZIP_AGI_QUERY = ''' | |
SELECT state, | |
zip, | |
agi_category, | |
adjusted_gross_income, | |
return_count, | |
CAST(adjusted_gross_income AS DECIMAL(48,16))/CAST(return_count AS DECIMAL(48,16)) AS avg_agi | |
FROM allagi | |
WHERE zip != '00000' AND return_count != 0 | |
ORDER BY state, avg_agi DESC;''' | |
STATE_COL_IDX = 0 | |
ZIP_COL_IDX = 1 | |
AGI_COL_IDX = 3 | |
RET_CNT_COL_IDX = 4 | |
AVG_AGI_COL_IDX = 5 | |
# StateData is a helper class which is initialized with the state totals, and then has the per zip data added to it. | |
# With that data it then has the ability to calculate the minimum number of returns which represent half the states agi. | |
class StateData(object): | |
def __init__(self, abbrev, total_agi, total_count): | |
self.abbrev = abbrev | |
self.total_agi = total_agi | |
self.total_count = total_count | |
self.zip_data = [] | |
self.min_returns_with_half_agi = 0 | |
def calc_min_returns_with_half_agi(self): | |
remaining = self.total_agi/2.0 | |
for curr_zip_agi in self.zip_data: | |
if remaining > curr_zip_agi[AGI_COL_IDX]: | |
remaining -= curr_zip_agi[AGI_COL_IDX] | |
self.min_returns_with_half_agi += curr_zip_agi[RET_CNT_COL_IDX] | |
else: | |
avg_agi = curr_zip_agi[AVG_AGI_COL_IDX] | |
self.min_returns_with_half_agi += int((float(remaining) + avg_agi - 0.1) / avg_agi) | |
break | |
# parse command line parameters | |
if len(sys.argv) < 3: | |
print("usage: python calc_half_agi_percentages.py <irs-soi-repo-dir> <output-dir>") | |
sys.exit(1) | |
repo_dir = sys.argv[1] | |
if not os.path.exists(repo_dir): | |
print("%s is not a valid path" % repo_dir) | |
sys.exit(1) | |
out_dir = sys.argv[2] | |
if not os.path.exists(out_dir): | |
print("%s is not a valid path" % out_dir) | |
sys.exit(1) | |
db = dpc.Dolt(repo_dir) | |
db.start_server() | |
# loop through all the branches | |
min_percentage, max_percentage = 100, 0 | |
for branch in BRANCHES: | |
if db.get_current_branch() != branch: | |
db.checkout(branch) | |
print("branch is " + branch) | |
# query the totals | |
state_data = {} | |
cursor = db.query_server(STATE_TOTAL_AGI_QUERY) | |
rows = cursor.fetchall( | |
cursor.close() | |
# initialize a StateData object for each state based on totals | |
for row in rows: | |
state, agi, count = row[0], row[1], row[2] | |
state_data[state] = StateData(state, agi, count) | |
# query each zip and agi category | |
cursor = db.query_server(PER_ZIP_AGI_QUERY) | |
rows = cursor.fetchall() | |
cursor.close() | |
# add the zip data to each StateData object | |
for row in rows: | |
state = row[0] | |
state_data[state].zip_data.append(row) | |
# open a file to write the output csv | |
with open(os.path.join(out_dir, branch + ".csv"), 'w+') as f: | |
f.write("state,percentage\n") | |
# for each state calculate the approximate minimum number of returns needed to make up half of the states | |
# total agi, convert that to a percentage of returns, and output to the csv. | |
for state, data in state_data.items(): | |
data.calc_min_returns_with_half_agi() | |
half_agi_percentage = (data.min_returns_with_half_agi / float(data.total_count)) * 100.0 | |
if half_agi_percentage < min_percentage: | |
min_percentage = half_agi_percentage | |
if half_agi_percentage > max_percentage: | |
max_percentage = half_agi_percentage | |
f.write("%s,%.02f\n" % (state, half_agi_percentage)) | |
# output the all time minimum percentage and maximum percentage in order to be able to create a scale that can be used | |
# across all states and tax years | |
print("min_percentage: %.02f, max_percentage: %.02f" % (min_percentage, max_percentage)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment