Skip to content

Instantly share code, notes, and snippets.

@jtrussell
Created August 19, 2019 14:52
Show Gist options
  • Save jtrussell/1a70b8b888e1957d4cd5cdaedc4e6381 to your computer and use it in GitHub Desktop.
Save jtrussell/1a70b8b888e1957d4cd5cdaedc4e6381 to your computer and use it in GitHub Desktop.
name: wingardium-leviosa
dependencies:
- python=3.7.3
- pandas
- requests
import zipfile
import requests
import os
import pandas as pd
from os.path import join, isfile
# Create a bunch of folders for us to put stuff in
print('Creating directories...')
os.makedirs('tmp/zip', exist_ok=True)
os.makedirs('tmp/extract', exist_ok=True)
os.makedirs('final', exist_ok=True)
# These years define the data range for the synpuf files we want to download
year_from = '2008'
year_to = '2010'
# The URL to download synpuf IP claims. Note the placeholders for year and file
# index
base_url = 'https://www.cms.gov/Research-Statistics-Data-and-Systems/Downloadable-Public-Use-Files/SynPUFs/Downloads/DE1_0_{year_from}_to_{year_to}_Inpatient_Claims_Sample_{file_index}.zip'
base_url = base_url.replace('{year_from}', year_from)
base_url = base_url.replace('{year_to}', year_to)
# Yeah, they broke the "large" data file into a number of "smaller" ones to make
# it "easier" for people to use
omg_how_many_files_are_there = 20
# We have to download and unzip each file separately.
print('Downloading data files...')
for file_index in range(1, omg_how_many_files_are_there + 1):
file_index = str(file_index)
# Download data files - these will be zip archives
url = base_url.replace('{file_index}', file_index)
response = requests.get(url)
zip_file_path = f'tmp/zip/ip_claims_{file_index}.zip'
with open(zip_file_path, 'wb') as zip_file:
zip_file.write(response.content)
# Extract the file(s) inside the zip
with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
zip_file.extractall('tmp/extract')
# Get a list of the data file (CSV) paths
data_files = [path for path in os.listdir('tmp/extract')]
data_files = [join('tmp/extract', path) for path in data_files]
data_files = [path for path in data_files if isfile(path)]
# Load in each CSV as a data frame
#
# If we wanted to be more precise we could tells pandas what data type to use
# for each column. I.e. integers vs decimals vs strings, etc.
print('Loading data...')
data_frames = [pd.read_csv(path) for path in data_files]
# Combine into single data frame
df = pd.concat(data_frames, ignore_index=True)
# Here, we could perform validation and update routines on the final set. For
# now, we'll just grab some high level stats to demonstrate and make sure we
# actually got something reasonable.
num_rows = len(df)
num_cols = len(df.columns)
pmt_avg = df.CLM_PMT_AMT.mean()
pmt_med = df.CLM_PMT_AMT.median()
pmt_std = df.CLM_PMT_AMT.std()
print('')
print(' Stats:')
print('')
print(f' Total Rows: {num_rows}')
print(f' Total Columns: {num_cols}')
print('')
print(f' Average claims payment: {pmt_avg:,}')
print(f' Standard deviation of claims payments: {pmt_std:,}')
print(f' Median claims payment: {pmt_med:,}')
print('')
# Save the final result
print('Saving final CSV...')
final_path = f'final/ip_claims_{year_from}_{year_to}.csv'
df.to_csv(final_path, index=False)
# Here we could also clean up our tmp files, we'll leave them in for demo
# purposes.
#import shutil
#shutil.rmtree('tmp')
print('Done.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment