jtrussell/environment.yml

## environment.yml
name: wingardium-leviosa
dependencies:
  - python=3.7.3
  - pandas
  - requests

## get-synpuf-ip-claims.py
import zipfile
import requests
import os
import pandas as pd
from os.path import join, isfile

# Create a bunch of folders for us to put stuff in
print('Creating directories...')
os.makedirs('tmp/zip', exist_ok=True)
os.makedirs('tmp/extract', exist_ok=True)
os.makedirs('final', exist_ok=True)

# These years define the data range for the synpuf files we want to download
year_from = '2008'
year_to = '2010'

# The URL to download synpuf IP claims. Note the placeholders for year and file
# index
base_url = 'https://www.cms.gov/Research-Statistics-Data-and-Systems/Downloadable-Public-Use-Files/SynPUFs/Downloads/DE1_0_{year_from}_to_{year_to}_Inpatient_Claims_Sample_{file_index}.zip'
base_url = base_url.replace('{year_from}', year_from)
base_url = base_url.replace('{year_to}', year_to)

# Yeah, they broke the "large" data file into a number of "smaller" ones to make
# it "easier" for people to use
omg_how_many_files_are_there = 20

# We have to download and unzip each file separately.
print('Downloading data files...')
for file_index in range(1, omg_how_many_files_are_there + 1):
    file_index = str(file_index)

    # Download data files - these will be zip archives
    url = base_url.replace('{file_index}', file_index)
    response = requests.get(url)
    zip_file_path = f'tmp/zip/ip_claims_{file_index}.zip'
    with open(zip_file_path, 'wb') as zip_file:
        zip_file.write(response.content)

        # Extract the file(s) inside the zip
        with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
            zip_file.extractall('tmp/extract')

# Get a list of the data file (CSV) paths
data_files = [path for path in os.listdir('tmp/extract')]
data_files = [join('tmp/extract', path) for path in data_files]
data_files = [path for path in data_files if isfile(path)]

# Load in each CSV as a data frame
#
# If we wanted to be more precise we could tells pandas what data type to use
# for each column. I.e. integers vs decimals vs strings, etc.
print('Loading data...')
data_frames = [pd.read_csv(path) for path in data_files]

# Combine into single data frame
df = pd.concat(data_frames, ignore_index=True)

# Here, we could perform validation and update routines on the final set. For
# now, we'll just grab some high level stats to demonstrate and make sure we
# actually got something reasonable.
num_rows = len(df)
num_cols = len(df.columns)
pmt_avg = df.CLM_PMT_AMT.mean()
pmt_med = df.CLM_PMT_AMT.median()
pmt_std = df.CLM_PMT_AMT.std()
print('')
print(' Stats:')
print('')
print(f'   Total Rows: {num_rows}')
print(f'   Total Columns: {num_cols}')
print('')
print(f'   Average claims payment: {pmt_avg:,}')
print(f'   Standard deviation of claims payments: {pmt_std:,}')
print(f'   Median claims payment: {pmt_med:,}')
print('')

# Save the final result
print('Saving final CSV...')
final_path = f'final/ip_claims_{year_from}_{year_to}.csv'
df.to_csv(final_path, index=False)

# Here we could also clean up our tmp files, we'll leave them in for demo
# purposes.
#import shutil
#shutil.rmtree('tmp')

print('Done.')
	name: wingardium-leviosa
	dependencies:
	- python=3.7.3
	- pandas
	- requests
	import zipfile
	import requests
	import os
	import pandas as pd
	from os.path import join, isfile

	# Create a bunch of folders for us to put stuff in
	print('Creating directories...')
	os.makedirs('tmp/zip', exist_ok=True)
	os.makedirs('tmp/extract', exist_ok=True)
	os.makedirs('final', exist_ok=True)

	# These years define the data range for the synpuf files we want to download
	year_from = '2008'
	year_to = '2010'

	# The URL to download synpuf IP claims. Note the placeholders for year and file
	# index
	base_url = 'https://www.cms.gov/Research-Statistics-Data-and-Systems/Downloadable-Public-Use-Files/SynPUFs/Downloads/DE1_0_{year_from}_to_{year_to}_Inpatient_Claims_Sample_{file_index}.zip'
	base_url = base_url.replace('{year_from}', year_from)
	base_url = base_url.replace('{year_to}', year_to)

	# Yeah, they broke the "large" data file into a number of "smaller" ones to make
	# it "easier" for people to use
	omg_how_many_files_are_there = 20

	# We have to download and unzip each file separately.
	print('Downloading data files...')
	for file_index in range(1, omg_how_many_files_are_there + 1):
	file_index = str(file_index)

	# Download data files - these will be zip archives
	url = base_url.replace('{file_index}', file_index)
	response = requests.get(url)
	zip_file_path = f'tmp/zip/ip_claims_{file_index}.zip'
	with open(zip_file_path, 'wb') as zip_file:
	zip_file.write(response.content)

	# Extract the file(s) inside the zip
	with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
	zip_file.extractall('tmp/extract')

	# Get a list of the data file (CSV) paths
	data_files = [path for path in os.listdir('tmp/extract')]
	data_files = [join('tmp/extract', path) for path in data_files]
	data_files = [path for path in data_files if isfile(path)]

	# Load in each CSV as a data frame
	#
	# If we wanted to be more precise we could tells pandas what data type to use
	# for each column. I.e. integers vs decimals vs strings, etc.
	print('Loading data...')
	data_frames = [pd.read_csv(path) for path in data_files]

	# Combine into single data frame
	df = pd.concat(data_frames, ignore_index=True)

	# Here, we could perform validation and update routines on the final set. For
	# now, we'll just grab some high level stats to demonstrate and make sure we
	# actually got something reasonable.
	num_rows = len(df)
	num_cols = len(df.columns)
	pmt_avg = df.CLM_PMT_AMT.mean()
	pmt_med = df.CLM_PMT_AMT.median()
	pmt_std = df.CLM_PMT_AMT.std()
	print('')
	print(' Stats:')
	print('')
	print(f' Total Rows: {num_rows}')
	print(f' Total Columns: {num_cols}')
	print('')
	print(f' Average claims payment: {pmt_avg:,}')
	print(f' Standard deviation of claims payments: {pmt_std:,}')
	print(f' Median claims payment: {pmt_med:,}')
	print('')

	# Save the final result
	print('Saving final CSV...')
	final_path = f'final/ip_claims_{year_from}_{year_to}.csv'
	df.to_csv(final_path, index=False)

	# Here we could also clean up our tmp files, we'll leave them in for demo
	# purposes.
	#import shutil
	#shutil.rmtree('tmp')

	print('Done.')