Created
January 8, 2018 02:05
-
-
Save conorsch/6f8c8c77b7d3a1ce4d8902b611865b5e to your computer and use it in GitHub Desktop.
First pass at using pandas for cell data munging
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import pandas | |
import numpy | |
ethnicity_mapping = { | |
1: "White", | |
2: "Black", | |
3: "Asian", | |
4: "Native American", | |
5: "Hispanic", | |
6: "Unknown", | |
} | |
# Identifier for breast cancer (all we care about) | |
cancer_index = 2 | |
raw_data_filepath = './Counts/Originals/v01_PaCo.csv' | |
def read_raw_data_file(csv_filepath): | |
""" | |
Accepts raw data CSV file, returns Pandas dataframe. | |
""" | |
df = pandas.DataFrame(pandas.read_csv(csv_filepath)) | |
return df | |
def munge_cancer_dataframe(csv_filepath): | |
""" | |
Accepts raw data CSV file, returns Pandas dataframe, | |
with only breast cancer incidents. | |
""" | |
df = pandas.DataFrame(pandas.read_csv(csv_filepath, index_col=False)) | |
df = df.query('CancerType == '+str(cancer_index)) # Delete other cancer type data | |
del df['CancerType'] # Delete CancerType column | |
#del df['Gen'] # Delete CancerType column | |
return(df) #Return the dataframe | |
df = munge_cancer_dataframe(raw_data_filepath) | |
years = set(df['year']) | |
new_data = pandas.DataFrame() | |
for year in years: | |
for ethnicity_index, ethnicity in ethnicity_mapping.items(): | |
q = df.query('Eth == '+str(ethnicity_index)+' and year == '+str(year)) | |
# Trying to combine all genders' incidences. | |
combined_incidence_filter = q.groupby('year').agg(sum)['count(*)'] | |
# If no incidence, lookup will throw IndexError. | |
# Instead set to "0" and continue processing. | |
try: | |
combined_incidence_count = list(combined_incidence_filter)[0] | |
except IndexError: | |
combined_incidence_count = 0 | |
# Remove gender column, since we've already summed above. | |
del q['Gen'] | |
# Overwrite count for given ethnicity with the combined count. | |
# This will result in duplicate rows (since all incidences | |
# will be the same). No problem: we'll clean up after. | |
q.loc[:,('count(*)',)] = combined_incidence_count | |
q.loc[:,('Eth',)] = ethnicity | |
# Append new munged data set onto the pristine dataframe. | |
new_data = pandas.concat([new_data, q]) | |
# Purge all duplicates, of which we have many, due to the | |
# inelegant munging above. | |
new_data = new_data.drop_duplicates() | |
# Step 2: convert numeric ethnicities to words. | |
# (This was done above in the loop) | |
# Step 3: Sort by year so the rows descend in | |
# ascending chronological order. | |
new_data = new_data.sort_values(by=['year']) | |
# Now let's format the CSV structure prior to writing to file. | |
# Step 1: set year as index | |
# Don't do this... it removes 'year' from the data! | |
# So we'll do it as last operation prior to writing. | |
new_data = new_data.set_index('year') | |
# Finally, write out to local file. | |
new_data.to_csv('jawn.txt') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment