Skip to content

Instantly share code, notes, and snippets.

@conorsch
Created January 8, 2018 02:05
Show Gist options
  • Save conorsch/6f8c8c77b7d3a1ce4d8902b611865b5e to your computer and use it in GitHub Desktop.
Save conorsch/6f8c8c77b7d3a1ce4d8902b611865b5e to your computer and use it in GitHub Desktop.
First pass at using pandas for cell data munging
#!/usr/bin/env python
import pandas
import numpy
ethnicity_mapping = {
1: "White",
2: "Black",
3: "Asian",
4: "Native American",
5: "Hispanic",
6: "Unknown",
}
# Identifier for breast cancer (all we care about)
cancer_index = 2
raw_data_filepath = './Counts/Originals/v01_PaCo.csv'
def read_raw_data_file(csv_filepath):
"""
Accepts raw data CSV file, returns Pandas dataframe.
"""
df = pandas.DataFrame(pandas.read_csv(csv_filepath))
return df
def munge_cancer_dataframe(csv_filepath):
"""
Accepts raw data CSV file, returns Pandas dataframe,
with only breast cancer incidents.
"""
df = pandas.DataFrame(pandas.read_csv(csv_filepath, index_col=False))
df = df.query('CancerType == '+str(cancer_index)) # Delete other cancer type data
del df['CancerType'] # Delete CancerType column
#del df['Gen'] # Delete CancerType column
return(df) #Return the dataframe
df = munge_cancer_dataframe(raw_data_filepath)
years = set(df['year'])
new_data = pandas.DataFrame()
for year in years:
for ethnicity_index, ethnicity in ethnicity_mapping.items():
q = df.query('Eth == '+str(ethnicity_index)+' and year == '+str(year))
# Trying to combine all genders' incidences.
combined_incidence_filter = q.groupby('year').agg(sum)['count(*)']
# If no incidence, lookup will throw IndexError.
# Instead set to "0" and continue processing.
try:
combined_incidence_count = list(combined_incidence_filter)[0]
except IndexError:
combined_incidence_count = 0
# Remove gender column, since we've already summed above.
del q['Gen']
# Overwrite count for given ethnicity with the combined count.
# This will result in duplicate rows (since all incidences
# will be the same). No problem: we'll clean up after.
q.loc[:,('count(*)',)] = combined_incidence_count
q.loc[:,('Eth',)] = ethnicity
# Append new munged data set onto the pristine dataframe.
new_data = pandas.concat([new_data, q])
# Purge all duplicates, of which we have many, due to the
# inelegant munging above.
new_data = new_data.drop_duplicates()
# Step 2: convert numeric ethnicities to words.
# (This was done above in the loop)
# Step 3: Sort by year so the rows descend in
# ascending chronological order.
new_data = new_data.sort_values(by=['year'])
# Now let's format the CSV structure prior to writing to file.
# Step 1: set year as index
# Don't do this... it removes 'year' from the data!
# So we'll do it as last operation prior to writing.
new_data = new_data.set_index('year')
# Finally, write out to local file.
new_data.to_csv('jawn.txt')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment