conorsch/cell-data-parsing-attempt

## cell-data-parsing-attempt
#!/usr/bin/env python
import pandas
import numpy


ethnicity_mapping = {
    1: "White",
    2: "Black",
    3: "Asian",
    4: "Native American",
    5: "Hispanic",
    6: "Unknown",
}

# Identifier for breast cancer (all we care about)
cancer_index = 2


raw_data_filepath = './Counts/Originals/v01_PaCo.csv'


def read_raw_data_file(csv_filepath):
    """
    Accepts raw data CSV file, returns Pandas dataframe.
    """
    df = pandas.DataFrame(pandas.read_csv(csv_filepath))
    return df


def munge_cancer_dataframe(csv_filepath):
    """
    Accepts raw data CSV file, returns Pandas dataframe,
    with only breast cancer incidents.
    """
    df = pandas.DataFrame(pandas.read_csv(csv_filepath, index_col=False))
    df = df.query('CancerType == '+str(cancer_index)) # Delete other cancer type data
    del df['CancerType'] # Delete CancerType column
    #del df['Gen'] # Delete CancerType column
    return(df) #Return the dataframe


df = munge_cancer_dataframe(raw_data_filepath)


years = set(df['year'])


new_data = pandas.DataFrame()
for year in years:
    for ethnicity_index, ethnicity in ethnicity_mapping.items():
        q = df.query('Eth == '+str(ethnicity_index)+' and year == '+str(year))
        # Trying to combine all genders' incidences.
        combined_incidence_filter = q.groupby('year').agg(sum)['count(*)']

        # If no incidence, lookup will throw IndexError.
        # Instead set to "0" and continue processing.
        try:
            combined_incidence_count = list(combined_incidence_filter)[0]
        except IndexError:
            combined_incidence_count = 0

        # Remove gender column, since we've already summed above.
        del q['Gen']

        # Overwrite count for given ethnicity with the combined count.
        # This will result in duplicate rows (since all incidences
        # will be the same). No problem: we'll clean up after.
        q.loc[:,('count(*)',)] = combined_incidence_count

        q.loc[:,('Eth',)] = ethnicity
        # Append new munged data set onto the pristine dataframe.
        new_data = pandas.concat([new_data, q])

# Purge all duplicates, of which we have many, due to the
# inelegant munging above.
new_data = new_data.drop_duplicates()

# Step 2: convert numeric ethnicities to words.
# (This was done above in the loop)

# Step 3: Sort by year so the rows descend in
# ascending chronological order.
new_data = new_data.sort_values(by=['year'])

# Now let's format the CSV structure prior to writing to file.
# Step 1: set year as index
# Don't do this... it removes 'year' from the data!
# So we'll do it as last operation prior to writing.
new_data = new_data.set_index('year')

# Finally, write out to local file.
new_data.to_csv('jawn.txt')
	#!/usr/bin/env python
	import pandas
	import numpy



	ethnicity_mapping = {
	1: "White",
	2: "Black",
	3: "Asian",
	4: "Native American",
	5: "Hispanic",
	6: "Unknown",
	}

	# Identifier for breast cancer (all we care about)
	cancer_index = 2


	raw_data_filepath = './Counts/Originals/v01_PaCo.csv'


	def read_raw_data_file(csv_filepath):
	"""
	Accepts raw data CSV file, returns Pandas dataframe.
	"""
	df = pandas.DataFrame(pandas.read_csv(csv_filepath))
	return df



	def munge_cancer_dataframe(csv_filepath):
	"""
	Accepts raw data CSV file, returns Pandas dataframe,
	with only breast cancer incidents.
	"""
	df = pandas.DataFrame(pandas.read_csv(csv_filepath, index_col=False))
	df = df.query('CancerType == '+str(cancer_index)) # Delete other cancer type data
	del df['CancerType'] # Delete CancerType column
	#del df['Gen'] # Delete CancerType column
	return(df) #Return the dataframe


	df = munge_cancer_dataframe(raw_data_filepath)


	years = set(df['year'])



	new_data = pandas.DataFrame()
	for year in years:
	for ethnicity_index, ethnicity in ethnicity_mapping.items():
	q = df.query('Eth == '+str(ethnicity_index)+' and year == '+str(year))
	# Trying to combine all genders' incidences.
	combined_incidence_filter = q.groupby('year').agg(sum)['count(*)']

	# If no incidence, lookup will throw IndexError.
	# Instead set to "0" and continue processing.
	try:
	combined_incidence_count = list(combined_incidence_filter)[0]
	except IndexError:
	combined_incidence_count = 0

	# Remove gender column, since we've already summed above.
	del q['Gen']

	# Overwrite count for given ethnicity with the combined count.
	# This will result in duplicate rows (since all incidences
	# will be the same). No problem: we'll clean up after.
	q.loc[:,('count(*)',)] = combined_incidence_count

	q.loc[:,('Eth',)] = ethnicity
	# Append new munged data set onto the pristine dataframe.
	new_data = pandas.concat([new_data, q])

	# Purge all duplicates, of which we have many, due to the
	# inelegant munging above.
	new_data = new_data.drop_duplicates()

	# Step 2: convert numeric ethnicities to words.
	# (This was done above in the loop)

	# Step 3: Sort by year so the rows descend in
	# ascending chronological order.
	new_data = new_data.sort_values(by=['year'])

	# Now let's format the CSV structure prior to writing to file.
	# Step 1: set year as index
	# Don't do this... it removes 'year' from the data!
	# So we'll do it as last operation prior to writing.
	new_data = new_data.set_index('year')

	# Finally, write out to local file.
	new_data.to_csv('jawn.txt')