Skip to content

Instantly share code, notes, and snippets.

@davidad
Created August 4, 2023 20:14
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save davidad/068fc4e2ccd5166c943156bbaca6a11b to your computer and use it in GitHub Desktop.
Save davidad/068fc4e2ccd5166c943156bbaca6a11b to your computer and use it in GitHub Desktop.
Lead poisoning data analysis (thanks GPT-4)
import pandas as pd
# Load the data
df = pd.read_excel('pnas.2118631119.sd01.xlsx')
import matplotlib.pyplot as plt
# Filter the data for ages 22-35
df_filtered = df[(df['AGE'] >= 22) & (df['AGE'] <= 35) & (df['YEAR'] >= 1955) & (df['YEAR'] <= 2040)]
# Create a new column 'condition_min' to hold the minimum value of each range
df_filtered['condition_min'] = df_filtered['condition'].apply(lambda x: 30 if x=='30+ (ud/dL)' else float(x.split('-')[0]))
# Pivot the data to get years as index, conditions as columns and leadpop as values
df_pivot = df_filtered.pivot_table(index='YEAR', columns=['condition_min', 'condition'], values='leadpop', aggfunc='sum')
# Sort the columns by 'condition_min'
df_pivot = df_pivot.sort_index(axis=1, level='condition_min')
# Scale to millions
df_pivot = df_pivot / 1e6
# Drop the 'condition_min' level in the column index
df_pivot.columns = df_pivot.columns.droplevel('condition_min')
# Create a function to format the x-axis labels
def format_year_labels(year):
if year % 5 == 0:
return str(year)
else:
return ''
# Define color palette
colors = ['#eaf2f0', '#e2afac', '#dc9896', '#d68281', '#ce6a6d', '#c65259', '#bc3746']
# Update color palette
colors[0] = '#cbe3ce'
# Create the stacked bar chart with the updated color palette
fig, ax = plt.subplots(figsize=(14, 8))
df_pivot.plot(kind='bar', stacked=True, ax=ax, color=colors, grid=False)
# Format y-axis labels to display in millions
ax.yaxis.get_major_formatter().set_scientific(False)
# Remove x-axis grid lines
ax.xaxis.grid(False)
# Apply custom formatting to x-axis labels
ax.set_xticklabels([format_year_labels(year) for year in df_pivot.index])
plt.title('Lead Poisoning in Age Group 22-35 Over Time')
plt.xlabel('Year')
plt.ylabel('U.S. Population by Estimated Childhood Lead Poisoning (millions)')
plt.legend(title='Condition', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment