Skip to content

Instantly share code, notes, and snippets.

@acbass49
Created May 14, 2025 04:14
Show Gist options
  • Select an option

  • Save acbass49/84b026c1a31b6e7a2ea85e612e20f7fe to your computer and use it in GitHub Desktop.

Select an option

Save acbass49/84b026c1a31b6e7a2ea85e612e20f7fe to your computer and use it in GitHub Desktop.
8 Country Correlation Analysis
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from pyfonts import load_google_font
#load in the data
data = pd.read_csv('https://gist.githubusercontent.com/acbass49/a789338f3e72d3692cadc0d5459b968a/raw/863b4cbe7bb69648001bdf6dd489000c39b4b531/final_data_joined.csv')
#dropping rows have more than 30% missing values
missing = data.isnull().apply(sum, axis=1) / len(data.columns) > 0.3
missing = missing[missing].index.tolist()
# As you can see, most of the missing values are islands
data.iloc[missing,:]
data = data.drop(missing).reset_index(drop=True)
# after removing the small islands we have 123 countries
#impute data at the mean
data = data.apply(lambda col: col.fillna(col.mean()) if col.dtype in ['float64', 'int64'] else col)
# variable cleaning
# winsozing to remove a few outliers
data['Percent Change'] = np.where(data['Percent Change'] > 25, 25, data['Percent Change'])
data['Percent Change'] = np.where(data['Percent Change'] < -25, -25, data['Percent Change'])
#spot edit assigning aruba as a free country
data.loc[data.Status.isna(), 'Status'] = 'F'
#I should log population and GDP per capita bc of skewness
data['log_population'] = np.log(data['Population'])
data['log_gdp_per_capita'] = np.log(data['GDP per capita'])
# one hot encode categorical variables
data = pd.get_dummies(data, columns=['Status'], drop_first=True)
data.columns = data.columns.str.replace(' ', '_')
ivs = [
'Fertility_rate',
'arrival_departure_ratio',
'log_gdp_per_capita',
'log_population',
'Rural_population',
'Life_expectancy',
'Government_Expenditure_On_Education',
'Christians',
'Muslims',
'Unaffiliated',
'Educational_attainment',
'Status_NF',
'Status_PF',
'continent'
]
# Standardize the independent variables (subtract mean and divide by standard deviation)
data2 = data
for col in ivs:
if data[col].dtype in ['float64', 'int64'] and col not in ['Status_NF', 'Status_PF']: # Only standardize numeric columns
data[col] = (data[col] - data[col].mean()) / data[col].std()
dv = 'Percent_Change'
formula = 'Percent_Change ~ Fertility_rate + arrival_departure_ratio + log_gdp_per_capita + log_population + Rural_population + Life_expectancy + Government_Expenditure_On_Education + Christians + Muslims + Unaffiliated + Educational_attainment + Status_NF + Status_PF'
model3 = smf.mixedlm(formula, data, groups=data["continent"]).fit()
print(model3.summary())
# Extract random effects (group-level differences)
random_effects = model3.random_effects
predict_df = pd.DataFrame({
'Intercept' : model3.params['Intercept'],
'Fertility_rate': [data.Fertility_rate.mean()],
'arrival_departure_ratio': [data.arrival_departure_ratio.mean()],
'log_gdp_per_capita': [data.log_gdp_per_capita.mean()],
'log_population': [data.log_population.mean()],
'Rural_population': [data.Rural_population.mean()],
'Life_expectancy': [data.Life_expectancy.mean()],
'Government_Expenditure_On_Education': [data.Government_Expenditure_On_Education.mean()],
'Christians': [data.Christians.mean()],
'Muslims': [data.Muslims.mean()],
'Unaffiliated': [data.Unaffiliated.mean()],
'Educational_attainment': [data.Educational_attainment.mean()],
'Status_NF': [0], #modal value is F
'Status_PF': [0],
'continent': [-0.25585] #South America is closest to 0 South America
})
model3.predict(predict_df)
def bootstrap_prediction_intervals(model, data, predict_df, col_to_change, new_values, n_bootstrap=2000, ci=0.9):
"""
Bootstrap prediction intervals for a mixed-effects model.
Parameters:
- model: The fitted mixed-effects model.
- data: The original dataset used to fit the model.
- predict_df: The DataFrame containing the base values for prediction.
- col_to_change: The name of the column to change for predictions.
- new_values: A list of new values for the specified column.
- n_bootstrap: Number of bootstrap iterations (default is 1000).
- ci: Confidence interval level (default is 0.95).
Returns:
- A DataFrame with the new values, mean predictions, and prediction intervals.
"""
bootstrap_predictions = {value: [] for value in new_values}
for i in range(n_bootstrap):
# Resample the data with replacement
bootstrap_sample = data.sample(frac=1, replace=True)
# Refit the model on the bootstrap sample
bootstrap_model = smf.mixedlm(model.model.formula, bootstrap_sample, groups=bootstrap_sample["continent"]).fit()
# Generate predictions for each new value
for value in new_values:
temp_df = predict_df.copy()
temp_df[col_to_change] = value
# temp_df = pd.get_dummies(temp_df, drop_first=True) # Ensure proper encoding
temp_df = temp_df.reindex(columns=bootstrap_model.model.exog_names, fill_value=0)
# Predict and store the result
pred = bootstrap_model.predict(temp_df)[0]
bootstrap_predictions[value].append(pred)
# Calculate prediction intervals
results = []
lower_percentile = (1 - ci) / 2 * 100
upper_percentile = (1 + ci) / 2 * 100
for value, preds in bootstrap_predictions.items():
lower_bound = np.percentile(preds, lower_percentile)
upper_bound = np.percentile(preds, upper_percentile)
mean_pred = np.mean(preds)
results.append({
col_to_change: value,
'Mean Prediction': mean_pred,
f'Lower {int(ci * 100)}% PI': lower_bound,
f'Upper {int(ci * 100)}% PI': upper_bound
})
return pd.DataFrame(results)
new_values = [1, 2, 3, 4, 5]
bootstrap_results = bootstrap_prediction_intervals(model3, data, predict_df, 'Fertility_rate', new_values)
print(bootstrap_results)
new_values = [17,35,49] # 25, 50 75 percentiles for this variable
bootstrap_results = bootstrap_prediction_intervals(model3, data, predict_df, 'Rural_population', new_values)
print(bootstrap_results)
new_values = [58,81,91] # 25, 50 75 percentiles for this variable
bootstrap_results = bootstrap_prediction_intervals(model3, data, predict_df, 'Christians', new_values)
print(bootstrap_results)
new_values = [1,5,13] # 25, 50 75 percentiles for this variable
bootstrap_results = bootstrap_prediction_intervals(model3, data, predict_df, 'Unaffiliated', new_values)
print(bootstrap_results)
### Birthrate calculations
data = data2
# Predicted Birthrate Increase
data['proj_br_pc_growth'] = (((data['2024']/1000 * data['Birth_Rate']) / data['2024'])*100).round(2)
#plot 1
plt.figure(figsize=(10,10))
# Your scatterplot code
sns.scatterplot(data=data, x='proj_br_pc_growth', y='Percent_Change', hue='continent', palette='deep', s=100)
# Add a y = x line (limited to -5 to 25)
y_vals = np.linspace(0, 5, 100) # Limit x_vals to the range -5 to 25
x_vals = np.linspace(0, 5, 100)
plt.plot(y_vals, y_vals, color='red', linestyle='--', linewidth=2, label='Performing At Birthrate')
# Shade the region above the line (green)
plt.fill_between(x_vals, y_vals, 30, color='green', alpha=0.1, label='Overperforming Birthrate')
# Shade the region below the line (red)
plt.fill_between(x_vals, y_vals, -30, color='red', alpha=0.1, label='Underperforming Birthrate')
# Set axis limits
plt.xlim(0, 5)
plt.ylim(-30, 30)
# Add titles and labels with more prominent font style
plt.title('Actual Growth vs Projected Birth Rate Growth', fontsize=16, fontweight='bold')
plt.xlabel('Projected Birth Rate Growth', fontsize=12)
plt.ylabel('Actual Percent Membership Growth', fontsize=12)
correlation = data['proj_br_pc_growth'].corr(data['Percent_Change'])
plt.annotate(
f"Medium Positive Correlation\n(r = {correlation:.2f})",
xy=(2.5, -3), # Coordinates for the arrow tip
xytext=(3, -5), # Coordinates for the text
arrowprops=dict(facecolor='black', arrowstyle='->', lw=1.5),
fontsize=12,
bbox=dict(boxstyle="round,pad=0.3", edgecolor="black", facecolor="white")
)
# Update the legend style
plt.legend(title="Legend", title_fontsize='13', fontsize='11')
plt.figtext(1, 0, '@mormon_metrics\nData: churchofjesuschrist.org & worldbank',
wrap=True, horizontalalignment='right', fontsize=10)
# Customize grid lines for readability
plt.grid(True, which='both', axis='y', linestyle='--', alpha=0.5)
plt.rcParams['axes.facecolor'] = '#f0f0f0' # Light gray background for the plot
plt.rcParams['figure.facecolor'] = '#f0f0f0'
# Show plot
plt.savefig('../images/8_country_model_1.png', dpi=300, bbox_inches='tight')
plt.show()
# plot 2
plt.figure(figsize=(7,7))
plt.rcParams['axes.facecolor'] = '#f0f0f0' # Light gray background for the plot
plt.rcParams['figure.facecolor'] = '#f0f0f0'
sns.scatterplot(data=data, x='arrival_departure_ratio', y='Percent_Change', hue='continent')
correlation = data['arrival_departure_ratio'].corr(data['Percent_Change'])
plt.annotate(
f"Weak Negative Correlation\n(r = {correlation:.2f})",
xy=(2, -10), # Coordinates for the arrow tip
xytext=(3, -20), # Coordinates for the text
arrowprops=dict(facecolor='black', arrowstyle='->', lw=1.5),
fontsize=12,
bbox=dict(boxstyle="round,pad=0.3", edgecolor="black", facecolor="white")
)
plt.title('Countries With More Leavers More Likely To Grow', fontsize=16, fontweight='bold')
plt.xlabel('Arrivals/Departures\n(Above 1 More Coming than Leaving; Below 1 More Leaving than Coming)', fontsize=12)
plt.ylabel('Actual Percent Membership Growth', fontsize=12)
plt.figtext(1, -0.1, '@mormon_metrics\nData: churchofjesuschrist.org & facebook',
wrap=True, horizontalalignment='right', fontsize=10)
plt.savefig('../images/8_country_model_2.png', dpi=300, bbox_inches='tight')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment