Created
May 14, 2025 04:14
-
-
Save acbass49/84b026c1a31b6e7a2ea85e612e20f7fe to your computer and use it in GitHub Desktop.
8 Country Correlation Analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import numpy as np | |
| import statsmodels.formula.api as smf | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from pyfonts import load_google_font | |
| #load in the data | |
| data = pd.read_csv('https://gist.githubusercontent.com/acbass49/a789338f3e72d3692cadc0d5459b968a/raw/863b4cbe7bb69648001bdf6dd489000c39b4b531/final_data_joined.csv') | |
| #dropping rows have more than 30% missing values | |
| missing = data.isnull().apply(sum, axis=1) / len(data.columns) > 0.3 | |
| missing = missing[missing].index.tolist() | |
| # As you can see, most of the missing values are islands | |
| data.iloc[missing,:] | |
| data = data.drop(missing).reset_index(drop=True) | |
| # after removing the small islands we have 123 countries | |
| #impute data at the mean | |
| data = data.apply(lambda col: col.fillna(col.mean()) if col.dtype in ['float64', 'int64'] else col) | |
| # variable cleaning | |
| # winsozing to remove a few outliers | |
| data['Percent Change'] = np.where(data['Percent Change'] > 25, 25, data['Percent Change']) | |
| data['Percent Change'] = np.where(data['Percent Change'] < -25, -25, data['Percent Change']) | |
| #spot edit assigning aruba as a free country | |
| data.loc[data.Status.isna(), 'Status'] = 'F' | |
| #I should log population and GDP per capita bc of skewness | |
| data['log_population'] = np.log(data['Population']) | |
| data['log_gdp_per_capita'] = np.log(data['GDP per capita']) | |
| # one hot encode categorical variables | |
| data = pd.get_dummies(data, columns=['Status'], drop_first=True) | |
| data.columns = data.columns.str.replace(' ', '_') | |
| ivs = [ | |
| 'Fertility_rate', | |
| 'arrival_departure_ratio', | |
| 'log_gdp_per_capita', | |
| 'log_population', | |
| 'Rural_population', | |
| 'Life_expectancy', | |
| 'Government_Expenditure_On_Education', | |
| 'Christians', | |
| 'Muslims', | |
| 'Unaffiliated', | |
| 'Educational_attainment', | |
| 'Status_NF', | |
| 'Status_PF', | |
| 'continent' | |
| ] | |
| # Standardize the independent variables (subtract mean and divide by standard deviation) | |
| data2 = data | |
| for col in ivs: | |
| if data[col].dtype in ['float64', 'int64'] and col not in ['Status_NF', 'Status_PF']: # Only standardize numeric columns | |
| data[col] = (data[col] - data[col].mean()) / data[col].std() | |
| dv = 'Percent_Change' | |
| formula = 'Percent_Change ~ Fertility_rate + arrival_departure_ratio + log_gdp_per_capita + log_population + Rural_population + Life_expectancy + Government_Expenditure_On_Education + Christians + Muslims + Unaffiliated + Educational_attainment + Status_NF + Status_PF' | |
| model3 = smf.mixedlm(formula, data, groups=data["continent"]).fit() | |
| print(model3.summary()) | |
| # Extract random effects (group-level differences) | |
| random_effects = model3.random_effects | |
| predict_df = pd.DataFrame({ | |
| 'Intercept' : model3.params['Intercept'], | |
| 'Fertility_rate': [data.Fertility_rate.mean()], | |
| 'arrival_departure_ratio': [data.arrival_departure_ratio.mean()], | |
| 'log_gdp_per_capita': [data.log_gdp_per_capita.mean()], | |
| 'log_population': [data.log_population.mean()], | |
| 'Rural_population': [data.Rural_population.mean()], | |
| 'Life_expectancy': [data.Life_expectancy.mean()], | |
| 'Government_Expenditure_On_Education': [data.Government_Expenditure_On_Education.mean()], | |
| 'Christians': [data.Christians.mean()], | |
| 'Muslims': [data.Muslims.mean()], | |
| 'Unaffiliated': [data.Unaffiliated.mean()], | |
| 'Educational_attainment': [data.Educational_attainment.mean()], | |
| 'Status_NF': [0], #modal value is F | |
| 'Status_PF': [0], | |
| 'continent': [-0.25585] #South America is closest to 0 South America | |
| }) | |
| model3.predict(predict_df) | |
| def bootstrap_prediction_intervals(model, data, predict_df, col_to_change, new_values, n_bootstrap=2000, ci=0.9): | |
| """ | |
| Bootstrap prediction intervals for a mixed-effects model. | |
| Parameters: | |
| - model: The fitted mixed-effects model. | |
| - data: The original dataset used to fit the model. | |
| - predict_df: The DataFrame containing the base values for prediction. | |
| - col_to_change: The name of the column to change for predictions. | |
| - new_values: A list of new values for the specified column. | |
| - n_bootstrap: Number of bootstrap iterations (default is 1000). | |
| - ci: Confidence interval level (default is 0.95). | |
| Returns: | |
| - A DataFrame with the new values, mean predictions, and prediction intervals. | |
| """ | |
| bootstrap_predictions = {value: [] for value in new_values} | |
| for i in range(n_bootstrap): | |
| # Resample the data with replacement | |
| bootstrap_sample = data.sample(frac=1, replace=True) | |
| # Refit the model on the bootstrap sample | |
| bootstrap_model = smf.mixedlm(model.model.formula, bootstrap_sample, groups=bootstrap_sample["continent"]).fit() | |
| # Generate predictions for each new value | |
| for value in new_values: | |
| temp_df = predict_df.copy() | |
| temp_df[col_to_change] = value | |
| # temp_df = pd.get_dummies(temp_df, drop_first=True) # Ensure proper encoding | |
| temp_df = temp_df.reindex(columns=bootstrap_model.model.exog_names, fill_value=0) | |
| # Predict and store the result | |
| pred = bootstrap_model.predict(temp_df)[0] | |
| bootstrap_predictions[value].append(pred) | |
| # Calculate prediction intervals | |
| results = [] | |
| lower_percentile = (1 - ci) / 2 * 100 | |
| upper_percentile = (1 + ci) / 2 * 100 | |
| for value, preds in bootstrap_predictions.items(): | |
| lower_bound = np.percentile(preds, lower_percentile) | |
| upper_bound = np.percentile(preds, upper_percentile) | |
| mean_pred = np.mean(preds) | |
| results.append({ | |
| col_to_change: value, | |
| 'Mean Prediction': mean_pred, | |
| f'Lower {int(ci * 100)}% PI': lower_bound, | |
| f'Upper {int(ci * 100)}% PI': upper_bound | |
| }) | |
| return pd.DataFrame(results) | |
| new_values = [1, 2, 3, 4, 5] | |
| bootstrap_results = bootstrap_prediction_intervals(model3, data, predict_df, 'Fertility_rate', new_values) | |
| print(bootstrap_results) | |
| new_values = [17,35,49] # 25, 50 75 percentiles for this variable | |
| bootstrap_results = bootstrap_prediction_intervals(model3, data, predict_df, 'Rural_population', new_values) | |
| print(bootstrap_results) | |
| new_values = [58,81,91] # 25, 50 75 percentiles for this variable | |
| bootstrap_results = bootstrap_prediction_intervals(model3, data, predict_df, 'Christians', new_values) | |
| print(bootstrap_results) | |
| new_values = [1,5,13] # 25, 50 75 percentiles for this variable | |
| bootstrap_results = bootstrap_prediction_intervals(model3, data, predict_df, 'Unaffiliated', new_values) | |
| print(bootstrap_results) | |
| ### Birthrate calculations | |
| data = data2 | |
| # Predicted Birthrate Increase | |
| data['proj_br_pc_growth'] = (((data['2024']/1000 * data['Birth_Rate']) / data['2024'])*100).round(2) | |
| #plot 1 | |
| plt.figure(figsize=(10,10)) | |
| # Your scatterplot code | |
| sns.scatterplot(data=data, x='proj_br_pc_growth', y='Percent_Change', hue='continent', palette='deep', s=100) | |
| # Add a y = x line (limited to -5 to 25) | |
| y_vals = np.linspace(0, 5, 100) # Limit x_vals to the range -5 to 25 | |
| x_vals = np.linspace(0, 5, 100) | |
| plt.plot(y_vals, y_vals, color='red', linestyle='--', linewidth=2, label='Performing At Birthrate') | |
| # Shade the region above the line (green) | |
| plt.fill_between(x_vals, y_vals, 30, color='green', alpha=0.1, label='Overperforming Birthrate') | |
| # Shade the region below the line (red) | |
| plt.fill_between(x_vals, y_vals, -30, color='red', alpha=0.1, label='Underperforming Birthrate') | |
| # Set axis limits | |
| plt.xlim(0, 5) | |
| plt.ylim(-30, 30) | |
| # Add titles and labels with more prominent font style | |
| plt.title('Actual Growth vs Projected Birth Rate Growth', fontsize=16, fontweight='bold') | |
| plt.xlabel('Projected Birth Rate Growth', fontsize=12) | |
| plt.ylabel('Actual Percent Membership Growth', fontsize=12) | |
| correlation = data['proj_br_pc_growth'].corr(data['Percent_Change']) | |
| plt.annotate( | |
| f"Medium Positive Correlation\n(r = {correlation:.2f})", | |
| xy=(2.5, -3), # Coordinates for the arrow tip | |
| xytext=(3, -5), # Coordinates for the text | |
| arrowprops=dict(facecolor='black', arrowstyle='->', lw=1.5), | |
| fontsize=12, | |
| bbox=dict(boxstyle="round,pad=0.3", edgecolor="black", facecolor="white") | |
| ) | |
| # Update the legend style | |
| plt.legend(title="Legend", title_fontsize='13', fontsize='11') | |
| plt.figtext(1, 0, '@mormon_metrics\nData: churchofjesuschrist.org & worldbank', | |
| wrap=True, horizontalalignment='right', fontsize=10) | |
| # Customize grid lines for readability | |
| plt.grid(True, which='both', axis='y', linestyle='--', alpha=0.5) | |
| plt.rcParams['axes.facecolor'] = '#f0f0f0' # Light gray background for the plot | |
| plt.rcParams['figure.facecolor'] = '#f0f0f0' | |
| # Show plot | |
| plt.savefig('../images/8_country_model_1.png', dpi=300, bbox_inches='tight') | |
| plt.show() | |
| # plot 2 | |
| plt.figure(figsize=(7,7)) | |
| plt.rcParams['axes.facecolor'] = '#f0f0f0' # Light gray background for the plot | |
| plt.rcParams['figure.facecolor'] = '#f0f0f0' | |
| sns.scatterplot(data=data, x='arrival_departure_ratio', y='Percent_Change', hue='continent') | |
| correlation = data['arrival_departure_ratio'].corr(data['Percent_Change']) | |
| plt.annotate( | |
| f"Weak Negative Correlation\n(r = {correlation:.2f})", | |
| xy=(2, -10), # Coordinates for the arrow tip | |
| xytext=(3, -20), # Coordinates for the text | |
| arrowprops=dict(facecolor='black', arrowstyle='->', lw=1.5), | |
| fontsize=12, | |
| bbox=dict(boxstyle="round,pad=0.3", edgecolor="black", facecolor="white") | |
| ) | |
| plt.title('Countries With More Leavers More Likely To Grow', fontsize=16, fontweight='bold') | |
| plt.xlabel('Arrivals/Departures\n(Above 1 More Coming than Leaving; Below 1 More Leaving than Coming)', fontsize=12) | |
| plt.ylabel('Actual Percent Membership Growth', fontsize=12) | |
| plt.figtext(1, -0.1, '@mormon_metrics\nData: churchofjesuschrist.org & facebook', | |
| wrap=True, horizontalalignment='right', fontsize=10) | |
| plt.savefig('../images/8_country_model_2.png', dpi=300, bbox_inches='tight') | |
| plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment