Last active
October 5, 2023 20:36
-
-
Save omarkohl/433968aa54d75e34a5c8b5c50a259ed7 to your computer and use it in GitHub Desktop.
Script to generate data and plots for a blog post: https://www.codeandbugs.com/post/second-regression-line/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Script to generate data and plots for a blog post: | |
# https://www.codeandbugs.com/post/second-regression-line/ | |
import numpy as np | |
import matplotlib.pyplot as plt | |
try: | |
from google.colab import files | |
IN_COLAB = True | |
except ImportError: | |
IN_COLAB = False | |
# Will only work on Google Colab. Download the plots as SVG images. | |
DOWNLOAD_PLOTS = False | |
# Make it reproducible | |
np.random.seed(0) | |
def generate_data(): | |
""" | |
Generate some data that is correlated for nicer graphs. | |
""" | |
# Height is a normal distribution | |
height_data = np.random.normal(170, 15, 100) | |
# Weight is half the height plus-minus some noise | |
weight_data = 1/2 * height_data + np.random.normal(0, 18, 100) | |
return height_data, weight_data | |
def get_correlation(data1, data2): | |
""" | |
Get the correlation coefficient. | |
""" | |
correlation_matrix = np.corrcoef(data1, data2) | |
# The correlation coefficient is in the (0, 1) position of the correlation matrix | |
correlation_coefficient = correlation_matrix[0, 1] | |
return correlation_coefficient | |
height_data, weight_data = generate_data() | |
h_avg = np.mean(height_data) | |
h_std = np.std(height_data) | |
w_avg = np.mean(weight_data) | |
w_std = np.std(weight_data) | |
r = get_correlation(height_data, weight_data) | |
# Equation for lines: | |
# y = slope * x + intercept | |
# SD line | |
sd_line_slope = w_std / h_std | |
# Use the (avg, avg) point to calculate | |
sd_line_intercept = w_avg - sd_line_slope * h_avg | |
# Regression line Weight on Height (weight on height) i.e. to predict weight based on height | |
rwoh_line_slope = r * w_std / h_std | |
# Use the (avg, avg) point to calculate | |
rwoh_line_intercept = w_avg - rwoh_line_slope * h_avg | |
# WRONG Regresion Height on Weight | |
# We know that r never changes and we know that the slope of the "normal" regression line | |
# is r * w_std / h_std . Since we are doing the reverse line it's logical | |
# that the SDs would be inverted, but r must stay the same. | |
rhow_line_slope = r * h_std / w_std | |
# Use the (avg, avg) point to calculate | |
rhow_line_intercept = w_avg - rhow_line_slope * h_avg | |
print("r =", r) | |
print("h_avg =", h_avg) | |
print("h_std =", h_std) | |
print("w_avg =", w_avg) | |
print("w_std =", w_std) | |
print(f"SD line: y = {sd_line_slope:.3}*x + {sd_line_intercept:.3}") | |
print(f"Regression (Weight on Height) line: y = {rwoh_line_slope:.3}*x + {rwoh_line_intercept:.3}") | |
print(f"WRONG Regression (Height on Weight) line: y = {rhow_line_slope:.3}*x + {rhow_line_intercept:.3}") | |
# This is just used for plotting to cover the entire width of the plot | |
extended_height_data = np.concatenate((np.array([120]), height_data, np.array([220]))) | |
plt.figure(1, figsize=(8, 8)) | |
# Plot the SD line | |
plt.plot( | |
extended_height_data, | |
sd_line_slope * extended_height_data + sd_line_intercept, | |
c='purple', | |
label='SD line', | |
) | |
# Create the scatter plot | |
plt.xlim(120, 220) | |
plt.ylim(20, 160) | |
plt.scatter(height_data, weight_data, c='blue', alpha=0.7) | |
plt.xlabel('Height (cm)') | |
plt.ylabel('Weight (kg)') | |
plt.title('Weight vs. Height') | |
plt.grid(True) | |
plt.legend() | |
if IN_COLAB and DOWNLOAD_PLOTS: | |
plt.savefig('plot1.svg', format='svg') | |
files.download('plot1.svg') | |
plt.show() | |
plt.figure(2, figsize=(8, 8)) | |
# Plot the SD line | |
plt.plot( | |
extended_height_data, | |
sd_line_slope * extended_height_data + sd_line_intercept, | |
c='purple', | |
label='SD line', | |
) | |
# Plot the regression Weight on Height line | |
plt.plot( | |
extended_height_data, | |
rwoh_line_slope * extended_height_data + rwoh_line_intercept, | |
c='green', | |
label='Regression (Weight on Height)', | |
) | |
# Create the scatter plot | |
plt.xlim(120, 220) | |
plt.ylim(20, 160) | |
plt.scatter(height_data, weight_data, c='blue', alpha=0.7) | |
plt.xlabel('Height (cm)') | |
plt.ylabel('Weight (kg)') | |
plt.title('Weight vs. Height') | |
plt.grid(True) | |
plt.legend() | |
if IN_COLAB and DOWNLOAD_PLOTS: | |
plt.savefig('plot2.svg', format='svg') | |
files.download('plot2.svg') | |
plt.show() | |
plt.figure(3, figsize=(8, 8)) | |
# Plot the SD line | |
plt.plot( | |
extended_height_data, | |
sd_line_slope * extended_height_data + sd_line_intercept, | |
c='purple', | |
label='SD line', | |
) | |
# Plot the regression Weight on Height line | |
plt.plot( | |
extended_height_data, | |
rwoh_line_slope * extended_height_data + rwoh_line_intercept, | |
c='green', | |
label='Regression (Weight on Height)', | |
) | |
# Plot the regression Height on Weight line | |
plt.plot( | |
extended_height_data, | |
rhow_line_slope * extended_height_data + rhow_line_intercept, | |
c='orange', | |
label='WRONG Regression (Height on Weight)', | |
) | |
# Create the scatter plot | |
plt.xlim(120, 220) | |
plt.ylim(20, 160) | |
plt.scatter(height_data, weight_data, c='blue', alpha=0.7) | |
plt.xlabel('Height (cm)') | |
plt.ylabel('Weight (kg)') | |
plt.title('Weight vs. Height') | |
plt.grid(True) | |
plt.legend() | |
if IN_COLAB and DOWNLOAD_PLOTS: | |
plt.savefig('plot3.svg', format='svg') | |
files.download('plot3.svg') | |
plt.show() | |
print("Fix the second regression line...") | |
print(" Since we are using y to predict x the line equation is inverted.") | |
print(" x = slope * y + intercept") | |
print(" if we re-order the terms ...") | |
# Regression line Height on Weight (height on weight) i.e. to predict height based on weight | |
# IMPORTANT!!! Since we are using y to predict x we are using a different line equation (y and x are inverted) | |
# x = slope * y + intercept | |
# The slope is calculated via r * h_std / w_std | |
# Since we want the other line equation where y is dependent on x in order to | |
# plot it in the same graph we re-order the terms | |
# y = (x - intercept) / slope = 1/slope * x - intercept/slope | |
# i.e. the new slope is the inverted slope which means 1/r * w_std / h_std | |
rhow_line_slope = 1/r * w_std / h_std | |
# Use the (avg, avg) point to calculate | |
rhow_line_intercept = w_avg - rhow_line_slope * h_avg | |
print(f"Regression (Height on Weight) line: y = {rhow_line_slope:.3}*x + {rhow_line_intercept:.3}") | |
plt.figure(4, figsize=(8, 8)) | |
# Plot the SD line | |
plt.plot( | |
extended_height_data, | |
sd_line_slope * extended_height_data + sd_line_intercept, | |
c='purple', | |
label='SD line', | |
) | |
# Plot the regression Weight on Height line | |
plt.plot( | |
extended_height_data, | |
rwoh_line_slope * extended_height_data + rwoh_line_intercept, | |
c='green', | |
label='Regression (Weight on Height)', | |
) | |
# Plot the regression Height on Weight line | |
plt.plot( | |
extended_height_data, | |
rhow_line_slope * extended_height_data + rhow_line_intercept, | |
c='orange', | |
label='Regression (Height on Weight)', | |
) | |
# Create the scatter plot | |
plt.xlim(120, 220) | |
plt.ylim(20, 160) | |
plt.scatter(height_data, weight_data, c='blue', alpha=0.7) | |
plt.xlabel('Height (cm)') | |
plt.ylabel('Weight (kg)') | |
plt.title('Weight vs. Height') | |
plt.grid(True) | |
plt.legend() | |
if IN_COLAB and DOWNLOAD_PLOTS: | |
plt.savefig('plot4.svg', format='svg') | |
files.download('plot4.svg') | |
plt.show() | |
# Normalized plot | |
plt.figure(5, figsize=(8, 8)) | |
# Plot the SD line | |
plt.plot( | |
np.linspace(-5, 5, 10), | |
np.linspace(-5, 5, 10), | |
c='purple', | |
label='SD line', | |
) | |
# Plot the regression Weight on Height line | |
plt.plot( | |
np.linspace(-5, 5, 10), | |
r * np.linspace(-5, 5, 10), | |
c='green', | |
label='Regression (Weight on Height)', | |
) | |
# Plot the regression Height on Weight line | |
plt.plot( | |
np.linspace(-5, 5, 10), | |
1/r * np.linspace(-5, 5, 10), | |
c='orange', | |
label='Regression (Height on Weight)', | |
) | |
# Create the scatter plot | |
plt.xlim(-5, 5) | |
plt.ylim(-5, 5) | |
plt.scatter((height_data-h_avg)/h_std, (weight_data-w_avg)/w_std, c='blue', alpha=0.7) | |
plt.xlabel('Height (nº SDs)') | |
plt.ylabel('Weight (nº SDs)') | |
plt.title('Weight vs. Height') | |
plt.grid(True) | |
plt.legend() | |
if IN_COLAB and DOWNLOAD_PLOTS: | |
plt.savefig('plot5.svg', format='svg') | |
files.download('plot5.svg') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment