Skip to content

Instantly share code, notes, and snippets.

@omarkohl
Last active October 5, 2023 20:36
Show Gist options
  • Save omarkohl/433968aa54d75e34a5c8b5c50a259ed7 to your computer and use it in GitHub Desktop.
Save omarkohl/433968aa54d75e34a5c8b5c50a259ed7 to your computer and use it in GitHub Desktop.
Script to generate data and plots for a blog post: https://www.codeandbugs.com/post/second-regression-line/
# Script to generate data and plots for a blog post:
# https://www.codeandbugs.com/post/second-regression-line/
import numpy as np
import matplotlib.pyplot as plt
try:
from google.colab import files
IN_COLAB = True
except ImportError:
IN_COLAB = False
# Will only work on Google Colab. Download the plots as SVG images.
DOWNLOAD_PLOTS = False
# Make it reproducible
np.random.seed(0)
def generate_data():
"""
Generate some data that is correlated for nicer graphs.
"""
# Height is a normal distribution
height_data = np.random.normal(170, 15, 100)
# Weight is half the height plus-minus some noise
weight_data = 1/2 * height_data + np.random.normal(0, 18, 100)
return height_data, weight_data
def get_correlation(data1, data2):
"""
Get the correlation coefficient.
"""
correlation_matrix = np.corrcoef(data1, data2)
# The correlation coefficient is in the (0, 1) position of the correlation matrix
correlation_coefficient = correlation_matrix[0, 1]
return correlation_coefficient
height_data, weight_data = generate_data()
h_avg = np.mean(height_data)
h_std = np.std(height_data)
w_avg = np.mean(weight_data)
w_std = np.std(weight_data)
r = get_correlation(height_data, weight_data)
# Equation for lines:
# y = slope * x + intercept
# SD line
sd_line_slope = w_std / h_std
# Use the (avg, avg) point to calculate
sd_line_intercept = w_avg - sd_line_slope * h_avg
# Regression line Weight on Height (weight on height) i.e. to predict weight based on height
rwoh_line_slope = r * w_std / h_std
# Use the (avg, avg) point to calculate
rwoh_line_intercept = w_avg - rwoh_line_slope * h_avg
# WRONG Regresion Height on Weight
# We know that r never changes and we know that the slope of the "normal" regression line
# is r * w_std / h_std . Since we are doing the reverse line it's logical
# that the SDs would be inverted, but r must stay the same.
rhow_line_slope = r * h_std / w_std
# Use the (avg, avg) point to calculate
rhow_line_intercept = w_avg - rhow_line_slope * h_avg
print("r =", r)
print("h_avg =", h_avg)
print("h_std =", h_std)
print("w_avg =", w_avg)
print("w_std =", w_std)
print(f"SD line: y = {sd_line_slope:.3}*x + {sd_line_intercept:.3}")
print(f"Regression (Weight on Height) line: y = {rwoh_line_slope:.3}*x + {rwoh_line_intercept:.3}")
print(f"WRONG Regression (Height on Weight) line: y = {rhow_line_slope:.3}*x + {rhow_line_intercept:.3}")
# This is just used for plotting to cover the entire width of the plot
extended_height_data = np.concatenate((np.array([120]), height_data, np.array([220])))
plt.figure(1, figsize=(8, 8))
# Plot the SD line
plt.plot(
extended_height_data,
sd_line_slope * extended_height_data + sd_line_intercept,
c='purple',
label='SD line',
)
# Create the scatter plot
plt.xlim(120, 220)
plt.ylim(20, 160)
plt.scatter(height_data, weight_data, c='blue', alpha=0.7)
plt.xlabel('Height (cm)')
plt.ylabel('Weight (kg)')
plt.title('Weight vs. Height')
plt.grid(True)
plt.legend()
if IN_COLAB and DOWNLOAD_PLOTS:
plt.savefig('plot1.svg', format='svg')
files.download('plot1.svg')
plt.show()
plt.figure(2, figsize=(8, 8))
# Plot the SD line
plt.plot(
extended_height_data,
sd_line_slope * extended_height_data + sd_line_intercept,
c='purple',
label='SD line',
)
# Plot the regression Weight on Height line
plt.plot(
extended_height_data,
rwoh_line_slope * extended_height_data + rwoh_line_intercept,
c='green',
label='Regression (Weight on Height)',
)
# Create the scatter plot
plt.xlim(120, 220)
plt.ylim(20, 160)
plt.scatter(height_data, weight_data, c='blue', alpha=0.7)
plt.xlabel('Height (cm)')
plt.ylabel('Weight (kg)')
plt.title('Weight vs. Height')
plt.grid(True)
plt.legend()
if IN_COLAB and DOWNLOAD_PLOTS:
plt.savefig('plot2.svg', format='svg')
files.download('plot2.svg')
plt.show()
plt.figure(3, figsize=(8, 8))
# Plot the SD line
plt.plot(
extended_height_data,
sd_line_slope * extended_height_data + sd_line_intercept,
c='purple',
label='SD line',
)
# Plot the regression Weight on Height line
plt.plot(
extended_height_data,
rwoh_line_slope * extended_height_data + rwoh_line_intercept,
c='green',
label='Regression (Weight on Height)',
)
# Plot the regression Height on Weight line
plt.plot(
extended_height_data,
rhow_line_slope * extended_height_data + rhow_line_intercept,
c='orange',
label='WRONG Regression (Height on Weight)',
)
# Create the scatter plot
plt.xlim(120, 220)
plt.ylim(20, 160)
plt.scatter(height_data, weight_data, c='blue', alpha=0.7)
plt.xlabel('Height (cm)')
plt.ylabel('Weight (kg)')
plt.title('Weight vs. Height')
plt.grid(True)
plt.legend()
if IN_COLAB and DOWNLOAD_PLOTS:
plt.savefig('plot3.svg', format='svg')
files.download('plot3.svg')
plt.show()
print("Fix the second regression line...")
print(" Since we are using y to predict x the line equation is inverted.")
print(" x = slope * y + intercept")
print(" if we re-order the terms ...")
# Regression line Height on Weight (height on weight) i.e. to predict height based on weight
# IMPORTANT!!! Since we are using y to predict x we are using a different line equation (y and x are inverted)
# x = slope * y + intercept
# The slope is calculated via r * h_std / w_std
# Since we want the other line equation where y is dependent on x in order to
# plot it in the same graph we re-order the terms
# y = (x - intercept) / slope = 1/slope * x - intercept/slope
# i.e. the new slope is the inverted slope which means 1/r * w_std / h_std
rhow_line_slope = 1/r * w_std / h_std
# Use the (avg, avg) point to calculate
rhow_line_intercept = w_avg - rhow_line_slope * h_avg
print(f"Regression (Height on Weight) line: y = {rhow_line_slope:.3}*x + {rhow_line_intercept:.3}")
plt.figure(4, figsize=(8, 8))
# Plot the SD line
plt.plot(
extended_height_data,
sd_line_slope * extended_height_data + sd_line_intercept,
c='purple',
label='SD line',
)
# Plot the regression Weight on Height line
plt.plot(
extended_height_data,
rwoh_line_slope * extended_height_data + rwoh_line_intercept,
c='green',
label='Regression (Weight on Height)',
)
# Plot the regression Height on Weight line
plt.plot(
extended_height_data,
rhow_line_slope * extended_height_data + rhow_line_intercept,
c='orange',
label='Regression (Height on Weight)',
)
# Create the scatter plot
plt.xlim(120, 220)
plt.ylim(20, 160)
plt.scatter(height_data, weight_data, c='blue', alpha=0.7)
plt.xlabel('Height (cm)')
plt.ylabel('Weight (kg)')
plt.title('Weight vs. Height')
plt.grid(True)
plt.legend()
if IN_COLAB and DOWNLOAD_PLOTS:
plt.savefig('plot4.svg', format='svg')
files.download('plot4.svg')
plt.show()
# Normalized plot
plt.figure(5, figsize=(8, 8))
# Plot the SD line
plt.plot(
np.linspace(-5, 5, 10),
np.linspace(-5, 5, 10),
c='purple',
label='SD line',
)
# Plot the regression Weight on Height line
plt.plot(
np.linspace(-5, 5, 10),
r * np.linspace(-5, 5, 10),
c='green',
label='Regression (Weight on Height)',
)
# Plot the regression Height on Weight line
plt.plot(
np.linspace(-5, 5, 10),
1/r * np.linspace(-5, 5, 10),
c='orange',
label='Regression (Height on Weight)',
)
# Create the scatter plot
plt.xlim(-5, 5)
plt.ylim(-5, 5)
plt.scatter((height_data-h_avg)/h_std, (weight_data-w_avg)/w_std, c='blue', alpha=0.7)
plt.xlabel('Height (nº SDs)')
plt.ylabel('Weight (nº SDs)')
plt.title('Weight vs. Height')
plt.grid(True)
plt.legend()
if IN_COLAB and DOWNLOAD_PLOTS:
plt.savefig('plot5.svg', format='svg')
files.download('plot5.svg')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment