Skip to content

Instantly share code, notes, and snippets.

@wraseman
Last active October 24, 2022 06:51
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save wraseman/f4be0414b1c791bd341082e2ef53b295 to your computer and use it in GitHub Desktop.
Save wraseman/f4be0414b1c791bd341082e2ef53b295 to your computer and use it in GitHub Desktop.
Code for figures used in my blog post on multivariate distance calculations: https://waterprogramming.wordpress.com/2018/07/23/multivariate-distances-mahalanobis-vs-euclidean/
---
title: 'Multivariate Distances: Mahalanobis vs. Euclidean'
author: "Billy Raseman"
date: "July 12, 2018"
output: html_document
---
Below is the code used to create the plots for https://waterprogramming.wordpress.com/2018/07/23/multivariate-distances-mahalanobis-vs-euclidean/. My apologies that it is not entirely reproducible. I found it a bit easier to post-process a few of the plots (particularly for the PCA part)!
```{r}
## initial setup
# clear environment
rm(list=ls())
# load packages
library(tidyverse)
# save shortcut to remove axes ticks and text
axes_labels_only <- theme(axis.line=element_blank(), # remove axis
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
## cars data analysis and visualization
# consider only 4wd cars
cars <- filter(mpg, str_detect(model, '4wd')) %>%
select(displ, hwy) %>%
unique
# examine mpg: dataset about cars
p <- ggplot(data = cars, mapping = aes(x = displ, y = hwy)) +
xlab("Displacement") + ylab("Gas mileage (highway)") +
geom_point()
# let's visually compare my hypothetical Jeep and the rest of 'em
my.car <- filter(mpg, model=="grand cherokee 4wd", year==1999, cyl==8) %>%
select(displ, hwy)
# plot values
p + geom_point(data=my.car, color="red", size=2) +
geom_text(data=my.car, label="dream car!", color="red", vjust=-1)
# plot values without axes (visually standardize)
p + geom_point(data=my.car, color="red", size=2) +
geom_text(data=my.car, label="dream car!", color="red", vjust=-1) +
axes_labels_only
```
## Plot standardized values
```{r}
# calculate Euclidean distance between my dream car and the other cars
other.cars <- filter(mpg, model!="grand cherokee 4wd" | year!=1999 | cyl!=8) %>%
filter(str_detect(model, '4wd')) %>%
select(displ, hwy)
# compute Euclidean distance between cars
compare.cars <- rbind(my.car, other.cars) %>% select(hwy, displ) %>%
unique
hwy.mean <- mean(compare.cars$hwy)
hwy.sd <- sd(compare.cars$hwy)
displ.mean <- mean(compare.cars$displ)
displ.sd <- sd(compare.cars$displ)
compare.cars <- mutate(compare.cars,
hwy.standardized = (hwy-hwy.mean)/hwy.sd,
displ.standardized = (displ-displ.mean)/displ.sd)
compare.cars <- cbind(id=1:nrow(compare.cars), compare.cars)
# as you can see, all that has changed is the scale of the axes
ggplot(data = compare.cars, mapping = aes(x = displ.standardized, y = hwy.standardized)) +
geom_point() +
xlab("Standardized Displacement") + ylab("Standardized Gas mileage (highway)")
```
## Plot Euclidean Distance
```{r}
# calculate Euclidean distance
compare.cars$euc.dist <- (select(compare.cars,
hwy.standardized,
displ.standardized) %>%
dist %>% # calculate Euclidean distance between all cars
as.matrix)[,1] # just use first column (distance between my cars and rest)
compare.cars$euc.dist.order <- (rank(compare.cars$euc.dist) - 1) %>% as.integer
# Euclidean distance plot
ggplot(data=compare.cars, aes(x = displ.standardized, y = hwy.standardized, color = euc.dist)) +
geom_point() +
geom_text(aes(label = euc.dist.order), size = 3, hjust=0, vjust=1.5) +
xlab("Displacement") + ylab("Gas mileage (highway)") +
axes_labels_only +
labs(color = "Euclidean distance") +
NULL
```
## Plot Mahalanobis Distance
```{r}
# calculate Mahalanobis distance
x.mah <- select(compare.cars[-1,], hwy, displ) %>% as.matrix
center.mah <- select(compare.cars[1,], hwy, displ) %>% as.matrix
compare.cars$mah.dist <- c(0, mahalanobis(x = x.mah, center = center.mah, cov = cov(rbind(center.mah, x.mah))))
compare.cars$mah.dist.order <- (rank(compare.cars$mah.dist) - 1) %>% as.integer
# Mahalanobis distance plot
ggplot(data=compare.cars, aes(x = displ.standardized, y = hwy.standardized, color = mah.dist)) +
geom_point() +
geom_text(aes(label = mah.dist.order), size = 3, hjust=0, vjust=1.5) +
xlab("Displacement") + ylab("Gas mileage (highway)") +
axes_labels_only +
labs(color = "Mahalanobis distance") +
NULL
```
## Mahalanobis:Euclid #1
```{r}
# standardize metrics
compare.cars$mah.dist.standardized <- scale(compare.cars$mah.dist) %>% as.vector
compare.cars$euc.dist.standardized <- scale(compare.cars$euc.dist) %>% as.vector
# sapply(compare.cars, typeof)
# as.tibble(compare.cars)
# get distance from line
compare.cars <- as.tibble(compare.cars) %>%
mutate(mah.euc = (mah.dist.standardized-euc.dist.standardized)/sqrt(2)) # dot product to find distance from line
# compare distance metrics
ggplot(compare.cars, aes(x=euc.dist.standardized, mah.dist.standardized)) +
geom_point(aes(color = mah.euc)) +
xlab('Euclidean distance (standardized)') +
ylab('Mahalanobis distance (standardized)') +
scale_color_gradient2(name="Mahal. : Euclid.") +
geom_abline(intercept=0, slope=1) +
annotate("text", x=-0.5, y=1.25, label="Mahalanobis > Euclidean distance", color=scales::muted("blue")) +
annotate("text", x=1.5, y=-0.5, label="Euclidean > Mahalanobis distance", color=scales::muted("red")) +
NULL
```
## Mahalanobis:Euclid #2
```{r}
# compare distance metrics
ggplot(compare.cars, aes(x=displ, y=hwy)) +
geom_point(aes(color = mah.euc)) +
scale_color_gradient2() +
geom_point(data=my.car, color="red", size=2) +
geom_text(data=my.car, label="dream car!", color="red", vjust=-1) +
labs(color = "Mahal. : Euclid.") +
xlab("Displacement") + ylab("Gas mileage (highway)") +
NULL
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment