tradingbills/covariance|pearson|spearman

## covariance|pearson|spearman
library(readxl)
library(tidyverse)
library(psych)
library(scales)
setwd("C:/Users/tradingbills/Documents/_exer/_data/math/wk4/")

# 1 compute the covariance
# COV = Sum( (x_i - x_bar) * (y-i -y_bar)) / N-1
# COVARIANCE
covariance <- function(x,y){
  numerator <- sum(( x - mean(x)) * ( y - mean(y)))
  denominator <- length(x) - 1
  return  (numerator/denominator)
}

cereal <- read_excel('Cereals.xlsx')
x <- cereal$Sugar
y <- cereal$Calories
covariance(x,y)

cereal %>% ggplot(aes(Sugar,Calories)) +
  theme_bw() +
  geom_point()

# COVARIANCE
covar_of_cereal <- covariance(x,y)
# r (coefficient of correlation) pearson
r_of_cereal <- covar_of_cereal/(sd(x) * sd(y))
cor(x, y)
round(r_of_cereal, digits = 4) == round(cor(x, y), digits = 4)  # [1] TRUE
cor(x, y, method = "spearman")
r_sqrd <-  (cor(x, y))^2 # [1] 0.8561823
fit <- lm(y~x)
summary(fit)

#2
football <- read_excel("College Football.xlsx")
glimpse(football)

ds02 <- football %>%
  select(`Total Pay`, `Football Net Revenue`) %>%
  rename(ttl_pay = `Total Pay`,
         revenue = `Football Net Revenue`)
ds02 %>% ggplot(aes(revenue, ttl_pay )) +
  theme_bw() +
  geom_point()

x = ds02$revenue
y = ds02$ttl_pay
# COVARIANCE
numerator <- sum(( x - mean(x)) * ( y - mean(y)))
denominator <- length(ds02$revenue) - 1
covar_of_football <- numerator/denominator
r_of_football <- covar_of_football/(sd(x) * sd(y))
cor(x, y)
cor(x, y, method = "spearman")
round(r_of_football, digits = 4) == round(cor(x, y), digits = 4)  # [1] TRUE
r_sqrd <-  (cor(cereal$x, cereal$y))^2 # [1] 0.8561823

#3 HDL_cholesterol
hdl <- read_excel('HDL_cholesterol.xlsx')
hdl %>% ggplot(aes(x=Age, y=Cholesterol)) +
  theme_bw() +
  geom_point()

# COVARIANCE
x <- hdl$Age
y <- hdl$Cholesterol
numerator <- sum(( x - mean(x)) * ( y - mean(y)))
denominator <- length(hdl$Age) - 1
covar_of_hdl <- numerator/denominator
r_of_hdl <- covar_of_hdl/(sd(x) * sd(y))
cor(x, y)
cor(x, y, method = "spearman")
round(r_of_hdl, digits = 4) == round(cor(x, y), digits = 4)  # [1] TRUE

#4 MRI
mri <- read_excel('MRI_IQ.xlsx')
#mri as explanatory and iq as response
breaks_log10 <- function(x){
  low <- floor(log(min(x), base = 8))
  high <- ceiling(log10(max(x)))
  10^(seq.int(low, high))
}

ggplot(mri,aes(x=MRI_COUNT, y=IQ, shape = GENDER)) +
  geom_point()+
  scale_shape_manual(values = c(24, 16 ))

female <- mri %>%
  filter(GENDER == "F")
cor(female$MRI_COUNT, female$IQ)
male <- mri %>%
  filter(GENDER == "M")
cor(male$MRI_COUNT, male$IQ)

#5
library(lsr)
baseball <- read_excel("Baseball P14.xlsx")
correlate(baseball)
Filter(is.numeric, baseball) %>%
  cor()

#8
judging <- read_excel('Judging.xlsx')
cor(judging$Judge_1_Score, judging$Judge_2_Score, method = "spearman")
cor(judging$Judge_1_Score, judging$Judge_2_Score, method = "pearson")
	library(readxl)
	library(tidyverse)
	library(psych)
	library(scales)
	setwd("C:/Users/tradingbills/Documents/_exer/_data/math/wk4/")

	# 1 compute the covariance
	# COV = Sum( (x_i - x_bar) * (y-i -y_bar)) / N-1
	# COVARIANCE
	covariance <- function(x,y){
	numerator <- sum(( x - mean(x)) * ( y - mean(y)))
	denominator <- length(x) - 1
	return (numerator/denominator)
	}

	cereal <- read_excel('Cereals.xlsx')
	x <- cereal$Sugar
	y <- cereal$Calories
	covariance(x,y)

	cereal %>% ggplot(aes(Sugar,Calories)) +
	theme_bw() +
	geom_point()

	# COVARIANCE
	covar_of_cereal <- covariance(x,y)
	# r (coefficient of correlation) pearson
	r_of_cereal <- covar_of_cereal/(sd(x) * sd(y))
	cor(x, y)
	round(r_of_cereal, digits = 4) == round(cor(x, y), digits = 4) # [1] TRUE
	cor(x, y, method = "spearman")
	r_sqrd <- (cor(x, y))^2 # [1] 0.8561823
	fit <- lm(y~x)
	summary(fit)

	#2
	football <- read_excel("College Football.xlsx")
	glimpse(football)

	ds02 <- football %>%
	select(`Total Pay`, `Football Net Revenue`) %>%
	rename(ttl_pay = `Total Pay`,
	revenue = `Football Net Revenue`)
	ds02 %>% ggplot(aes(revenue, ttl_pay )) +
	theme_bw() +
	geom_point()

	x = ds02$revenue
	y = ds02$ttl_pay
	# COVARIANCE
	numerator <- sum(( x - mean(x)) * ( y - mean(y)))
	denominator <- length(ds02$revenue) - 1
	covar_of_football <- numerator/denominator
	r_of_football <- covar_of_football/(sd(x) * sd(y))
	cor(x, y)
	cor(x, y, method = "spearman")
	round(r_of_football, digits = 4) == round(cor(x, y), digits = 4) # [1] TRUE
	r_sqrd <- (cor(cereal$x, cereal$y))^2 # [1] 0.8561823

	#3 HDL_cholesterol
	hdl <- read_excel('HDL_cholesterol.xlsx')
	hdl %>% ggplot(aes(x=Age, y=Cholesterol)) +
	theme_bw() +
	geom_point()

	# COVARIANCE
	x <- hdl$Age
	y <- hdl$Cholesterol
	numerator <- sum(( x - mean(x)) * ( y - mean(y)))
	denominator <- length(hdl$Age) - 1
	covar_of_hdl <- numerator/denominator
	r_of_hdl <- covar_of_hdl/(sd(x) * sd(y))
	cor(x, y)
	cor(x, y, method = "spearman")
	round(r_of_hdl, digits = 4) == round(cor(x, y), digits = 4) # [1] TRUE

	#4 MRI
	mri <- read_excel('MRI_IQ.xlsx')
	#mri as explanatory and iq as response
	breaks_log10 <- function(x){
	low <- floor(log(min(x), base = 8))
	high <- ceiling(log10(max(x)))
	10^(seq.int(low, high))
	}

	ggplot(mri,aes(x=MRI_COUNT, y=IQ, shape = GENDER)) +
	geom_point()+
	scale_shape_manual(values = c(24, 16 ))

	female <- mri %>%
	filter(GENDER == "F")
	cor(female$MRI_COUNT, female$IQ)
	male <- mri %>%
	filter(GENDER == "M")
	cor(male$MRI_COUNT, male$IQ)

	#5
	library(lsr)
	baseball <- read_excel("Baseball P14.xlsx")
	correlate(baseball)
	Filter(is.numeric, baseball) %>%
	cor()

	#8
	judging <- read_excel('Judging.xlsx')
	cor(judging$Judge_1_Score, judging$Judge_2_Score, method = "spearman")
	cor(judging$Judge_1_Score, judging$Judge_2_Score, method = "pearson")