Skip to content

Instantly share code, notes, and snippets.

@smc77
smc77 / cs229_univariate_regression
Created October 3, 2011 00:28
Univariate regression with housing data
# First look at a linear model fit to the housing data
# details about dataset available http://archive.ics.uci.edu/ml/datasets/Housing
housing <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data")[, c(6, 14)]
names(housing) <- c("num.rooms", "median.values")
housing.lm <- lm(median.values ~ num.rooms, data=housing)
plot(housing)
abline(housing.lm)
summary(housing.lm)
@smc77
smc77 / intuitive_regression
Created October 5, 2011 02:01
Fitting various random lines to the housing data to get an intuition about the loss function.
# Example of randomly chosen lines
plot(housing)
abline(0, 5, col="red")
abline(-50, 10, col="blue")
# Create the loss function
loss <- function(intercept, slope) sum(((intercept + (slope * housing[, "num.rooms"])) - housing[, "median.values"])^2)/2
# Create some data for a given line and compute the loss
loss(0, 5)
@smc77
smc77 / linear algebra in R
Created October 20, 2011 01:32
Quick linear algebra demo
# Matrix addition
matrix(c(1, 2, 3, 0, 5, 1), ncol=2) + matrix(c(4, 2, 0, 0.5, 5, 1), ncol=2)
# Matrix multiplication
3 * matrix(c(1, 2, 3, 0, 5, 1), ncol=2)
# Matrix-Vector Multiplication
matrix(c(1, 4, 2, 3, 0, 1), ncol=2) %*% c(1, 5)
# Matrix-Mector Multiplication
@smc77
smc77 / multivariate
Created October 22, 2011 23:44
Multivariate Regression
# details about dataset available http://archive.ics.uci.edu/ml/datasets/Housing
housing <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data")
names(housing) <- c("CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV")
# Subset the data for our model
housing <- housing[, c("CRIM", "RM", "PTRATIO", "LSTAT", "MEDV")]
plot(housing)
@smc77
smc77 / multivariate_grad_descent.R
Created October 23, 2011 21:32
Multivariate Gradient Descent
# Load data and initialize values
data <- read.csv("http://www.statalgo.com/wp-content/uploads/2011/10/housing.csv")
num.iterations <- 1000
x <- data[, c("area", "bedrooms")]
y <- matrix(data$price, ncol=1) / 1000 # Divide by a thousand so that numbers are in $1000's
# Function to standardize input values
zscore <- function(x, mean.val=NA) {
@smc77
smc77 / normal_equation.R
Created October 24, 2011 00:11
Normal Equation
data <- read.csv("http://www.statalgo.com/wp-content/uploads/2011/10/housing.csv")
x <- as.matrix(cbind(intercept=rep(1, m), data[, c("area", "bedrooms")]))
theta <- solve(t(x) %*% x) %*% t(x) %*% y
@smc77
smc77 / logistic_regression.R
Created October 26, 2011 01:45
Logistic Regression
# Plot the sigmoid function
library(ggplot2)
qplot(-10:10, 1/(1 + exp(-(-10:10))), geom="line", xlab="z", ylab="sigmoid function")
# Download South African heart disease data
sa.heart <- read.table("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/SAheart.data", sep=",",head=T,row.names=1)
# Pretty plot
pairs(sa.heart[1:9],pch=21,bg=c("red","green")[factor(sa.heart$chd)])
@smc77
smc77 / logistic_regression_multi.R
Created October 28, 2011 03:28
Multiclass Logistic Regression
# Plot the data
pairs(iris[1:4], main = "Anderson's Iris Data -- 3 species", pch = 21, bg = c("red", "green3", "blue")[unclass(iris$Species)])
# Use linear discriminant analysis
iris.lda <- lda(Species ~ ., data = iris)
summary(iris.lda)
# Use a multinomial logistic regression model
library(VGAM)
iris.vglm <- glm(Species ~ , family=multinomial, data=iris)
@smc77
smc77 / overfitting.R
Created November 4, 2011 02:05
Overfitting
library(PolynomF)
n <- 10
f <- function(x) sin(2 * pi * x)
x <- seq(0, 1, length=n)
y <- f(x) + rnorm(n, sd=0.2)
plot(data.frame(x, y))
@smc77
smc77 / polynomial_generalization.R
Created November 9, 2011 03:22
Generalization
#
# Let's look at how the different models generalize between different datasets
#
n.training <- 10
n.test <- 100
error.function <- function(y, y.pred) sum((y.pred - y)^2) / 2
e.rms <- function(y, y.pred) sqrt(2 * error.function(y=y, y.pred=y.pred) / length(y))