Skip to content

Instantly share code, notes, and snippets.

@jpotts18
Created August 28, 2015 01:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jpotts18/e725cf67bc0e06c9c49e to your computer and use it in GitHub Desktop.
Save jpotts18/e725cf67bc0e06c9c49e to your computer and use it in GitHub Desktop.
###########################
# Identify the question
###########################
Identify the question you are trying to solve
# Which car is the most undervalued?
# Are we trying to maximize or minimize a variable?
###########################
# Understand the data
###########################
# Understand what they variables mean
# Variable = Description
# Id = Record_ID
# Model = Model Description
# Price = Offer Price in EUROs
# Age_08_04 = Age in months as in August 2004
# Mfg_Month = Manufacturing month (1-12)
# Mfg_Year = Manufacturing Year
# KM = Accumulated Kilometers on odometer
# Fuel_Type = Fuel Type (Petrol, Diesel, CNG)
# HP = Horse Power
# Met_Color = Metallic Color? (Yes=1, No=0)
# Color = Color (Blue, Red, Grey, Silver, Black, etc.)
# Automatic = Automatic ( (Yes=1, No=0)
# CC = Cylinder Volume in cubic centimeters
# Doors = Number of doors
# Cylinders = Number of cylinders
# Gears = Number of gear positions
# Quarterly_Tax = Quarterly road tax in EUROs
# Weight = Weight in Kilograms
# Mfr_Guarantee = Within Manufacturer's Guarantee period (Yes=1, No=0)
# BOVAG_Guarantee = BOVAG (Dutch dealer network) Guarantee (Yes=1, No=0)
# Guarantee_Period = Guarantee period in months
# ABS = Anti-Lock Brake System (Yes=1, No=0)
# Airbag_1 = Driver_Airbag (Yes=1, No=0)
# Airbag_2 = Passenger Airbag (Yes=1, No=0)
# Airco = Airconditioning (Yes=1, No=0)
# Automatic_airco = Automatic Airconditioning (Yes=1, No=0)
# Boardcomputer = Boardcomputer (Yes=1, No=0)
# CD_Player = CD Player (Yes=1, No=0)
# Central_Lock = Central Lock (Yes=1, No=0)
# Powered_Windows = Powered Windows (Yes=1, No=0)
# Power_Steering = Power Steering (Yes=1, No=0)
# Radio = Radio (Yes=1, No=0)
# Mistlamps = Mistlamps (Yes=1, No=0)
# Sport_Model = Sport Model (Yes=1, No=0)
# Backseat_Divider = Backseat Divider (Yes=1, No=0)
# Metallic_Rim = Metallic Rim (Yes=1, No=0)
# Radio_cassette = Radio Cassette (Yes=1, No=0)
# Parking_Assistant = Parking assistance system (Yes=1, No=0)
# Tow_Bar = Tow Bar (Yes=1, No=0)
###########################
# Explore the data
###########################
# set working directory to be next to data
raw.data <- read.csv('toyota-corolla.csv')
View(raw.data)
# What types of variables are in this data set?
str(raw.data)
# We won't be able to use all of these variables why?
plot(raw.data$Price ~ raw.data$Mfg_Year)
subset <- raw.data[,c(3:7)]
plot(subset)
plot(subset$KM ~ subset$Age_08_04)
# What can we learn from this correlation?
plot(subset$Price ~ subset$KM)
# What can we learn from this?
plot(subset)
plot(subset$Age_08_04 ~ subset$Mfg_Year)
plot(subset$Age_08_04 ~ subset$Mfg_Month)
# can we learn anything from this data?
plot(subset)
subset <- raw.data[,c(3:4,7,9,13:18)]
plot(subset)
# Run one linear regression
plot(subset$Price ~ subset$KM)
price_km_linear_regression <- lm(subset$Price ~ subset$KM)
# Plot this line on the graph
abline(price_km_linear_regression, col = "red")
summary(price_km_linear_regression)
# What do all of these numbers mean?
# Intercept?
# subset$KM Estimate
# Multiple R-squared
# 32% explained by
plot(price_km_linear_regression)
mlr.2factors <- lm(subset$Price ~ subset$KM + subset$Weight)
summary(mlr.2factors)
mlr.2factors <- lm(subset$Price ~ subset$KM + subset$Age_08_04)
summary(mlr.2factors)
mlr.3factors <- lm(subset$Price ~ subset$KM + subset$Age_08_04 + subset$Weight)
summary(mlr.3factors)
# Why weight?
# Run an MLR
mlr.allfactors <- lm(subset$Price ~
subset$KM +
subset$Age_08_04 +
subset$HP +
subset$CC +
subset$Doors +
subset$Cylinders +
subset$Gears +
subset$Quarterly_Tax +
subset$Weight
)
summary(mlr.allfactors)
# How many factors do we need to get the most accurate MLR?
# How do we identify which ones give the most predictive power?
mlr.4factors <- lm(subset$Price ~ subset$KM + subset$Age_08_04 + subset$HP + subset$Weight)
summary(mlr.4factors)
# Are the extra factors worth it?
# .8614 - 4 factors
# .863 - All factors
# How do we calculate the predicted vs actual
coef(mlr.4factors)
plot(density(resid(mlr.4factors)))
resid(mlr.4factors)
subset["residual"] <- resid(mlr.4factors)
new_data = data.frame(KM=1000, Age_08_04=15, HP=100, Weight=1200)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment