Skip to content

Instantly share code, notes, and snippets.

@standarderror
Last active November 14, 2015 02:06
Show Gist options
  • Save standarderror/f7c2ae19fdbbb01b59ff to your computer and use it in GitHub Desktop.
Save standarderror/f7c2ae19fdbbb01b59ff to your computer and use it in GitHub Desktop.
R code library
##########################################
#### DATA MANIPULATION WITH DPLYR ########
#### from datacamp course ################
# All examples here: https://rstudio-pubs-static.s3.amazonaws.com/98285_aee86145ffc746e685b8792c7cf8a73d.html
##########################################
## SUMMARY: THE 5 VERBS ######
# select = subset columns
# mutate = new cols
# filter = subset rows
# arrange = reorder rows
# summarise
# use GROUP_BY with summarise or mutate to get group calculations
# pipe results with %>%
## Can use these with dataframes, datatables or databases ##
# Additional summary functions within dplyr
#first(x) - The first element of vector x.
#last(x) - The last element of vector x.
#nth(x, n) - The nth element of vector x.
#n() - The number of rows in the data.frame or group of observations that summarise() describes.
#n_distinct(x) - The number of unique values in vector x.
#install.packages("BH", lib='c:\\notbackedup\\R\\') # dplyr requirement
library("BH", lib='c:\\notbackedup\\R\\')
install.packages("ggplot2", lib='c:\\notbackedup\\R\\')
library("ggplot2", lib='c:\\notbackedup\\R\\')
install.packages("ROCR", lib='c:\\notbackedup\\R\\')
library("ROCR", lib='c:\\notbackedup\\R\\')
install.packages("party", lib='c:\\notbackedup\\R\\')
library("party", lib='c:\\notbackedup\\R\\')
install.packages("dplyr", lib='c:\\notbackedup\\R\\')
library(dplyr, lib='c:\\notbackedup\\R\\')
install.packages("hflights", lib='\\notbackedup\\R\\')
library(hflights, lib='\\notbackedup\\R\\')
summary(hflights)
## TBL instead of DATA.FRAME = faster?
# tbl - more convenient for displaying data.frames
# convert with tbl_df()
hflights <- tbl_df(hflights)
hflights
#### cleanup dataset
## carrier name
lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental", "DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways", "WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier", "FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa")
# Use lut to translate the UniqueCarrier column of hflights
hflights$UniqueCarrier = lut[hflights$UniqueCarrier]
# Inspect the resulting raw values of your variables
glimpse(hflights)
## more codes
# Build the lookup table
lut <- c("A" = "carrier", "B" = "weather", "C" = "FFA" , "D" = "security", "E" = "not cancelled")
# Use the lookup table to create a vector of code labels. Assign the vector to the CancellationCode column of hflights
hflights$CancellationCode = lut[hflights$CancellationCode]
# Inspect the resulting raw values of your variables
glimpse(hflights)
#### THE 5 VERBS: SELECT & MUTATE ######
# select = remove columns
# mutate = new cols
# filter = removes rows
# arrange = reorder rows
# summarise
############### SELECT() ###############
## SELECT HELPER FUNCTIONS
#dplyr provides 6 helper functions, each of which only works when used inside select().
#starts_with("X"): every name that starts with "X",
#ends_with("X"): every name that ends with "X",
#contains("X"): every name that contains "X",
#matches("X"): every name that matches "X", which can be a regular expression,
#num_range("x", 1:5): the variables named x01, x02, x03, x04 and x05,
#one_of(x): every name that appears in x, which should be a character vector.
# select(df, Group, Sum)
hflights2 = select(hflights, ActualElapsedTime, Airtime, ArrDelay, DepDelay)
?select
## CHAPTER 2 Ex1
# hflights is pre-loaded as a tbl, together with the necessary libraries.
# Return a copy of hflights that contains the four columns related to delay.
hflights2 = select(hflights, ActualElapsedTime, AirTime, ArrDelay, DepDelay)
# print hflights, nothing has changed!
print(hflights)
# Return a copy of hflights containing the columns Origin up to Cancelled
hflights2 = select(hflights, Origin:Cancelled)
# Answer to last question: be concise!
#colnames(hflights)
hflights2 = select(hflights, Year:DayOfWeek,ArrDelay:Diverted )
## Chapter 2 Ex2
# As usual, hflights is pre-loaded as a tbl, together with the necessary libraries.
# Return a tbl containing just ArrDelay and DepDelay
hflights2 = select(hflights, ArrDelay, DepDelay)
# Return a tbl as described in the second exercise, using both helper functions and variable names
hflights2 = select(hflights, UniqueCarrier:TailNum, Cancelled:CancellationCode)
# Return a tbl as described in the third exercise, using only helper functions.
hflights2 = select(hflights, contains('Time'), contains('Delay'))
## Chap 2 ex3
# both hflights and dplyr are available
# Exercise 1
ex1r <- hflights[c("TaxiIn","TaxiOut","Distance")]
ex1d <- select(hflights,TaxiIn, TaxiOut,Distance)
# Exercise 2
ex2r <- hflights[c("Year","Month","DayOfWeek","DepTime","ArrTime")]
ex2d <- select(hflights, Year:ArrTime, -DayofMonth)
# Exercise 3
ex3r <- hflights[c("TailNum","TaxiIn","TaxiOut")]
ex3d <- select(hflights,TailNum, TaxiIn,TaxiOut)
colnames(hflights)
#### MUTATE()
# mutate(tbl, z = x - y, c = a - b, ...)
colnames(hflights)
# Chapter 2 ex 4
# hflights and dplyr are loaded and ready to serve you.
# Add the new variable ActualGroundTime to a copy of hflights and save the result as g1.
g1 <- mutate(hflights, ActualGroundTime = ActualElapsedTime - AirTime)
# Add the new variable GroundTime to a copy of g1 and save the result as g2.
g2 <- mutate(g1, GroundTime = TaxiIn + TaxiOut)
# Add the new variable AverageSpeed to a copy of g2 and save the result as g3.
g3 <- mutate(g2, AverageSpeed = Distance / AirTime * 60)
# chatper 3 ex 5
# hflights and dplyr are ready, are you?
# hflights and dplyr are ready, are you?
# Add a second variable loss_percent to the dataset and save the result to m1.
m1 <- mutate(hflights, loss = ArrDelay - DepDelay, loss_percent = (ArrDelay - DepDelay) / DepDelay * 100)
# Remove the redundancy from your previous exercise and reuse loss to define the loss_percent variable.
# Assign the result to m2
m2 <- mutate(m1, loss = ArrDelay - DepDelay, loss_percent = loss / DepDelay * 100)
# Add the three variables as described in the third exercise and save the result to m3
m3 <- mutate(hflights, TotalTaxi = TaxiIn + TaxiOut, ActualGroundTime = ActualElapsedTime - AirTime, Diff = TotalTaxi - ActualGroundTime)
#### Section 5: FILTER rows #####
# form: filter(tablename, logical tests)
# use ?comparison to see operators
install.packages("dplyr")
library(dplyr)
# hflights is at your disposal as a tbl, with clean carrier names
install.packages("hflights")
library(hflights)
colnames(hflights)
# hflights is at your disposal as a tbl, with clean carrier names
colnames(hflights)
head(hflights)
# All flights that traveled 3000 miles or more
filter(hflights, Distance > 3000)
# All flights flown by one of JetBlue, Southwest, or Delta
filter(hflights, UniqueCarrier %in% c('JetBlue', 'Southwest', 'Delta'))
# All flights where taxiing took longer than flying
filter(hflights, (TaxiIn+TaxiOut)>AirTime)
## Combine with booleans
# & (and), | (or), and ! (not)
# for NOT: filter(df, !is.na(x))
#filter(df, a > b & c > d)
#filter(df, a > b, c > d)
# hflights is at your service as a tbl!
# All flights that departed before 5am or arrived after 10pm
filter(hflights, DepTime< 500 | ArrTime>2200)
# All flights that departed late but arrived ahead of schedule
filter(hflights, DepDelay>0 & ArrDelay<0)
# All cancelled weekend flights
filter(hflights, Cancelled==1 & DayOfWeek %in% c(6,7))
# All flights that were cancelled after being delayed
filter(hflights, DepDelay>0 & Cancelled==1)
# hflights is already available in the workspace
# Select the flights that had JFK as their destination: c1
c1 = filter(hflights, Dest == 'JFK')
# Combine the Year, Month and DayofMonth variables to create a Date column: c2
c2 = mutate(c1, Date = paste(Year,Month,DayofMonth,sep="-"))
# Print out a selection of columns of c2
select(c2, Date, DepTime, ArrTime, TailNum)
filter(hflights, Distance>1000 & (TaxiIn+TaxiOut)<15 & DayOfWeek %in% (6,7))
##### Section 6: ARRANGE = reorder dataframe rows
### arrange(tbl, columns) - by default in ascending
## arrange(tbl, col1, col2)
# for descending, wrap var in desc(): arrange(tbl, desc(col1))
# dplyr and the hflights tbl are available
# Definition of dtc
dtc <- filter(hflights, Cancelled == 1, !is.na(DepDelay))
# Arrange dtc by departure delays
arrange(dtc, DepDelay)
# Arrange dtc so that cancellation reasons are grouped
arrange(dtc, CancellationCode)
# Arrange dtc according to carrier and departure delays
arrange(dtc, UniqueCarrier, DepDelay)
# dplyr and the hflights tbl are available
# Arrange according to carrier and decreasing departure delays
arrange(hflights, UniqueCarrier, desc(DepDelay))
# Arrange flights by total delay (normal order).
arrange(hflights, (ArrDelay+DepDelay))
# Keep flights leaving to DFW before 8am and arrange according to decreasing AirTime
arrange(filter(hflights, Dest == 'DFW' & DepTime < 800), desc(AirTime))
### SECTION 7: SUMMARISE
# summarise(tbl, newcol = expression(col), ...)
# sum, mean, var
# functions must take a vector and return a single number
# min(x) - minimum value of vector x.
# max(x) - maximum value of vector x.
# mean(x) - mean value of vector x.
# median(x) - median value of vector x.
# quantile(x, p) - pth quantile of vector x.
# sd(x) - standard deviation of vector x.
# var(x) - variance of vector x.
# IQR(x) - Inter Quartile Range (IQR) of vector x.
# diff(range(x)) - total range of vector x.
# hflights and dplyr are loaded in the workspace
# Print out a summary with variables min_dist and max_dist
summarise(hflights, min_dist = min(Distance), max_dist = max(Distance))
# Print out a summary with variable max_div
summarise(filter(hflights, Diverted==1), max_div = max(Distance))
# hflights is available
# Remove rows that have NA ArrDelay: temp1
temp1 = filter(hflights, !is.na(ArrDelay))
# Generate summary about ArrDelay column of temp1
summarise(temp1, earliest = min(ArrDelay), average = mean(ArrDelay), latest = max(ArrDelay), sd = sd(ArrDelay))
# Keep rows that have no NA TaxiOut and no NA TaxiOut: temp2
temp2 = filter(hflights, !is.na(TaxiOut) & !is.na(TaxiIn))
# Print the maximum taxiing difference of temp2 with summarise()
summarise(temp2, max_taxi_diff = max(abs(TaxiIn - TaxiOut)))
##### summary functions within dplyr
#first(x) - The first element of vector x.
#last(x) - The last element of vector x.
#nth(x, n) - The nth element of vector x.
#n() - The number of rows in the data.frame or group of observations that summarise() describes.
#n_distinct(x) - The number of unique values in vector x.
# hflights is available with full names for the carriers
#head(hflights)
# Generate summarizing statistics for hflights
summarise(hflights, n_obs = n(), n_carrier = n_distinct(UniqueCarrier), n_dest = n_distinct(Dest), dest100 = nth(Dest,100))
# Filter hflights to keep all American Airline flights: aa
aa = filter(hflights, UniqueCarrier=='American')
# Generate summarizing statistics for aa
summarise(aa, n_flights = n(),
n_canc = sum(Cancelled == 1),
p_canc = mean(Cancelled == 1) * 100,
avg_delay = mean(ArrDelay, na.rm = TRUE))
########## SECTION 8: CHAINING WITH PIPE OPERATOR
# magrittr inside dplyr
# %>% passes objcet on left to object on right as FIRST ARGUMENT
# eg. c(1, 2, 3, NA) %>% mean(na.rm = TRUE)
# Write the 'piped' version of the English sentences.
hflights %>% mutate(diff = TaxiOut - TaxiIn) %>% filter(!is.na(diff)) %>% summarise(avg = mean(diff))
# hflights is pre-loaded
# Build data frame with 4 columns of hflights and 2 self-defined columns: d
d = hflights %>%
select(Dest, UniqueCarrier, Distance, ActualElapsedTime) %>%
mutate(RealTime = ActualElapsedTime + 100, mph = Distance/RealTime*60)
head(d)
# Filter and summarise d according to the instructions
filter(d, !is.na(mph) & mph<70) %>%
summarise(n_less = n(), n_dest = n_distinct(Dest), min_dist = min(Distance), max_dist = max(Distance))
hflights %>%
select(Dest, Cancelled, Distance, ActualElapsedTime, Diverted) %>%
mutate(RealTime = ActualElapsedTime + 100, mph = Distance / RealTime * 60) %>%
filter(mph < 105 | Cancelled == 1 | Diverted == 1) %>%
summarise( n_non = n(),
p_non = n_non / nrow(hflights) * 100,
n_dest = n_distinct(Dest),
min_dist = min (Distance),
max_dist = max(Distance))
# Count the number of overnight flights
hflights %>%
filter(!is.na(DepTime), !is.na(ArrTime), DepTime > ArrTime) %>%
summarise(n = n())
#### SECTION 9: GROUP BY in SUMMARISE
# group_by(tbl, group_var)
# Attaches group metadata to dataset which summarise can use
# Make an ordered per-carrier summary of hflights
hflights %>%
group_by(UniqueCarrier) %>%
summarise(n_flights = n(),
n_canc = sum(Cancelled == 1),
p_canc = mean(Cancelled == 1) * 100,
avg_delay = mean(ArrDelay, na.rm = TRUE)) %>%
arrange(avg_delay, p_canc)
# Make an ordered per-day summary of hflights
hflights %>%
group_by(DayOfWeek) %>%
summarise(avg_taxi = mean(TaxiIn + TaxiOut, na.rm=TRUE)) %>%
arrange(desc(avg_taxi))
### Using group_by with mutate
# dplyr is loaded, hflights is loaded with fancy carrier names
# Solution to first instruction
hflights %>%
group_by(UniqueCarrier) %>%
filter(!is.na(ArrDelay)) %>%
summarise(p_delay = mean(ArrDelay > 0)) %>%
mutate(rank = rank(p_delay)) %>%
arrange(rank)
# Solution to second instruction
hflights %>%
group_by(UniqueCarrier) %>%
filter(!is.na(ArrDelay), ArrDelay > 0) %>%
summarise(avg = mean(ArrDelay)) %>%
mutate(rank = rank(avg)) %>%
arrange(rank)
### ADVANCED COMBINATIONS
# Which plane (by tail number) flew out of Houston the most times? How many times? adv1
adv1 <- hflights %>%
group_by(TailNum) %>%
summarise(n = n()) %>%
filter(n == max(n))
# How many airplanes only flew to one destination from Houston? adv2
adv2 <- hflights %>%
group_by(TailNum) %>%
summarise(ndest = n_distinct(Dest)) %>%
filter(ndest == 1) %>%
summarise(nplanes = n())
# Find the most visited destination for each carrier: adv3
adv3 <- hflights %>%
group_by(UniqueCarrier, Dest) %>%
summarise(n = n()) %>%
mutate(rank = rank(desc(n))) %>%
filter(rank == 1)
# Find the carrier that travels to each destination the most: adv4
adv4 <- hflights %>%
group_by(Dest, UniqueCarrier) %>%
summarise(n = n()) %>%
mutate(rank = rank(desc(n))) %>%
filter(rank == 1)
########### CONNECTING TO DATABASES
## SQL
# set up a src that connects to the mysql database (src_mysql is provided by dplyr)
my_db <- src_mysql(dbname = "dplyr",
host = "dplyr.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "dplyr",
password = "dplyr")
# and reference a table within that src: nycflights is now available as an R object that references to the remote nycflights table
nycflights <- tbl(my_db, "dplyr")
# glimpse at nycflights
glimpse(nycflights)
# Calculate the grouped summaries detailed in the instructions
dbsumm <- nycflights %>%
group_by(carrier) %>%
summarise(n_flights = n(), avg_delay = mean(arr_delay)) %>%
arrange(avg_delay)
dbsumm
#####################################
# Swinburne: R programming #
#####################################
#### DISTRIBUTIONS AND MODELLING ###
#####################################
# Methods of FITTING / ESTIMATING
# [1] Classical frequentist
# [1.1] Least squares
# minimise squared residual
# Usually by finding point with minimum error = 0 gradient
# from solving sets of linear equations Ax = B
# [1.2] Maximum likelihood: ******
# given a function that calculates "likelihood," which means
# if _data is fixed_ and parameters vary, what is probability.
# usually multiply together f(y,theta) for all y
# PICTURE the probability parabola across the various thetas
# OR sum the log likelihoods
# LS vs MLE: are the same if normally distributed error
# LS cannot be used for other distributions, eg binomial
# ...
# ***
# The distribution of ERROR and distribution of Y are independant
# You can have NORMALLY DISTRIBUTED error and non-normal Y
# --> You can use least squares on NON-NORMAL DATA
# ... because you may be adding non-normal variables to get
# a non-normal outcome... and that's fine.
# ==> you'll have normal error
# ***
# ** Hence it is the distribution of ERROR we need to attend to
## Distributions / random variables
# Can be univariate or multivariate
# Every model requires assumptions about probability distribution of residuals.
# We can do Monte Carlo simulations with our distributions
# if residuals are normal and relationship between variables is simple additive
# --> simple linear modelling
## Discrete RVs
# Have a set of values, each has a probability
X = c(30, 40, 50, 60, 70, 80)
P = c(0.3, 0.3, 0.05, 0.05, 0.1, 0.2)
# mean = sum(X*P) # sum product of each value * probabilty of value
# of in matrix algebra: mean=t(X)%*%P
## Continuous RVs
# Must integrate across ranges to get probabilities
# mean - integrate over x * Px
# variance = integrate over (x-mu)^2 * Px
### Uniform distribution ###
# Every value has same probability
runif(10,3,5) # n, min, max
x = runif(100000,3,5) # n, min, max
x=data.matrix(x)
dim(x)
hist(x)
# compute theoretical mean
(1/2) * (3+5)
# compute empirical mean
mean(x)
# var theoretical
(1/12)*(5-3)^2
# var emprical
var(x)
### Normal ####
# Generate 10,000 N(0,1) numbers (i.e., standard normal).
# no need to specify the mean and variance.
X<-rnorm(10000)
# Generate 10,000 N(30,102) numbers.
X<-rnorm(10000,30,10)
# checking the histogram.
hist(X)
# checking the mean and variance.
mean(X)
var(X)
### Chi-sq ###
# is the SUM of multiple SQUARED NORMAL R.V's
# = sum of squared normal RVs
# So we could simulate ChiSq with 5 degrees of freedom by generating 5 sets of RNORM, squaring and adding
#### F-distribution ###
# all ANOVAs are F-tests
# F is defined with 2 independant chi-square variables
# It is result of ratio between the two
x=rf(100,5,2) # number, df of chisq 1, df of chisq 2
hist(x)
#### t-distribution ###
# variance = v/v-2, where v=df
# as df --> infinity (because sample gets huge), then t becomes normal
# The square of a t is an F
#### Distributions for GLM ####
## to account for residual distributions
## logistic distribution: similar to normal but with fatter tail
# generate 10000 logistic random numbers with location being 0 and scale being 1.
X<-rlogis(10000, 0, 1)
# generate 10000 standard normal random numbers.
Y<-rnorm(10000, 0,1)
# compare their histograms
hist(X, xlim=c(-20,20), col="red")
hist(Y, add=T, col="blue")
## Exponential
# models time BETWEEN events in Poisson proceess
# remember not to confuse RATE with MEAN TIME
# generate 10000 random exponential numbers from the example
X<-rexp(10000, rate = 1/30)
hist(X)
## Poisson distribution
# Describe probability of a number of events occuring in fixed period of time or space. eg crime
# commonly used for COUNT data
# lambda = expected # of occurences
# if you have TOO MANY zeroes may not an alternative
# also, sometimes large count data actually does look normal - in that case, use normal!
# generate 10000 random Poisson numbers
X<-rpois(10000, 1)
Y<-rpois(10000, 10)
Z<-rpois(10000, 20)
# stack their histograms together
hist(X, xlim=c(0,50), col="red")
hist(Y, add=T, col="blue")
hist(Z, add=T, col="green")
# check the mean and variance are indeed very close
mean(X)
var(X)
##### ANOVA #####
library(reshape2)
data(tips, package = "reshape2")
tips
tipAnova <- aov(tip~day+time+sex-1,tips)
summary(tipAnova)
tipAnova$coefficients
# Let’s look at the p-value of gender…
# In this F distribution:
# DF1 = 1, DF2 = 239
# R code for obtaining p-value from an F distribution.
pf(0.820,1,238,lower.tail=FALSE)
# Recall that when DF1 = 1 the F distribution is the same as the square of a t(DF2).
# R code for obtaining p-value from a t distribution.
pt(0.820^0.5,238,lower.tail=FALSE)*2
####### MATRIX ALGEBRA ############
# week 8
# X is a matrix, m x n
# m is rows, n is coloumns
# if n=1, column vector
# if m=1, row vector
# if m=n, square matrix
# Transpose = X'
# Column Vector
X = matrix(c(3,4,7,8,9,10,3,3),8,1)
X
# Row Vector
X = matrix(c(3,4,7,8,9,10,3,3),1,8)
X
# a (4x2) matrix
X = matrix(c(3,4,7,8,9,10,3,3),4,2)
X
# a (3x3) square matrix
X = matrix(c(3,4,7,8,9,10,3,3,1),3,3)
X
# transpose of X
t(X)
## Multiplying matrices
# must be CONFORMABLE
# (axb)*(bxc) --> col dim of 1st matrix must = row dim of 2nd matrix
X = matrix(c(3,4,7,8,9,10,3,3,1),3,3)
Y = matrix(c(6,8,2,1,2,1,3,2,1),3,3)
O = matrix(c(2,2,3),3,1)
# X and O are conformable (3x1)
X%*%O
# O and X are NOT conformable
O%*%X
# t(O) and X are conformable (1x3)
t(O)%*%X
# X and Y are conformable
X%*%Y
# first element of X%*%Y
# is sumproduct of row1 of X and column1 of Y
sum(X[1,]*Y[,1])
# last element of X%*%Y
# is sumproduct of row3 of X and column3 of Y
sum(X[3,]*Y[,3])
# YX is NOT equal to XY
Y%*%X==X%*%Y
## Matrix inverse
# X * X' = I, where I is the identity matrix
# Identity matrix = 1's along diagonal
I = matrix(diag(3),3,3)
I
# solve() finds the inverse
?solve
X = matrix(c(3,4,7,8,9,10,3,3,1),3,3)
# in R, matrix inverse is done through solve().
XI = solve(X)
X%*%XI
# if you foolishly did this
XI = X^-1
XI
# you have inverted every single element in X.
# only square matrices have inverse
# So many X's aren't square: obs * variables, not square
# Hence we have to use X*X to get square
#################### REGRESSION MODELLING ###########################
# Y = X*Beta + epsilon
# Where X is matrix N * K, B is K x 1
# epsilon is (N x 1). The distribution of epison determines
# priperties of Y. Here, we assume multivariate normal.
# Beta is what we want to estimate: determines relationship
# between X and Y
data(cars)
attach(cars)
plot(speed,dist,xlim=c(0,25))
regmodel=lm(dist~speed) # add -1 to remove intercept
summary(regmodel)
# generate a sequence of x’s…
x=seq(min(speed),max(speed),1)
ypredicted=regmodel$coefficients[1]+regmodel$coefficients[2]*x
lines(x,ypredicted,col="red")
## Least squares estimation is fundamental method
# Residual e_i = Y_i - Yhat_i
# = the distance between line and observation point
# Now we want to solve to get minimum residuals
# see slides: take the derivative of e_i with respect to B0 and B1
# = two simultaneous equations. Solve for both = 0
# --> solve this with matrix algebra
# beta = (X'X)^-1(X'Y)
data(cars)
attach(cars)
Y = matrix(dist)
dim(Y)
one = matrix(rep(1,50))
X = cbind(one,matrix(speed))
dim(X)
b = solve( t(X)%*%X ) %*% ( t(X)%*%Y )
b
# It is important to recognize that LS
# line of best fit is NOT the only option out there.
# If you choose a different penalty function, you will
# get a different line...
# eg sum of residuals instead of sum SQUARE residuals
# LS estimators are BLUE:
# BLUE: Best Linear Unbiased Estimator.
# Under the following assumptions:
# The regressors are INDEPENDENT of the residuals.
# The regressors are NOT perfectly collinear with each other.
# The residuals are homoscedastic.
# The residuals are uncorrelated.
# **
# We assume *residuals* must be normally distributed
# that DOESN'T MEAN that Y must be normally distributed
# as long as ERROR is normal, we can use least squares
## Electricity example
# Y-variable - electricity usage - is normal with some outliers
# By looking at residuals, we see there is something happening which we're missing
plot(resid(regmodel))
hist(resid(regmodel))
# ... by adding polynomial (square) terms we get a good model
####### GENERALISED LINEAR MODELS ###########
# if residuals aren't normal, may need a link function
# equation of Beta * X stays same, but link funciton applies to Y: g(Y)
# precisely: g(mu), where u is *mean* of response variable
## THE RHS MUST ALWAYS BE BETA * X --> LINK IS APPLIED TO Y ONLY!
?glm
# glm(formula, family = gaussian, data, ...)
# glm usually fit using maximum likelihood
## Family: binomial
# link functions logit & probit
# think of logit in terms of threshold from 0 to 1, with the latent variable being some scale of willingness that trips the threshold
# ln(p/1-p) = Beta * X
data <- read.csv("Homicides.csv")
attach(data)
regmodel_logit <- glm(HomicideHotSpot~SOUTH+UE90+DV90+MA90+PS90+RD90, family=binomial("logit"))
summary(regmodel_logit)
# Estimates are
## Family: Poisson
# the count comes from Poisson distribution, where MEAN of distribution
# is defined by linear combination of variables (Beta * X)
# be careful of overdispersion (variance>mean) [variance and mean should be same]
# be carful of zero inflatio (too many 0's)
data <- read.csv("ShipWreck.csv")
attach(data)
data
hist(damage)
# examine
xtabs(~damage+type)
xtabs(~damage+construction)
xtabs(~damage+operation)
plot(damage~months)
# notice that we get different counts by cutting data different ways
# --> see which vraibles are predictive
# ALSO notice that there is a non-linear relationship between damage & month
months2 <- months^2
regmodel_Pois <- glm(damage~type+construction+operation+months+months2-1,family=poisson(link="log"))
summary(regmodel_Pois)
# Note that link function is log!
# BEWARE that we have lots of dummy variables --> We don't need INTERCEPT
# Coefficients: negative = less likely
######## OPTIMISATION #######
# recall that LS = sum of squared residuals, which is a function of betas
# so we vary betas and maximise LS
# Can do this in any problem that you can express as error = ...
# in R, we define a function to optimise, as well as initial values
# we use optim() --> MINIMIZES by default
data <- read.csv("NavigationExample.csv")
attach(data)
# First we must specify the objective function
fn = function(P) sum( ( D - ( (X-P[1])^2 + (Y-P[2])^2 )^0.5 )^2 )
# next we specify initial values/guesses to the problem
initial = c(0,0)
# call on optim to _minimize_ sum of squares
# the first argument is the initial value
# the second argument is the function to be optimized
# the third argument selects the numerical search method
# we will talk about Hessian later.
out = optim(initial, fn, method = "BFGS", hessian = TRUE)
# problem SOLVED…
out$par
# if you want to get a TRACE:
out = optim(initial, fn, method = "L-BFGS-B", hessian = TRUE,control=list(trace=6,REPORT=1))
###### Maximum likelihood ######
# ML is a type of OPTIMIZATION
# ML is ONE option. Sometimes you MUST use ML, other times you can use LS, ..etc
# Y' is a vector of observed values
# Theta' = vector of parameters (Betas)
# Joint density f(Y;Theta)
# Interepreted as EITHER:
# [1] Conditional on fixed 𝚯 it is the probability of sample outcomes 𝐘.
# [2] Conditional on sample outcomes 𝐘 it is a function of 𝚯.
# [2] is LIKELIHOOD function
# LIKELIHOOD
# probability of this data given assumptions = p(data|assumptions)
# = likelihood of each observation
# We're assuming all observations are INDEPENDANT: so we can multiply together
# Given that it is iid, then joint density function = probability of observing this set of values
# = product of marginal probability of each observation
# = f(y1, y2, y3... ; Theta) = f(y1;theta) * f(y2;theta) * ... * f(yn;theta)
# We're looking for values of theta - theta_hat - that maximise likelihood
# To make easier, sum of logs
# We take derivative of log-likelihood (called "score") and set to zero
# Key assumptions of ML estimator [see slides]
# Consistent: big sample converges theta_hat to theta
# Asymptotically normal: so we can do hypothesis tests
# --> we verify this with Hessian matrix
# How to do MLE
# First, write down PMF or PDF for your problem: binomial, gaussian, etc
# Replace outcome variable in PMF (eg y) with linear combination of betas * X's
# Work out joint density = product of each observation's likelihood function
# Take log to get log-likelihood
# Now we have an equation tying linear model to probability of observation
# Optim() to find best values of betas
# Here is the data
# carbon disulphide concentration
x = c(1.6907, 1.7242, 1.7552, 1.7842, 1.8113, 1.8369, 1.8610, 1.8839)
# beetles death
y = c(6, 13, 18, 28, 52, 53, 61, 60)
# beetles experimented on
n = c(59, 60, 62, 56, 63, 59, 62, 60)
# Note that the default of “optim” is to minimize…
# so we simply reverse the sign of the log likelihood function derived eariler…
fn = function(B) sum( -y*(B[1]+B[2]*x) + n*log(1+exp(B[1]+B[2]*x)) - lchoose(n, y) )
out = optim(c(-50,20), fn, method = "BFGS", hessian = TRUE)
# Hessian matrix
# Recall that the inverse of the negative of the Hessian
# matrix approximates the variance-covariance matrix of the
# estimated coefficient vector (in a minimization problem you
# don’t have to take the negative of the Hessian).
# use solve() to get inverse
# Don't care about off-diagonals, because those are the COVARIANCES
# Covariances may be interesting if want to understand relationships between variables
sqrt(diag(solve(out$hessian)))
# [1] 5.180705 2.912137
# These values are STANDARD ERRORS of the ESTIMATES
# Therefore, the t-stat for estimated _𝛽_1_ is:
# 34.27037 / 2.912137 = 11.76
## ML with Gaussian
# We assume residuals are normally distributed
# Assume Gaussian pdf
# (x-mu)^2: means we expect x is normally distributed around mu
# for estimation, x is the DEPENDANT variable (y)
# mu is our linear combination of variables
# WHAT ABOUT SIGMA (VARIANCE)? WE MUST ESTIMATE IT TOO!
# include an unknown for sigma in likelihood function
lnL <- (n/2)*log(2*3.1415926)+(n/2)*log(p[1])+(1/(2*p[1]))*(t(y-X%*%p[2:4])%*% (y-X%*%p[2:4]))
out <- optim(c(rep(1,4)), lnL, hessian=TRUE, method = "BFGS")
# estimate / standard error = t-statistic
#####################################
############ CODE LIBRARY ##########
#####################################
### highlights to remember
# plyr functions: mutate (create new variables), slice, filter...
# tapply - to apply a function by group variable
## data for playing with
data(cars)
data(ChickWeight)
##### Working with data & structures [week 2]
## input/output
read.table(filename, header=T, sep=",")
# CSVs
read.csv(file.choose(), header=T)
read.table(file.choose(),header=T, sep=”,”)
read.delim(file.choose(), header=T,sep=",")
## data types & structures
head(tablename)
dim(tablename)
rownames()
colnames()
head() # from plyr/dplyr
## variable names
names() # to get contents of non-dataframe object - eg contents of linear model object
rename(tablename, newvariablename = oldvariablenane) # plyr
## filtering & subsetting (with PLYR)
# filter rows
# from plyr or dplyr
filter(tablename, Time==0)
filter(tablename, Chick>1, Time==0) # multiple filters
# slice rows
slice(tablename, 1:20) # take first 20 rows
# order rows
arrange(tablename, time, desc(weight)) # sort by time THEN by weight (descending)
# select variables
select(tablename, variable1, variable2, ...)
select(tablename, -variable1) # drop variable
# also mutate...
## merging, binding, etc
# combine vectors
rbind() # rowbind = combine horizontally
cbind() # column bind = combine vertivally
######### DERIVING variables
### APPLYing functions
apply(x,1,sum) # sum across rows
apply(x,2,sum) # sum down columns
# apply by group variable
tapply(scores, sex, mean) # get mean score by sex
### MUTATING (plyr)
mutate(tablename, newvariable = oldvar1 + oldvar2)
####### Descriptive statistics [week 2] #########
## crosstab
table(gender)
xtab(~gender+smoker) # or: table(gender,smoker)
table(x,y)
# SAME AS
xtabs(~x+y)
## Describing
library(psych)
describe(tablename)
describe.by(tablename, tablename$gender)
## summarising
###### PLOTTING [week 03]
## histogram
hist(cars$speed, main="Distribution of Car Speeds",
xlab="Speed (mph)")
# use breaks to change bar width
hist(cars$speed, breaks=10)
## boxplot
boxplot(ChickWeight$weight)
boxplot(ChickWeight$weight~ChickWeight$Diet, xlab="Diet", ylab="Weight (grams)")
## scatterplot
plot(cars$speed,cars$dist)
plot(cars$speed,cars$dist, pch=3, col="red")
## line charts
lines(cars$speed,cars$dist)
# change xlims
plot(cars$speed,cars$dist,xlim=c(0,30))
# add vertical or horizontal lines
plot(cars$speed,cars$dist,pch=x)
abline(h=40)
abline(v=10,lty=2)
# legend
legend(5,110,c("First Half", "Second Half"),pch=1:2)
### par() for paremeters
## par() for multiple plots
# grid of 4 plots:
par(mfrow = c(2, 2))
plot(cars$speed,cars$dist,pch=x)
plot(cars$speed,cars$dist)
hist(cars$speed)
hist(cars$dist)
## par() for direction of labels
nr.prof <-c(prof.pilots=16,lawyers=11,farmers=10,salesmen=9,physicians=9,
mechanics=6,policemen=6,managers=6,engineers=5,teachers=4,
housewives=3,students=3,armed.forces=1)
par(las = 3)
barplot(rbind(nr.prof))
##### GGPLOT2
## qplot = quick plots
# qplot histogram
qplot(cars$dist)
# qplot scatter
qplot(cars$speed,cars$dist)
# qplot gradient
qplot(mtcars$wt, mtcars$mpg, colour=mtcars$cyl)
## ggplot layering
# geom_point()for scatter/dot plots
# geom_line()for line graphs
# geom_bar() for bar charts
# geom_boxplot() for boxplots
# ggplot histogram
ggplot(data=diamonds)+geom_histogram(aes(x=carat))
# ggplot density
ggplot(data=diamonds)+geom_density(aes(x=carat),fill="grey50")
## facet_grid to create grid of 4 plots
ggplot(CO2, aes(conc, uptake)) +
geom_point() +
facet_grid(Treatment ~ Type) +
stat_summary(fun.y = mean, colour = "red", geom = "line")
## ggthemes
# wall street journal theme
library(ggthemes)
g2<-ggplot(diamonds, aes(x=carat,y=price))+
geom_point(aes(color=color))
g2+theme_wsj
## maps with ggplot
library(maps)
nz <- map_data("nz")
nzmap <- ggplot(nz, aes(x=long, y=lat, group=group)) +
geom_polygon(fill="white", colour="black")
###### INFERENCE AND DISTRIBUTIONS [week 4]
# distribution functions - eg. norm
# q - for quartile (eg. 0.975)
# r - for random number
# p for probability of a value (eg. 1.96) = area under curve to that point (integral)
# d for density
# dnorm(x, mean = 0, sd = 1, log = FALSE)
# pnorm(q, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
# qnorm(p, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
# rnorm(n, mean = 0, sd = 1)
# work with vectors too
qnorm(0.975)
# 1.96
pnorm(1.96)
# 0.975
rnorm(5,0,1)
# 5 values
dnorm(1.96)
# confidence intervals
phat=.4
n=120
ME=qnorm(.975)*sqrt(phat*(1-phat)/n)
CI=c(phat-ME,phat+ME)
# ttests
# remember to think of null hypothesis
# one sample t-test
# H0: mean of speed = 10
# H1: mean of speed <> 10
t.test(speed,mu=10)
# compare sample t-test
# H0: Male = female
# H1: male <> female
scores=rnorm(50,20,5)
sex=rep(c("Male", "Female"),c(25,25))
boxplot(scores~sex)
t.test(scores~sex)
# paired samples t-test
before.scores=rnorm(50,15,10)
after.scores=rnorm(50,25,10)
t.test(before.scores,after.scores,paired=TRUE)
########## ANOVA
# A good one-way ANOVA example can be found at: http://www.stat.wmich.edu/wang/664/egs/Rrust.html
# A very good summary of one and two-way ANOVAs with R can be a found at: http://www.statmethods.net/stats/anova.html
# one-way anova example
boys=rnorm(50,mean=128,sd=14)
girls=rnorm(50,mean=130,sd=14)
aliens=rnorm(50,mean=75,sd=14)
all.heights=c(boys,girls,aliens)
type=rep(c("Boy","Girl","Alien"),rep(50,3))
type=factor(type)
boxplot(all.heights~type)
myanova=aov(all.heights~type)
summary(myanova)
layout(matrix(c(1,2,3,4),2,2))
plot(myanova)
# this is the same as lm
mylmanova=lm(all.heights~type)
anova(mylmanova)
# use TUKEY to get ADJUSTED multiple comparisons
# adjusted = adjusted because we are doing many tests simultaneously
# http://en.wikipedia.org/wiki/Multiple_comparisons_problem
TukeyHSD(myanova)
# 2-way ANOVA example
data(ToothGrowth)
attach(ToothGrowth)
supp=factor(supp)
dose=factor(dose)
toothanova=aov(len~supp*dose)
summary(toothanova)
####### REGRESSION
# simple lm
data(cars)
plot(cars$speed,cars$dist)
regmodel=lm(cars$dist~cars$speed)
summary(regmodel)
lines(cars$speed,regmodel$fitted)
confint(regmodel) # gives confidence interval and slope
# transformations - use the I()
lm(y~x+I(x^2))
# GLM - can use various links:
glm(counts ~ outcome + treatment, family=poisson())
# binomial(link = "logit")
# gaussian(link = "identity")
# Gamma(link = "inverse")
# inverse.gaussian(link = "1/mu^2")
# poisson(link = "log")
# quasi(link = "identity", variance = "constant")
# quasibinomial(link = "logit")
# quasipoisson(link = "log")
###### FUNCTIONS [week 5]
# simple example
myfirstfunctionwithinput=function(input)
{
print(input)
}
myfirstfunctionwithinput("I am getting better at functions")
# [1] "I am getting better at functions"
myfirstfunctionwithinput=function(input1, input2)
{
print(input1)
print(input2)
}
myfirstfunctionwithinput(input1=1,input2=2)
# [1] 1
# [1] 2
# DEFAULT values
meanormedian=function(variable, stat=”mean”)
{
# a function that calculates either mean or median of a vector
if (stat=="mean")
{
return(mean(variable))
}
else
{
return(median(variable))
}
}
# IF statements
meanormedian=function(variable, stat)
{
# a function that calculates either mean or median of a vector
if (stat=="mean")
{
return(mean(variable))
}
else
{
return(median(variable))
}
}
# FOR loops
addsomenumbers1=function(n)
{
#returns the sum of the numbers from 1 to n
total=0
for (i in 1:n)
{
total=total + i
}
return(total)
}
addsomenumbers1(100)
# [1] 5050
# WHILE loops
addsomenumbers2=function(n)
{
#returns the sum of the numbers from 1 to n
i=1
total=0
while (i<=n)
{
total=total+i
i=i+1
}
return(total)
}
# RECURSIOn
fibo=function(n)
{
#function that returns the nth Fibonacci number
if (n<2)
{
return(n)
}
else
{
return(fibo(n-1)+fibo(n-2))
}
}
############## SHINY ###################
http://shiny.rstudio.com/gallery/
install.packages('shiny', lib="c:\\notbackedup\\")
library(shiny, lib="c:\\notbackedup\\")
runApp("C:\\NotBackedUp\\R\\shiny\\")
#
## DIAGRAMS
install.packages('diagram')
library(diagram)
names <- c(".Rnw", ".tex", ".pdf")
connect <- c(0, 0, 0,
"knitr", 0, 0,
0, "pdflatex", 0)
M <- matrix(nrow=3, ncol=3, byrow=TRUE, data=connect)
pp <- plotmat(M, pos=c(1, 2), name=names, box.col="orange")
###### MISC ##########
## Generating random datasets
# ------------------------------- #
# R TUTORIAL CODE #
# SESSION 1 #
# ------------------------------- #
## Meta
# use # for comments
# use CTRL-ENTER to run a single line or highlighed bit of code
# ------------------------------- #
# CHAPTER 1: working with basic data objects #
# ------------------------------- #
## [1.1] simple Variables
a = 1
a
a <- 1 # <- arrow is same as =
a
a == 1 # use double arrows for boolean equals (more later)
b = 2
a + b
a * b
c = a + b^2
c
d = 1.0
str(d) # structure of object: number, string, ...
help(str) # help on any function
??structure # search for help on a term --> Google is better
# unless you know what you're looking for
string # if you make an error, R will tell you in red
e = 'string'
str(e)
paste('hello','world') # join two strings
help(paste)
paste('hello','world', sep=", ")
## Meta
# R will interpret end of line as end of command
# *except* where it a line is left incomplete,
# for example it ends in a + or an unclosed bracket
a =
2
(1 + 7 +
19 +
12)
# ....................................... #
# - EXERCISE - #
# caculate 57 * 43
# then calculate the square of the result
# ....................................... #
# ------------------------------- #
## [1.2] Lists
c('a','b','c') # Combine elements into a vector or list
c(1, 2, 3) # Combine elements into a vector or list
f = c(1,2,'word',4) # any list with at least 1 string = all strings
f
# ------------------------------- #
## [1.3] Vectors = a list of numbers OR a one dimensional matrix
# Also think of this like a column OR a row from a table - it's all the same to R
f = c(1,2,3,4)
f
0:4 # quick list of integers
10:0 # reverse
seq(1,50,5) # sequence of number from x to y in steps of size z
# VECTORISATION
# operations can be applied to vectors in one hit = 'vectorisation'
g = 1:10
4*g
g/2
h = g^2
plot(g,h)
# A vector can be a column from a table or a row from a table, R is indifferent.
# R has a huge library of functions which you typically apply to vectors.
# Starting with the basics:
sum(g)
mean(g)
median(g)
summary(g)
# through to nonparametric tests...
x = c(4,7,10,15)
y = c(1,2,3,5)
wilcox.test(y,x)
# And anything parametric you might want, including tests...
pbinom(q=2, size=10, p=0.5 ) # binomial p-value for getting 2 success in 10 trials
# ...and density functions...
n_successes = 0:10
prob_successes = dbinom(n_successes, size=10, p=0.5) # distribution for 10 trials with p=0.5
plot(n_successes, prob_successes, type='l')
# ... and so on
## META
# Data in R lives in active memory (RAM), enabling 'vectorisation'
# Contrast this with SAS, which opens datasets and reads line-by-line,
# hence why PROC SORT exists
# - EXERCISE - #
# create a vector of numbers 5 through 10
# Multiply, elementwise, the numbers in the vector by 3.2
# Calculate the mean & median of the vector
# ....................................... #
# ------------------------------- #
# CHAPTER 2: Data frames #
# ------------------------------- #
# Flexible tables of data, like Excel or SAS tables
# Can still be vectorised and treated like matrices
## creating a dataframe by IMPORTING DATA from csv
# NOTE double slashes in paths
## Getting data out of text fils, eg. CSV
# for different delimiters, use sep="XXX".
# eg. pipe-delimited: sep="|"
# eg. tab-delimited: sep="\t"
FileLoc = "C:\\Documents and Settings\\mencec1\\Desktop\\training\\"
FileName = 'baseball_train.csv'
File = paste(FileLoc,FileName,sep="")
print(File)
TBL_BASEBALL <-read.delim(File,header=TRUE,sep=",")
# look at it in workspace
TBL_BASEBALL # view the table in log
dim(TBL_BASEBALL) # dimensions
colnames(TBL_BASEBALL) # column names
summary(TBL_BASEBALL) # quick summary
str(TBL_BASEBALL) # summary of structure
# ------------------------------- #
# [2.1] Working with dataframes: basic slicing & dicing
# ------------------------------- #
# in R, we access 'cells' of data using [row, column] indexing
TBL_BASEBALL[1,1]
TBL_BASEBALL[2,3]
TBL_BASEBALL[1,] # access entire row --> a vector
TBL_BASEBALL[,1] # access entire column --> a vector
mean(TBL_BASEBALL[,1]) # works like the vectors we saw above
TBL_BASEBALL[,1:2] # select columns 1 & 2
TBL_BASEBALL[1:5,] # select rows 1 through 5.
# remember? 1:5 is the same as c(1,2,3,4,5)
# So you can pass R a list as an index:
TBL_BASEBALL[c(1,7,99),] # use a list as an index
# use the [rows,cols] syntax to do both at once:
TBL_BASEBALL[1:2,6:8] # rows 1 & 2 with columns 6,7,8
## selecting columns by name
colnames(TBL_BASEBALL)
TBL_BASEBALL[,"salary"] # name a single column
TBL_BASEBALL[,c("salary","n_runs","n_walks")] # use a LIST of columns
cols = c("salary","n_runs","n_walks")
TBL_BASEBALL[,cols]
# Lazy '$' syntax works if you only want a single column
TBL_BASEBALL$salary
# subset() syntax can be used if it less confusing for you:
subset(TBL_BASEBALL, select=c('salary','n_runs','n_hits'))
## combine with row slicing:
TBL_BASEBALL[27:32,c('n_runs','n_hits')]
# Once more, we can apply any R functions to sliced chunks of data
# Many functions can handle chunks just as well as vectors:
sum(TBL_BASEBALL[27:32,c('n_runs','n_hits')])
### Now that we can slice & dice, let's use it to get to know BASEBALL data
pairs(TBL_BASEBALL) # generic R matrix scatterplot
## Too many variables... Let's look at fewer
cols = c('salary','batting_avg','n_runs','n_hits','n_homeruns')
cols
pairs(TBL_BASEBALL[,cols])
# More useful scatterplot matrix:
install.packages('ggplot2') # ggplot2 plotting library [if your first install, you may be asked to choose a CRAN server]
install.packages('GGally') # Companion tools for ggplot2
library(ggplot2)
library(GGally)
ggpairs(data=TBL_BASEBALL[cols],
upper = list(continuous = "cor"),
lower = list(continuous = "smooth"),
diag = list(continuous="density"),
axisLabels='show')
# [2] Move to a higher level of abstraction: correlation heatmap
z <- cor(TBL_BASEBALL) # correlation matrix with one command :-)
z
install.packages('lattice')
library(lattice)
levelplot(z) # default is hideous
col.l <- colorRampPalette(c('blue','red','orange','yellow'))(30)
levelplot(z, col.regions=col.l, scales=list(x=list(rot=90, cex=0.8)))
# ....................................... #
# - EXERCISE - #
# select only columns 'n_runs' and 'n_errors' from baseball data
# do a summary() on baseball data that includes only columns 'n_runs' and 'n_errors'
# ....................................... #
## creating a dataframe by IMPORTING DATA from Excel
install.packages('RODBC')
library(RODBC)
FileLoc = "C:\\Documents and Settings\\mencec1\\Desktop\\training\\"
FileName = 'branch profiles.xls'
File = paste(FileLoc,FileName,sep="")
print(File)
channel <- odbcConnectExcel(File)
TBL_BRANCHES <- sqlFetch(channel, "Sheet1",as.is=TRUE)
odbcClose(channel)
# look at it
TBL_BRANCHES
dim(TBL_BRANCHES)
colnames(TBL_BRANCHES)
summary(TBL_BRANCHES)
str(TBL_BRANCHES)
# ------------------------------- #
## [2.2] working with dataframes: boolean slicing & dicing
# ------------------------------- #
##### using logical conditions to select things
# EQUALS is ==
# OR is |
# NOT is !
# AND is &
# NOT EQUALS is !=
# logical conditions work by creating an index then applying that to dataset
# , eg.:
TBL_BRANCHES['n_custs']>5000 # returns list of TRUE vs FALSE for each row
# --> use that list as an index
TBL_BRANCHES[TBL_BRANCHES['n_custs']>30000,]
TBL_BRANCHES[TBL_BRANCHES['pct_migrant']>0.5,]
TBL_BRANCHES[TBL_BRANCHES['pct_migrant']>0.5,c('Branch name','Mean_monthly_net_income','mean_A_RAR')]
# can write your index lists into variables if it is clearer
bigbranches = TBL_BRANCHES['n_custs']>30000
TBL_BRANCHES[bigbranches,]
# boolean conditions create the same list of TRUE/FALSE for indexing:
TBL_BRANCHES['pct_migrant']>0.5 & TBL_BRANCHES['State']=='NSW'
TBL_BRANCHES[TBL_BRANCHES['pct_migrant']>0.5 & TBL_BRANCHES['State']=='NSW',]
TBL_BRANCHES[TBL_BRANCHES['pct_migrant']>0.5 | TBL_BRANCHES['pct_student']>0.5,] # OR
# negations - ! can go before an entire condition, or as part of a !=
TBL_BRANCHES[TBL_BRANCHES['State']!='NSW',] # all states NOT NSW
TBL_BRANCHES[!TBL_BRANCHES['pct_migrant']>0.5, ] # every record BUT where pct_migrant > 50%
# You can use %in% like SQL 'in' to match to a list of conditions
TBL_BRANCHES[,'State'] %in% c('SA','WA')
TBL_BRANCHES[TBL_BRANCHES[,'State'] %in% c('SA','WA'),]
## Alternative syntax using subset() may be easier for some
subset(TBL_BRANCHES, State == 'VIC')
subset(TBL_BRANCHES, State == 'VIC' & pct_migrant>0.5)
subset(TBL_BRANCHES, State == 'VIC' | pct_migrant>0.5)
subset(TBL_BRANCHES, Location %in% c('Regional','Remote'))
### now that we can do boolean slicing & dicing, let's use it
### to get to know BRANCHES data better
# Map it
install.packages('ggmap') # package for query google maps & plot with ggplot2
library(ggmap)
qmap('Australia', zoom=4, maptype='satellite') # query google maps
qmap('Australia', zoom=4, maptype='terrain')
# download map into 'mapAus' variable
mapAus <- qmap('Australia', zoom=4, maptype='terrain', color="bw")
# Overlay points onto mapAus using TBL_BRANCHES data
mapAus + geom_point(aes(x=Long, y=Lat), color='red', data=TBL_BRANCHES, alpha=0.5)
# Overlay points where size is proportional to N_CUSTS
mapAus + geom_point(aes(x=Long, y=Lat, size=n_custs), color='red', data=TBL_BRANCHES, alpha=0.5)
## let's use boolean slicing: do a map of high % Affluent 50+ around Melbourne
affluentbranches = TBL_BRANCHES['pct_affluent_50+']>0.1
# Get map centered on Melbourne
MyMap <- qmap(location = 'Melbourne, Australia', zoom=9, maptype='terrain',color="bw")
# plot migrant branches, scale colour to mean RAR
MyMap +
geom_point(data=TBL_BRANCHES[affluentbranches,], aes(x=Long, y=Lat, colour=as.numeric(mean_A_RAR)), alpha=0.7, size=5) +
scale_colour_gradient(low="blue", high="red",name='Mean RAR')
# ....................................... #
### EXERCISE
# Select all branches with >20% affluent 50+
# get average customer RAR for all branches in VIC
# ....................................... #
# ------------------------------- #
# CHAPTER 3: R LIBRARIES
# ------------------------------- #
## R Libraries make it easy to do stuff
# C-RAN: http://cran.r-project.org/
# packages must be downloaded once (this will also update them):
# then called in a given script you are using them for:
# you can see packages & versions in package tab of rstudio
## wordcloud
install.packages("wordcloud") # download package
library(wordcloud) # call package into memory
text = "There are five main ways water vapor can be added to the air.
Increased vapor content can result from wind convergence over water or
moist ground into areas of upward motion. Precipitation or virga falling
from above also enhances moisture content. Daytime heating causes water
to evaporate from the surface of oceans, water bodies or wet land.
Transpiration from plants is another typical source of water vapor.
Lastly, cool or dry air moving over warmer water will become more humid.
As with daytime heating, the addition of moisture to the air increases
its heat content and instability and helps set into motion those processes
that lead to the formation of cloud or fog"
wordcloud(text,min.freq=1,random.order=FALSE)
# make it look prettier (?) with colours
library(RColorBrewer)
pal <- brewer.pal(8,"Dark2")
wordcloud(text, scale=c(0.3,2), min.freq=1,
colors=pal, rot.per=0, fixed.asp=FALSE,
vfont=c("gothic english","plain"))
## randomForest
install.packages("randomForest")
library(randomForest)
rf <- randomForest(TBL_BASEBALL[,c('n_homeruns','batting_avg','n_errors')],
TBL_BASEBALL$salary, # target variable
n_tree=500, # 500 trees
mtry=2) # try 2 variables (at random) for each split
rf
importance(rf)
TBL_BASEBALL$yhat = predict(rf, TBL_BASEBALL) # predict function
plot(TBL_BASEBALL$salary,TBL_BASEBALL$yhat)
## sqldf - use SQL to operate on dataframes
install.packages('sqldf')
library('sqldf')
sqldf(' select State
, count(*)
from TBL_BRANCHES
group by State')
## Other neat libraries
# Interactive charts with Shiny: http://www.rstudio.com/shiny/
# twittR
# RODBC - for ODBC connections
# XML - for webpage scraping
# ....................................... #
# - EXERCISE - #
# download and install wordcloud library using install.packages()
# then make it active with library()
# make a wordcloud from a piece of text
# if necessary, help(wordcloud)
# ....................................... #
# ------------------------------- #
# CHAPTER 4: PLOTTING
# ------------------------------- #
# generic R plotting is drab but quick
plot(TBL_BASEBALL$batting_avg, TBL_BASEBALL$salary)
# ggplot2 library is awesome: http://docs.ggplot2.org/current/
install.packages('ggplot2')
library(ggplot2)
# build plots logically by adding layers
p <- ggplot(TBL_BASEBALL, aes(x=batting_avg, y=salary)) # create a blank canvas in 'p'
p <- p + geom_point() # add simple points
p
p <- p + geom_point(aes(colour = salary), size=5, alpha=0.5 )
p # aes statement defines how data links to plot elements
# you can go on and on adding layers and tweaking settings...
p <- p + stat_density2d(colour='red')
p
p <- p + scale_colour_gradient(low = "green", high="yellow")
p
p <- p + theme_bw()
p
# ... can layer up endlessly. lines, points, contours; sized, coloured, shaped by variables..
# various add-on libaries for all sorts of charts, eg GGally has scatterplot matrix
###### histogram / density example
p <- ggplot(TBL_BRANCHES, aes(mean_A_RAR))
p + geom_histogram()
# split by location
p <- ggplot(TBL_BRANCHES, aes(mean_A_RAR, fill = Location))
p + geom_histogram() # not so good
p <- p + geom_density(alpha=0.2)
p
p <- p + theme_bw()
p
# use facet_grid to break plots up by a categorical variable
# syntax: rows ~ columns
p + facet_grid(Location ~ .)
p + facet_grid(Location ~ State)
### Maps + ggplot2 = lots of possibilities
library(ggmap)
MyMap <- qmap(location = 'Niddrie, Australia', zoom=10, color='bw')
MyMap +
geom_point(aes(x=Long, y=Lat, size=as.numeric(n_custs),
colour=mean_A_RAR), alpha=0.8, data=TBL_BRANCHES) +
scale_size_continuous(range = c(2,10), name='# Custs') +
scale_colour_gradient(low="blue", high="red",name='Mean $ RAR')
# using a different map source:
MyMap <- qmap(location = 'Niddrie, Australia', zoom=10, source = 'osm')
MyMap +
geom_point(aes(x=Long, y=Lat, size=as.numeric(n_custs),
colour=mean_A_RAR), alpha=0.8, data=TBL_BRANCHES) +
scale_size_continuous(range = c(2,10), name='# Custs') +
scale_colour_gradient(low="blue", high="red",name='Mean $ RAR')
## Map of branches coloured by cluster
TBL_BRANCHES_VIC = subset(TBL_BRANCHES, State == 'VIC')
MyMap <- qmap(location = 'Melbourne, Australia', zoom=10, source = 'osm', color='bw')
MyMap +
geom_point(aes(x=Long, y=Lat, colour=as.factor(Cluster)), size=5,
show_guide = FALSE, data=TBL_BRANCHES_VIC)
# add border to points, make colour gradient have more contrast:
cols = rainbow(100) # take 100 colours from rainbow
cols = sample(cols, 100) # use sample to get them in a random order
MyMap +
geom_point(aes(x=Long, y=Lat, colour=as.factor(Cluster)), size=5,
show_guide = FALSE, data=TBL_BRANCHES_VIC) +
geom_point(aes(x=Long, y=Lat), size=5, pch=21 , data=TBL_BRANCHES_VIC) +
scale_colour_manual(values=cols)
# See list of inbuilt spectrums here: http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/
# Kate also created this ANZ colourset:
ANZcolours <- c(
"#B2B2B2",
"#DDDDDD",
"#CCE5F5",
"#66B2E0",
"#007FCC",
"#002B3C",
"#D7E7DE",
"#E6E1B6",
"#F2F285",
"#B2A58D",
"#87E10C",
"#F2AF00",
"#F27300",
"#BF3500")
pie(rep(1,14), col=ANZcolours)
# ... ggplot is awesome
# virtually any element of the plot can be customised
# See galleries of examples at http://docs.ggplot2.org/current/
#### Final awesome example: Houston crime data density
str(crime) # the 'crime' dataset is contained in the ggmap library
# limit to violent crimes
violent_crimes <- subset(crime, offense != "auto theft" &
offense != "theft" & offense != "burglary")
HoustonMap <- qmap("houston", zoom = 14, color = "bw")
theme_set(theme_bw(16)) # changes default plot theme for ggplot2
HoustonMap +
stat_density2d(aes(x = lon, y = lat, fill = ..level.., alpha = ..level..),
bins = 5, geom = "polygon",
data = violent_crimes, show_guide = FALSE) +
scale_fill_gradient(low = "black", high= "red", guide=FALSE) +
facet_wrap(~ day)
theme_set(theme_grey()) # revert to default ggplot2 theme
# ....................................... #
# - EXERCISE - #
# [1] Scatterplot of branch customer FUM vs RAR
# hint: 1st try plot(x_values, y_values)
# next, try ggplot:
# p <- ggplot(data=TABLENAME, aes(x=X_VALUES, y=Y_VALUES))
# p <- p + geom_point()
# p
# [2] Change the scatterplot so branches are coloured by Location
# --> insert colour=VARIABLE into the ggplot aes statement
# [3] Make it look prettier
# Change the size by adding size=4 into the geom_point() statement
# Change the transparency by adding alpha=0.4 into the geom_point() statement
# Change the colour to be a function of the MOSAIC_prosperity
# .. and so on. look at examples at http://docs.ggplot2.org/current/ and tinker
# ---------------------------------------------------------------- #
# ---------------------------------------------------------------- #
# ------------------------------- #
# R TUTORIAL CODE #
# SESSION 2 #
# ------------------------------- #
# refresher on what we did last week
# vectors, variables, functions
f = c(1,2,3,4)
sum(f)
# importing data from csv
FileLoc = "C:\\Documents and Settings\\mencec1\\Desktop\\training\\"
FileName = 'baseball_train.csv'
File = paste(FileLoc,FileName,sep="")
print(File)
TBL_BASEBALL <-read.delim(File,header=TRUE,sep=",")
# importing data from Excel
install.packages('RODBC')
library(RODBC)
FileLoc = "C:\\Documents and Settings\\mencec1\\Desktop\\training\\"
FileName = 'branch profiles.xls'
File = paste(FileLoc,FileName,sep="")
print(File)
channel <- odbcConnectExcel(File)
TBL_BRANCHES <- sqlFetch(channel, "Sheet1",as.is=TRUE)
odbcClose(channel)
# dataframes, slicing & dicing
colnames(TBL_BASEBALL)
summary(TBL_BASEBALL)
TBL_BASEBALL[1:5,] # select rows 1 through 5.
TBL_BASEBALL[,c("salary","n_runs","n_walks")] # use a LIST of columns
TBL_BRANCHES[TBL_BRANCHES['pct_migrant']>0.5,] # boolean indexing
## libraries
install.packages("wordcloud") # download package
library(wordcloud) # call package into memory
text = "There are five main ways water vapor can be added to the air.
Increased vapor content can result from wind convergence over water or
moist ground into areas of upward motion. Precipitation or virga falling
from above also enhances moisture content. Daytime heating causes water
to evaporate from the surface of oceans, water bodies or wet land.
Transpiration from plants is another typical source of water vapor.
Lastly, cool or dry air moving over warmer water will become more humid.
As with daytime heating, the addition of moisture to the air increases
its heat content and instability and helps set into motion those processes
that lead to the formation of cloud or fog"
wordcloud(text,min.freq=1,random.order=FALSE)
## plotting & mapping with ggplot2
install.packages('ggplot2')
library(ggplot2)
p <- ggplot(TBL_BRANCHES, aes(mean_A_RAR, fill = Location))
p <- p + geom_density(alpha=0.2)
p <- p + theme_bw()
p + facet_grid(Location ~ State)
# on to new stuff...
# ------------------------------- #
# [2.3] aggregating data - like PROC SUMMARY
aggregate(TBL_BRANCHES['mean_FUM'], by=list(TBL_BRANCHES$State), FUN=mean)
# multiple variables
aggcols = c('mean_FUM','mean_n_prods','mean_A_RAR')
aggregate(TBL_BRANCHES[aggcols], by=list(TBL_BRANCHES$State), FUN=mean)
# multiple functions
aggregate(TBL_BRANCHES[aggcols], by=list(TBL_BRANCHES$State),
FUN=function(x) c(mn = mean(x), md = median(x) ))
# custom aggregate functions - anything that would work on a vector
aggregate(TBL_BRANCHES['mean_FUM'], by=list(TBL_BRANCHES$State),
FUN=function(x) interest=sum(x*0.18/12) )
# we can simplify syntax by loading formula into a variable
# ** virtually anything can be loaded into a variable in R **
f <- function(x) interest=sum(x*0.18/12)
aggregate(TBL_BRANCHES['mean_FUM'], by=list(TBL_BRANCHES$State), FUN=f )
# You can write your aggregation into a dataframe for further maniupation, plots, etc
aggdf = aggregate(TBL_BRANCHES['mean_FUM'], by=list(TBL_BRANCHES$State), FUN=f )
colnames(aggdf)
ggplot(data=aggdf, aes(x=Group.1, y=mean_FUM, fill=Group.1)) + geom_bar(stat="identity")
ggplot(data=aggdf, aes(x=Group.1, y=mean_FUM, fill=mean_FUM)) + geom_bar(stat="identity")
# ....................................... #
# - EXERCISE - #
# Aggregate branch data by Location:
# calculate mean branch customer FUM AND mean RAR
# ....................................... #
# ------------------------------- #
## [2.4] TRANSFORMING DATA
## creating a new field
TBL_BASEBALL['new'] = 1
TBL_BASEBALL['new']
TBL_BASEBALL['new'] = c(1,2) # error: vectors are different sizes
TBL_BASEBALL['new'] = log(TBL_BASEBALL['salary'])
TBL_BASEBALL['new']
TBL_BASEBALL['power'] = TBL_BASEBALL['n_homeruns'] + TBL_BASEBALL['n_stolenbases']
TBL_BASEBALL['power']
## linear transformations of scale
# say we have a skewed distribution...
hist(TBL_BASEBALL$salary)
hist(TBL_BASEBALL$salary, breaks=30)
summary(TBL_BASEBALL['salary'])
# yes, very skewed.
# we can transform it...
TBL_BASEBALL$log_salary = log(TBL_BASEBALL$salary)
hist(TBL_BASEBALL$log_salary, breaks=30)
# ...or we could band it:
## banding & categorising
# equal spaced cut points based on value field
cut(TBL_BASEBALL$n_runs, 5) # 5 groups
# [] brackets are inclusive
# Labels can be set- see help(cut)
# custom cut points
cuts = c(0,10,50,90)
cut(TBL_BASEBALL$n_runs, cuts)
# doing cuts based on # of records, eg deciles, quartiles
quantile(TBL_BASEBALL$batting_avg,0.5) # quantile returns the point at X%
quantile(TBL_BASEBALL$batting_avg,c(0.25,0.5,0.75)) # quantile returns the point at X%
cutpoints = quantile(TBL_BASEBALL$batting_avg,c(0,0.25,0.5,0.75,1.0)) # quantile returns the point at X%
cut(TBL_BASEBALL$batting_avg, cutpoints)
# deciles
(0:10) # remember this?
(0:10)/10 # 0 to 1 by 0.1, eg deciles
decpoints = quantile(TBL_BASEBALL$batting_avg,(0:10)/10)
decpoints
TBL_BASEBALL['batting_decile'] = cut(TBL_BASEBALL$batting_avg, decpoints, include.lowest=TRUE)
TBL_BASEBALL[,c('batting_decile','batting_avg')]
# now we can aggregate
aggcols = c('n_hits','n_runs','n_strikeouts','salary')
aggregate(TBL_BASEBALL[aggcols], by=list(TBL_BASEBALL$batting_decile), FUN=mean)
# ....................................... #
# - EXERCISE - #
## with TBL_BRANCHES
# create a new column which is sum of % migrants and % students
# aggregate the table to see average FUM in each FUM decile
# ....................................... #
## [2.5] Custom functions to transform data
# there's lots of ways to recode data...
TBL_BRANCHES$c_age <- ifelse(TBL_BRANCHES$mean_age > 45, c("old"), c("young"))
TBL_BRANCHES[,c('c_age','mean_age')]
# ...which I'll leave you to figure out
# let's look at one advanced method: custom functions
# define an R function and apply it:
fn <- function(x) x/1000
fn
fn(2000)
# use 'sapply' to apply across a vector
sapply(TBL_BRANCHES[,'mean_FUM'],fn)
TBL_BRANCHES[,'mean_FUM_k'] = sapply(TBL_BRANCHES[,'mean_FUM'],fn)
# functions can be complex, can contain if/then logic,
# can be used for categorising:
fn <- function(x) {
if(x<20) {'young'}
else if(x<45) {'mid age'}
else {'old'}
} # curly brackets allow the function to run over many lines
fn(40)
sapply(TBL_BRANCHES[,'mean_age'], fn)
# ....................................... #
# ------------------------------- #
# CHAPTER 6: EASY DATA MINING
# ------------------------------- #
## [6.1] regression: linear regression
plot(TBL_BASEBALL$n_homeruns, TBL_BASEBALL$salary)
lm.fit = lm(salary~n_homeruns, data=TBL_BASEBALL)
lm.fit
abline(lm.fit) # overlays regression line onto chart
summary(lm.fit)
plot(lm.fit) # various residuals and goodness-of-fit plots
# we can easily predict back onto another dataset using predict()
TBL_BASEBALL['salary_predict'] = predict(lm.fit, TBL_BASEBALL)
TBL_BASEBALL[,c('salary','salary_predict')]
TBL_BASEBALL[231,]
# if you want to use ALL variables, here's a shorthand:
lm(salary~., data=TBL_BASEBALL)
# lm can be extended into complex linear models, polynomials, etc:
lm.fit = lm(salary~n_homeruns + n_errors * n_strikeouts + I(n_hits^2),
data=TBL_BASEBALL)
summary(lm.fit)
# use glm() for generalised linear models, eg logistic
## [6.2] unsupervised learning: k-means clustering
# what if we automatically cluster branches together using
# MOSAIC factors? Do we get anything useful or meaningful?
ggplot(TBL_BRANCHES, aes(x=MOSAIC_cultural_diversity, y=MOSAIC_prosperity)) +
geom_point()
# this way makes it clearer where dense areas are:
ggplot(TBL_BRANCHES, aes(x=MOSAIC_cultural_diversity, y=MOSAIC_prosperity)) +
geom_point(alpha=0.4, size=5, colour='blue')
# use kMeans clustering to find 2 clusters
km.out = kmeans(TBL_BRANCHES[,c('MOSAIC_prosperity','MOSAIC_cultural_diversity')], 2)
km.out
TBL_BRANCHES$cluster = km.out$cluster # write clusters back to dataset
# plot
ggplot(TBL_BRANCHES,
aes(x=MOSAIC_cultural_diversity, y=MOSAIC_prosperity,
colour=as.factor(cluster))) +
geom_point(alpha=0.4, size=5)
# Clustering has done a pretty good job of automatically detecting
# metro vs. regional branches:
ggplot(TBL_BRANCHES,
aes(x=MOSAIC_cultural_diversity, y=MOSAIC_prosperity, colour=Location)) +
geom_point(alpha=0.4, size=5)
## [6.3] classification: kNN classification
# k-nearest neighbour
colnames(TBL_BRANCHES)
# make three income bands: low, medium, high:
cutpoints = quantile(TBL_BRANCHES$Mean_monthly_net_income, c(0,0.33,0.66,1))
TBL_BRANCHES$band_income = cut(TBL_BRANCHES$Mean_monthly_net_income, cutpoints,
labels=c(1,2,3), include.lowest=TRUE)
TBL_BRANCHES$band_income
# first 2 MOSAIC factors are decent discriminators:
ggplot(TBL_BRANCHES,
aes(x=MOSAIC_prosperity, y=MOSAIC_dependants, colour=band_income)) +
geom_point(size=4, alpha=0.5)
# use kNN to predict back onto our dataset:
install.packages('class')
library(class)
X = TBL_BRANCHES[,c('MOSAIC_prosperity','MOSAIC_dependants')]
y = TBL_BRANCHES[,'band_income']
TBL_BRANCHES$income_predicted = knn(train = X, # Dataset to learn from
test = X, # dataset to predict
cl = y, # target variable to learn
k=10) # number of neighbours to use
ggplot(TBL_BRANCHES,
aes(x=MOSAIC_prosperity, y=MOSAIC_dependants, colour=income_predicted)) +
geom_point(size=4, alpha=0.5)
# we can draw a decision boundary plot
# first we create a 100x100 grid that spans the range of two variables
x1lim = range(TBL_BRANCHES$MOSAIC_prosperity) # gets min and max of variable
x2lim = range(TBL_BRANCHES$MOSAIC_dependants)
x1range = seq(x1lim[1],x1lim[2],len=100) # creates a sequence of values
x2range = seq(x2lim[1],x2lim[2],len=100)
grid = data.frame(X1 = rep(x1range, 100), X2 = rep(x2range, each=100))
# use KNN to predict income band of each point in grid
grid$predict = knn(train = X, # Dataset to learn from
test = grid, # Dataset to predict
cl = y, # Target variable to learn
k=10) # number of neighbours to use
# plot decision grid, then overlay datapoints
g <- ggplot(data=grid, aes(X1, X2)) +
geom_point(aes(colour=predict), alpha=0.3) +
theme_bw()
g
g + geom_point(data=TBL_BRANCHES, aes(x=MOSAIC_prosperity,
y=MOSAIC_dependants,
colour=band_income), size=4)
# ....................................... #
# ------------------------------- #
# CHAPTER 5: CREATING HTML NOTEBOOKS OF YOUR ANALYSIS
# ------------------------------- #
# a small notebook with code & output you can email to anyone...
# Steps:
# run this
install.packages('knitr') # requires knitr package is installed
### COPY THIS INTO A NEW R SCRIPT ANDin RSTUDIO AND HIT THE
### 'COMPILE NOTEBOOK'BUTTON AT THE TOP RIGHT OF THE SCRIPT PANEL
#########################################################################################
#### Example of compiling HTML notebook
# For HTML notebook creation to work,
# code must run standalone from end to end
library(knitr)
library(RODBC)
library(ggmap)
library(ggplot2)
library(GGally)
# import data
FileLoc = "C:\\Documents and Settings\\mencec1\\Desktop\\training\\"
FileName = 'branch profiles.xls'
File = paste(FileLoc,FileName,sep="")
print(File)
channel <- odbcConnectExcel(File)
TBL_BRANCHES <- sqlFetch(channel, "Sheet1",as.is=TRUE)
odbcClose(channel)
# summarise useful data by state
aggcols = c('mean_FUM','mean_n_prods','mean_A_RAR')
aggregate(TBL_BRANCHES[aggcols], by=list(TBL_BRANCHES$State), FUN=mean)
# plot average customer RAR histogram by branch location
ggplot(TBL_BRANCHES, aes(mean_A_RAR, fill = Location)) + geom_density(alpha=0.2) + theme_bw()
# plot matrix scatterplot of ...
ggpairs(data=TBL_BRANCHES[aggcols],
upper = list(continuous = "cor"),
lower = list(continuous = "smooth"),
diag = list(continuous="density"),
axisLabels='show')
#########################################################################################
# ....................................... #
# - EXERCISE - #
## DO [1] or [2]:
# as you go, remember to visualise your data
# [1] Data mining competition: predict baseball salary
# you might try:
# a linear model:
lm.fit = lm(salary~., data=TBL_BASEBALL)
predict(lm.fit, TBL_BASEBALL)
# a randomforest:
install.packages("randomForest")
library(randomForest)
predictors = c('n_homeruns','batting_avg','n_errors')
rf = randomForest(TBL_BASEBALL[predictors],
TBL_BASEBALL$salary,
n_tree=500,
mtry=3)
predict(rf,TBL_BASEBALL)
# clustering to create derived variables
km.out = kmeans(TBL_BASEBALL[predictors], 2)
TBL_BASEBALL$cluster = km.out$cluster
# [2] predict branch customer profitability: mean_A_RAR
# you might try:
# a linear model:
colnames(TBL_BRANCHES)
lm.fit=lm(mean_A_RAR~mean_FUM + Mean_monthly_net_income +
mean_n_prods + pct_affluent_50, data=TBL_BRANCHES)
predict(lm.fit, TBL_BRANCHES)
# a randomforest:
install.packages("randomForest")
library(randomForest)
predictors = c('mean_FUM','Mean_monthly_net_income','mean_n_prods')
rf = randomForest(TBL_BRANCHES[predictors],
TBL_BRANCHES$mean_A_RAR,
n_tree=500,
mtry=3)
predict(rf,TBL_BRANCHES)
# clustering to create derived variables
km.out = kmeans(TBL_BRANCHES[predictors], 2)
TBL_BRANCHES$cluster = km.out$cluster
# ....................................... #
# ------------------------------- #
# APPENDIX: useful things we skipped
# ------------------------------- #
## merging dataframes
# see rbind, cbind, merge
# here are a few examples: https://rhandbook.wordpress.com/tag/slicing-and-dicing-of-data-frame/
## Exporting data
# write.table(mydata,"c:/mydata2.txt", sep="\t")
## Generate random sequence of integers
# sample(20)
## Generate distributions
# ...
############# OPTIMAL BINNING BUCKETING ###############
# with decision tree
install.packages("party", lib="c:\\notbackedup")
library("party", lib='\\notbackedup')
?ctree
str(iris)
iris_ctree <- ctree(Species ~ Sepal.Length,
controls = ctree_control(minbucket=30, maxdepth=2), data=iris)
print(iris_ctree)
# tag node:
iris$predicted_value = predict(data=iris, iris_ctree, type = "node")
prop.table(xtabs( ~ F_RESPONSE + treenode, campaigndf),2)
# discussion of bucketing
# http://www.r-bloggers.com/package-party-conditional-inference-trees/
??ctree
## System & file management
objects() # workspace
getwd() # working directory
setwd("/mnt/resource/training3")
dataPath = "/mnt/resource/BigData/data"
bigDataPath = "/mnt/resource/BigData/BigData"
outdir = "output"
# Check & create directories
if(!file.exists(outdir)) dir.create(outdir)
# list files in directory
dir(dataPath)
dir(bigDataPath)
# example
##
setwd("/mnt/resource/training3")
dataPath = "/mnt/resource/BigData/data"
outdir = "output"
outPath = "output"
if(!file.exists(outdir)) dir.create(outdir)
bigDataPath = "/mnt/resource/BigData/BigData"
big.data.path = "/mnt/resource/BigData/BigData"
data.path = "/mnt/resource/BigData/data"
output.path = 'output'
sample.data.dir = rxGetOption("sampleDataDir")
## Read data
# from internet:
## Selecting & slicing
# which() = identify records meet a list of criteria
# which(mtcars$hp == max(mtcars$hp))
## also SQLDF
# repeat something
replicate()
## sort and order
# sort() = sort a vector
# order() = return element order of a sorted vector
# for a dataframe rows use order: mtcars[order(mtcars$mpg, decreasing=TRUE), ]
## Merging
# use merge()
# inner join (only keep matches): merge(x,y,all=FALSE)
# outer join (keep all records): merge(x,y,all=TRUE)
# left or right join: merge(x,y,all.x=TRUE)
# BINDING = 1 to 1 rows = cbind()
# UNION ALL = stack, use rbind
## also SQLDF
## regex and strings
# richest.nations$Amount <- gsub(",", "", richest.nations$Amount)
# richest.nations$Amount <- sapply(richest.nations$Amount, function(x) substr(x, 2, nchar(x)))
## connecting to databases, SQL
# note: SQLite = free, open-source SQL DB
library("RSQLite")
db = dbConnect(SQLite(), dbname = file.path(bigDataPath, "airlines.sqlite"))
dbListTables(db) # list all tables
dbListFields(db, "planes")
# run a query
query <- "SELECT count(*) FROM carriers"
dbGetQuery(db, query)
# more queries
query <- "SELECT * FROM airlines where Dest=\"DAL\""
query <- "SELECT * FROM carriers where Description LIKE \"American%\""
query <- 'SELECT * FROM carriers where Description LIKE "%American%"'
## Sequential reads: taking data one chunk at a time = fetch()
query <- "SELECT * FROM carriers"
res <- dbSendQuery(db, query)
x <- fetch(res, n = 1000) # then use rbind() to stack
str(x)
dbClearResult(res) # close connection
# Misc
# put brackets around assignment ot make it print
# eg (test = 2)
## Apply
# apply(df, MARGIN, FUN, ...)
# where Margin is a vector. 1 is rows, 2 is columns
# FUN = function
# eg. apply(mtcars, 2, mean) # mean for each column
## Pass parameters to the FUN in the apply syntax
# apply(mtcars, 2, quantile, c(0.25,.75))
# 2th, 75th centile
## Other applies
# lapply() = to each element of a list
## by()
# split dataframe into defined groups, then apply function
# eg. by(mtcars, mtcars$cyl, colMeans) # colMeans for each factor of cyl
## tables & xtabs
xtabs(~gear + cyl, mtcars)
aggregate()
# Note: write your OWN function to return multiple aggregates if you want multiple
# repeat
replicate()
#### RevoR & Teradata
# xdf seems to be distributed form of dataframe
### data munging
## library(dplyrXdf)
## filter, select, distinct, transmute, mutate, arrange, rename,
## Summarise
# bankSmry <- group_by(bankXdf, marital, housing) %>%
# summarise(mean_age=mean(age), mean_balance=mean(balance), .method=3)
## doXdf
# bankModels <- group_by(bankXdf, housing) %>%
# doXdf(model=rxLogit(y ~ job + marital + education, data=.))
> outdir = "~"
> cars.mod = cars
> save(cars.mod, file=file.path(outdir, "cars_mod.RData"))
paste() = concatenate strings
can include a sep
paste("row",1:5,sep="_")
## boolean
# is.na() for missing values
# x[is.na(x)] = 412
## VEctors
letters[-c(1:5]) = drop first 5 letters
# giving elements names - can access with names
x = 1:5
names(x) = letters[1:5]
x["c"]
## Matrices
# Assing ROW and COLUMN names, then you can query on them
# mat[c('a','b'),c('f','g')]
## Dataframes
# str()
# summary()
# data.frame() = create
## Vectorising
# ifelse(condition, ifvalue, elsevalue)
# eg. mtcars$type = ifelse(mtcars$hp>mean(mtcars$hp),'highhp','lowhp')
Scatterplot: use pairs()
## R Functions
test <- function(a, b) return(a + b)
test
function.name <- function(arguments) {
# everything the function does in here
}
## Modelling
# Where there is a big spike at 0 before smaller skewed distribution
# --> use Tweedie distribution power 1.5 = 'compound gamma poisson' = zero spike like poisson plus gamma
### ggvis
mtcars %>% ggvis(~ht) %>% layer_histograms()
#### RevoR
# xdf = "extermal data frame"
## import
args(rxImport)
colClasses <- c("integer", rep("factor", 4), "integer", rep("factor",3), "integer", "factor", rep("integer", 4), rep("factor", 2))
names(colClasses) <- c("age", "job", "marital", "education", "default","balance", "housing", "loan", "contact", "day", "month", "duration","campaign", "pdays", "previous", "poutcome", "y")
BankXDF <- rxImport(inData = infile, outFile = outfile, colClasses = colClasses,rowsPerRead = 10000, overwrite = TRUE)
# from text: RxTextData()
## Info & statistics
rxGetInfo(inADS, getVarInfo = TRUE)
rxGetInfo(bankXDF, getVarInfo = TRUE, numRows = 6)
rxGetVarInfo() # names, max, min
rxGetVarNames() # variable names
rxSummary()
rxSummary(~default + balance, data = BankDS)
BankSummary2 <- rxSummary(balance ~ F(age), data = BankDS)
# quantiles
args(rxQuantile)
# crosstabs
args(rxCrossTabs)
rxCrossTabs(balance ~ F(age, low = 20, high = 30):F(campaign, low = 1, high = 5), data = BankDS)
#### manipulation & transformation
## use rxDataStep to do transforms, drop, keep, etc
args(rxDataStep)
BankSubDS <- rxDataStep(inData = origBankSubDS, outFile = outfile,
transforms = list(logBalance = log(newBalance)), overwrite = TRUE)
rxDataStep(inData = BankDS, outFile = BankDS, transforms = list(newBalance = balance + 8019 + 1), overwrite = TRUE)
## using custom functions
manualFactorRecoding <- function(dataList) {
dataList[["F_n.devices"]] <- factor(dataList[["n.devices"]], levels = seq(1,
10))
return(dataList)
}
ChurnDS <- rxDataStep(inData = origChurnDS, outFile = churnoutfile,
transformFunc = manualFactorRecoding, overwrite = TRUE)
## creating factors - use factorise() or F() inside functions
bankXdf2 <- factorise(bankXdf, age)
rxFactors(inData = ChurnDS, outFile = ChurnDS, varsToKeep = rxGetVarNames(origChurnDS),
factorInfo = list(F_n.devices2 = list(levels = seq(1:10), varName = "n.devices")),
overwrite = TRUE)
## subset cols
BankDS <- rxDataStep(inData = origBankDS, outFile = outfile, varsToKeep = c("balance",
"age", "y", "poutcome"), overwrite = TRUE)
## subset rows
BankDS.rs <- rxDataStep(inData = origBankDS, outFile = outfile.rs,
varsToKeep = c("balance", "age", "y", "poutcome"), rowSelection = age < 21, overwrite = TRUE)
## Sort: rxSort
## Dedupe
BankDS2 <- rxSort(inData = BankDS, outFile = outfile2, sortByVars = c("balance",
"age"), decreasing = c(FALSE, TRUE), removeDupKeys = TRUE, dupFreqVar = "Dup_Count",
overwrite = TRUE)
#### manipulation with dplyrXdf
### can use:
## filter, select, distinct, transmute, mutate, arrange, rename,
## group_by, summarise, do
## left_join, right_join, full_join, inner_join
## these functions supported by rx: sum, n, mean, sd, var, min, max
bankSmry <- group_by(bankXdf, marital, housing) %>%
summarise(mean_age=mean(age), mean_balance=mean(balance))
## Plots
rxHistogram(~logBalance, data = BankSubDS, xnNumTicks = 15)
# scatterplot
rxLinePlot(newBalance ~ age, type = "p", data = BankDS)
## Linear models
inADS <- RxXdfData(file.path(sample.data.dir, "AirlineDemoSmall.xdf"))
rxGetInfo(inADS, getVarInfo = TRUE)
# create model
model <- rxLinMod(ArrDelay ~ DayOfWeek + F(CRSDepTime), data = inADS)
model
model
coef(model)
names(model) # components of model object
# predict
'?'(rxPredict)
airline.xdf <- file.path(output.path, "airline.xdf")
rxPredict(modelObject = model, data = inADS, outData = airline.xdf,
writeModelVars = TRUE, computeResiduals = TRUE, overwrite = TRUE)
rxGetVarInfo(airline.xdf)
# view residuals
rxHistogram(formula = ~ArrDelay_Resid, data = airline.xdf)
# zoom in
rxHistogram(formula = ~ArrDelay_Resid, data = airline.xdf, numBreaks = 100, startVal = -100, endVal = 100)
## example
bankXDF <- file.path(data.path, "BankXDF.xdf")
rxGetInfo(bankXDF, getVarInfo = TRUE, numRows = 6)
model <- rxLinMod(balance ~ age + duration + marital + housing, data = bankXDF)
model
test = rxPredict(modelObject = model, data = bankXDF, computeResiduals = TRUE, writeModelVars = TRUE, overwrite=TRUE)
rxGetVarInfo(test)
## Optional: estimating standard errors
model <- rxLinMod(ArrDelay ~ DayOfWeek + F(CRSDepTime), data = inADS, covCoef = TRUE)
model$covCoef
# append to output
rxPredict(modelObject = model, data = airline.xdf, outData = airline.xdf,
computeStdErrors = TRUE, overwrite = TRUE)
rxGetVarInfo(airline.xdf)
# calc confidence itnervals
rxPredict(modelObject = model, data = airline.xdf, outData = airline.xdf,
computeStdErrors = TRUE, interval = "confidence", overwrite = TRUE)
rxGetVarInfo(airline.xdf)
### model Validation: split test & train
# Construct a variable to identify training data
set.seed(100)
rxDataStep(inData = airline.xdf,
outFile = airline.xdf,
transforms = list(
TrainValidate = factor(
ifelse(rbinom(length(DayOfWeek), size=1, prob=0.8), "train", "validate")
)
),
overwrite = TRUE)
rxGetVarInfo(airline.xdf)
rxHistogram(formula = ~TrainValidate, data = airline.xdf)
# splitting into two datasets
# Build a training data set
splitDS <- rxSplit(inData = airline.xdf, outFilesBase = file.path(output.path,
"airline"), splitByFactor = "TrainValidate", overwrite = TRUE)
splitDS # two files are located in here
train <- splitDS[[1]]
validate <- splitDS[[2]]
# Build a model on the training data
model <- rxLinMod(formula = ArrDelay ~ DayOfWeek + F(CRSDepTime),
data = train)
summary(model)
# use predict function to retrieve residuals from model object
rxPredict(modelObject = model, data = validate, outData = validate,
computeResiduals = TRUE, predVarNames = "ArrDelay_PredOutOfSample",
overwrite = TRUE)
rxGetInfo(validate, getVarInfo = TRUE, numRows = 5)
# correlation matrix - correlate actual & predicted
rxCor(formula = ~ArrDelay_PredOutOfSample + ArrDelay, data = validate)
### kfold cross validation
k = 10;
rxDataStep(inData = airline.xdf,
outFile = airline.xdf,
transforms = list(
kSplits = factor(sample(LETTERS[1:k], size = .rxNumRows, replace = TRUE))
),
transformObjects = list(LETTERS = LETTERS, k = k),
append = "rows",
overwrite = TRUE)
rxHistogram(~kSplits, data = airline.xdf)
# split into k-files
kSplits <- rxSplit(inData = airline.xdf, outFilesBase = file.path(output.path,
"airline"), splitByFactor = "kSplits", overwrite = TRUE)
# write a function
myLinModWrapper <- function(holdoutlevel, splitFiles) {
## first, estimate the model on all data point but those including
## holdoutlevel
print(holdoutlevel)
myMod <- rxLinMod(ArrDelay ~ DayOfWeek + F(CRSDepTime), data = airline.xdf,
rowSelection = kSplits != holdout, transformObjects = list(holdout = holdoutlevel),
)
## Then, generate predictions
curHoldOut <- grep(paste("kSplits", holdoutlevel, "xdf", sep = "."),
names(splitFiles), value = TRUE)
rxPredict(myMod, data = splitFiles[[curHoldOut]], overwrite = TRUE,
predVarNames = "ArrDelay_kFold_Pred")
## Generate cross-validation metric for this hold out
curCor <- rxCor(~ArrDelay + ArrDelay_kFold_Pred, data = curHoldOut)
return(curCor["ArrDelay", "ArrDelay_kFold_Pred"])
}
# estimate all holdout samples
allCors <- vapply(LETTERS[1:k], myLinModWrapper, splitFiles = kSplits,
FUN.VALUE = numeric(1))
# visualise
dotplot(allCors, cex = 2, scales = list(y = list(cex = 1), x = list(cex = 1.5)),
xlab = list("Out-of-Sample Correlation", cex = 2), ylab = list("Hold out Sample",
cex = 2), panel = function(x, y, ...) {
panel.dotplot(x, y, ...)
panel.abline(v = mean(x), lty = 2, lwd = 3)
})
#### Model selection, stepwise, etc
## rxStepControl()
inADS <- RxXdfData(file.path(sample.data.dir, "AirlineDemoSmall.xdf"))
rxGetInfo(inADS, getVarInfo = TRUE)
# fit simple model, main effects only
model.main <- rxLinMod(ArrDelay ~ DayOfWeek + F(CRSDepTime), data = inADS)
length(coef(model.main))
summary(model.main)
# fit model with interactions
model.interaction <- rxLinMod(ArrDelay ~ DayOfWeek * F(CRSDepTime),
data = inADS, dropMain = FALSE)
length(coef(model.interaction))
summary(model.interaction)
## comparing the models
# AIC: 2k - 2ln(L) [# of paremeters - log likelihood]
model.main$aic
model.interaction$aic
# compare relative likelihood
exp((model.interaction$aic - model.main$aic)/2)
## or do f test
Df <- c(model.interaction$df[1] - model.main$df[1], model.interaction$df[2])
SS <- c((model.main$residual.squares - model.interaction$residual.squares),
model.interaction$residual.squares)
MS <- SS/Df
fVal <- c(MS[-length(MS)]/MS[length(MS)], NA)
pVal <- pf(fVal, df1 = Df[-length(Df)], df2 = Df[length(Df)], lower = FALSE)
anova.tbl <- data.frame(Df, SS, MS, fVal, pVal)
dimnames(anova.tbl) <- list(c("modelDiff", "Residuals"), c("Df", "Sum Sq",
"Mean Sq", "F value", "Pr(>F)"))
stats:::print.anova(anova.tbl)
## automatic model selection
args(rxLinMod)
args(rxStepControl)
#method: method for performing model selection
#scope: the range of models to explore (see below)
myStepControl <- rxStepControl(method = "stepwise", scope = list(lower = ~DayOfWeek +
F(CRSDepTime), upper = ~DayOfWeek * F(CRSDepTime)), test = "F")
# define starting model and then run
model.step1 <- rxLinMod(ArrDelay ~ DayOfWeek + F(CRSDepTime), variableSelection = myStepControl,
data = inADS, dropMain = FALSE)
# model sequence
model.step1$anova
summary(model.step1) # final model
## Another example
bigger.ADS <- file.path(big.data.path, "AirOnTime7Pct.xdf")
rxGetInfo(bigger.ADS, getVarInfo = TRUE)
mySel <- rxStepControl(method = "stepwise", scope = ~DayOfWeek * F(CRSDepTime) +
DepDelay + TaxiOut + UniqueCarrier + Distance)
model.complex1 <- rxLinMod(ArrDelay ~ 1, data = bigger.ADS, variableSelection = mySel)
summary(model.complex1)
model.complex1$anova
### LOgistic regression
args(rxLogit)
mortgages <- file.path(sample.data.dir, "mortDefaultSmall.xdf")
rxGetVarInfo(mortgages)
myformula <- formula(default ~ F(year) + ccDebt + creditScore + houseAge +
yearsEmploy)
model = rxLogit(myformula, data=mortgages)
# predict for some new mortgages
new_mortgages <- data.frame(year = rep(c(2006, 2009), each = 4), ccDebt = rep(c(1000,
10000), 4), creditScore = rep(c(700, 800), 4), houseAge = rep(c(1,
5, 10, 20), 2), yearsEmploy = rep(7, 8))
str(new_mortgages)
rxPredict(modelObject=model, data=new_mortgages)
#### GLMs - have a family argument: logic, probit, log...
# rxGlm()
args(rxGlm)
census2000 <- file.path(big.data.path, "Census5PCT2000.xdf")
census_insurance <- file.path(output.path, "Census5PCT2000_insurance.xdf")
if (!file.exists(census_insurance)) {
rxDataStep(inData = census2000, outFile = census_insurance, rowSelection = (related ==
"Head/Householder") & (age > 20) & (age < 90), varsToKeep = c("propinsr",
"age", "sex", "region", "perwt"), blocksPerRead = 1, overwrite = TRUE)
}
# --> this is about 10g
rxGetVarInfo(census_insurance)
p <- rxSummary(~region, data = census_insurance)
print(p, head = FALSE)
rxHistogram(~propinsr, data = census_insurance, pweights = "perwt")
propinGlm <- rxGlm(propinsr ~ sex + F(age) + region, pweights = "perwt",
data = census_insurance, family = rxTweedie(var.power = 1.5),
dropFirst = TRUE)
p <- summary(propinGlm)
print(p, head = FALSE)
rxPredict(propinGlm, data = census_insurance, outData=outData, overwrite =TRUE)
rxGetInfo(outData, getVarInfo = TRUE, numRows = 5)
rxHistogram(~propinsr_Pred, data = outData, pweights = "perwt")
### logistic with rxGLM
propinGlm <- rxGlm(default ~ ccDebt + F(year) + creditScore + houseAge + yearsEmploy,
data = mortgages, family = binomial())
#### Decision trees
# rxDTree(formula, data, ...)
# control: maxDepth, minSplit, minBucket, maxNumBins, cost,
# outFile = write out terminal node for each observation
# parms: loss - specify loss matrix
infile <- file.path(data.path, "BankXDF.xdf")
BankDS <- RxXdfData(file = infile)
myBankDS <- file.path(output.path, "mybank.xdf")
Tree <- rxDTree(housing ~ age + balance, data = BankDS, cp = 0.001)
# cp = 'complexity parameter' = minimum decrease in lack-of-fit reduction allowed by a split
# visualise
library(RevoTreeView)
plot(createTreeView(Tree))
### Regression tree
RTree <- rxDTree(housing_num ~ age + balance, data = BankDS, outFile = myBankDS,
writeModelVars = TRUE, overwrite = TRUE, extraVarsToWrite = "housing_num",
transforms = list(housing_num = housing == "yes"), cp = 0.005)
plot(createTreeView(RTree))
# using loss matrix in parms
curlvls <- rxGetVarInfo(BankDS, varsToKeep = "housing")[["housing"]]$levels
(loss.mat <- matrix(c(0, 1.5, 1, 0), ncol = 2, dimnames = list(actual = curlvls,
predicted = curlvls)))
## predicted
## actual yes no
## yes 0.0 1
## no 1.5 0
TreeLossMat <- rxDTree(formula = housing ~ age + balance, data = BankDS,
parms = list(loss = loss.mat), cp = 0.001)
plot(createTreeView(TreeLossMat))
# predictions: rxPredict() - can have prob (probability for class), class (predicted terminal node), ...
args(RevoScaleR:::rxPredict.rxDTree)
rxPredict(Tree, data = BankDS, outData = myBankDS, overwrite = TRUE,
writeModelVars = TRUE)
rxGetVarInfo(myBankDS)
rxPredict(RTree, BankDS, myBankDS, overwrite = TRUE, writeModelVars = TRUE,
predVarNames = c("predicted_housingnum_dtree"))
## visualization - ROC/AUC
rxRocCurve(actualVarName = "housing_num", predVarNames = "predicted_housingnum_dtree",
data = myBankDS)
roc.df <- rxRoc(actualVarName = "housing_num", predVarNames = "predicted_housingnum_dtree",
data = myBankDS)
head(roc.df)
rxAuc(roc.df)
#### Randomforests
infile <- file.path(data.path, "BankXDF.xdf")
BankDS <- RxXdfData(file = infile)
Forest <- rxDForest(housing ~ balance + age, data = BankDS, seed = 10,
cp = 0.01, nTree = 50, mTry = 2, overwrite = TRUE)
Forest
# we can access each tree
t(all.imp <- vapply(Forest$forest, getElement, "variable.importance",
FUN.VALUE = numeric(2)))
# notes
# out-of-bag error = every round we fit tree to 2/3rds dataset. Then we run
# remaining 1/3 through to see error
## determining variable importance
# choose an impurity measure, like Gini Index
# calculate Gini Index before and after each split
# sum up Gini improvements for each variable
Forest <- rxDForest(housing ~ balance + age, data = BankDS, seed = 10,
cp = 0.01, nTree = 50, mTry = 1, maxDepth = 5, importance = TRUE,
overwrite = TRUE)
Forest$importance
rxVarImpPlot(Forest)
## prediction types: response, prob, vote
# category = majority prediction
# regression = average of predictions
#### GOODNESS OF FIT MEASURES #####
rxRocCurve()
rxAuc(roc.df)
############ UNSUPERVISED LEARNING ##################
### Clustering
args(rxKmeans)
infile <- file.path(data.path, "BankXDF.xdf")
BankDS <- RxXdfData(file = infile)
mybankfile <- file.path(output.path, "mybankXDF.xdf")
bank_clusters <- rxKmeans(~duration + balance, data = BankDS, numClusters = 3,
outFile = mybankfile, writeModelVars = TRUE, extraVarsToWrite = "randSamp",
transforms = list(randSamp = sample(10, .rxNumRows, replace = TRUE)),
overwrite = TRUE)
bank_clusters
bank_clusters$centers
# plot 1
library(latticeExtra)
plt = rxLinePlot(balance ~ duration, data = BankDS, type = "p")
plt + as.layer(xyplot(balance ~ duration, data = as.data.frame(bank_clusters$centers),
col = "red"))
# plot 2
intern.km.df <- rxXdfToDataFrame(file = mybankfile, rowSelection = randSamp == 1)
plot(balance ~ duration, data = intern.km.df, col = intern.km.df$.rxCluster,pch = 19)
with(as.data.frame(bank_clusters$centers), points(duration, balance, col = "lightblue", pch = "x", cex = 2))
## Scaling variables
bank_summary <- rxSummary(~ duration + balance, data = BankDS)
bank_clusters <-
rxKmeans(~ duration_scl + balance_scl,
data = BankDS, writeModelVars = TRUE,
outFile = mybankfile, outColName = ".rxCluster_scl",
transforms = list(
duration_scl = (duration - duration_mean)/
duration_stdev,
balance_scl = (balance - balance_mean)/
balance_stdev),
transformObjects = list(
duration_mean=bank_summary$sDataFrame$Mean[1],
duration_stdev=bank_summary$sDataFrame$StdDev[1],
balance_mean=bank_summary$sDataFrame$Mean[2],
balance_stdev=bank_summary$sDataFrame$StdDev[2]),
numClusters = 3, overwrite = TRUE)
bank_clusters
bank_clusters$centers
# plot 1
unscaled_centers <- t(t(bank_clusters$centers) * bank_summary$sDataFrame$StdDev +
bank_summary$sDataFrame$Mean)
plt = rxLinePlot(balance ~ duration, data = BankDS, type = "p")
plt + as.layer(xyplot(balance_scl ~ duration_scl, data = as.data.frame(unscaled_centers),
col = "red", pch = 19))
# plot 2
intern.km.df2 <- rxXdfToDataFrame(file = mybankfile, rowSelection = randSamp ==1)
plot(balance ~ duration, data = intern.km.df2, col = intern.km.df2$.rxCluster_scl,pch = 19)
with(as.data.frame(unscaled_centers), points(duration_scl, balance_scl,col = "lightblue", pch = "x", cex = 2))
## How many clustes?
SS <- rep(NA, 20)
for (i in 2:20) SS[i] <- rxKmeans(~duration + balance, data = BankDS,
numClusters = i)$tot.withinss
plot(1:20, SS, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")
## k-means can be sensitive to startingcentroid positions
## use numStarts to repeat multiple times and refit
########## PRINCIPAL COMPONENTS ##############
## no rx function: must be run locally
## Goal: Simple data reduction technique.
## Main idea: Sequentially create variables that are linearly
## independent (i.e. uncorrelated) that capture as much variance in the
## dataset as possible. Create the variables based on linear
## combinations of the variables already in your dataset.
# how to build our own in-db function
airDS <- RxXdfData(file.path(big.data.path, "AirOnTime7Pct.xdf"))
rxGetInfo(airDS, getVarInfo = TRUE)
# We will explore the relationship among Year, DepDelay, ArrDelay,
# TaxiOut, TaxiIn, CRSDepTime, and Distance.
vars2use <- c("Year", "DepDelay", "ArrDelay", "TaxiOut", "TaxiIn",
"CRSDepTime", "Distance")
rxGetVarInfo(airDS, varsToKeep = vars2use)
# Step 1. Compute the correlation matrix.
# rxCor() uses complete method – if any values are missing, then that entire row is dropped.
airformula <- as.formula(paste("~", paste(vars2use, collapse = "+")))
(cormat <- rxCor(airformula, data = airDS))
# Step 2. apply open-source princomp() function
# remember to use covmat parameter
args(stats:::princomp.default)
(myair.pca <- princomp(covmat = cormat))
summary(myair.pca)
plot(myair.pca)
# The loadings correspond to the eigen vectors (i.e. the linear
# combinations for each of the components).
myair.pca$loadings
# Now, apply: Create new factor scores associated with the first K principal components.
myAirXDF <- file.path(output.path, "myAirDSpca.xdf")
rxDataStep(airDS, outFile = myAirXDF, varsToKeep = vars2use, overwrite = TRUE,
removeMissings = TRUE)
rxGetInfo(myAirXDF, getVarInfo = TRUE)
# verify correlations
cormat2 <- rxCor(airformula, data = myAirXDF)
cormat - cormat2
# summarise dataset
summaryResults <- rxSummary(airformula, data = myAirXDF)$sDataFrame
curMeans <- setNames(summaryResults$Mean, summaryResults$Name)
curStd <- setNames(summaryResults$StdDev, summaryResults$Name)
# finally, create a transformfunc
# note: matrix multiplication
createPCAFactorVars <- function(mylist){
## scale the variables
scaledvars <- as.data.frame(
mapply(function(curvar, curmean, cursd){ scale(curvar, center = curmean, scale = cursd)},
mylist[curvars],
mymeans[curvars],
mysds[curvars]
)
)
colnames(scaledvars) <- paste0(curvars,"Z")
Components <- as.matrix(scaledvars) %*% mypca$loadings
return(c(mylist, as.list(scaledvars), as.list(as.data.frame(Components))))
}
# apply transformfunc
rxDataStep(myAirXDF,
outFile = myAirXDF,
varsToKeep = vars2use,
append = "cols",
overwrite = TRUE,
transformFunc = createPCAFactorVars,
transformObjects = list(curvars = vars2use, mymeans = curMeans,
mysds = curStd, mypca = myair.pca)
)
# let's look at it
(newSummary <- rxSummary(~., data = myAirXDF)$sDataFrame)
####### Elastic net / ridge regression / other variable selection... ############
# stepwise = too much chance. coefficients too large, dependant on pre-specified model. overfits
# apply shrinkage or regularisation to reduce estimates
# - ridge
# - lasso (L1 constraint)
# basically: minimise[(lack of fit criterion) + (overfitting penalty)]
# Ridge: penalty is lambda * sum(Beta^2) <-- overfitting
# added to lack of fit = sum(y - x_i%*%Beta_i)^2
# choose lambda with crossvalidation, etc
# disadvantages: coefficients still always remain non-zero: are never removed
# Lasso
# as before, but overfitting = lambda * sum(|Beta|) <-- no coefficients can become zero and drop out
# disadvantages: can be volatile with correlated predictors: which one to pick?
# poor with wide data (n<p)
# Elastic net = combines ridge and lasso
# overfitting = lambda * sum(P * Beta)
# where P = 1/2(1-alpha)beta etc etc
# in R: use glmnet & glmnetUtils package
# see slides in gmail
### Neural networks
# nnet(y ~ x1 + x2 + ..., data, size, linest, entropy, softmax, maxit, decay, skip, rang)
#size: number of hidden nodes
#linest: TRUE if a regression network, FALSE for classification
#entropy: TRUE if a classification network and y is numeric, FALSE for regression
#softmax: TRUE if a classification network and y is a factor, FALSE for regression
#maxit: maximum number of iterations
#decay: weight decay parameter
#skip: TRUE to use skip-layer connections (often a good idea)
#rang: initial range for weights (if random starting values used)
#############################################
##### CODE NOTES FROM SWINBURNE
#############################################
######## RANDOM VARIABLES #########
#### DISCRETE #####
# Your mark for this subject
X = c(30, 40, 50, 60, 70, 80)
# The probability of you getting these marks
P = c(0.3, 0.3, 0.05, 0.05, 0.1, 0.2)
# Checking the sum of probabilities equal to 1
sum(P)
### Calculating means and variance
# can be done vectorwise or matrix algebra
mean=sum(X*P)
mean
sum( (X-mean)^2*P )
# MATRIX
mean=t(X)%*%P
mean
####### UNIFORM ########
# Generate 10 uniform random numbers between 3 and 5.
runif(10, 3, 5)
# Generate 100000 uniform random numbers between 3 and 5.
X <- runif(100000,3,5)
# Look at its histogram.
hist(X)
# compute its mean using the formula.
(1/2)*(3+5)
# verify using the mean function.
mean(X)
# compute its variance using the formula.
(1/12)*(5-3)^2
# verify using the var function.
var(X)
########### NORMAL
# Generate 10,000 N(0,1) numbers (i.e., standard normal).
# no need to specify the mean and variance.
X<-rnorm(10000)
# Generate 10,000 N(30,102) numbers.
X<-rnorm(10000,30,10)
# checking the histogram.
hist(X)
# checking the mean and variance.
mean(X)
var(X)
########## CHI
# Generate 10,000 Chi-squared numbers with 5 degrees of freedom.
X<-rchisq(10000,5)
# Look at its histogram.
hist(X)
# Verify that the mean and variance are indeed as n and 2n respectively…
mean(X)
var(X)
##### F-DISTRIBUTION
# Generate 100 F numbers with df1 = 5, df2 = 2.
X<-rf(100,5,2)
# Look at its histogram.
hist(X)
############### BETA
## BETA
# mean = a/(a+b)
# median = qbeta(0.5,a,b)
############### BAYES EXAMPLES
### BETA BINOMIAL COIN TOSS - large sample (n=11)
a <- 5
b <- 5
s <- 70 # which is y in our previous slides
f <- 30 # which is n – y in our previous slides
# The Prior
curve(dbeta(x,a,b),lty=3,lwd=4, col="red", ylim = range(0,10))
# The Likelihood
curve(dbeta(x,s+1,f+1),add=TRUE, lty=2,lwd=3, col="blue")
# The Posterior
curve(dbeta(x,a+s,b+f), from=0, to=1, add=TRUE, xlab="p",ylab="Density",lty=1,lwd=3, col="green")
legend(.1,8,c("Posterior","Likelihood","Prior"),lty=c(1,2,3),lwd=c(3,3,3),
col=c( "green", "blue", "red"))
### BETA BINOMIAL COIN TOSS - small sample
a <- 5
b <- 5
s <- 7 # which is y in our previous slides
f <- 3 # which is n - y in our previous slides
# The Prior
curve(dbeta(x,a,b),lty=3,lwd=4, col="red", ylim = range(0,5))
# The Likelihood
curve(dbeta(x,s+1,f+1),add=TRUE, lty=2,lwd=3, col="blue")
# The Posterior
curve(dbeta(x,a+s,b+f), from=0, to=1, add=TRUE, xlab="p",ylab="Density",lty=1,lwd=3, col="green")
legend(.1,8,c("Posterior","Likelihood","Prior"),lty=c(1,2,3),lwd=c(3,3,3),
col=c( "green", "blue", "red"))
# Posterior Mean
qbeta(0.5, a + s, b + f)
# Bayesian 90% confidence interval
qbeta(c(0.05,0.95), a + s, b + f)
#### What if all ten tosses are heads?
a <- 5
b <- 5
s <- 10 # which is y in our previous slides
f <- 0 # which is n - y in our previous slides
# The Prior
curve(dbeta(x,a,b),lty=3,lwd=4, col="red", ylim = range(0,8))
# The Likelihood
curve(dbeta(x,s+1,f+1),add=TRUE, lty=2,lwd=3, col="blue")
# The Posterior
curve(dbeta(x,a+s,b+f), from=0, to=1, add=TRUE, xlab="p",ylab="Density",lty=1,lwd=3, col="green")
legend(.1,8,c("Posterior","Likelihood","Prior"),lty=c(1,2,3),lwd=c(3,3,3),
col=c( "green", "blue", "red"))
################ FITTING MAXIMUM LIKELIHOOD ###################
# Assume gaussian pdf for epsilon
# y = XB + epsilon, where epsilon ~ N(0, sigma^2)
# import US_Homicides_1990.csv
data <- read.csv(file.choose())
data<-data.matrix(data)
one <- rep(1,dim(data)[1]) # create constant
one <- data.matrix(one)
y <- data[,1]
y<-data.matrix(y)
X <- data[,(-1)]
X <- cbind(one,X)
X<- data.matrix(X)
# define log-likelihood function
# Note: SIGNS HAVE BEEN REVERSED as optim() only finds MINIMUM
lnL <- function(p)
(dim(data)[1]/2)*log(2*3.1415926)+(dim(data)[1]/2)*log(p[1])+(1/(2*p[1]))*(t(y-X%*%p[2:8])%*% (y-X%*%p[2:8]))
# Find maximum - NOTE: OPTIM finds MINIMUM, so we have to REVERSE all signs
out <- optim(c(rep(1,8)), lnL, hessian=TRUE, method = "BFGS")
beta <- matrix(out$par)
# Note: first value is variance --> ignore
# Second value is for intercept
# remainder are coefficients for parameters
stdev <- matrix(sqrt(diag(solve(out$hessian))))
t_beta <- beta/stdev # t-statistics
cbind(beta,t_beta) # bind coefficient estimate & t-statistics
# Interpret coefficients positive or negative
# Interpret t:
# Remember that you are controlling for all other variables
##### LME EXAMPLE FROM ASSIGNMENT 1 ############
# just estimating mean & SD
y=c(6.1,5.9,6.2,5.7,5.1,6.7,5.8,6.4,6.8,4.5,6.3,6.1,6.6,6.1,5.6,6.3,6.6,5.4,6.1,6.2,5.2,6.2,
6.3,6.7,6.5,5.8,5.4,4.8,6.8,6.3,6.5,6.2,6,5.8,5.2,6.3,6,6.1,4.8,6)
n = length(y)
## Define function
# note that we switch signs (minus'es to plus'es) to accomodate
# the optim() function, which _minimises_ by default
# p[1] will be the variance, p[2] will be the mean
lnL <- function(p)
(n/2)*log(2*pi*p[1]) + (1/(2*p[1]))*(t(y-p[2])%*%(y-p[2]))
out <- optim(c(rep(1,2)), lnL, hessian=TRUE, method = "BFGS")
# print variance
print(out$par[1])
# print mean
print(out$par[2])
############# LECTURE 4
#### Slides #2: "The bayesian computation problem" on
## Example: Waiting until a hit in baseball - using beta-binomial conjugates
# Note that observations are WAIT TIMES, but we are estimating p = probability of hit
# observations= wait time until hit baseball
# note that 0's = non-hits
baseball <- c(0,2,0,4,1,0,2,0,1,0,0,1,1,3,1,0,0,0,1,6,0,9,0,4,1,9,1,0,3,4,5,5,1,0,2,
4,0,4,0,3,2,1,0,1,3,7,0,3,1,2,14,4,0,1,6,1,10,1,2,0,1,0,4,5,0,7,3,1,2,1,2,1,2,2,
4,3,3,1,1,2,1,2,7,0,3,1,2,2,2,2,0,3,4,1,1,0,0,1,1,1,11,2,2,1,3,1,0,1,2,1,1,1,0,0,
2,0,10,1,2,2,1,1,3,1,1,0,0,1,0,1,0,1,1,0,1,0,0,0,2,1,4,5,5,0,0,0,0,2,0,8,5,2,11,
8,0,7,1,3,1)
n <- length(baseball) # [1] 159
s <- sum(baseball) # [1] 336
# prior = beta distribution
a = 44
b = 102
curve(dbeta(x,a,b), from=0.2, to=0.5,
xlab="p",ylab="Density",lty=1,lwd=3, main= "Prior")
# The likelihood function
# A function of p, defined as:
likelihood = function(p)
{
p^n*(1-p)^s
}
curve(likelihood, from=0.2, to=0.5, xlab="p", ylab="Density", main= "Likelihood Function")
abline(v=0.321, lty=2)
# POSTERIOR - using beta-binomial known conjugacy
a=44
b=102
n=159
s=336
A=n+a
B=s+b
curve(dbeta(x,A,B), from=0.2, to=0.5,
xlab="p",ylab="Density",lty=1,lwd=3, main= "Posterior Density Function (1)")
# --> note, no normalisation = not a proper density, but still correct shape
## or alternatively we can define the posterior function explicitly
posterior = function(p)
{
p^(n+a-1)*(1-p)^(s+b-1)
}
#plot of the posterior density function
curve(posterior, from=0.2, to=0.5, xlab="p", ylab="Density",lty=1,lwd=3 , main= "Posterior Density Function (2)")
# Using the qbeta function, find the median of the posterior density of p.
a=44
b=102
n=159
s=336
A=n+a
B=s+b
posterior.median = qbeta(p = 0.5, A, B)
posterior.median
# Using the qbeta function, construct a 95% Bayesian interval estimate for p.
Bayesian_95Interval = qbeta(c(0.025,0.975), A, B)
Bayesian_95Interval
# Using the rbeta function, simulate 100 values from the posterior density of p.
a=44
b=102
n=159
s=336
A=n+a
B=s+b
sim.posterior=rbeta(100, A, B)
# Use the hist function on the simulated sample to display the posterior density.
hist(sim.posterior, col="gray" , main="Simulation for Posterior Density", xlab="p", freq=FALSE )
curve(dbeta(x,a,b), col="red", lty=2, add=TRUE)
curve(dbeta(x,A,B), col="blue", lty=2, add=TRUE)
## Use simulation to get mean & SD
# Using the simulated draws, approximate the mean and S.D. of the posterior density
mean(sim.posterior)
# very close to A/(A+B)
sd(sim.posterior)
# Using the simulated draws, construct a 95% Bayesian interval estimate.
sim_posterior_95Interval = quantile(sim.posterior, c(0.025,0.975))
sim_posterior_95Interval
#########################################################
######### Oct 1 - MARKOV CHAIN MONTE CARLO #############
#########################################################
#### EXAMPLE ####
# Demonstrate bayes
install.packages('LearnBayes', lib='c:\\notbackedup\\R\\')
library(LearnBayes, lib='c:\\notbackedup\\R\\')# a Bayesian package in R
## Define prior
# a sequence of mid-points
midpt <- seq(0.05, 0.95, by = 0.1) # a sequence of mid-points
# a sequence of "heights" for the prior density
# these are clearly not proper densities.
prior <- c(1, 5.2, 8, 7.2, 4.6, 2.1, 0.7, 0.1, 0, 0)
# Divide by their sum total, they now become proper densities.
prior <- prior/sum(prior)
# histprior is a function in LearnBayes package that plots histogram
curve(histprior(x,midpt,prior), from=0, to=1,ylab="Prior density",ylim=c(0,.3))
## now find posterior
s <- 4 # 4 heads (successes)
f <- 6 # 6 tails (failures)
# define "myfn" as the product of the likelihood and the prior
myfn <- function(x)
histprior(x,midpt,prior)*dbeta(x,s+1,f+1)
curve(myfn(x), from=0, to=1, ylab="Posterior density")
##--> SHAPE is BIZARRE, doesn't correspond to any nice, known distributions
# Not a proper density because doesn't sum to 1
## In this case, easy to solve
# we can just numerically evaluate area under curve to get normalising constant
pvals <- seq(0, 1, length=1000)
# evaluating "myfn" over the space of [0,1]
mean(myfn(pvals))
## --> this is our normalising constant --> 0.1725680
post <- function(x)
histprior(x,midpt,prior)*dbeta(x,s+1,f+1)/0.1725680
## simulate values to get mean, etc.
# 1000 posterior values evaluated at 0.001 intervals
pev <- post(pvals)
# the above "pev" is close but not entire accurate due to rounding.
# standardize it
pev <- pev/sum(pev)
# sample WITH replacement from pvals using pev as probability input.
ps <- sample(pvals, replace = TRUE, prob = pev)
# The Mean of the simulated sample
mean(ps)
# The Median of the simulated sample
quantile(ps, 0.5)
# 95% confidence interval 
quantile(ps, c(0.05,0.95))
### MARKOV CHAINS ####
# The sunny/rainy example
# for 1 day
P = matrix(c(0.90,0.50,0.10,0.50), 2,2)
# 2 days
P%*%P
# long term equilibrium - N days
N = 20
P = matrix(c(0.90,0.50,0.10,0.50),2,2)
P11 = matrix(0,N,1)
P12 = matrix(0,N,1)
P21 = matrix(0,N,1)
P22 = matrix(0,N,1)
P11[1] = P[1,1]
P12[1] = P[1,2]
P21[1] = P[2,1]
P22[1] = P[2,2]
Temp = P
for (i in 1:(N-1))
{
Temp = Temp%*%P
P11[i+1] = Temp[1,1]
P12[i+1] = Temp[1,2]
P21[i+1] = Temp[2,1]
P22[i+1] = Temp[2,2]
}
EP = Temp # this is the equilibrium probability
par(mfrow=c(2,2))
plot(P11)
plot(P12)
plot(P21)
plot(P22)
# notice the burn-in period? Must throw that away
## Eventually markov chain converges to EQUILIBRIUM DISTRIBUTION
## Equilibrium is what we're interested in,
# This is the distribution ignoring the conditionality of time steps
#### MCMC #####
## Example of ages
dval <- c(200,180,170,165,190,195,184,183,150,157)
# log-prior = a function of mu. Note: log taken
lprior <- function(mu) dnorm(mu, 175, 100,log = TRUE)
# log-likelihdood = sum of logs of normal probabilities across all observations
llike <- function(mu) sum(dnorm(dval, mean = mu, sd = 15, log = TRUE))
# log-posterior function - normalising constant not needed
lpost <- function(mu) lprior(mu) + llike(mu)
#now, do MCMC = simulate values and save if accepted
mu0 <- 180
s <- 10 # this controls the standard deviation of normal distribution from which "jumps" are drawn
n <- 10000 # number of steps
mus <- c(mu0, numeric(n))
for(i in 1:n)
{
# generating proposal value of mu = current value + random draw value
mup <- mus[i] + rnorm(1,0,s) # s is SD of the jump
p <- exp(lpost(mup) - lpost(mus[i])) # proposal probability = RATIO ofposteriors
# either accept or reject the proposal
if(p < runif(1)) mus[i+1] <- mus[i] else mus[i+1] <- mup
}
# discard burn-in
musb <- mus[-(1:1001)] # discards burn-in values
# Examine our chain with code package
install.packages('coda', lib='c:\\notbackedup\\R\\')
library(coda, lib='c:\\notbackedup\\R\\')# a Bayesian package in R
musc <- as.mcmc(musb) # converts to MCMC object
rejectionRate(musc) # get rejection rate
# 0.53 = 53% of proposals rejected
## PLOT
# plot posterior
par(mfrow=c(2,1)) # 2 graphs stacked on top of each other
traceplot(musc)
densplot(musc)
summary(musc)
##################### END OCT 1 LEC ########################
##### OCT 8 LEC: MCMC ################
install.packages('ismev', lib='c:\\notbackedup\\R\\')
library(ismev, lib='c:\\notbackedup\\R\\')# a Bayesian package in R
install.packages('LearnBayes', lib='c:\\notbackedup\\R\\')
library(LearnBayes, lib='c:\\notbackedup\\R\\')# a Bayesian package in R
data(engine)
plot(engine$Corr,engine$Time)
# as corosion increases, time gets shorter
# We model as exponential distribution
# x = corrosion
# y = a * x ^b
# so, theta = (a,b)
# Exponential distirbutions are parameterised by RATE parameter
# if we just use MLE
# First specify the log likelihood function
fn = function(p, engine)
{
n = nrow(engine) # number of observations
y = engine$Time # engine failure time
x = engine$Corrosion # corrosion as a predictor
-n*log(p[1]) - p[2]*sum(log(x)) + p[1]*sum(x^p[2] * y)
}
out = optim(c(1.16, 0), fn, method = "BFGS",
hessian = TRUE, engine = engine)
round(out$par,3)
round(sqrt(diag(solve(out$hessian))), 3)
plot(engine$Corr,engine$Time)
CorrLev = seq(0, 4.2, len=100)
lines(CorrLev, 1/(1.133* CorrLev^0.479)) # these are coefficients
# "The rate parameter of an exponential distribution is the reciprocal of the mean" - so we need to use reciprocal
# so if we predict rate, we take reciprocal to get mean
## The bayesian method
# define the log of the posterior
lpost <- function(p, engine)
{
y <- engine$Time
x <- engine$Corrosion
# the log of the joint prior is equal to the sum of log marginal
# Note: this is because we defined that as independant so ab = a*b
lprior <- dnorm(p[1],0,100,log=TRUE) + dnorm(p[2],0,100,log=TRUE)
# the likelihood uses the exponential function
# note that the "a" coefficient is "exp(a)"
llike <- sum(dexp(y, rate = exp(p[1])*x^p[2], log = TRUE))
# the log posterior is the sum of log prior and log likelihood
lprior + llike
}
start <- c(log(1.16),0) # starting values. We've assumed p[1] is logged
# see next slide for details of gibbs()
ftmcmc <- gibbs(lpost, start, 100000, c(0.25,0.25), engine = engine)
# get the accept rate. remember, we want this to be ~50-60%
ftmcmc$accept
# get the sample chain
ftmcmc$par
hist(ftmcmc$par[,1]) # hist of b (remmeber, we are sampling from log)
hist(exp(ftmcmc$par[,1]))
# get means
mean(exp(ftmcmc$par[-(1:10000),1]))
mean(ftmcmc$par[-(1:10000),2])
########### ELICITATION
quantile1 <- list(p=.05,x=0.25) # 5%tile at x = 0.25
quantile2 <- list(p=.95,x=0.75) # 95%tile at x = 0.75
# select parameters for a Beta distribution with the above quantiles.
beta.select(quantile1,quantile2)
########## COIN SELECTION EXAMPLE
# drawing pennies from jar then tossing
# each penny could have different probability of landing on heads
# But we know it has a uniform distribution from 0-1
# we select a coin at random, flip it 10 times, get 3 heads
# Q. we toss it 12 more times, what is p(4 heads)?
# two parts
# p(heads) for chosen coin
# p(4 heads | 12 tosses) for that coin
# So what is joint density?
# {see slides}
random.coin.gibbs = function (p=0.5, m = 1000)
{
S = matrix(0,m,2)
dimnames(S)[[2]] = c("p","y")
for (j in 1:m)
{
y = rbinom(1, size=12, prob=p)
p = rbeta(1, y+4, 20-y)
S[j,] = c(p,y)
}
return(S)
}
sim.values = random.coin.gibbs()
plot(sim.values)
# p(y=4)
table(sim.values[,"y"])
# so, reading off the table, y=4 = number of 4's / total sample 1000 gives us p(y=4)
# p:
mean(sim.values[,"p"])
################################################################
### PART 2 of course ###########################################
################################################################
#############################################
##### MARKOV CHAIN ######
#############################################
## Finding long-term steady-state
transmat<-matrix(c(0.9, 0.2, 0.1, 0.8), nrow=2, byrow=2)
s0<-matrix(c(3000,4000), nrow=2)
s2<-transmat%*%transmat%*%s0
steady50<-transmat
for(i in 1:50)
steady50<-transmat%*%steady50
steady50
steady51<-transmat
for(i in 1:51)
steady51<-transmat%*%steady51
steady51
#### Generate sample for Gibbs
n<-50 #Data
x<-rnorm(n,5,2) # mean is 5 and sd=2
N<-1000 #Run MC=2 chains of length N=1000
simmat<-matrix(0, ncol=2, nrow=N) # store the results
for (j in 1:2) { #Loop over chains
tau<-rgamma(1,n/2,1/2) #Starting value for tau
for (i in (1:N)) { #Gibbs iterations
mu<-rnorm(1,mean(x),sqrt(1/(tau*n))) #Update mu
tau<-rgamma(1,n/2,sum((x-mu)^2)/2) #Update tau
simmat[i,1]<-mu #Save results
simmat[i,2]<-tau
}
}
par(mfrow=c(2,2))
hist(simmat[,1])
hist(simmat[,2])
plot(simmat[,1], type="l")
plot(simmat[,2], type="l")
##### Metropolis algorithm
## example with uniform distribution
metropolis1<-function(n, a, b)
{
sim<-numeric(n)
sim[1]<-x<-0
for (i in 2:n) {
prob=runif(1,a,b)
cand=x+prob
ratio=min(1,dnorm(cand)/dnorm(x))
u=runif(1)
if (u < ratio)
x=cand
sim[i]=x
}
sim
}
par(mfrow=c(1,1))
res<-metropolis1(1000,-1, 1 )
plot(res, type="l", col="dark red")
####### Metropolis-Hasting
## also fix to make log-likelihood
metropolis2<-function(n, a, b) {
sim<-numeric(n)
sim[1]<-x<-0
logllkelihood.old<-dnorm(x, log=TRUE)
for (i in 2:n) {
cand<-x+ runif(1,a,b)
logllkelihood.new<-dnorm(cand, log=TRUE)
ratio<-min(1, exp(logllkelihood.new- logllkelihood.old))
u<-runif(1)
if (u < ratio){
x<-cand
logllkelihood.new <- logllkelihood.old
}
sim[i]<-x
}
sim
}
res<-metropolis2(1000,-1, 1 )
plot(res, type="l", col="dark red")
#### Generalised Extreme Value distribution example
# log-likelihood function
# theta is 3-value vector
loglikelihood<-function(theta, mdata){
mu<-theta[1]
shape<-theta[2]
sigma<-theta[3]
n<-length(mdata)
zz<-1+shape*(mdata-mu)/sigma
# note we zero any values that are <0
# not quite zero, as can't log 0
if(any(zz<=0)) {
return(-1e16)
}
llikelihood<- -log(sigma)-(1+1/shape)*log(zz)-(zz)^(-1/shape)
sum(llikelihood)
}
# mcmc function
# std is for accept/reject values
# if STD is TOO SMALL then we'll accept everything
# we want 40-60% accept rate
mcmcevd<- function(nn, init,std, mdata, burn=0) {
theta<-init
simmat<- matrix(NA,nn+1, ncol=length(theta))
colnames(simmat) <- c("loc","shape","scale")
simmat[1,]<-init
acc.counts<-numeric(3)
llikelihood.old<-loglikelihood(theta, mdata)
for(i in 1:nn){
logposterior.old<-llikelihood.old+dnorm(theta[1], sd=100, log=TRUE) #Prior and logLikelihood
# Proposal value for location
loc.old<-theta[1]
loc.new<-rnorm(1,loc.old,std[1]) #Simulating a value from a normal distribution
theta[1]<-loc.new
llikelihood.new<-loglikelihood(theta, mdata)
logposterior.new<-llikelihood.new+dnorm(theta[1], sd=100, log=TRUE) #Prior and logLikelihood
##DECISION WHETHER REJECT/ACCEPT
alpha<- min(exp(logposterior.new-logposterior.old),1) # remove log
u<-runif(1,0,1) # generate new value
if(alpha > u) # decision
{
simmat[(i+1),1]<-loc.new
acc.counts[1]<-acc.counts[1]+1
llikelihood.old<-llikelihood.new
}
else
simmat[(i+1),1]<-loc.old
theta[1]<-simmat[(i+1),1]
logposterior.old<-llikelihood.old+dnorm(theta[2], sd=100, log=TRUE) #Prior and logLikelihood
# Proposal value for shape
shape.old<-theta[2]
shape.new<-rnorm(1,shape.old,std[2]) #Simulating a value from a normal distribution
theta[2]<-shape.new
llikelihood.new<-loglikelihood(theta, mdata)
logposterior.new<-llikelihood.new+dnorm(theta[2], sd=100, log=TRUE) #Prior and logLikelihood
##DECISION WHETHER REJECT/ACCEPT
alpha<- min(exp(logposterior.new-logposterior.old),1)
u<-runif(1,0,1)
if(alpha > u)
{
simmat[(i+1),2]<-shape.new
acc.counts[2]<-acc.counts[2]+1
llikelihood.old<-llikelihood.new
}
else
simmat[(i+1),2]<-shape.old
theta[2]<-simmat[(i+1),2]
## Third parameter: scale
# this is special: cannot have negative value
# We take value from LOG-NORMAL
# This is not a symmetric distribuiton
# so we need special non-symmetric component in accept/reject
sigma.old<-theta[3]
sigma.new<-exp(rnorm(1,log(sigma.old),std[3])) #Simulating a value from a lognormal distribution
theta[3]<-sigma.new
llikelihood.new<-loglikelihood(theta, mdata)
logposterior.new<-llikelihood.new+dnorm(theta[2], sd=100, log=TRUE) #Prior and logLikelihood
##DECISION WHETHER REJECT/ACCEPT
nonsymm<-log(sigma.new/sigma.old)
alpha<- min(exp(logposterior.new-logposterior.old+nonsymm),1)
u<-runif(1,0,1)
if(alpha > u)
{
simmat[(i+1),3]<-sigma.new
acc.counts[3]<-acc.counts[3]+1
llikelihood.old<-llikelihood.new
}
else
simmat[(i+1),3]<-sigma.old
theta[3]<-simmat[(i+1),3]
} ##End of simulation loop
simmat<-simmat[(burn+1):nn,]
acc.rates<-c(acc.counts)/nn
list(simmat=simmat, acc.rates=acc.rates)
} #End of Function
mdata<-rnorm(25, 25, 2) # this is our observed dataset
theta<-c(2,0.5,5) # starting value
loglikelihood(theta, mdata)
init<-c(5,3,1)
std<-c(0.7, 0.6, 0.3)
xx<-mcmcevd(nn=10000, init,std, mdata=mdata, burn=1000)
temp<-xx$simmat
xx$acc.rates
par(mfrow=c(3,1))
plot(temp[,1], type="l")
plot(temp[,2], type="l")
plot(temp[,3], type="l")
###### GLM with Bayes
library("MASS")
install.packages("mvtnorm")
install.packages("mfp")
library(mvtnorm) # multivariate normal distributions
library(mfp) #
# import dataset from folder
attach(bodyfat)
summary(bodyfat)
x1<-Abdomen
x2<-Age
x3<-Height
x4<-Neck
y<-Siri
#=======USING MLE============================
RegModel.1 <- lm(y~x1+x2+x3+x4)
summary(RegModel.1)
#==============USING BAYES===============
X<-cbind(1, x1, x2, x3, x4)
dim(X) # 252 rows, 5 columns
Vbeta<-solve(t(X)%*%X) # solve = find inverse
Vbeta # diagnoals are variance, off-diagonals are co-variance
betahat<-solve(t(X)%*%X)%*%t(X)%*%y
betahat
df<-nrow(X)-ncol(X) # degrees of freedom
df
ssquared<-(sum(t(y-X%*%betahat)%*%(y-X%*%betahat)))/df
#simulate from the posterior
n.sims<-10000
sigma<-sqrt(1/rgamma(n.sims, df/2, df*ssquared/2))
# sigma
bbeta<-matrix(betahat, n.sims, ncol(X), byrow=T)+sigma*rmvnorm(n.sims, rep(0, ncol(X)), Vbeta)
apply(bbeta, 2, quantile, probs=c(0.025, 0.5, 0.975))
# can now plot trace, density, etc
#==================USING LEARNBAYES PACKAGE====
install.packages("LearnBayes")
library(LearnBayes)
bmod <- blinreg(y,X, 100000) # prior is null by default
str(bmod) # contains two vectors: for Beta and Sigma
summary(bmod$beta) # see the mean & median estimates: very close to above
betas<-bmod$beta
par(mfrow=c(1,1))
plot(betas[,1], type="l")
plot(density(betas[,1]), type="l")
#==================USING MCMCpack PACKAGE====
# The model formula is the same, but here we have some new options.
# for example: starting value, can incorporate more info,
install.packages("MCMCpack")
library(MCMCpack)
model.mcmc <- MCMCregress(y~x1+x2+x3+x4,
data = bodyfat,
burnin = 2000,
mcmc = 10000,
thin = 1,
seed = NA,
beta.start = NA)
str(model.mcmc)
# mcmc = chain
summary(model.mcmc)
###### logistic regression with Bayes
## Basic MLE approach
x<-c(0,1,2)
ca<-c(10,12,85)
cl<-c(10,65,165)
logitmod<-glm(cbind(ca,cl)~x,family="binomial")
str(logitmod)
thetahat<-logitmod$coeff[2]
exp(thetahat)
# 1.18: so odds of diseased increase 18% with each extra x
var<-vcov(logitmod)[2,2]
# 95% confidence:
exp(thetahat-1.96*sqrt(var))
exp(thetahat+1.96*sqrt(var))
#### logistic via Bayesian - use INLA package
## INLA:
install.packages(c("fields","numDeriv","pixmap", "mvtnorm","R.utils"))
source("http://www.math.ntnu.no/inla/givemeINLA.R")
library(INLA)
inla.upgrade(testing=TRUE)
# First, no priors
# 1. create a list of values [?], convert to dataframe
dat<???list(x=x,ca=ca,cl=cl)
dat<???as.data.frame(rbind(ca,cl,x))
mod.inla<???inla(ca~x, family="binomial" ,data=dat,Ntrials=ca+cl)
str(mod.inla)
# so many parameters...
# mainly interested in marginal distributions for parameters
summary(mod.inla)
vals1 = mod.inla$marginals.fixed
intercept = vals1$'(Intercept)'
x=vals1$x
par(mfrow=c(2,1))
plot(intercept, type="l", main="Intercept")
plot(x, type="l", main='x')
# don't really care about intercept
# get summary statistics for x, x values
# median: 0.16 --> 16% increase odds for each extra 'a'
#=====Now with informative priors=====
Upper975<???log(1.5)
W<???(log(Upper975)/1.96)^2
mod.inla1<???inla(ca~x,family='binomial',data=dat,Ntrials=ca+cl, control.fixed=list(mean.intercept=c(0),prec.intercept=c(.1), mean=c(0),prec=c(1/W)))
summary(mod.inla1)
#=======MULTIPLE LINEAR REG======
Res.inla<-inla(y~x1+x2+x3+x4, data=bodyfat, family="gaussian")
summary ( Res.inla)
# estimated values = marginal distributions
vals<-Res.inla$margianls.fixed
ex1 = vals$x1
ex2 = vals$x2
ex3 = vals$x3
ex4 = vals$x4
# then plot them with plot, type="l"
################ ONE-WAY ANOVA
sigma.mu<-0.3
sigma.e<-0.2
sample<-10
trt<-5
beta0<-0.5
mu<-rnorm(sample, mean=0, sd=sigma.mu)
e<-rnorm(sample*trt, mean=0, sd=sigma.e)
yvec<-beta0 + rep (mu , each=trt) + e
simdata<-data.frame(y=yvec, trt=rep(1:sample, each=trt))
# basic ANOVA
fit1 <- aov(y ~ trt, data=simdata)
summary(fit1)
#
# Bayesian
fit2<-inla(y~f(trt, model="iid"), data=simdata)
summary(fit2)
##########################################################
### PART 5: BAYES FACTOR & TESTING ####################
###########################################################
install.packages('LearnBayes')
# Using Bayes factor to compare models with different
# assumptions of sampling distribution
# See slide see slide 32: "Models for Fire Calls"
fire.counts <- c(75, 88, 84, 99, 79, 68, 86, 109, 73, 85, 101, 85,
75, 81, 64, 77, 83, 83, 88, 83, 78, 83, 78, 80,
82, 90, 74, 72, 69, 72, 76, 76, 104, 86, 92, 88)
# Step 1: use histograms to compare different posterior sampling distributions
hist(fire.counts, probability=TRUE, ylim=c(0, .08))
x <- 60:110
lines(x, dpois(x, lambda=mean(fire.counts)), col="red")
lines(x, dnorm(x, mean=mean(fire.counts), sd=12), col="blue")
lines(x, dnorm(x, mean=mean(fire.counts), sd=6), col="green")
legend("topright", legend=c("M1: Poisson(theta)", "M2: N(theta, 12)", "M3: N(theta, 6)"), col=c("red", "blue", "green"), lty=1)
# which is best?
# We need to calculate log likelihood for each
# use LearnBayes to do integration
#==========Model M1=========
model.1 <- function(theta, y){
sum(log(dpois(y, theta))) + dgamma(theta, shape=280, rate=4)
}
library(LearnBayes)
log.pred.1 <- laplace(model.1, 80, fire.counts)$int
log.pred.1
#==========Model M2=========
model.2 <- function(theta, y){
sum(log(dnorm(y, theta, 6))) + dgamma(theta, shape=280, rate=4)
}
log.pred.2 <- laplace(model.2, 80, fire.counts)$int
#==========Model M3=========
model.3 <- function(theta, y){
sum(log(dnorm(y, theta, 12))) + dgamma(theta, shape=280, rate=4)
}
log.pred.3 <- laplace(model.3, 80, fire.counts)$int
## Calculate Bayes Factors - subtract due to logs
exp(log.pred.1-log.pred.2)
# very large: model 1 better than 2
exp(log.pred.2-log.pred.3)
############ Linear regression comparison ############
install.packages('BayesFactor')
library(BayesFactor)
data(attitude)
head(attitude)
lmObj = lm(rating ~ ., data = attitude)
summary(lmObj)
# only complaints is significant variable
# RegressionBF calculates bayes factors for many models at once
bf = regressionBF(rating ~ ., data = attitude)
length(bf) # 63 models = all possible combinations. 2^6 = 64 - 1 = 63
head(bf, n=6) # displays only the 6 models
################ HYPOTHESIS TESTING ##########################
### One sample t-test ###
# classical approach
# first, import dataset with load workspace
t.test(x=BodyTemperature$Temperature-98.4)
# Bayes method
onesam.test1 = ttestBF(x=BodyTemperature$Temperature-98.4)
onesam.test1
### Independant samples t-test
# is there a difference in IQ between male vs. female
# import data
classical.test = t.test(iqf ~ sex, data = lead_environ,var.equal = TRUE)
# p=0.5 --> reject alternative
# bayes factor
bf = ttestBF(formula = iqf ~ sex, data = lead_environ,var.equal = TRUE )
bf
# bayes factor = 0.26 where null: mu1-mu2 = 0
# ??? therefore, we DO NOT accept alternative hypothesis.
### Paired sample t-test
# import data
PrisonStressSport = sp
attach(PrisonStressSport)
ccpaired.test = t.test(x=PSSbefore, y=PSSafter, paired=TRUE, data=PrisonStressSport)
# p=0.0178 = reject Null
ttestBF(x=PSSbefore, y=PSSafter, paired=TRUE)
# Bayes factor = 3.48 = reject Null
##########################################################
### PART 6: Matrix, conjugates, markov ####################
###########################################################
# MAtrix
dataset<-cbind(c(1991:2015), rnorm(25, 40, 2))
t<-dataset[,1]
#design matrix
designmat<-cbind(1, t)
#lets assume a0<-0.2 and a1<-0.8
a0<-0.2; a1<-0.8 # intercept, slope
coeff<-c(a0, a1)
beta<-designmat%*%coeff # calculate predictions
beta
################################################
### IMPUTING MISSING VALUES
################################################
install.packages("mice", dependencies=TRUE)
library(mice)
## Create a dataset with missing values
# 1. create a dataset
set.seed(100)
x1.r<-runif(200)
x2.r<-runif(200)
y.r<-runif(200)
# 2. we randomly introduce missing values
miss.x1<-rbinom(200, 1, prob=0.9)
miss.x2<-rbinom(200, 1, prob=0.9)
miss.y<-rbinom(200, 1, prob=0.9)
x1<-ifelse(miss.x1==1, x1.r, NA)
x2<-ifelse(miss.x2==1, x2.r, NA)
y<-ifelse(miss.y==1, y.r, NA)
datmat<-cbind(x1, x2, y)
# use mice to create 5 different datasets with filled missings
mice.dat<-mice(datmat, m=5, maxit=20)
#bwplot(mice.dat)
# fit regression line to 5 mice-filled datasets
fit<-with(mice.dat, lm(y~x1+x2))
# compare our model with what we'd get if we just used original dataset
summary(pool(fit))
summary(lm(y~x1+x2), datmat)
# fitting a model to 5 imputed datasets
# gives us more information that fitting
# to original dataset with missing values
#
##########################
# example 2: make 30% missing
set.seed(100)
x1.r<-runif(200)
x2.r<-runif(200)
y.r<-runif(200)
#we randomly generate missing values in the dataset
miss.x1<-rbinom(200, 1, prob=0.7)
miss.x2<-rbinom(200, 1, prob=0.7)
miss.y<-rbinom(200, 1, prob=0.7)
x1<-ifelse(miss.x1==1, x1.r, NA)
x2<-ifelse(miss.x2==1, x2.r, NA)
y<-ifelse(miss.y==1, y.r, NA)
datmat<-cbind(x1, x2, y)
mice.dat<-mice(datmat, m=5, maxit=20)
# Compare model with imputed data to that without
fit<-with(mice.dat, lm(y~x1+x2))
summary(pool(fit))
summary(lm(y~x1+x2), datmat)
###########################
# Example 3: introduce interactions, make 10% missing
x1.r<-runif(200)
x2.r<-runif(200)
y.r<-x1.r+x2.r+x1.r*x2.r+rnorm(200, sd=1)
#we randomly generate missing values in the dataset
miss.x1<-rbinom(200, 1, prob=0.9)
miss.x2<-rbinom(200, 1, prob=0.9)
miss.y<-rbinom(200, 1, prob=0.9)
x1<-ifelse(miss.x1==1, x1.r, NA)
x2<-ifelse(miss.x2==1, x2.r, NA)
y<-ifelse(miss.y==1, y.r, NA)
x1x2<-x1.r*x2.r
datmat<-cbind(x1, x2, x1x2, y)
# impute values to 5 datasets
mice.dat<-mice(datmat, m=5, maxit=20)
# fit model to imputed data
fit<-with(mice.dat, lm(y~x1+x2+I(x1*x2)))
summary(pool(fit))
# compare to model fit to dataset with missings
summary(lm(y~x1+x2+I(x1*x2)), datmat)
# ... has higher errors
#####################
# example 4: categorical data
# Mice package uses POisson
# Create a dataset for explanation
set.seed(100)
b0 <- 1
b1 <- .75
b2 <- -.25
b3 <- .5
N <- 200
x1 <- rnorm(N)
x2 <- rnorm(N)
x3 <- rnorm(N)
lam <- exp( b0+b1*x1+b2*x2+b3*x3)
y <- rpois( N, lam )
POIS <- data.frame( y, x1, x2, x3 )
# We introduce MAR missingness in y using the following function:
generatemissingy<- function( data, pos = 1, Z = 2, pmis = .5, strength = c( .5, .5 ) ) {
total <- round( pmis * nrow(data) )
sm <- which( data[,Z] < mean( data[,Z] ) )
gr <- which( data[,Z] > mean( data[,Z] ) )
sel.sm <- sample( sm, round( strength[1] * total ) )
sel.gr <- sample( gr, round( strength[2] * total ) )
sel <- c( sel.sm, sel.gr )
data[sel,pos] <- NA
return(data)
}
mdat <- generatemissingy( POIS, pmis = .2, strength = c( .2, .8) )
library(mice)
imp <- mice( mdat, seed = 100)
res <- with(imp, glm( y ~ x1 + x2 + x3, family = "poisson"))
print( pool.res <- summary( pool( res ) ) )
res<-glm( y ~ x1 + x2 + x3, family = "poisson")
summary(res)
# Error is higher if we don't impute values...
## also see "mi" package - allows specifying prior, etc
##############################################################################
# RevoScaleR RxTeradata Getting Started Guide Examples
##############################################################################
##############################################################################
# General setup/cleanup: Run this code before proceeding
##############################################################################
# To follow the examples in this guide, you will need the following
# comma-delimited text data files loaded into your Teradata database:
# ccFraud.csv as RevoTestDB.ccFraud10
# ccFraudScore.csv as RevoTestDB.ccFraudScore10
# These data files are available from Revolution FTP site.
# You can use the 'fastload' Teradata command to load the data sets into your
# data base.
# Modify and uncomment the following code for your connection to Teradata,
# and other information about your Teradata setup:
tdConnString <- "DRIVER=Teradata;DBCNAME=10.4.208.180;DATABASE=RevoTestDB;UID=SysDba;PWD=SysDba;"
tdShareDir = paste("c:\\users\\", Sys.getenv("USERNAME"), "\\tdShare", sep="")
tdRemoteShareDir = "/tmp/revoJobs"
tdRevoPath = "/usr/lib64/Revo-7.3/R-3.1.3/lib64/R"
# Make sure that the local share dir exists:
if (!file.exists(tdShareDir)) dir.create(tdShareDir, recursive = TRUE)
# If your DATABASE is not named RevoTestDB, you will need
# to modify the SQL queries in the following code to your
# DATABASE name
##########################################################
# Using a Teradata Data Source and Compute Context
##########################################################
# Creating an RxTeradata Data Source
tdQuery <- "SELECT * FROM ccFraud10"
teradataDS <- RxTeradata(connectionString = tdConnString,
sqlQuery = tdQuery, rowsPerRead = 50000)
# NOTE: this query is run every time it is called
# Extracting basic information about your data
rxGetVarInfo(data = teradataDS)
# Specifying column information in your data source
stateAbb <- c("AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC",
"DE", "FL", "GA", "HI","IA", "ID", "IL", "IN", "KS", "KY", "LA",
"MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NB", "NC", "ND",
"NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI","SC",
"SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY")
ccColInfo <- list(
gender = list(
type = "factor",
levels = c("1", "2"),
newLevels = c("Male", "Female")),
cardholder = list(
type = "factor",
levels = c("1", "2"),
newLevels = c("Principal", "Secondary")),
state = list(
type = "factor",
levels = as.character(1:51),
newLevels = stateAbb)
)
teradataDS <- RxTeradata(connectionString = tdConnString,
sqlQuery = tdQuery, colInfo = ccColInfo, rowsPerRead = 50000)
rxGetVarInfo(data = teradataDS)
tdWait <- TRUE
tdConsoleOutput <- FALSE
tdCompute <- RxInTeradata(
connectionString = tdConnString,
shareDir = tdShareDir,
remoteShareDir = tdRemoteShareDir,
revoPath = tdRevoPath,
wait = tdWait,
consoleOutput = tdConsoleOutput)
rxGetNodeInfo(tdCompute)
tdComputeTrace <- RxInTeradata(
connectionString = tdConnString,
shareDir = tdShareDir,
remoteShareDir = tdRemoteShareDir,
revoPath = tdRevoPath,
wait = tdWait,
consoleOutput = tdConsoleOutput,
traceEnabled = TRUE,
traceLevel = 7)
##########################################################
# High-Performance In-Database Analytics in Teradata
##########################################################
# Compute summary statistics in Teradata
# Set the compute context to compute in Teradata
rxSetComputeContext(tdCompute)
rxSummary(formula = ~gender + balance + numTrans + numIntlTrans +
creditLine, data = teradataDS)
# Set the compute context to compute locally
rxSetComputeContext ("local")
# Refining the RxTeradata data source
ccColInfo <- list(
gender = list(
type = "factor",
levels = c("1", "2"),
newLevels = c("Male", "Female")),
cardholder = list(
type = "factor",
levels = c("1", "2"),
newLevels = c("Principal", "Secondary")),
state = list(
type = "factor",
levels = as.character(1:51),
newLevels = stateAbb),
balance = list(low = 0, high = 41485),
numTrans = list(low = 0, high = 100),
numIntlTrans = list(low = 0, high = 60),
creditLine = list(low = 1, high = 75)
)
teradataDS <- RxTeradata(connectionString = tdConnString,
sqlQuery = tdQuery, colInfo = ccColInfo, rowsPerRead = 50000)
# Visualizing your data using rxHistogram and rxCube
rxSetComputeContext(tdCompute)
rxHistogram(~creditLine|gender, data = teradataDS,
histType = "Percent")
cube1 <- rxCube(fraudRisk~F(numTrans):F(numIntlTrans),
data = teradataDS)
cubePlot <- rxResultsDF(cube1)
levelplot(fraudRisk~numTrans*numIntlTrans, data = cubePlot)
# Analyzing your data with rxLinMod
linModObj <- rxLinMod(balance ~ gender + creditLine,
data = teradataDS)
summary(linModObj)
# Analyzing your data with rxLogit
logitObj <- rxLogit(fraudRisk ~ state + gender + cardholder + balance +
numTrans + numIntlTrans + creditLine, data = teradataDS,
dropFirst = TRUE)
summary(logitObj)
# Scoring a Data Set
tdQuery <- "SELECT * FROM ccFraudScore10"
teradataScoreDS <- RxTeradata(connectionString = tdConnString,
sqlQuery = tdQuery, colInfo = ccColInfo, rowsPerRead = 50000)
teradataOutDS <- RxTeradata(table = "ccScoreOutput",
connectionString = tdConnString, rowsPerRead = 50000 )
rxSetComputeContext(tdCompute)
if (rxTeradataTableExists("ccScoreOutput"))
rxTeradataDropTable("ccScoreOutput")
rxPredict(modelObject = logitObj,
data = teradataScoreDS,
outData = teradataOutDS,
predVarNames = "ccFraudLogitScore",
type = "link",
writeModelVars = TRUE,
overwrite = TRUE)
if (rxTeradataTableExists("ccScoreOutput"))
rxTeradataDropTable("ccScoreOutput")
rxPredict(modelObject = logitObj,
data = teradataScoreDS,
outData = teradataOutDS,
predVarNames = "ccFraudLogitScore",
type = "link",
writeModelVars = TRUE,
extraVarsToWrite = "custID",
overwrite = TRUE)
tdMinMax <- RxOdbcData(sqlQuery = paste(
"SELECT MIN(ccFraudLogitScore),",
"MAX(ccFraudLogitScore) FROM ccScoreOutput"),
connectionString = tdConnString)
minMaxVals <- rxImport(tdMinMax)
minMaxVals <- as.vector(unlist(minMaxVals))
teradataScoreDS <- RxTeradata(sqlQuery =
"Select ccFraudLogitScore FROM ccScoreOutput",
connectionString = tdConnString, rowsPerRead = 50000,
colInfo = list(ccFraudLogitScore = list(
low = floor(minMaxVals[1]),
high = ceiling(minMaxVals[2]))))
rxSetComputeContext(tdCompute)
rxHistogram(~ccFraudLogitScore, data = teradataScoreDS)
##########################################################
# Using rxDataStep and rxImport
##########################################################
teradataScoreDS <- RxTeradata(
sqlQuery = "Select * FROM ccScoreOutput",
connectionString = tdConnString, rowsPerRead = 50000 )
teradataOutDS2 <- RxTeradata(table = " ccScoreOutput2",
connectionString = tdConnString, rowsPerRead = 50000)
rxSetComputeContext(tdCompute)
if (rxTeradataTableExists("ccScoreOutput2"))
rxTeradataDropTable("ccScoreOutput2")
rxDataStep(inData = teradataScoreDS, outFile = teradataOutDS2,
transforms = list(ccFraudProb = inv.logit(ccFraudLogitScore)),
transformPackages = "boot", overwrite = TRUE)
rxGetVarInfo(teradataOutDS2)
# Using rxImport to Extract a Subsample
rxSetComputeContext("local")
teradataProbDS <- RxTeradata(
sqlQuery = paste(
"Select * FROM ccScoreOutput2",
"WHERE (ccFraudProb > .99)"),
connectionString = tdConnString)
highRisk <- rxImport(teradataProbDS)
orderedHighRisk <- highRisk[order(-highRisk$ccFraudProb),]
row.names(orderedHighRisk) <- NULL # Reset row numbers
head(orderedHighRisk)
# Using rxDataStep to Create a Teradata Table
rxSetComputeContext("local")
xdfAirDemo <- RxXdfData(file.path(rxGetOption("sampleDataDir"),
"AirlineDemoSmall.xdf"))
rxGetVarInfo(xdfAirDemo)
teradataAirDemo <- RxTeradata(table = " AirDemoSmallTest",
connectionString = tdConnString)
if (rxTeradataTableExists("AirDemoSmallTest",
connectionString = tdConnString))
rxTeradataDropTable("AirDemoSmallTest",
connectionString = tdConnString)
rxDataStep(inData = xdfAirDemo, outFile = teradataAirDemo,
transforms = list(
DayOfWeek = as.integer(DayOfWeek),
rowNum = .rxStartRow : (.rxStartRow + .rxNumRows - 1)
), overwrite = TRUE
)
rxSetComputeContext(tdCompute)
teradataAirDemo <- RxTeradata(sqlQuery =
"SELECT * FROM AirDemoSmallTest",
connectionString = tdConnString,
rowsPerRead = 50000,
colInfo = list(DayOfWeek = list(type = "factor",
levels = as.character(1:7))))
rxSummary(~., data = teradataAirDemo)
# Performing Your Own Chunking Analysis
ProcessChunk <- function( dataList)
{
# Convert the input list to a data frame and
# call the 'table' function to compute the
# contingency table
chunkTable <- table(as.data.frame(dataList))
# Convert table output to data frame with single row
varNames <- names(chunkTable)
varValues <- as.vector(chunkTable)
dim(varValues) <- c(1, length(varNames))
chunkDF <- as.data.frame(varValues)
names(chunkDF) <- varNames
# Return the data frame, which has a single row
return( chunkDF )
}
rxSetComputeContext( tdCompute )
tdQuery <-
"select DayOfWeek from AirDemoSmallTest"
inDataSource <- RxTeradata(sqlQuery = tdQuery,
rowsPerRead = 50000,
colInfo = list(DayOfWeek = list(type = "factor",
levels = as.character(1:7))))
iroDataSource = RxTeradata(table = "iroResults",
connectionString = tdConnString)
if (rxTeradataTableExists(table = "iroResults",
connectionString = tdConnString))
{
rxTeradataDropTable( table = "iroResults",
connectionString = tdConnString)
}
rxDataStep( inData = inDataSource, outFile = iroDataSource,
transformFunc = ProcessChunk,
overwrite = TRUE)
iroResults <- rxImport(iroDataSource)
finalResults <- colSums(iroResults)
finalResults
rxTeradataDropTable( table = " iroResults",
connectionString = tdConnString)
##########################################################
# Using rxDataStep for By-Group Analyses
##########################################################
# Creating a simulated data set
set.seed(10)
numObs <- 100000
testData <- data.frame(
SKU = as.integer(runif(n = numObs, min = 1, max = 11)),
INCOME = as.integer(runif(n = numObs, min = 10, max = 21)))
# SKU is also underlying coefficient
testData$SALES <- as.integer(testData$INCOME * testData$SKU +
25*rnorm(n = numObs))
# Uploading into a Teradata table
rxSetComputeContext("local")
if (rxTeradataTableExists("sku_sales_100k",
connectionString = tdConnString))
{
rxTeradataDropTable("sku_sales_100k",
connectionString = tdConnString)
}
teradataDS <- RxTeradata(table = "sku_sales_100k",
connectionString = tdConnString)
rxDataStep(inData = testData, outFile = teradataDS)
# The By-Group Transformation Function
EstimateModel <- function(dataList)
{
# Convert the input list to a data frame
chunkDF <- as.data.frame(dataList)
# Estimate a linear model on this chunk
# Don't include the model frame in the model object
model <- lm( SALES~INCOME, chunkDF, model=FALSE )
# Print statements can be useful for debugging
# print( model$coefficients )
# Remove unneeded parts of model object
model$residuals <- NULL
model$effects <- NULL
model$fitted.values <- NULL
model$assign <- NULL
model$qr[[1]] <- NULL
attr(model$terms, ".Environment") <- NULL
# Convert the model to a character string
modelString <-
rawToChar(serialize(model, connect=NULL, ascii=TRUE))
# Pad to a fixed length
modelString <- paste( modelString,
paste(rep("X", 2000 - nchar(modelString)), collapse = ""),
sep="")
# Create the entry for the results table
resultDF <- data.frame(
SKU = chunkDF[1,]$SKU, # Assumes SKU's all the same
COEF = model$coefficients[2], # Slope
MODEL = modelString )
return( resultDF )
}
# Specifying the input by-group data source
tdQuery <- "SELECT * FROM sku_sales_100k"
partitionKeyword <- "PARTITION-WITH-VIEW"
partitionClause <- paste( partitionKeyword, "BY sku_sales_100k.SKU" )
inDataSource <- RxTeradata(sqlQuery = tdQuery,
tableOpClause = partitionClause,
rowsPerRead = 100000)
# Setting up the results data source
resultsDataSource = RxTeradata(table = "models",
connectionString = tdConnString )
rxOptions(transformPackages = c("stats"))
# Make sure compute context is in-database
rxSetComputeContext(tdCompute)
# Run the by-group data step
rxDataStep( inData = inDataSource, outFile = resultsDataSource,
transformFunc = EstimateModel, reportProgress = 0,
overwrite = TRUE )
# Showing the estimated slopes
lmResults <- rxImport(resultsDataSource)
lmResults[order(lmResults$SKU),c("SKU", "COEF")]
# The Scoring transformation function
ScoreChunk <- function(dataList)
{
chunkDF <- as.data.frame(dataList)
# Extract the model string for the first observation
# and convert back to model object
# All observations in this chunk have the same SKU,
# so they all share the same model
modelString <- as.character(chunkDF[1,]$MODEL)
model <- unserialize( charToRaw( modelString ) )
resultDF <- data.frame(
SKU = chunkDF$SKU,
INCOME = chunkDF$INCOME,
# Use lm's predict method
PREDICTED_SALES = predict(model, chunkDF) )
return( resultDF )
}
# Setting up the input data source for predictions
predictQuery <- paste("SELECT sku_sales_100k.SKU,",
"sku_sales_100k.INCOME, models.MODEL",
"FROM sku_sales_100k JOIN models ON",
"sku_sales_100k.SKU = models.SKU")
inDataSource <- RxTeradata(sqlQuery = predictQuery,
connectionString = tdConnString,
tableOpClause = partitionClause, rowsPerRead = 10000)
# Setting up the output data set for predictions
scoresDataSource = RxTeradata(table = "scores",
connectionString = tdConnString)
# Perform scoring in-database
rxSetComputeContext(tdCompute)
rxDataStep( inData = inDataSource, outFile = scoresDataSource,
transformFunc = ScoreChunk, reportProgress = 0,
overwrite = TRUE )
# Extracting summary results
predSum <- rxImport( RxTeradata(
sqlQuery = paste("SELECT SKU, INCOME, MIN(PREDICTED_SALES),",
"MAX(PREDICTED_SALES), COUNT(PREDICTED_SALES)",
"FROM scores GROUP BY SKU, INCOME",
"ORDER BY SKU, INCOME"),
connectionString = tdConnString ) )
options(width = 120) # Set display width
predSum[1:15,]
# Clean-up
rxTeradataDropTable("models")
rxTeradataDropTable("scores")
rxSetComputeContext("local")
##########################################################
# Performing Simulations In-Database
##########################################################
playCraps <- function()
{
result <- NULL
point <- NULL
count <- 1
while (is.null(result))
{
roll <- sum(sample(6, 2, replace=TRUE))
if (is.null(point))
{
point <- roll
}
if (count == 1 && (roll == 7 || roll == 11))
{
result <- "Win"
}
else if (count == 1 &&
(roll == 2 || roll == 3 || roll == 12))
{
result <- "Loss"
}
else if (count > 1 && roll == 7 )
{
result <- "Loss"
}
else if (count > 1 && point == roll)
{
result <- "Win"
}
else
{
count <- count + 1
}
}
result
}
# Calling rxExec
teradataExec <- rxExec(playCraps, timesToRun=1000, RNGseed="auto")
length(teradataExec)
table(unlist(teradataExec))
##########################################################
# Analyzing Your Data Alongside Teradata
##########################################################
# Set the compute context to compute locally
rxSetComputeContext ("local")
tdQuery <- "SELECT * FROM ccFraudScore10"
# Perform an rxSummary using a local compute context
teradataDS1 <- RxTeradata(connectionString = tdConnString,
sqlQuery = tdQuery, colInfo = ccColInfo, rowsPerRead = 500000)
rxSummary(formula = ~gender + balance + numTrans + numIntlTrans +
creditLine, data = teradataDS1)
statesToKeep <- sapply(c("CA", "OR", "WA"), grep, stateAbb)
statesToKeep
# Fast import of iata from a Teradata Database
importQuery <- paste("SELECT gender,cardholder,balance,state",
"FROM ccFraud10",
"WHERE (state = 5 OR state = 38 OR state = 48)")
importColInfo <- list(
gender = list(
type = "factor",
levels = c("1", "2"),
newLevels = c("Male", "Female")),
cardholder = list(
type = "factor",
levels = c("1", "2"),
newLevels = c("Principal", "Secondary")),
state = list(
type = "factor",
levels = as.character(statesToKeep),
newLevels = names(statesToKeep))
)
rxSetComputeContext("local")
teradataImportDS <- RxTeradata(connectionString = tdConnString,
sqlQuery = importQuery, colInfo = importColInfo)
localDS <- rxImport(inData = teradataImportDS,
outFile = "ccFraudSub.xdf",
overwrite = TRUE)
# Using the imported data
rxGetVarInfo(data = localDS)
rxSummary(~gender + cardholder + balance + state, data = localDS)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment