standarderror/R code library.R Secret

## R code library.R
##########################################
#### DATA MANIPULATION WITH DPLYR ########
#### from datacamp course ################
# All examples here: https://rstudio-pubs-static.s3.amazonaws.com/98285_aee86145ffc746e685b8792c7cf8a73d.html
##########################################
## SUMMARY: THE 5 VERBS ######
# select = subset columns
# mutate = new cols
# filter = subset rows
# arrange = reorder rows
# summarise

# use GROUP_BY with summarise or mutate to get group calculations

# pipe results with %>%

## Can use these with dataframes, datatables or databases ##

# Additional summary functions within dplyr
#first(x) - The first element of vector x.
#last(x) - The last element of vector x.
#nth(x, n) - The nth element of vector x.
#n() - The number of rows in the data.frame or group of observations that summarise() describes.
#n_distinct(x) - The number of unique values in vector x.


#install.packages("BH", lib='c:\\notbackedup\\R\\')  # dplyr requirement
library("BH", lib='c:\\notbackedup\\R\\')

install.packages("ggplot2", lib='c:\\notbackedup\\R\\')
library("ggplot2", lib='c:\\notbackedup\\R\\')

install.packages("ROCR", lib='c:\\notbackedup\\R\\')
library("ROCR", lib='c:\\notbackedup\\R\\')

install.packages("party", lib='c:\\notbackedup\\R\\')
library("party", lib='c:\\notbackedup\\R\\')

install.packages("dplyr", lib='c:\\notbackedup\\R\\')
library(dplyr, lib='c:\\notbackedup\\R\\')

install.packages("hflights", lib='\\notbackedup\\R\\')
library(hflights, lib='\\notbackedup\\R\\')

summary(hflights)

## TBL instead of DATA.FRAME = faster?
# tbl - more convenient for displaying data.frames
# convert with tbl_df()
hflights <- tbl_df(hflights)
hflights


#### cleanup dataset
## carrier name
lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental", "DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways", "WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier", "FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa")
# Use lut to translate the UniqueCarrier column of hflights
hflights$UniqueCarrier = lut[hflights$UniqueCarrier]
# Inspect the resulting raw values of your variables
glimpse(hflights)

## more codes
# Build the lookup table
lut <- c("A" = "carrier", "B" = "weather", "C" = "FFA" , "D" = "security", "E" = "not cancelled")
# Use the lookup table to create a vector of code labels. Assign the vector to the CancellationCode column of hflights
hflights$CancellationCode = lut[hflights$CancellationCode]
# Inspect the resulting raw values of your variables
glimpse(hflights)


#### THE 5 VERBS: SELECT & MUTATE ######
# select = remove columns
# mutate = new cols
# filter = removes rows
# arrange = reorder rows
# summarise

############### SELECT() ###############
## SELECT HELPER FUNCTIONS
#dplyr provides 6 helper functions, each of which only works when used inside select().
#starts_with("X"): every name that starts with "X",
#ends_with("X"): every name that ends with "X",
#contains("X"): every name that contains "X",
#matches("X"): every name that matches "X", which can be a regular expression,
#num_range("x", 1:5): the variables named x01, x02, x03, x04 and x05,
#one_of(x): every name that appears in x, which should be a character vector.

# select(df, Group, Sum)

hflights2 = select(hflights, ActualElapsedTime, Airtime, ArrDelay, DepDelay)

?select

## CHAPTER 2 Ex1

# hflights is pre-loaded as a tbl, together with the necessary libraries.

# Return a copy of hflights that contains the four columns related to delay.
hflights2 = select(hflights, ActualElapsedTime, AirTime, ArrDelay, DepDelay)

# print hflights, nothing has changed!
print(hflights)

# Return a copy of hflights containing the columns Origin up to Cancelled
hflights2 = select(hflights, Origin:Cancelled)

# Answer to last question: be concise!
#colnames(hflights)

hflights2 = select(hflights, Year:DayOfWeek,ArrDelay:Diverted )


## Chapter 2 Ex2

# As usual, hflights is pre-loaded as a tbl, together with the necessary libraries.

# Return a tbl containing just ArrDelay and DepDelay
hflights2 = select(hflights, ArrDelay, DepDelay)

# Return a tbl as described in the second exercise, using both helper functions and variable names
hflights2 = select(hflights, UniqueCarrier:TailNum, Cancelled:CancellationCode)


# Return a tbl as described in the third exercise, using only helper functions.
hflights2 = select(hflights, contains('Time'), contains('Delay'))


## Chap 2 ex3

# both hflights and dplyr are available

# Exercise 1
ex1r <- hflights[c("TaxiIn","TaxiOut","Distance")]
ex1d <- select(hflights,TaxiIn, TaxiOut,Distance)

# Exercise 2
ex2r <- hflights[c("Year","Month","DayOfWeek","DepTime","ArrTime")]
ex2d <- select(hflights, Year:ArrTime, -DayofMonth)

# Exercise 3
ex3r <- hflights[c("TailNum","TaxiIn","TaxiOut")]
ex3d <- select(hflights,TailNum, TaxiIn,TaxiOut)


colnames(hflights)

#### MUTATE()
# mutate(tbl, z = x - y, c = a - b, ...)

colnames(hflights)


# Chapter 2 ex 4
# hflights and dplyr are loaded and ready to serve you.

# Add the new variable ActualGroundTime to a copy of hflights and save the result as g1.
g1 <- mutate(hflights, ActualGroundTime = ActualElapsedTime - AirTime)

# Add the new variable GroundTime to a copy of g1 and save the result as g2.
g2 <- mutate(g1, GroundTime = TaxiIn + TaxiOut)

# Add the new variable AverageSpeed to a copy of g2 and save the result as g3.
g3 <- mutate(g2, AverageSpeed = Distance / AirTime * 60)


# chatper 3 ex 5
# hflights and dplyr are ready, are you?
# hflights and dplyr are ready, are you?

# Add a second variable loss_percent to the dataset and save the result to m1.
m1 <- mutate(hflights, loss = ArrDelay - DepDelay, loss_percent = (ArrDelay - DepDelay) / DepDelay * 100)

# Remove the redundancy from your previous exercise and reuse loss to define the loss_percent variable.
# Assign the result to m2
m2 <- mutate(m1, loss = ArrDelay - DepDelay, loss_percent = loss / DepDelay * 100)

# Add the three variables as described in the third exercise and save the result to m3
m3 <- mutate(hflights, TotalTaxi = TaxiIn + TaxiOut, ActualGroundTime = ActualElapsedTime - AirTime, Diff = TotalTaxi - ActualGroundTime)


#### Section 5: FILTER rows #####
# form: filter(tablename, logical tests)
# use ?comparison to see operators

install.packages("dplyr")
library(dplyr)

# hflights is at your disposal as a tbl, with clean carrier names
install.packages("hflights")
library(hflights)
colnames(hflights)
# hflights is at your disposal as a tbl, with clean carrier names
colnames(hflights)
head(hflights)

# All flights that traveled 3000 miles or more
filter(hflights, Distance > 3000)

# All flights flown by one of JetBlue, Southwest, or Delta
filter(hflights, UniqueCarrier %in% c('JetBlue', 'Southwest', 'Delta'))


# All flights where taxiing took longer than flying
filter(hflights, (TaxiIn+TaxiOut)>AirTime)


## Combine with booleans
# & (and), | (or), and ! (not)
# for NOT: filter(df, !is.na(x))
#filter(df, a > b & c > d)
#filter(df, a > b, c > d)
# hflights is at your service as a tbl!

# All flights that departed before 5am or arrived after 10pm
filter(hflights,  DepTime< 500 | ArrTime>2200)

# All flights that departed late but arrived ahead of schedule
filter(hflights,  DepDelay>0 & ArrDelay<0)

# All cancelled weekend flights
filter(hflights,  Cancelled==1 & DayOfWeek %in% c(6,7))

# All flights that were cancelled after being delayed
filter(hflights,  DepDelay>0 & Cancelled==1)

# hflights is already available in the workspace

# Select the flights that had JFK as their destination: c1
c1 = filter(hflights, Dest  == 'JFK')

# Combine the Year, Month and DayofMonth variables to create a Date column: c2
c2 = mutate(c1, Date = paste(Year,Month,DayofMonth,sep="-"))

# Print out a selection of columns of c2
select(c2, Date, DepTime, ArrTime, TailNum)


filter(hflights, Distance>1000 & (TaxiIn+TaxiOut)<15 & DayOfWeek %in% (6,7))


##### Section 6: ARRANGE = reorder dataframe rows
### arrange(tbl, columns) - by default in ascending
## arrange(tbl, col1, col2)
# for descending, wrap var in desc(): arrange(tbl, desc(col1))

# dplyr and the hflights tbl are available

# Definition of dtc
dtc <- filter(hflights, Cancelled == 1, !is.na(DepDelay))

# Arrange dtc by departure delays
arrange(dtc, DepDelay)

# Arrange dtc so that cancellation reasons are grouped
arrange(dtc, CancellationCode)

# Arrange dtc according to carrier and departure delays
arrange(dtc, UniqueCarrier, DepDelay)


# dplyr and the hflights tbl are available

# Arrange according to carrier and decreasing departure delays
arrange(hflights, UniqueCarrier, desc(DepDelay))

# Arrange flights by total delay (normal order).
arrange(hflights, (ArrDelay+DepDelay))

# Keep flights leaving to DFW before 8am and arrange according to decreasing AirTime
arrange(filter(hflights, Dest == 'DFW' & DepTime < 800), desc(AirTime))


### SECTION 7: SUMMARISE
# summarise(tbl, newcol = expression(col), ...)
# sum, mean, var
# functions must take a vector and return a single number
# min(x) - minimum value of vector x.
# max(x) - maximum value of vector x.
# mean(x) - mean value of vector x.
# median(x) - median value of vector x.
# quantile(x, p) - pth quantile of vector x.
# sd(x) - standard deviation of vector x.
# var(x) - variance of vector x.
# IQR(x) - Inter Quartile Range (IQR) of vector x.
# diff(range(x)) - total range of vector x.

# hflights and dplyr are loaded in the workspace

# Print out a summary with variables min_dist and max_dist
summarise(hflights, min_dist = min(Distance), max_dist = max(Distance))

# Print out a summary with variable max_div
summarise(filter(hflights, Diverted==1), max_div = max(Distance))


# hflights is available

# Remove rows that have NA ArrDelay: temp1
temp1 = filter(hflights, !is.na(ArrDelay))

# Generate summary about ArrDelay column of temp1
summarise(temp1, earliest = min(ArrDelay), average = mean(ArrDelay), latest = max(ArrDelay), sd = sd(ArrDelay))

# Keep rows that have no NA TaxiOut and no NA TaxiOut: temp2
temp2 = filter(hflights, !is.na(TaxiOut) & !is.na(TaxiIn))

# Print the maximum taxiing difference of temp2 with summarise()
summarise(temp2,  max_taxi_diff = max(abs(TaxiIn - TaxiOut)))


##### summary functions within dplyr
#first(x) - The first element of vector x.
#last(x) - The last element of vector x.
#nth(x, n) - The nth element of vector x.
#n() - The number of rows in the data.frame or group of observations that summarise() describes.
#n_distinct(x) - The number of unique values in vector x.


# hflights is available with full names for the carriers

#head(hflights)

# Generate summarizing statistics for hflights
summarise(hflights, n_obs = n(), n_carrier = n_distinct(UniqueCarrier), n_dest = n_distinct(Dest), dest100 = nth(Dest,100))

# Filter hflights to keep all American Airline flights: aa
aa = filter(hflights, UniqueCarrier=='American')

# Generate summarizing statistics for aa
summarise(aa, n_flights = n(),
          n_canc = sum(Cancelled == 1),
          p_canc = mean(Cancelled == 1) * 100,
          avg_delay = mean(ArrDelay, na.rm = TRUE))


########## SECTION 8: CHAINING WITH PIPE OPERATOR
# magrittr inside dplyr
# %>% passes objcet on left to object on right as FIRST ARGUMENT
# eg. c(1, 2, 3, NA) %>% mean(na.rm = TRUE)

# Write the 'piped' version of the English sentences.

hflights %>% mutate(diff = TaxiOut - TaxiIn) %>% filter(!is.na(diff)) %>% summarise(avg = mean(diff))

# hflights is pre-loaded

# Build data frame with 4 columns of hflights and 2 self-defined columns: d
d = hflights %>%
  select(Dest, UniqueCarrier, Distance, ActualElapsedTime) %>%
  mutate(RealTime = ActualElapsedTime + 100, mph = Distance/RealTime*60)

head(d)

# Filter and summarise d according to the instructions
filter(d, !is.na(mph) & mph<70) %>%
  summarise(n_less = n(), n_dest = n_distinct(Dest), min_dist = min(Distance), max_dist = max(Distance))


hflights %>%
  select(Dest, Cancelled, Distance, ActualElapsedTime, Diverted) %>%
  mutate(RealTime = ActualElapsedTime + 100, mph = Distance / RealTime * 60) %>%
  filter(mph < 105 | Cancelled == 1 | Diverted == 1) %>%
  summarise( n_non = n(),
             p_non = n_non / nrow(hflights) * 100,
             n_dest = n_distinct(Dest),
             min_dist = min (Distance),
             max_dist = max(Distance))


# Count the number of overnight flights
hflights %>%
  filter(!is.na(DepTime), !is.na(ArrTime), DepTime > ArrTime) %>%
  summarise(n = n())


#### SECTION 9: GROUP BY in SUMMARISE
# group_by(tbl, group_var)
# Attaches group metadata to dataset which summarise can use

# Make an ordered per-carrier summary of hflights
hflights %>%
  group_by(UniqueCarrier) %>%
  summarise(n_flights = n(),
            n_canc = sum(Cancelled == 1),
            p_canc = mean(Cancelled == 1) * 100,
            avg_delay = mean(ArrDelay, na.rm = TRUE)) %>%
  arrange(avg_delay, p_canc)

# Make an ordered per-day summary of hflights
hflights %>%
  group_by(DayOfWeek) %>%
  summarise(avg_taxi = mean(TaxiIn + TaxiOut, na.rm=TRUE)) %>%
  arrange(desc(avg_taxi))


### Using group_by with mutate
# dplyr is loaded, hflights is loaded with fancy carrier names

# Solution to first instruction
hflights %>%
  group_by(UniqueCarrier) %>%
  filter(!is.na(ArrDelay)) %>%
  summarise(p_delay = mean(ArrDelay > 0)) %>%
  mutate(rank = rank(p_delay)) %>%
  arrange(rank)

# Solution to second instruction
hflights %>%
  group_by(UniqueCarrier) %>%
  filter(!is.na(ArrDelay), ArrDelay > 0) %>%
  summarise(avg = mean(ArrDelay)) %>%
  mutate(rank = rank(avg)) %>%
  arrange(rank)


### ADVANCED COMBINATIONS
# Which plane (by tail number) flew out of Houston the most times? How many times? adv1
adv1 <- hflights %>%
  group_by(TailNum) %>%
  summarise(n = n()) %>%
  filter(n == max(n))

# How many airplanes only flew to one destination from Houston? adv2
adv2 <- hflights %>%
  group_by(TailNum) %>%
  summarise(ndest = n_distinct(Dest)) %>%
  filter(ndest == 1) %>%
  summarise(nplanes = n())


# Find the most visited destination for each carrier: adv3
adv3 <- hflights %>%
  group_by(UniqueCarrier, Dest) %>%
  summarise(n = n()) %>%
  mutate(rank = rank(desc(n))) %>%
  filter(rank == 1)

# Find the carrier that travels to each destination the most: adv4
adv4 <- hflights %>%
  group_by(Dest, UniqueCarrier) %>%
  summarise(n = n()) %>%
  mutate(rank = rank(desc(n))) %>%
  filter(rank == 1)

########### CONNECTING TO DATABASES

## SQL
# set up a src that connects to the mysql database (src_mysql is provided by dplyr)
my_db <- src_mysql(dbname = "dplyr",
                   host = "dplyr.csrrinzqubik.us-east-1.rds.amazonaws.com",
                   port = 3306,
                   user = "dplyr",
                   password = "dplyr")

# and reference a table within that src: nycflights is now available as an R object that references to the remote nycflights table
nycflights <- tbl(my_db, "dplyr")

# glimpse at nycflights
glimpse(nycflights)

# Calculate the grouped summaries detailed in the instructions
dbsumm <- nycflights %>%
  group_by(carrier) %>%
  summarise(n_flights = n(), avg_delay = mean(arr_delay)) %>%
  arrange(avg_delay)

dbsumm


#####################################
# Swinburne: R programming #
#####################################
#### DISTRIBUTIONS AND MODELLING ###
#####################################

# Methods of FITTING / ESTIMATING
# [1] Classical frequentist
# [1.1] Least squares
#       minimise squared residual
#        Usually by finding point with minimum error = 0 gradient
#        from solving sets of linear equations Ax = B
# [1.2] Maximum likelihood:   ******
#   given a function that calculates "likelihood," which means
#   if _data is fixed_ and parameters vary, what is probability.
#   usually multiply together f(y,theta) for all y
#       PICTURE the probability parabola across the various thetas
#   OR sum the log likelihoods
# LS vs MLE: are the same if normally distributed error
# LS cannot be used for other distributions, eg binomial
# ...

# ***
# The distribution of ERROR and distribution of Y are independant
# You can have NORMALLY DISTRIBUTED error and non-normal Y
# --> You can use least squares on NON-NORMAL DATA
# ... because you may be adding non-normal variables to get
#      a non-normal outcome... and that's fine.
#     ==> you'll have normal error
# ***
# ** Hence it is the distribution of ERROR we need to attend to


## Distributions / random variables
# Can be univariate or multivariate
# Every model requires assumptions about probability distribution of residuals.
# We can do Monte Carlo simulations with our distributions

# if residuals are normal and relationship between variables is simple additive
# --> simple linear modelling

## Discrete RVs
# Have a set of values, each has a probability
X = c(30, 40, 50, 60, 70, 80)
P = c(0.3, 0.3, 0.05, 0.05, 0.1, 0.2)
# mean = sum(X*P) # sum product of each value * probabilty of value
# of in matrix algebra: mean=t(X)%*%P

## Continuous RVs
# Must integrate across ranges to get probabilities
# mean - integrate over x * Px
# variance = integrate over (x-mu)^2 * Px

### Uniform distribution ###
# Every value has same probability
runif(10,3,5) # n, min, max
x = runif(100000,3,5) # n, min, max
x=data.matrix(x)
dim(x)
hist(x)
# compute theoretical mean
(1/2) * (3+5)
# compute empirical mean
mean(x)
# var theoretical
(1/12)*(5-3)^2
# var emprical
var(x)


### Normal ####
# Generate 10,000 N(0,1) numbers (i.e., standard normal).
# no need to specify the mean and variance.
X<-rnorm(10000)
# Generate 10,000 N(30,102) numbers.
X<-rnorm(10000,30,10)
# checking the histogram.
hist(X)
# checking the mean and variance.
mean(X)
var(X)


### Chi-sq ###
# is the SUM of multiple SQUARED NORMAL R.V's
# = sum of squared normal RVs
# So we could simulate ChiSq with 5 degrees of freedom by generating 5 sets of RNORM, squaring and adding


#### F-distribution ###
# all ANOVAs are F-tests
# F is defined with 2 independant chi-square variables
# It is result of ratio between the two

x=rf(100,5,2)  # number, df of chisq 1, df of chisq 2
hist(x)

#### t-distribution ###
# variance = v/v-2, where v=df
# as df --> infinity (because sample gets huge), then t becomes normal
# The square of a t is an F


#### Distributions for GLM ####
## to account for residual distributions

## logistic distribution: similar to normal but with fatter tail
# generate 10000 logistic random numbers with location being 0 and scale being 1.
X<-rlogis(10000, 0, 1)
# generate 10000 standard normal random numbers.
Y<-rnorm(10000, 0,1)
# compare their histograms
hist(X, xlim=c(-20,20), col="red")
hist(Y, add=T, col="blue")

## Exponential
# models time BETWEEN events in Poisson proceess
# remember not to confuse RATE with MEAN TIME

# generate 10000 random exponential numbers from the example
X<-rexp(10000, rate = 1/30)
hist(X)


## Poisson distribution
# Describe probability of a number of events occuring in fixed period of time or space. eg crime
# commonly used for COUNT data
# lambda = expected # of occurences

# if you have TOO MANY zeroes may not an alternative
# also, sometimes large count data actually does look normal - in that case, use normal!

# generate 10000 random Poisson numbers
X<-rpois(10000, 1)
Y<-rpois(10000, 10)
Z<-rpois(10000, 20)
# stack their histograms together
hist(X, xlim=c(0,50), col="red")
hist(Y, add=T, col="blue")
hist(Z, add=T, col="green")
# check the mean and variance are indeed very close
mean(X)
var(X)


##### ANOVA #####
library(reshape2)
data(tips, package = "reshape2")
tips

tipAnova <- aov(tip~day+time+sex-1,tips)
summary(tipAnova)
tipAnova$coefficients

# Letâs look at the p-value of genderâ¦
# In this F distribution:
#   DF1 = 1, DF2 = 239
# R code for obtaining p-value from an F distribution.
pf(0.820,1,238,lower.tail=FALSE)
# Recall that when DF1 = 1 the F distribution is the same as the square of a t(DF2).
# R code for obtaining p-value from a t distribution.
pt(0.820^0.5,238,lower.tail=FALSE)*2


####### MATRIX ALGEBRA ############
# week 8

# X is a matrix, m x n
# m is rows, n is coloumns
# if n=1, column vector
# if m=1, row vector
# if m=n, square matrix
# Transpose = X'

# Column Vector
X = matrix(c(3,4,7,8,9,10,3,3),8,1)
X
# Row Vector
X = matrix(c(3,4,7,8,9,10,3,3),1,8)
X
# a (4x2) matrix
X = matrix(c(3,4,7,8,9,10,3,3),4,2)
X
# a (3x3) square matrix
X = matrix(c(3,4,7,8,9,10,3,3,1),3,3)
X
# transpose of X
t(X)


## Multiplying matrices
# must be CONFORMABLE
# (axb)*(bxc) --> col dim of 1st matrix must = row dim of 2nd matrix

X = matrix(c(3,4,7,8,9,10,3,3,1),3,3)
Y = matrix(c(6,8,2,1,2,1,3,2,1),3,3)
O = matrix(c(2,2,3),3,1)
# X and O are conformable (3x1)
X%*%O
# O and X are NOT conformable
O%*%X
# t(O) and X are conformable (1x3)
t(O)%*%X
# X and Y are conformable
X%*%Y
# first element of X%*%Y
# is sumproduct of row1 of X and column1 of Y
sum(X[1,]*Y[,1])
# last element of X%*%Y
# is sumproduct of row3 of X and column3 of Y
sum(X[3,]*Y[,3])
# YX is NOT equal to XY
Y%*%X==X%*%Y

## Matrix inverse
# X * X' = I, where I is the identity matrix
# Identity matrix = 1's along diagonal
I = matrix(diag(3),3,3)
I

# solve() finds the inverse
?solve

X = matrix(c(3,4,7,8,9,10,3,3,1),3,3)
# in R, matrix inverse is done through solve().
XI = solve(X)
X%*%XI
# if you foolishly did this
XI = X^-1
XI
# you have inverted every single element in X.

# only square matrices have inverse
# So many X's aren't square: obs * variables, not square
# Hence we have to use X*X to get square

#################### REGRESSION MODELLING ###########################

# Y = X*Beta + epsilon
# Where X is matrix N * K, B is K x 1
# epsilon is (N x 1). The distribution of epison determines
# priperties of Y. Here, we assume multivariate normal.
# Beta is what we want to estimate: determines relationship
# between X and Y

data(cars)
attach(cars)
plot(speed,dist,xlim=c(0,25))

regmodel=lm(dist~speed)  # add -1 to remove intercept
summary(regmodel)

# generate a sequence of xâsâ¦
x=seq(min(speed),max(speed),1)
ypredicted=regmodel$coefficients[1]+regmodel$coefficients[2]*x
lines(x,ypredicted,col="red")


## Least squares estimation is fundamental method
# Residual e_i = Y_i - Yhat_i
# = the distance between line and observation point

# Now we want to solve to get minimum residuals
# see slides: take the derivative of e_i with respect to B0 and B1
# = two simultaneous equations. Solve for both = 0
# --> solve this with matrix algebra
# beta = (X'X)^-1(X'Y)

data(cars)
attach(cars)
Y = matrix(dist)
dim(Y)
one = matrix(rep(1,50))
X = cbind(one,matrix(speed))
dim(X)
b = solve( t(X)%*%X ) %*% ( t(X)%*%Y )
b

# It is important to recognize that LS
# line of best fit is NOT the only option out there.
# If you choose a different penalty function, you will
# get a different line...
# eg sum of residuals instead of sum SQUARE residuals


# LS estimators are BLUE:
# BLUE: Best Linear Unbiased Estimator.
# Under the following assumptions:
#  The regressors are INDEPENDENT of the residuals.
#  The regressors are NOT perfectly collinear with each other.
#  The residuals are homoscedastic.
#  The residuals are uncorrelated.

# **
# We assume *residuals* must be normally distributed
# that DOESN'T MEAN that Y must be normally distributed
# as long as ERROR is normal, we can use least squares


## Electricity example
# Y-variable - electricity usage - is normal with some outliers
# By looking at residuals, we see there is something happening which we're missing
plot(resid(regmodel))
hist(resid(regmodel))
# ... by adding polynomial (square) terms we get a good model


####### GENERALISED LINEAR MODELS ###########
# if residuals aren't normal, may need a link function
# equation of Beta * X stays same, but link funciton applies to Y: g(Y)
# precisely: g(mu), where u is *mean* of response variable
## THE RHS MUST ALWAYS BE BETA * X --> LINK IS APPLIED TO Y ONLY!

?glm
# glm(formula, family = gaussian, data, ...)

# glm usually fit using maximum likelihood

## Family: binomial
# link functions logit & probit
# think of logit in terms of threshold from 0 to 1, with the latent variable being some scale of willingness that trips the threshold
# ln(p/1-p) = Beta * X

data <- read.csv("Homicides.csv")
attach(data)
regmodel_logit <- glm(HomicideHotSpot~SOUTH+UE90+DV90+MA90+PS90+RD90, family=binomial("logit"))
summary(regmodel_logit)

# Estimates are

## Family: Poisson
# the count comes from Poisson distribution, where MEAN of distribution
# is defined by linear combination of variables (Beta * X)

# be careful of overdispersion (variance>mean)  [variance and mean should be same]
# be carful of zero inflatio (too many 0's)

data <- read.csv("ShipWreck.csv")
attach(data)
data
hist(damage)

# examine
xtabs(~damage+type)
xtabs(~damage+construction)
xtabs(~damage+operation)
plot(damage~months)
# notice that we get different counts by cutting data different ways
# --> see which vraibles are predictive

# ALSO notice that there is a non-linear relationship between damage & month

months2 <- months^2
regmodel_Pois <- glm(damage~type+construction+operation+months+months2-1,family=poisson(link="log"))
summary(regmodel_Pois)

# Note that link function is log!
# BEWARE that we have lots of dummy variables --> We don't need INTERCEPT

# Coefficients: negative = less likely


######## OPTIMISATION #######
# recall that LS = sum of squared residuals, which is a function of betas
# so we vary betas and maximise LS
# Can do this in any problem that you can express as error = ...

# in R, we define a function to optimise, as well as initial values
# we use optim() --> MINIMIZES by default

data <- read.csv("NavigationExample.csv")
attach(data)

# First we must specify the objective function
fn = function(P)   sum(  ( D - ( (X-P[1])^2 + (Y-P[2])^2 )^0.5 )^2 )

# next we specify initial values/guesses to the problem
initial = c(0,0)

# call on optim to _minimize_ sum of squares
# the first argument is the initial value
# the second argument is the function to be optimized
# the third argument selects the numerical search method
# we will talk about Hessian later.
out = optim(initial, fn, method = "BFGS", hessian = TRUE)

# problem SOLVEDâ¦
out$par

# if you want to get a TRACE:
out = optim(initial, fn, method = "L-BFGS-B", hessian = TRUE,control=list(trace=6,REPORT=1))


###### Maximum likelihood ######
# ML is a type of OPTIMIZATION
# ML is ONE option. Sometimes you MUST use ML, other times you can use LS, ..etc

# Y' is a vector of observed values
# Theta' = vector of parameters (Betas)
# Joint density f(Y;Theta)
# Interepreted as EITHER:
# [1] Conditional on fixed ð¯ it is the probability of sample outcomes ð.
# [2] Conditional on sample outcomes ð it is a function of ð¯.
# [2] is LIKELIHOOD function

# LIKELIHOOD
# probability of this data given assumptions = p(data|assumptions)
# = likelihood of each observation
# We're assuming all observations are INDEPENDANT: so we can multiply together
# Given that it is iid, then joint density function = probability of observing this set of values
# = product of marginal probability of each observation
# = f(y1, y2, y3... ; Theta) = f(y1;theta) * f(y2;theta) * ... * f(yn;theta)

# We're looking for values of theta - theta_hat - that maximise likelihood
# To make easier, sum of logs

# We take derivative of log-likelihood (called "score") and set to zero

# Key assumptions of ML estimator [see slides]
# Consistent: big sample converges theta_hat to theta
# Asymptotically normal: so we can do hypothesis tests
# --> we verify this with Hessian matrix

# How to do MLE
# First, write down PMF or PDF for your problem: binomial, gaussian, etc
# Replace outcome variable in PMF (eg y) with linear combination of betas * X's
# Work out joint density = product of each observation's likelihood function
# Take log to get log-likelihood
# Now we have an equation tying linear model to probability of observation
# Optim() to find best values of betas

# Here is the data
# carbon disulphide concentration
x = c(1.6907, 1.7242, 1.7552, 1.7842, 1.8113, 1.8369, 1.8610, 1.8839)
# beetles death
y = c(6, 13, 18, 28, 52, 53, 61, 60)
# beetles experimented on
n = c(59, 60, 62, 56, 63, 59, 62, 60)

# Note that the default of âoptimâ is to minimizeâ¦
# so we simply reverse the sign of the log likelihood function derived earilerâ¦

fn = function(B) sum( -y*(B[1]+B[2]*x) + n*log(1+exp(B[1]+B[2]*x)) - lchoose(n, y) )

out = optim(c(-50,20), fn, method = "BFGS", hessian = TRUE)

# Hessian matrix
# Recall that the inverse of the negative of the Hessian
# matrix approximates the variance-covariance matrix of the
# estimated coefficient vector (in a minimization problem you
# donât have to take the negative of the Hessian).
# use solve() to get inverse
# Don't care about off-diagonals, because those are the COVARIANCES
# Covariances may be interesting if want to understand relationships between variables

sqrt(diag(solve(out$hessian)))
# [1] 5.180705 2.912137
# These values are STANDARD ERRORS of the ESTIMATES

# Therefore, the t-stat for estimated _ð½_1_ is:
# 34.27037 / 2.912137 = 11.76


## ML with Gaussian
# We assume residuals are normally distributed
# Assume Gaussian pdf
# (x-mu)^2: means we expect x is normally distributed around mu
# for estimation, x is the DEPENDANT variable (y)
# mu is our linear combination of variables

# WHAT ABOUT SIGMA (VARIANCE)? WE MUST ESTIMATE IT TOO!
# include an unknown for sigma in likelihood function

lnL <- (n/2)*log(2*3.1415926)+(n/2)*log(p[1])+(1/(2*p[1]))*(t(y-X%*%p[2:4])%*% (y-X%*%p[2:4]))

out <- optim(c(rep(1,4)), lnL, hessian=TRUE, method = "BFGS")

# estimate / standard error = t-statistic


#####################################
############ CODE LIBRARY ##########
#####################################


### highlights to remember
# plyr functions: mutate (create new variables), slice, filter...
# tapply - to apply a function by group variable


## data for playing with
data(cars)
data(ChickWeight)


##### Working with data & structures [week 2]

## input/output
read.table(filename, header=T, sep=",")
# CSVs
read.csv(file.choose(), header=T)
read.table(file.choose(),header=T, sep=”,”)
read.delim(file.choose(), header=T,sep=",")


## data types & structures
head(tablename)
dim(tablename)
rownames()
colnames()
head()  # from plyr/dplyr


## variable names
names() # to get contents of non-dataframe object - eg contents of linear model object

rename(tablename, newvariablename = oldvariablenane)  # plyr


## filtering & subsetting (with PLYR)

# filter rows
# from plyr or dplyr
filter(tablename, Time==0)
filter(tablename, Chick>1, Time==0) # multiple filters

# slice rows
slice(tablename, 1:20)  # take first 20 rows

# order rows
arrange(tablename, time, desc(weight)) # sort by time THEN by weight (descending)

# select variables
select(tablename, variable1, variable2, ...)
select(tablename, -variable1) # drop variable

# also mutate...

## merging, binding, etc

# combine vectors
rbind()  # rowbind = combine horizontally
cbind() # column bind = combine vertivally


######### DERIVING variables


### APPLYing functions

apply(x,1,sum)  # sum across rows
apply(x,2,sum)  # sum down columns

# apply by group variable
tapply(scores, sex, mean)  # get mean score by sex

### MUTATING (plyr)
mutate(tablename, newvariable = oldvar1 + oldvar2)


####### Descriptive statistics [week 2]  #########

## crosstab
table(gender)
xtab(~gender+smoker)  # or: table(gender,smoker)


table(x,y)
# SAME AS
xtabs(~x+y)


## Describing
library(psych)
describe(tablename)
describe.by(tablename, tablename$gender)


## summarising


###### PLOTTING  [week 03]

## histogram
hist(cars$speed, main="Distribution of Car Speeds",
     xlab="Speed (mph)")
# use breaks to change bar width
hist(cars$speed, breaks=10)

## boxplot
boxplot(ChickWeight$weight)

boxplot(ChickWeight$weight~ChickWeight$Diet, xlab="Diet", ylab="Weight (grams)")

## scatterplot
plot(cars$speed,cars$dist)
plot(cars$speed,cars$dist, pch=3, col="red")

## line charts
lines(cars$speed,cars$dist)
# change xlims
plot(cars$speed,cars$dist,xlim=c(0,30))
# add vertical or horizontal lines
plot(cars$speed,cars$dist,pch=x)
abline(h=40)
abline(v=10,lty=2)
# legend
legend(5,110,c("First Half", "Second Half"),pch=1:2)

### par() for paremeters
## par() for multiple plots
# grid of 4 plots:
par(mfrow = c(2, 2))
plot(cars$speed,cars$dist,pch=x)
plot(cars$speed,cars$dist)
hist(cars$speed)
hist(cars$dist)

## par() for direction of labels
nr.prof <-c(prof.pilots=16,lawyers=11,farmers=10,salesmen=9,physicians=9,
            mechanics=6,policemen=6,managers=6,engineers=5,teachers=4,
            housewives=3,students=3,armed.forces=1)
par(las = 3)
barplot(rbind(nr.prof))


##### GGPLOT2

## qplot = quick plots
# qplot histogram
qplot(cars$dist)
# qplot scatter
qplot(cars$speed,cars$dist)
# qplot gradient
qplot(mtcars$wt, mtcars$mpg, colour=mtcars$cyl)

## ggplot layering
# geom_point()for scatter/dot plots
# geom_line()for line graphs
# geom_bar() for bar charts
# geom_boxplot() for boxplots
# ggplot histogram
ggplot(data=diamonds)+geom_histogram(aes(x=carat))
# ggplot density
ggplot(data=diamonds)+geom_density(aes(x=carat),fill="grey50")

## facet_grid to create grid of 4 plots
ggplot(CO2, aes(conc, uptake)) +
    geom_point() +
    facet_grid(Treatment ~ Type) +
    stat_summary(fun.y = mean, colour = "red", geom = "line")


## ggthemes
# wall street journal theme
library(ggthemes)
g2<-ggplot(diamonds, aes(x=carat,y=price))+
  geom_point(aes(color=color))
g2+theme_wsj


## maps with ggplot
library(maps)
nz <- map_data("nz")
nzmap <- ggplot(nz, aes(x=long, y=lat, group=group)) +
          geom_polygon(fill="white", colour="black")


###### INFERENCE AND DISTRIBUTIONS [week 4]

# distribution functions - eg. norm
# q - for quartile (eg. 0.975)
# r - for random number
# p for probability of a value (eg. 1.96) = area under curve to that point (integral)
# d for density

# dnorm(x, mean = 0, sd = 1, log = FALSE)
# pnorm(q, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
# qnorm(p, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
# rnorm(n, mean = 0, sd = 1)

# work with vectors too

qnorm(0.975)
# 1.96

pnorm(1.96)
# 0.975

rnorm(5,0,1)
# 5 values

dnorm(1.96)


# confidence intervals
phat=.4
n=120
ME=qnorm(.975)*sqrt(phat*(1-phat)/n)
CI=c(phat-ME,phat+ME)


# ttests
# remember to think of null hypothesis

# one sample t-test
# H0: mean of speed = 10
# H1: mean of speed <> 10
t.test(speed,mu=10)

# compare sample t-test
# H0: Male = female
# H1: male <> female
scores=rnorm(50,20,5)
sex=rep(c("Male", "Female"),c(25,25))
boxplot(scores~sex)
t.test(scores~sex)

# paired samples t-test
before.scores=rnorm(50,15,10)
after.scores=rnorm(50,25,10)
t.test(before.scores,after.scores,paired=TRUE)


########## ANOVA
# A good one-way ANOVA example can be found at: http://www.stat.wmich.edu/wang/664/egs/Rrust.html
# A very good summary of one and two-way ANOVAs with R can be a found at: http://www.statmethods.net/stats/anova.html

# one-way anova example
boys=rnorm(50,mean=128,sd=14)
girls=rnorm(50,mean=130,sd=14)
aliens=rnorm(50,mean=75,sd=14)
all.heights=c(boys,girls,aliens)
type=rep(c("Boy","Girl","Alien"),rep(50,3))
type=factor(type)
boxplot(all.heights~type)

myanova=aov(all.heights~type)
summary(myanova)

layout(matrix(c(1,2,3,4),2,2))
plot(myanova)

# this is the same as lm
mylmanova=lm(all.heights~type)
anova(mylmanova)


# use TUKEY to get ADJUSTED multiple comparisons
# adjusted = adjusted because we are doing many tests simultaneously
# http://en.wikipedia.org/wiki/Multiple_comparisons_problem
TukeyHSD(myanova)


# 2-way ANOVA example
data(ToothGrowth)
attach(ToothGrowth)
supp=factor(supp)
dose=factor(dose)
toothanova=aov(len~supp*dose)
summary(toothanova)


####### REGRESSION

# simple lm
data(cars)
plot(cars$speed,cars$dist)
regmodel=lm(cars$dist~cars$speed)
summary(regmodel)
lines(cars$speed,regmodel$fitted)

confint(regmodel) # gives confidence interval and slope


# transformations - use the I()
lm(y~x+I(x^2))


# GLM - can use various links:
glm(counts ~ outcome + treatment, family=poisson())

# binomial(link = "logit")
# gaussian(link = "identity")
# Gamma(link = "inverse")
# inverse.gaussian(link = "1/mu^2")
# poisson(link = "log")
# quasi(link = "identity", variance = "constant")
# quasibinomial(link = "logit")
# quasipoisson(link = "log")


###### FUNCTIONS [week 5]

# simple example
myfirstfunctionwithinput=function(input)
{
  print(input)
}

myfirstfunctionwithinput("I am getting better at functions")
# [1] "I am getting better at functions"


myfirstfunctionwithinput=function(input1, input2)
{
  print(input1)
  print(input2)
}

myfirstfunctionwithinput(input1=1,input2=2)
# [1] 1
# [1] 2


# DEFAULT values
meanormedian=function(variable, stat=”mean”)
{
  # a function that calculates either mean or median of a vector
  if (stat=="mean")
  {
    return(mean(variable))
  }
  else
  {
    return(median(variable))
  }
}

# IF statements
meanormedian=function(variable, stat)
{
  # a function that calculates either mean or median of a vector
  if (stat=="mean")
  {
    return(mean(variable))
  }
  else
  {
    return(median(variable))
  }
}

# FOR loops
addsomenumbers1=function(n)
{
  #returns the sum of the numbers from 1 to n
  total=0
  for (i in 1:n)
  {
    total=total + i
  }
  return(total)
}

addsomenumbers1(100)
# [1] 5050

# WHILE loops
addsomenumbers2=function(n)
{
  #returns the sum of the numbers from 1 to n
  i=1
  total=0
  while (i<=n)
  {
    total=total+i
    i=i+1
  }
  return(total)
}

# RECURSIOn
fibo=function(n)
{
  #function that returns the nth Fibonacci number
  if (n<2)
  {
    return(n)
  }
  else
  {
    return(fibo(n-1)+fibo(n-2))
  }
}


############## SHINY ###################
http://shiny.rstudio.com/gallery/

install.packages('shiny', lib="c:\\notbackedup\\")
library(shiny, lib="c:\\notbackedup\\")
runApp("C:\\NotBackedUp\\R\\shiny\\")

#


## DIAGRAMS
install.packages('diagram')
library(diagram)

names <- c(".Rnw", ".tex", ".pdf")
connect <- c(0, 0, 0,
             "knitr", 0, 0,
             0, "pdflatex", 0)
M <- matrix(nrow=3, ncol=3, byrow=TRUE, data=connect)
pp <- plotmat(M, pos=c(1, 2), name=names, box.col="orange")


###### MISC ##########
## Generating random datasets


# ------------------------------- #
# R TUTORIAL CODE #
# SESSION 1       #
# ------------------------------- #

## Meta
# use # for comments

# use CTRL-ENTER to run a single line or highlighed bit of code


# ------------------------------- #
# CHAPTER 1: working with basic data objects #
# ------------------------------- #

## [1.1] simple Variables
a = 1

a

a <- 1        # <- arrow is same as =

a

a == 1        # use double arrows for boolean equals (more later)

b = 2

a + b
a * b

c = a + b^2
c

d = 1.0
str(d)       # structure of object: number, string, ...

help(str)    # help on any function
??structure  # search for help on a term --> Google is better
             #  unless you know what you're looking for

string      # if you make an error, R will tell you in red

e = 'string'
str(e)

paste('hello','world')  # join two strings

help(paste)

paste('hello','world', sep=", ")

## Meta
# R will interpret end of line as end of command
# *except* where it a line is left incomplete,
# for example it ends in a + or an unclosed bracket

a =
  2

(1 + 7 +
   19 +
   12)


# ....................................... #
# - EXERCISE - #
# caculate 57 * 43
# then calculate the square of the result


# ....................................... #

# ------------------------------- #
## [1.2] Lists

c('a','b','c')         # Combine elements into a vector or list

c(1, 2, 3)             # Combine elements into a vector or list

f = c(1,2,'word',4)    # any list with at least 1 string = all strings
f

# ------------------------------- #
## [1.3] Vectors = a list of numbers OR a one dimensional matrix
# Also think of this like a column OR a row from a table - it's all the same to R
f = c(1,2,3,4)
f

0:4              # quick list of integers
10:0             # reverse
seq(1,50,5)      # sequence of number from x to y in steps of size z


# VECTORISATION
# operations can be applied to vectors in one hit = 'vectorisation'
g = 1:10
4*g
g/2
h = g^2

plot(g,h)

# A vector can be a column from a table or a row from a table, R is indifferent.
# R has a huge library of functions which you typically apply to vectors.
# Starting with the basics:
sum(g)
mean(g)
median(g)
summary(g)


# through to nonparametric tests...
x = c(4,7,10,15)
y = c(1,2,3,5)
wilcox.test(y,x)

# And anything parametric you might want, including tests...
pbinom(q=2, size=10, p=0.5 )  # binomial p-value for getting 2 success in 10 trials

# ...and density functions...
n_successes = 0:10
prob_successes = dbinom(n_successes, size=10, p=0.5)  # distribution for 10 trials with p=0.5
plot(n_successes, prob_successes, type='l')

# ... and so on


## META
# Data in R lives in active memory (RAM), enabling 'vectorisation'
# Contrast this with SAS, which opens datasets and reads line-by-line,
# hence why PROC SORT exists


# - EXERCISE - #
# create a vector of numbers 5 through 10
# Multiply, elementwise, the numbers in the vector by 3.2
# Calculate the mean & median of the vector


# ....................................... #

# ------------------------------- #
# CHAPTER 2: Data frames #
# ------------------------------- #
# Flexible tables of data, like Excel or SAS tables
# Can still be vectorised and treated like matrices


## creating a dataframe by IMPORTING DATA from csv
# NOTE double slashes in paths

## Getting data out of text fils, eg. CSV
# for different delimiters, use sep="XXX".
# eg. pipe-delimited: sep="|"
# eg. tab-delimited:  sep="\t"

FileLoc = "C:\\Documents and Settings\\mencec1\\Desktop\\training\\"
FileName = 'baseball_train.csv'
File = paste(FileLoc,FileName,sep="")
print(File)
TBL_BASEBALL <-read.delim(File,header=TRUE,sep=",")


# look at it in workspace

TBL_BASEBALL            # view the table in log

dim(TBL_BASEBALL)       # dimensions

colnames(TBL_BASEBALL)  # column names

summary(TBL_BASEBALL)   # quick summary

str(TBL_BASEBALL)       # summary of structure


# ------------------------------- #
# [2.1] Working with dataframes: basic slicing & dicing
# ------------------------------- #

# in R, we access 'cells' of data using [row, column] indexing

TBL_BASEBALL[1,1]
TBL_BASEBALL[2,3]

TBL_BASEBALL[1,]        # access entire row  --> a vector
TBL_BASEBALL[,1]        # access entire column --> a vector

mean(TBL_BASEBALL[,1])  # works like the vectors we saw above

TBL_BASEBALL[,1:2]      # select columns 1 & 2
TBL_BASEBALL[1:5,]     # select rows 1 through 5.
# remember? 1:5 is the same as c(1,2,3,4,5)
# So you can pass R a list as an index:
TBL_BASEBALL[c(1,7,99),]   # use a list as an index

# use the [rows,cols] syntax to do both at once:
TBL_BASEBALL[1:2,6:8]  # rows 1 & 2 with columns 6,7,8


## selecting columns by name

colnames(TBL_BASEBALL)

TBL_BASEBALL[,"salary"]    # name a single column

TBL_BASEBALL[,c("salary","n_runs","n_walks")]    # use a LIST of columns

cols = c("salary","n_runs","n_walks")
TBL_BASEBALL[,cols]


# Lazy '$' syntax works if you only want a single column
TBL_BASEBALL$salary

# subset() syntax can be used if it less confusing for you:
subset(TBL_BASEBALL, select=c('salary','n_runs','n_hits'))


## combine with row slicing:

TBL_BASEBALL[27:32,c('n_runs','n_hits')]


# Once more, we can apply any R functions to sliced chunks of data
# Many functions can handle chunks just as well as vectors:
sum(TBL_BASEBALL[27:32,c('n_runs','n_hits')])


### Now that we can slice & dice, let's use it to get to know BASEBALL data

pairs(TBL_BASEBALL)   # generic R matrix scatterplot

## Too many variables... Let's look at fewer
cols = c('salary','batting_avg','n_runs','n_hits','n_homeruns')
cols
pairs(TBL_BASEBALL[,cols])


# More useful scatterplot matrix:
install.packages('ggplot2')   # ggplot2 plotting library [if your first install, you may be asked to choose a CRAN server]
install.packages('GGally')    # Companion tools for ggplot2
library(ggplot2)
library(GGally)
ggpairs(data=TBL_BASEBALL[cols],
        upper = list(continuous = "cor"),
        lower = list(continuous = "smooth"),
        diag = list(continuous="density"),
        axisLabels='show')


# [2] Move to a higher level of abstraction: correlation heatmap
z <- cor(TBL_BASEBALL)  # correlation matrix with one command :-)

z

install.packages('lattice')
library(lattice)
levelplot(z)      # default is hideous

col.l <- colorRampPalette(c('blue','red','orange','yellow'))(30)
levelplot(z, col.regions=col.l, scales=list(x=list(rot=90, cex=0.8)))


# ....................................... #
# - EXERCISE - #
# select only columns 'n_runs' and 'n_errors' from baseball data
# do a summary() on baseball data that includes only columns 'n_runs' and 'n_errors'


# ....................................... #

## creating a dataframe by IMPORTING DATA from Excel

install.packages('RODBC')
library(RODBC)

FileLoc = "C:\\Documents and Settings\\mencec1\\Desktop\\training\\"
FileName = 'branch profiles.xls'
File = paste(FileLoc,FileName,sep="")
print(File)

channel <- odbcConnectExcel(File)
TBL_BRANCHES <- sqlFetch(channel, "Sheet1",as.is=TRUE)
odbcClose(channel)


# look at it

TBL_BRANCHES

dim(TBL_BRANCHES)

colnames(TBL_BRANCHES)

summary(TBL_BRANCHES)

str(TBL_BRANCHES)


# ------------------------------- #
## [2.2] working with dataframes: boolean slicing & dicing
# ------------------------------- #

##### using logical conditions to select things
# EQUALS is   ==
# OR     is   |
# NOT    is   !
# AND    is   &
# NOT EQUALS is !=

# logical conditions work by creating an index then applying that to dataset
# , eg.:

TBL_BRANCHES['n_custs']>5000   # returns list of TRUE vs FALSE for each row
                               # --> use that list as an index

TBL_BRANCHES[TBL_BRANCHES['n_custs']>30000,]

TBL_BRANCHES[TBL_BRANCHES['pct_migrant']>0.5,]

TBL_BRANCHES[TBL_BRANCHES['pct_migrant']>0.5,c('Branch name','Mean_monthly_net_income','mean_A_RAR')]

# can write your index lists into variables if it is clearer
bigbranches = TBL_BRANCHES['n_custs']>30000
TBL_BRANCHES[bigbranches,]


# boolean conditions create the same list of TRUE/FALSE for indexing:

TBL_BRANCHES['pct_migrant']>0.5 & TBL_BRANCHES['State']=='NSW'

TBL_BRANCHES[TBL_BRANCHES['pct_migrant']>0.5 & TBL_BRANCHES['State']=='NSW',]

TBL_BRANCHES[TBL_BRANCHES['pct_migrant']>0.5 | TBL_BRANCHES['pct_student']>0.5,]  # OR


# negations - ! can go before an entire condition, or as part of a !=

TBL_BRANCHES[TBL_BRANCHES['State']!='NSW',]  # all states NOT NSW

TBL_BRANCHES[!TBL_BRANCHES['pct_migrant']>0.5, ]   # every record BUT where pct_migrant > 50%


# You can use %in% like SQL 'in' to match to a list of conditions

TBL_BRANCHES[,'State'] %in% c('SA','WA')

TBL_BRANCHES[TBL_BRANCHES[,'State'] %in% c('SA','WA'),]


## Alternative syntax using subset() may be easier for some
subset(TBL_BRANCHES, State == 'VIC')

subset(TBL_BRANCHES, State == 'VIC' & pct_migrant>0.5)

subset(TBL_BRANCHES, State == 'VIC' | pct_migrant>0.5)

subset(TBL_BRANCHES, Location %in% c('Regional','Remote'))


### now that we can do boolean slicing & dicing, let's use it
### to get to know BRANCHES data better

# Map it
install.packages('ggmap')  # package for query google maps & plot with ggplot2
library(ggmap)

qmap('Australia', zoom=4, maptype='satellite')     # query google maps

qmap('Australia', zoom=4, maptype='terrain')

# download map into 'mapAus' variable
mapAus <- qmap('Australia', zoom=4, maptype='terrain', color="bw")

# Overlay points onto mapAus using TBL_BRANCHES data
mapAus + geom_point(aes(x=Long, y=Lat), color='red', data=TBL_BRANCHES, alpha=0.5)

# Overlay points where size is proportional to N_CUSTS
mapAus + geom_point(aes(x=Long, y=Lat, size=n_custs), color='red', data=TBL_BRANCHES, alpha=0.5)

## let's use boolean slicing: do a map of high % Affluent 50+ around Melbourne
affluentbranches = TBL_BRANCHES['pct_affluent_50+']>0.1

# Get map centered on Melbourne
MyMap <- qmap(location = 'Melbourne, Australia', zoom=9, maptype='terrain',color="bw")

# plot migrant branches, scale colour to mean RAR
MyMap +
  geom_point(data=TBL_BRANCHES[affluentbranches,], aes(x=Long, y=Lat, colour=as.numeric(mean_A_RAR)), alpha=0.7, size=5) +
  scale_colour_gradient(low="blue", high="red",name='Mean RAR')


# ....................................... #
### EXERCISE
# Select all branches with >20% affluent 50+
# get average customer RAR for all branches in VIC


# ....................................... #


# ------------------------------- #
# CHAPTER 3: R LIBRARIES
# ------------------------------- #
## R Libraries make it easy to do stuff
# C-RAN: http://cran.r-project.org/

# packages must be downloaded once (this will also update them):
# then called in a given script you are using them for:
# you can see packages & versions in package tab of rstudio

## wordcloud
install.packages("wordcloud")   # download package
library(wordcloud)              # call package into memory

text = "There are five main ways water vapor can be added to the air.
Increased vapor content can result from wind convergence over water or
moist ground into areas of upward motion. Precipitation or virga falling
from above also enhances moisture content. Daytime heating causes water
to evaporate from the surface of oceans, water bodies or wet land.
Transpiration from plants is another typical source of water vapor.
Lastly, cool or dry air moving over warmer water will become more humid.
As with daytime heating, the addition of moisture to the air increases
its heat content and instability and helps set into motion those processes
that lead to the formation of cloud or fog"

wordcloud(text,min.freq=1,random.order=FALSE)

# make it look prettier (?) with colours
library(RColorBrewer)
pal <- brewer.pal(8,"Dark2")
wordcloud(text, scale=c(0.3,2), min.freq=1,
          colors=pal, rot.per=0, fixed.asp=FALSE,
          vfont=c("gothic english","plain"))


## randomForest
install.packages("randomForest")
library(randomForest)

rf <- randomForest(TBL_BASEBALL[,c('n_homeruns','batting_avg','n_errors')],
             TBL_BASEBALL$salary,    # target variable
             n_tree=500,             # 500 trees
             mtry=2)                 # try 2 variables (at random) for each split

rf

importance(rf)

TBL_BASEBALL$yhat = predict(rf, TBL_BASEBALL)  # predict function

plot(TBL_BASEBALL$salary,TBL_BASEBALL$yhat)


## sqldf - use SQL to operate on dataframes
install.packages('sqldf')
library('sqldf')

sqldf(' select State
      , count(*)
      from TBL_BRANCHES
      group by State')


## Other neat libraries
# Interactive charts with Shiny: http://www.rstudio.com/shiny/
# twittR
# RODBC - for ODBC connections
# XML - for webpage scraping


# ....................................... #
# - EXERCISE - #
# download and install wordcloud library using install.packages()
# then make it active with library()
# make a wordcloud from a piece of text
# if necessary, help(wordcloud)


# ....................................... #

# ------------------------------- #
# CHAPTER 4: PLOTTING
# ------------------------------- #

# generic R plotting is drab but quick
plot(TBL_BASEBALL$batting_avg, TBL_BASEBALL$salary)

# ggplot2 library is awesome:   http://docs.ggplot2.org/current/
install.packages('ggplot2')
library(ggplot2)

# build plots logically by adding layers
p <- ggplot(TBL_BASEBALL, aes(x=batting_avg, y=salary))  # create a blank canvas in 'p'
p <- p + geom_point()                                    # add simple points
p

p <- p + geom_point(aes(colour = salary), size=5, alpha=0.5 )
p                 # aes statement defines how data links to plot elements

# you can go on and on adding layers and tweaking settings...
p <- p + stat_density2d(colour='red')
p

p <- p + scale_colour_gradient(low = "green", high="yellow")
p

p <- p + theme_bw()
p

# ... can layer up endlessly. lines, points, contours; sized, coloured, shaped by variables..

# various add-on libaries for all sorts of charts, eg GGally has scatterplot matrix


###### histogram / density example

p <- ggplot(TBL_BRANCHES, aes(mean_A_RAR))

p + geom_histogram()

# split by location
p <- ggplot(TBL_BRANCHES, aes(mean_A_RAR, fill = Location))

p + geom_histogram() # not so good

p <- p + geom_density(alpha=0.2)

p

p <- p + theme_bw()

p

# use facet_grid to break plots up by a categorical variable
# syntax: rows ~ columns
p + facet_grid(Location ~ .)

p + facet_grid(Location ~ State)


### Maps + ggplot2 = lots of possibilities
library(ggmap)

MyMap <- qmap(location = 'Niddrie, Australia', zoom=10, color='bw')

MyMap +
  geom_point(aes(x=Long, y=Lat, size=as.numeric(n_custs),
                 colour=mean_A_RAR), alpha=0.8, data=TBL_BRANCHES) +
  scale_size_continuous(range = c(2,10), name='# Custs') +
  scale_colour_gradient(low="blue", high="red",name='Mean $ RAR')

# using a different map source:
MyMap <- qmap(location = 'Niddrie, Australia', zoom=10, source = 'osm')

MyMap +
  geom_point(aes(x=Long, y=Lat, size=as.numeric(n_custs),
                 colour=mean_A_RAR), alpha=0.8, data=TBL_BRANCHES) +
  scale_size_continuous(range = c(2,10), name='# Custs') +
  scale_colour_gradient(low="blue", high="red",name='Mean $ RAR')


## Map of branches coloured by cluster
TBL_BRANCHES_VIC = subset(TBL_BRANCHES, State == 'VIC')

MyMap <- qmap(location = 'Melbourne, Australia', zoom=10, source = 'osm', color='bw')

MyMap +
  geom_point(aes(x=Long, y=Lat, colour=as.factor(Cluster)), size=5,
             show_guide = FALSE, data=TBL_BRANCHES_VIC)

# add border to points, make colour gradient have more contrast:

cols = rainbow(100)      # take 100 colours from rainbow
cols = sample(cols, 100) # use sample to get them in a random order

MyMap +
  geom_point(aes(x=Long, y=Lat, colour=as.factor(Cluster)), size=5,
             show_guide = FALSE, data=TBL_BRANCHES_VIC) +
  geom_point(aes(x=Long, y=Lat), size=5, pch=21 , data=TBL_BRANCHES_VIC) +
  scale_colour_manual(values=cols)

# See list of inbuilt spectrums here: http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/

# Kate also created this ANZ colourset:
ANZcolours <- c(
  "#B2B2B2",
  "#DDDDDD",
  "#CCE5F5",
  "#66B2E0",
  "#007FCC",
  "#002B3C",
  "#D7E7DE",
  "#E6E1B6",
  "#F2F285",
  "#B2A58D",
  "#87E10C",
  "#F2AF00",
  "#F27300",
  "#BF3500")
pie(rep(1,14), col=ANZcolours)


# ... ggplot is awesome
# virtually any element of the plot can be customised
# See galleries of examples at http://docs.ggplot2.org/current/


#### Final awesome example: Houston crime data density
str(crime)  # the 'crime' dataset is contained in the ggmap library
# limit to violent crimes
violent_crimes <- subset(crime, offense != "auto theft" &
                           offense != "theft" & offense != "burglary")

HoustonMap <- qmap("houston", zoom = 14, color = "bw")

theme_set(theme_bw(16))  # changes default plot theme for ggplot2
HoustonMap +
  stat_density2d(aes(x = lon, y = lat, fill = ..level.., alpha = ..level..),
                 bins = 5, geom = "polygon",
                 data = violent_crimes, show_guide = FALSE) +
  scale_fill_gradient(low = "black", high= "red", guide=FALSE) +
  facet_wrap(~ day)

theme_set(theme_grey()) # revert to default ggplot2 theme


# ....................................... #

# - EXERCISE - #
# [1] Scatterplot of branch customer FUM vs RAR
# hint: 1st try plot(x_values, y_values)
# next, try ggplot:
# p <- ggplot(data=TABLENAME, aes(x=X_VALUES, y=Y_VALUES))
# p <- p + geom_point()
# p

# [2] Change the scatterplot so branches are coloured by Location
# --> insert colour=VARIABLE into the ggplot aes statement

# [3] Make it look prettier
# Change the size by adding size=4 into the geom_point() statement
# Change the transparency by adding alpha=0.4 into the geom_point() statement
# Change the colour to be a function of the MOSAIC_prosperity
# .. and so on. look at examples at http://docs.ggplot2.org/current/ and tinker


# ---------------------------------------------------------------- #
# ---------------------------------------------------------------- #


# ------------------------------- #
# R TUTORIAL CODE #
# SESSION 2       #
# ------------------------------- #

# refresher on what we did last week

# vectors, variables, functions
f = c(1,2,3,4)
sum(f)

# importing data from csv
FileLoc = "C:\\Documents and Settings\\mencec1\\Desktop\\training\\"
FileName = 'baseball_train.csv'
File = paste(FileLoc,FileName,sep="")
print(File)
TBL_BASEBALL <-read.delim(File,header=TRUE,sep=",")

# importing data from Excel
install.packages('RODBC')
library(RODBC)

FileLoc = "C:\\Documents and Settings\\mencec1\\Desktop\\training\\"
FileName = 'branch profiles.xls'
File = paste(FileLoc,FileName,sep="")
print(File)

channel <- odbcConnectExcel(File)
TBL_BRANCHES <- sqlFetch(channel, "Sheet1",as.is=TRUE)
odbcClose(channel)

# dataframes, slicing & dicing
colnames(TBL_BASEBALL)

summary(TBL_BASEBALL)

TBL_BASEBALL[1:5,]     # select rows 1 through 5.

TBL_BASEBALL[,c("salary","n_runs","n_walks")]    # use a LIST of columns

TBL_BRANCHES[TBL_BRANCHES['pct_migrant']>0.5,]   # boolean indexing

## libraries
install.packages("wordcloud")   # download package
library(wordcloud)              # call package into memory

text = "There are five main ways water vapor can be added to the air.
Increased vapor content can result from wind convergence over water or
moist ground into areas of upward motion. Precipitation or virga falling
from above also enhances moisture content. Daytime heating causes water
to evaporate from the surface of oceans, water bodies or wet land.
Transpiration from plants is another typical source of water vapor.
Lastly, cool or dry air moving over warmer water will become more humid.
As with daytime heating, the addition of moisture to the air increases
its heat content and instability and helps set into motion those processes
that lead to the formation of cloud or fog"

wordcloud(text,min.freq=1,random.order=FALSE)

## plotting & mapping with ggplot2
install.packages('ggplot2')
library(ggplot2)

p <- ggplot(TBL_BRANCHES, aes(mean_A_RAR, fill = Location))

p <- p + geom_density(alpha=0.2)

p <- p + theme_bw()

p + facet_grid(Location ~ State)


# on to new stuff...


# ------------------------------- #
# [2.3] aggregating data - like PROC SUMMARY

aggregate(TBL_BRANCHES['mean_FUM'], by=list(TBL_BRANCHES$State), FUN=mean)

# multiple variables
aggcols = c('mean_FUM','mean_n_prods','mean_A_RAR')
aggregate(TBL_BRANCHES[aggcols], by=list(TBL_BRANCHES$State), FUN=mean)

# multiple functions
aggregate(TBL_BRANCHES[aggcols], by=list(TBL_BRANCHES$State),
          FUN=function(x) c(mn = mean(x), md = median(x) ))


# custom aggregate functions - anything that would work on a vector
aggregate(TBL_BRANCHES['mean_FUM'], by=list(TBL_BRANCHES$State),
          FUN=function(x) interest=sum(x*0.18/12) )


# we can simplify syntax by loading formula into a variable
# ** virtually anything can be loaded into a variable in R **

f  <- function(x) interest=sum(x*0.18/12)
aggregate(TBL_BRANCHES['mean_FUM'], by=list(TBL_BRANCHES$State), FUN=f )


# You can write your aggregation into a dataframe for further maniupation, plots, etc
aggdf = aggregate(TBL_BRANCHES['mean_FUM'], by=list(TBL_BRANCHES$State), FUN=f )
colnames(aggdf)

ggplot(data=aggdf, aes(x=Group.1, y=mean_FUM, fill=Group.1)) + geom_bar(stat="identity")

ggplot(data=aggdf, aes(x=Group.1, y=mean_FUM, fill=mean_FUM)) + geom_bar(stat="identity")


# ....................................... #
# - EXERCISE - #
# Aggregate branch data by Location:
# calculate mean branch customer FUM AND mean RAR


# ....................................... #

# ------------------------------- #
## [2.4] TRANSFORMING DATA

## creating a new field

TBL_BASEBALL['new'] = 1

TBL_BASEBALL['new']

TBL_BASEBALL['new'] = c(1,2)  # error: vectors are different sizes

TBL_BASEBALL['new'] = log(TBL_BASEBALL['salary'])

TBL_BASEBALL['new']

TBL_BASEBALL['power'] = TBL_BASEBALL['n_homeruns'] + TBL_BASEBALL['n_stolenbases']

TBL_BASEBALL['power']


## linear transformations of scale
# say we have a skewed distribution...

hist(TBL_BASEBALL$salary)

hist(TBL_BASEBALL$salary, breaks=30)

summary(TBL_BASEBALL['salary'])
# yes, very skewed.
# we can transform it...
TBL_BASEBALL$log_salary = log(TBL_BASEBALL$salary)

hist(TBL_BASEBALL$log_salary, breaks=30)

# ...or we could band it:
## banding & categorising

# equal spaced cut points based on value field
cut(TBL_BASEBALL$n_runs, 5)     # 5 groups
# [] brackets are inclusive
# Labels can be set- see help(cut)

# custom cut points
cuts = c(0,10,50,90)
cut(TBL_BASEBALL$n_runs, cuts)

# doing cuts based on # of records, eg deciles, quartiles
quantile(TBL_BASEBALL$batting_avg,0.5)    # quantile returns the point at X%

quantile(TBL_BASEBALL$batting_avg,c(0.25,0.5,0.75))    # quantile returns the point at X%

cutpoints = quantile(TBL_BASEBALL$batting_avg,c(0,0.25,0.5,0.75,1.0))    # quantile returns the point at X%

cut(TBL_BASEBALL$batting_avg, cutpoints)


# deciles
(0:10)    # remember this?

(0:10)/10 # 0 to 1 by 0.1, eg deciles

decpoints = quantile(TBL_BASEBALL$batting_avg,(0:10)/10)
decpoints

TBL_BASEBALL['batting_decile'] = cut(TBL_BASEBALL$batting_avg, decpoints, include.lowest=TRUE)

TBL_BASEBALL[,c('batting_decile','batting_avg')]


# now we can aggregate
aggcols = c('n_hits','n_runs','n_strikeouts','salary')
aggregate(TBL_BASEBALL[aggcols], by=list(TBL_BASEBALL$batting_decile), FUN=mean)


# ....................................... #
# - EXERCISE - #
## with TBL_BRANCHES
# create a new column which is sum of % migrants and % students
# aggregate the table to see average FUM in each FUM decile


# ....................................... #

## [2.5] Custom functions to transform data

# there's lots of ways to recode data...

TBL_BRANCHES$c_age <- ifelse(TBL_BRANCHES$mean_age > 45, c("old"), c("young"))

TBL_BRANCHES[,c('c_age','mean_age')]

# ...which I'll leave you to figure out
# let's look at one advanced method: custom functions

# define an R function and apply it:

fn <- function(x) x/1000

fn

fn(2000)

# use 'sapply' to apply across a vector
sapply(TBL_BRANCHES[,'mean_FUM'],fn)

TBL_BRANCHES[,'mean_FUM_k'] = sapply(TBL_BRANCHES[,'mean_FUM'],fn)


# functions can be complex, can contain if/then logic,
# can be used for categorising:

fn <- function(x) {
  if(x<20) {'young'}
  else if(x<45) {'mid age'}
  else {'old'}
}  # curly brackets allow the function to run over many lines

fn(40)

sapply(TBL_BRANCHES[,'mean_age'], fn)


# ....................................... #
# ------------------------------- #
# CHAPTER 6: EASY DATA MINING
# ------------------------------- #

## [6.1] regression: linear regression

plot(TBL_BASEBALL$n_homeruns, TBL_BASEBALL$salary)

lm.fit = lm(salary~n_homeruns, data=TBL_BASEBALL)

lm.fit

abline(lm.fit)  # overlays regression line onto chart

summary(lm.fit)

plot(lm.fit)   # various residuals and goodness-of-fit plots

# we can easily predict back onto another dataset using predict()
TBL_BASEBALL['salary_predict'] = predict(lm.fit, TBL_BASEBALL)

TBL_BASEBALL[,c('salary','salary_predict')]

TBL_BASEBALL[231,]


# if you want to use ALL variables, here's a shorthand:
lm(salary~., data=TBL_BASEBALL)

# lm can be extended into complex linear models, polynomials, etc:
lm.fit = lm(salary~n_homeruns + n_errors * n_strikeouts + I(n_hits^2),
            data=TBL_BASEBALL)

summary(lm.fit)

# use glm() for generalised linear models, eg logistic


## [6.2] unsupervised learning: k-means clustering
# what if we automatically cluster branches together using
# MOSAIC factors? Do we get anything useful or meaningful?

ggplot(TBL_BRANCHES, aes(x=MOSAIC_cultural_diversity, y=MOSAIC_prosperity)) +
  geom_point()

# this way makes it clearer where dense areas are:
ggplot(TBL_BRANCHES, aes(x=MOSAIC_cultural_diversity, y=MOSAIC_prosperity)) +
  geom_point(alpha=0.4, size=5, colour='blue')


# use kMeans clustering to find 2 clusters

km.out = kmeans(TBL_BRANCHES[,c('MOSAIC_prosperity','MOSAIC_cultural_diversity')], 2)

km.out

TBL_BRANCHES$cluster = km.out$cluster   # write clusters back to dataset

# plot
ggplot(TBL_BRANCHES,
       aes(x=MOSAIC_cultural_diversity, y=MOSAIC_prosperity,
           colour=as.factor(cluster)))  +
  geom_point(alpha=0.4, size=5)


# Clustering has done a pretty good job of automatically detecting
# metro vs. regional branches:
ggplot(TBL_BRANCHES,
       aes(x=MOSAIC_cultural_diversity, y=MOSAIC_prosperity, colour=Location))  +
  geom_point(alpha=0.4, size=5)


## [6.3] classification: kNN classification
# k-nearest neighbour

colnames(TBL_BRANCHES)

# make three income bands: low, medium, high:

cutpoints = quantile(TBL_BRANCHES$Mean_monthly_net_income, c(0,0.33,0.66,1))

TBL_BRANCHES$band_income = cut(TBL_BRANCHES$Mean_monthly_net_income, cutpoints,
                               labels=c(1,2,3), include.lowest=TRUE)

TBL_BRANCHES$band_income

# first 2 MOSAIC factors are decent discriminators:
ggplot(TBL_BRANCHES,
       aes(x=MOSAIC_prosperity, y=MOSAIC_dependants, colour=band_income))  +
  geom_point(size=4, alpha=0.5)


# use kNN to predict back onto our dataset:
install.packages('class')
library(class)

X = TBL_BRANCHES[,c('MOSAIC_prosperity','MOSAIC_dependants')]
y = TBL_BRANCHES[,'band_income']

TBL_BRANCHES$income_predicted = knn(train = X,  # Dataset to learn from
                                    test = X,      # dataset to predict
                                    cl = y,        # target variable to learn
                                    k=10)          # number of neighbours to use

ggplot(TBL_BRANCHES,
       aes(x=MOSAIC_prosperity, y=MOSAIC_dependants, colour=income_predicted))  +
  geom_point(size=4, alpha=0.5)


# we can draw a decision boundary plot
# first we create a 100x100 grid that spans the range of two variables
x1lim = range(TBL_BRANCHES$MOSAIC_prosperity)  # gets min and max of variable
x2lim = range(TBL_BRANCHES$MOSAIC_dependants)

x1range = seq(x1lim[1],x1lim[2],len=100) # creates a sequence of values
x2range = seq(x2lim[1],x2lim[2],len=100)

grid = data.frame(X1 = rep(x1range, 100), X2 = rep(x2range, each=100))

# use KNN to predict income band of each point in grid
grid$predict = knn(train = X,     # Dataset to learn from
                   test = grid,    # Dataset to predict
                   cl = y,         # Target variable to learn
                   k=10)           # number of neighbours to use

# plot decision grid, then overlay datapoints
g <- ggplot(data=grid, aes(X1, X2)) +
  geom_point(aes(colour=predict), alpha=0.3) +
  theme_bw()

g

g + geom_point(data=TBL_BRANCHES, aes(x=MOSAIC_prosperity,
                                      y=MOSAIC_dependants,
                                      colour=band_income), size=4)


# ....................................... #
# ------------------------------- #
# CHAPTER 5: CREATING HTML NOTEBOOKS OF YOUR ANALYSIS
# ------------------------------- #
# a small notebook with code & output you can email to anyone...

# Steps:
# run this
install.packages('knitr')   # requires knitr package is installed

### COPY THIS INTO A NEW R SCRIPT ANDin RSTUDIO AND HIT THE
### 'COMPILE NOTEBOOK'BUTTON AT THE TOP RIGHT OF THE SCRIPT PANEL


#########################################################################################
#### Example of compiling HTML notebook
# For HTML notebook creation to work,
# code must run standalone from end to end

library(knitr)
library(RODBC)
library(ggmap)
library(ggplot2)
library(GGally)

# import data
FileLoc = "C:\\Documents and Settings\\mencec1\\Desktop\\training\\"
FileName = 'branch profiles.xls'
File = paste(FileLoc,FileName,sep="")
print(File)

channel <- odbcConnectExcel(File)
TBL_BRANCHES <- sqlFetch(channel, "Sheet1",as.is=TRUE)
odbcClose(channel)

# summarise useful data by state
aggcols = c('mean_FUM','mean_n_prods','mean_A_RAR')
aggregate(TBL_BRANCHES[aggcols], by=list(TBL_BRANCHES$State), FUN=mean)


# plot average customer RAR histogram by branch location
ggplot(TBL_BRANCHES, aes(mean_A_RAR, fill = Location)) + geom_density(alpha=0.2) + theme_bw()


# plot matrix scatterplot of ...
ggpairs(data=TBL_BRANCHES[aggcols],
        upper = list(continuous = "cor"),
        lower = list(continuous = "smooth"),
        diag = list(continuous="density"),
        axisLabels='show')

#########################################################################################


# ....................................... #
# - EXERCISE - #
## DO [1] or [2]:
# as you go, remember to visualise your data


# [1] Data mining competition: predict baseball salary
# you might try:
# a linear model:
lm.fit = lm(salary~., data=TBL_BASEBALL)
predict(lm.fit, TBL_BASEBALL)

# a randomforest:
install.packages("randomForest")
library(randomForest)

predictors = c('n_homeruns','batting_avg','n_errors')
rf = randomForest(TBL_BASEBALL[predictors],
                  TBL_BASEBALL$salary,
                  n_tree=500,
                  mtry=3)
predict(rf,TBL_BASEBALL)

# clustering to create derived variables
km.out = kmeans(TBL_BASEBALL[predictors], 2)
TBL_BASEBALL$cluster = km.out$cluster


# [2] predict branch customer profitability: mean_A_RAR
# you might try:
# a linear model:
colnames(TBL_BRANCHES)
lm.fit=lm(mean_A_RAR~mean_FUM + Mean_monthly_net_income +
            mean_n_prods + pct_affluent_50, data=TBL_BRANCHES)
predict(lm.fit, TBL_BRANCHES)

# a randomforest:
install.packages("randomForest")
library(randomForest)

predictors = c('mean_FUM','Mean_monthly_net_income','mean_n_prods')
rf = randomForest(TBL_BRANCHES[predictors],
                  TBL_BRANCHES$mean_A_RAR,
                  n_tree=500,
                  mtry=3)
predict(rf,TBL_BRANCHES)

# clustering to create derived variables
km.out = kmeans(TBL_BRANCHES[predictors], 2)
TBL_BRANCHES$cluster = km.out$cluster


# ....................................... #

# ------------------------------- #
# APPENDIX: useful things we skipped
# ------------------------------- #
## merging dataframes
# see rbind, cbind, merge
# here are a few examples: https://rhandbook.wordpress.com/tag/slicing-and-dicing-of-data-frame/


## Exporting data
# write.table(mydata,"c:/mydata2.txt", sep="\t")


## Generate random sequence of integers
# sample(20)


## Generate distributions
# ...


############# OPTIMAL BINNING BUCKETING ###############
# with decision tree

install.packages("party", lib="c:\\notbackedup")
library("party", lib='\\notbackedup')
?ctree

str(iris)

iris_ctree <- ctree(Species ~ Sepal.Length,
                    controls = ctree_control(minbucket=30, maxdepth=2), data=iris)
print(iris_ctree)

# tag node:
iris$predicted_value = predict(data=iris, iris_ctree, type = "node")


prop.table(xtabs(  ~ F_RESPONSE + treenode, campaigndf),2)

# discussion of bucketing
# http://www.r-bloggers.com/package-party-conditional-inference-trees/


??ctree


## R training notes.R
## System & file management
objects() # workspace
getwd() # working directory
setwd("/mnt/resource/training3")
dataPath = "/mnt/resource/BigData/data"
bigDataPath = "/mnt/resource/BigData/BigData"
outdir = "output"
# Check & create directories
if(!file.exists(outdir)) dir.create(outdir)
# list files in directory
dir(dataPath)
dir(bigDataPath)

# example
##
setwd("/mnt/resource/training3")
dataPath = "/mnt/resource/BigData/data"
outdir = "output"
outPath = "output"
if(!file.exists(outdir)) dir.create(outdir)

bigDataPath = "/mnt/resource/BigData/BigData"
big.data.path = "/mnt/resource/BigData/BigData"
data.path = "/mnt/resource/BigData/data"
output.path = 'output'
sample.data.dir = rxGetOption("sampleDataDir")


## Read data
# from internet:


## Selecting & slicing
# which() = identify records meet a list of criteria
# which(mtcars$hp == max(mtcars$hp))
## also SQLDF

# repeat something
replicate()

## sort and order
# sort() = sort a vector
# order() = return element order of a sorted vector
# for a dataframe rows use order: mtcars[order(mtcars$mpg, decreasing=TRUE), ]

## Merging
# use merge()
# inner join (only keep matches):  merge(x,y,all=FALSE)
# outer join (keep all records): merge(x,y,all=TRUE)
# left or right join: merge(x,y,all.x=TRUE)
# BINDING = 1 to 1 rows = cbind()
# UNION ALL = stack, use rbind
## also SQLDF

## regex and strings
# richest.nations$Amount <- gsub(",", "", richest.nations$Amount)
# richest.nations$Amount <- sapply(richest.nations$Amount, function(x) substr(x, 2, nchar(x)))

## connecting to databases, SQL
# note: SQLite = free, open-source SQL DB
library("RSQLite")
db = dbConnect(SQLite(), dbname = file.path(bigDataPath, "airlines.sqlite"))
dbListTables(db) # list all tables
dbListFields(db, "planes")
# run a query
query <- "SELECT count(*) FROM carriers"
dbGetQuery(db, query)
# more queries
query <- "SELECT * FROM airlines where Dest=\"DAL\""
query <- "SELECT * FROM carriers where Description LIKE \"American%\""
query <- 'SELECT * FROM carriers where Description LIKE "%American%"'
## Sequential reads: taking data one chunk at a time = fetch()
query <- "SELECT * FROM carriers"
res <- dbSendQuery(db, query)
x <- fetch(res, n = 1000)  # then use rbind() to stack
str(x)
dbClearResult(res) # close connection


# Misc
# put brackets around assignment ot make it print
# eg (test = 2)

## Apply
# apply(df, MARGIN, FUN, ...)
# where Margin is a vector. 1 is rows, 2 is columns
# FUN = function
# eg. apply(mtcars, 2, mean) # mean for each column
## Pass parameters to the FUN in the apply syntax
# apply(mtcars, 2, quantile, c(0.25,.75))
 # 2th, 75th centile
## Other applies
# lapply() = to each element of a list

## by()
# split dataframe into defined groups, then apply function
# eg. by(mtcars, mtcars$cyl, colMeans) # colMeans for each factor of cyl

## tables & xtabs
xtabs(~gear + cyl, mtcars)
aggregate()
# Note: write your OWN function to return multiple aggregates if you want multiple

# repeat
replicate()

#### RevoR & Teradata
# xdf seems to be distributed form of dataframe
### data munging
## library(dplyrXdf)
## filter, select, distinct, transmute, mutate, arrange, rename,
## Summarise
# bankSmry <- group_by(bankXdf, marital, housing) %>%
#	summarise(mean_age=mean(age), mean_balance=mean(balance), .method=3)
## doXdf
# bankModels <- group_by(bankXdf, housing) %>%
#	doXdf(model=rxLogit(y ~ job + marital + education, data=.))


> outdir = "~"
> cars.mod = cars
> save(cars.mod, file=file.path(outdir, "cars_mod.RData"))


paste() = concatenate strings
can include a sep
paste("row",1:5,sep="_")

## boolean
# is.na() for missing values
# x[is.na(x)] = 412

## VEctors
letters[-c(1:5]) = drop first 5 letters

# giving elements names - can access with names
x = 1:5
names(x) = letters[1:5]
x["c"]

## Matrices
# Assing ROW and COLUMN names, then you can query on them
# mat[c('a','b'),c('f','g')]

## Dataframes
# str()
# summary()
# data.frame() = create

## Vectorising
# ifelse(condition, ifvalue, elsevalue)
# eg. mtcars$type = ifelse(mtcars$hp>mean(mtcars$hp),'highhp','lowhp')


Scatterplot: use pairs()


## R Functions

test <- function(a, b) return(a + b)
test

function.name <- function(arguments) {
	# everything the function does in here
}

## Modelling
# Where there is a big spike at 0 before smaller skewed distribution
# --> use Tweedie distribution power 1.5 = 'compound gamma poisson' = zero spike like poisson plus gamma


## R visualisation.R


### ggvis
mtcars %>% ggvis(~ht) %>% layer_histograms()


## Revo R training.R
#### RevoR
# xdf = "extermal data frame"
## import
args(rxImport)
colClasses <- c("integer", rep("factor", 4), "integer", rep("factor",3), "integer", "factor", rep("integer", 4), rep("factor", 2))
names(colClasses) <- c("age", "job", "marital", "education", "default","balance", "housing", "loan", "contact", "day", "month", "duration","campaign", "pdays", "previous", "poutcome", "y")
BankXDF <- rxImport(inData = infile, outFile = outfile, colClasses = colClasses,rowsPerRead = 10000, overwrite = TRUE)
# from text: RxTextData()


## Info & statistics
rxGetInfo(inADS, getVarInfo = TRUE)
rxGetInfo(bankXDF, getVarInfo = TRUE, numRows = 6)

rxGetVarInfo() # names, max, min
rxGetVarNames() # variable names
rxSummary()
rxSummary(~default + balance, data = BankDS)
BankSummary2 <- rxSummary(balance ~ F(age), data = BankDS)

# quantiles
args(rxQuantile)

# crosstabs
args(rxCrossTabs)
rxCrossTabs(balance ~ F(age, low = 20, high = 30):F(campaign, low = 1, high = 5), data = BankDS)


#### manipulation & transformation
## use rxDataStep to do transforms, drop, keep, etc
args(rxDataStep)
BankSubDS <- rxDataStep(inData = origBankSubDS, outFile = outfile,
                        transforms = list(logBalance = log(newBalance)), overwrite = TRUE)
rxDataStep(inData = BankDS, outFile = BankDS, transforms = list(newBalance = balance + 8019 + 1), overwrite = TRUE)
## using custom functions
manualFactorRecoding <- function(dataList) {
  dataList[["F_n.devices"]] <- factor(dataList[["n.devices"]], levels = seq(1,
                                                                            10))
  return(dataList)
}
ChurnDS <- rxDataStep(inData = origChurnDS, outFile = churnoutfile,
                      transformFunc = manualFactorRecoding, overwrite = TRUE)
## creating factors - use factorise() or F() inside functions
bankXdf2 <- factorise(bankXdf, age)
rxFactors(inData = ChurnDS, outFile = ChurnDS, varsToKeep = rxGetVarNames(origChurnDS),
          factorInfo = list(F_n.devices2 = list(levels = seq(1:10), varName = "n.devices")),
          overwrite = TRUE)
## subset cols
BankDS <- rxDataStep(inData = origBankDS, outFile = outfile, varsToKeep = c("balance",
                                                                            "age", "y", "poutcome"), overwrite = TRUE)
## subset rows
BankDS.rs <- rxDataStep(inData = origBankDS, outFile = outfile.rs,
                        varsToKeep = c("balance", "age", "y", "poutcome"), rowSelection = age < 21, overwrite = TRUE)
## Sort: rxSort
## Dedupe
BankDS2 <- rxSort(inData = BankDS, outFile = outfile2, sortByVars = c("balance",
                                                                      "age"), decreasing = c(FALSE, TRUE), removeDupKeys = TRUE, dupFreqVar = "Dup_Count",
                  overwrite = TRUE)

#### manipulation with dplyrXdf
### can use:
## filter, select, distinct, transmute, mutate, arrange, rename,
## group_by, summarise, do
## left_join, right_join, full_join, inner_join
## these functions supported by rx: sum, n, mean, sd, var, min, max
bankSmry <- group_by(bankXdf, marital, housing) %>%
  summarise(mean_age=mean(age), mean_balance=mean(balance))


## Plots
rxHistogram(~logBalance, data = BankSubDS, xnNumTicks = 15)
# scatterplot
rxLinePlot(newBalance ~ age, type = "p", data = BankDS)


## Linear models
inADS <- RxXdfData(file.path(sample.data.dir, "AirlineDemoSmall.xdf"))
rxGetInfo(inADS, getVarInfo = TRUE)

# create model
model <- rxLinMod(ArrDelay ~ DayOfWeek + F(CRSDepTime), data = inADS)
model

model
coef(model)
names(model) # components of model object


# predict
'?'(rxPredict)
airline.xdf <- file.path(output.path, "airline.xdf")
rxPredict(modelObject = model, data = inADS, outData = airline.xdf,
          writeModelVars = TRUE, computeResiduals = TRUE, overwrite = TRUE)
rxGetVarInfo(airline.xdf)

# view residuals
rxHistogram(formula = ~ArrDelay_Resid, data = airline.xdf)
# zoom in
rxHistogram(formula = ~ArrDelay_Resid, data = airline.xdf, numBreaks = 100, startVal = -100, endVal = 100)


## example
bankXDF <- file.path(data.path, "BankXDF.xdf")
rxGetInfo(bankXDF, getVarInfo = TRUE, numRows = 6)

model <- rxLinMod(balance ~ age + duration + marital + housing, data = bankXDF)
model

test = rxPredict(modelObject = model, data = bankXDF, computeResiduals = TRUE, writeModelVars = TRUE, overwrite=TRUE)

rxGetVarInfo(test)

## Optional: estimating standard errors
model <- rxLinMod(ArrDelay ~ DayOfWeek + F(CRSDepTime), data = inADS, covCoef = TRUE)
model$covCoef
# append to output
rxPredict(modelObject = model, data = airline.xdf, outData = airline.xdf,
          computeStdErrors = TRUE, overwrite = TRUE)
rxGetVarInfo(airline.xdf)
# calc confidence itnervals
rxPredict(modelObject = model, data = airline.xdf, outData = airline.xdf,
          computeStdErrors = TRUE, interval = "confidence", overwrite = TRUE)
rxGetVarInfo(airline.xdf)

### model Validation: split test & train
# Construct a variable to identify training data
set.seed(100)
rxDataStep(inData = airline.xdf,
           outFile = airline.xdf,
           transforms = list(
             TrainValidate = factor(
               ifelse(rbinom(length(DayOfWeek), size=1, prob=0.8), "train", "validate")
             )
           ),
           overwrite = TRUE)

rxGetVarInfo(airline.xdf)

rxHistogram(formula = ~TrainValidate, data = airline.xdf)

# splitting into two datasets
# Build a training data set
splitDS <- rxSplit(inData = airline.xdf, outFilesBase = file.path(output.path,
                   "airline"), splitByFactor = "TrainValidate", overwrite = TRUE)
splitDS # two files are located in here

train <- splitDS[[1]]
validate <- splitDS[[2]]

# Build a model on the training data
model <- rxLinMod(formula = ArrDelay ~ DayOfWeek + F(CRSDepTime),
                  data = train)

summary(model)

# use predict function to retrieve residuals from model object
rxPredict(modelObject = model, data = validate, outData = validate,
          computeResiduals = TRUE, predVarNames = "ArrDelay_PredOutOfSample",
          overwrite = TRUE)

rxGetInfo(validate, getVarInfo = TRUE, numRows = 5)

# correlation matrix - correlate actual & predicted
rxCor(formula = ~ArrDelay_PredOutOfSample + ArrDelay, data = validate)


### kfold cross validation
k = 10;
rxDataStep(inData = airline.xdf,
           outFile = airline.xdf,
           transforms = list(
             kSplits = factor(sample(LETTERS[1:k], size = .rxNumRows, replace = TRUE))
           ),
           transformObjects = list(LETTERS = LETTERS, k = k),
           append = "rows",
           overwrite = TRUE)

rxHistogram(~kSplits, data = airline.xdf)

# split into k-files
kSplits <- rxSplit(inData = airline.xdf, outFilesBase = file.path(output.path,
            "airline"), splitByFactor = "kSplits", overwrite = TRUE)

# write a function
myLinModWrapper <- function(holdoutlevel, splitFiles) {
  ## first, estimate the model on all data point but those including
  ## holdoutlevel
  print(holdoutlevel)
  myMod <- rxLinMod(ArrDelay ~ DayOfWeek + F(CRSDepTime), data = airline.xdf,
                    rowSelection = kSplits != holdout, transformObjects = list(holdout = holdoutlevel),
  )
  ## Then, generate predictions
  curHoldOut <- grep(paste("kSplits", holdoutlevel, "xdf", sep = "."),
                     names(splitFiles), value = TRUE)
  rxPredict(myMod, data = splitFiles[[curHoldOut]], overwrite = TRUE,
            predVarNames = "ArrDelay_kFold_Pred")
  ## Generate cross-validation metric for this hold out
  curCor <- rxCor(~ArrDelay + ArrDelay_kFold_Pred, data = curHoldOut)
  return(curCor["ArrDelay", "ArrDelay_kFold_Pred"])
}

# estimate all holdout samples
allCors <- vapply(LETTERS[1:k], myLinModWrapper, splitFiles = kSplits,
                  FUN.VALUE = numeric(1))

# visualise
dotplot(allCors, cex = 2, scales = list(y = list(cex = 1), x = list(cex = 1.5)),
        xlab = list("Out-of-Sample Correlation", cex = 2), ylab = list("Hold out Sample",
             cex = 2), panel = function(x, y, ...) {
               panel.dotplot(x, y, ...)
               panel.abline(v = mean(x), lty = 2, lwd = 3)
             })


#### Model selection, stepwise, etc
## rxStepControl()

inADS <- RxXdfData(file.path(sample.data.dir, "AirlineDemoSmall.xdf"))
rxGetInfo(inADS, getVarInfo = TRUE)
# fit simple model, main effects only
model.main <- rxLinMod(ArrDelay ~ DayOfWeek + F(CRSDepTime), data = inADS)
length(coef(model.main))
summary(model.main)
# fit model with interactions
model.interaction <- rxLinMod(ArrDelay ~ DayOfWeek * F(CRSDepTime),
                              data = inADS, dropMain = FALSE)
length(coef(model.interaction))
summary(model.interaction)

## comparing the models
# AIC: 2k - 2ln(L)  [# of paremeters - log likelihood]
model.main$aic
model.interaction$aic
# compare relative likelihood
exp((model.interaction$aic - model.main$aic)/2)
## or do f test
Df <- c(model.interaction$df[1] - model.main$df[1], model.interaction$df[2])
SS <- c((model.main$residual.squares - model.interaction$residual.squares),
        model.interaction$residual.squares)
MS <- SS/Df
fVal <- c(MS[-length(MS)]/MS[length(MS)], NA)
pVal <- pf(fVal, df1 = Df[-length(Df)], df2 = Df[length(Df)], lower = FALSE)
anova.tbl <- data.frame(Df, SS, MS, fVal, pVal)
dimnames(anova.tbl) <- list(c("modelDiff", "Residuals"), c("Df", "Sum Sq",
                                                           "Mean Sq", "F value", "Pr(>F)"))
stats:::print.anova(anova.tbl)

## automatic model selection
args(rxLinMod)
args(rxStepControl)
#method: method for performing model selection
#scope: the range of models to explore (see below)

myStepControl <- rxStepControl(method = "stepwise", scope = list(lower = ~DayOfWeek +
                                                                   F(CRSDepTime), upper = ~DayOfWeek * F(CRSDepTime)), test = "F")
# define starting model and then run
model.step1 <- rxLinMod(ArrDelay ~ DayOfWeek + F(CRSDepTime), variableSelection = myStepControl,
                        data = inADS, dropMain = FALSE)
# model sequence
model.step1$anova
summary(model.step1) # final model

## Another example
bigger.ADS <- file.path(big.data.path, "AirOnTime7Pct.xdf")
rxGetInfo(bigger.ADS, getVarInfo = TRUE)
mySel <- rxStepControl(method = "stepwise", scope = ~DayOfWeek * F(CRSDepTime) +
                         DepDelay + TaxiOut + UniqueCarrier + Distance)
model.complex1 <- rxLinMod(ArrDelay ~ 1, data = bigger.ADS, variableSelection = mySel)
summary(model.complex1)

model.complex1$anova


### LOgistic regression
args(rxLogit)
mortgages <- file.path(sample.data.dir, "mortDefaultSmall.xdf")
rxGetVarInfo(mortgages)
myformula <- formula(default ~ F(year) + ccDebt + creditScore + houseAge +
                       yearsEmploy)

model = rxLogit(myformula, data=mortgages)

# predict for some new mortgages
new_mortgages <- data.frame(year = rep(c(2006, 2009), each = 4), ccDebt = rep(c(1000,
                  10000), 4), creditScore = rep(c(700, 800), 4), houseAge = rep(c(1,
                                                                                                                                                5, 10, 20), 2), yearsEmploy = rep(7, 8))
str(new_mortgages)

rxPredict(modelObject=model, data=new_mortgages)


#### GLMs - have a family argument: logic, probit, log...
# rxGlm()
args(rxGlm)

census2000 <- file.path(big.data.path, "Census5PCT2000.xdf")
census_insurance <- file.path(output.path, "Census5PCT2000_insurance.xdf")
if (!file.exists(census_insurance)) {
  rxDataStep(inData = census2000, outFile = census_insurance, rowSelection = (related ==
                      "Head/Householder") & (age > 20) & (age < 90), varsToKeep = c("propinsr",
                                                                                                                                              "age", "sex", "region", "perwt"), blocksPerRead = 1, overwrite = TRUE)
}
# --> this is about 10g

rxGetVarInfo(census_insurance)

p <- rxSummary(~region, data = census_insurance)
print(p, head = FALSE)

rxHistogram(~propinsr, data = census_insurance, pweights = "perwt")

propinGlm <- rxGlm(propinsr ~ sex + F(age) + region, pweights = "perwt",
                   data = census_insurance, family = rxTweedie(var.power = 1.5),
                   dropFirst = TRUE)
p <- summary(propinGlm)
print(p, head = FALSE)

rxPredict(propinGlm, data = census_insurance, outData=outData, overwrite =TRUE)
rxGetInfo(outData, getVarInfo = TRUE, numRows = 5)
rxHistogram(~propinsr_Pred, data = outData, pweights = "perwt")


### logistic with rxGLM
propinGlm <- rxGlm(default ~ ccDebt + F(year) + creditScore + houseAge + yearsEmploy,
                   data = mortgages, family = binomial())


#### Decision trees
# rxDTree(formula, data, ...)
# control: maxDepth, minSplit, minBucket, maxNumBins, cost,
# outFile = write out terminal node for each observation
# parms: loss - specify loss matrix


infile <- file.path(data.path, "BankXDF.xdf")
BankDS <- RxXdfData(file = infile)
myBankDS <- file.path(output.path, "mybank.xdf")
Tree <- rxDTree(housing ~ age + balance, data = BankDS, cp = 0.001)

# cp = 'complexity parameter' = minimum decrease in lack-of-fit reduction allowed by a split

# visualise
library(RevoTreeView)
plot(createTreeView(Tree))

### Regression tree
RTree <- rxDTree(housing_num ~ age + balance, data = BankDS, outFile = myBankDS,
                 writeModelVars = TRUE, overwrite = TRUE, extraVarsToWrite = "housing_num",
                 transforms = list(housing_num = housing == "yes"), cp = 0.005)
plot(createTreeView(RTree))

# using loss matrix in parms
curlvls <- rxGetVarInfo(BankDS, varsToKeep = "housing")[["housing"]]$levels
(loss.mat <- matrix(c(0, 1.5, 1, 0), ncol = 2, dimnames = list(actual = curlvls,
                                                               predicted = curlvls)))
## predicted
## actual yes no
## yes 0.0 1
## no 1.5 0

TreeLossMat <- rxDTree(formula = housing ~ age + balance, data = BankDS,
                       parms = list(loss = loss.mat), cp = 0.001)

plot(createTreeView(TreeLossMat))


# predictions: rxPredict() - can have prob (probability for class), class (predicted terminal node), ...
args(RevoScaleR:::rxPredict.rxDTree)
rxPredict(Tree, data = BankDS, outData = myBankDS, overwrite = TRUE,
          writeModelVars = TRUE)
rxGetVarInfo(myBankDS)

rxPredict(RTree, BankDS, myBankDS, overwrite = TRUE, writeModelVars = TRUE,
          predVarNames = c("predicted_housingnum_dtree"))
## visualization - ROC/AUC
rxRocCurve(actualVarName = "housing_num", predVarNames = "predicted_housingnum_dtree",
           data = myBankDS)


roc.df <- rxRoc(actualVarName = "housing_num", predVarNames = "predicted_housingnum_dtree",
                data = myBankDS)
head(roc.df)
rxAuc(roc.df)

#### Randomforests
infile <- file.path(data.path, "BankXDF.xdf")
BankDS <- RxXdfData(file = infile)
Forest <- rxDForest(housing ~ balance + age, data = BankDS, seed = 10,
                    cp = 0.01, nTree = 50, mTry = 2, overwrite = TRUE)


Forest

# we can access each tree
t(all.imp <- vapply(Forest$forest, getElement, "variable.importance",
                    FUN.VALUE = numeric(2)))

# notes
# out-of-bag error = every round we fit tree to 2/3rds dataset. Then we run
#       remaining 1/3 through to see error

## determining variable importance
# choose an impurity measure, like Gini Index
# calculate Gini Index before and after each split
# sum up Gini improvements for each variable

Forest <- rxDForest(housing ~ balance + age, data = BankDS, seed = 10,
                    cp = 0.01, nTree = 50, mTry = 1, maxDepth = 5, importance = TRUE,
                    overwrite = TRUE)
Forest$importance
rxVarImpPlot(Forest)


## prediction types: response, prob, vote
# category = majority prediction
# regression = average of predictions


#### GOODNESS OF FIT MEASURES  #####
rxRocCurve()
rxAuc(roc.df)


############ UNSUPERVISED LEARNING ##################
### Clustering
args(rxKmeans)

infile <- file.path(data.path, "BankXDF.xdf")
BankDS <- RxXdfData(file = infile)
mybankfile <- file.path(output.path, "mybankXDF.xdf")
bank_clusters <- rxKmeans(~duration + balance, data = BankDS, numClusters = 3,
                          outFile = mybankfile, writeModelVars = TRUE, extraVarsToWrite = "randSamp",
                          transforms = list(randSamp = sample(10, .rxNumRows, replace = TRUE)),
                          overwrite = TRUE)

bank_clusters

bank_clusters$centers

# plot 1
library(latticeExtra)
plt = rxLinePlot(balance ~ duration, data = BankDS, type = "p")
plt + as.layer(xyplot(balance ~ duration, data = as.data.frame(bank_clusters$centers),
                      col = "red"))

# plot 2
intern.km.df <- rxXdfToDataFrame(file = mybankfile, rowSelection = randSamp == 1)
plot(balance ~ duration, data = intern.km.df, col = intern.km.df$.rxCluster,pch = 19)
with(as.data.frame(bank_clusters$centers), points(duration, balance, col = "lightblue", pch = "x", cex = 2))

## Scaling variables
bank_summary <- rxSummary(~ duration + balance, data = BankDS)
bank_clusters <-
  rxKmeans(~ duration_scl + balance_scl,
           data = BankDS, writeModelVars = TRUE,
           outFile = mybankfile, outColName = ".rxCluster_scl",
           transforms = list(
             duration_scl = (duration - duration_mean)/
               duration_stdev,
             balance_scl = (balance - balance_mean)/
               balance_stdev),
           transformObjects = list(
             duration_mean=bank_summary$sDataFrame$Mean[1],
             duration_stdev=bank_summary$sDataFrame$StdDev[1],
             balance_mean=bank_summary$sDataFrame$Mean[2],
             balance_stdev=bank_summary$sDataFrame$StdDev[2]),
           numClusters = 3, overwrite = TRUE)

bank_clusters

bank_clusters$centers

# plot 1
unscaled_centers <- t(t(bank_clusters$centers) * bank_summary$sDataFrame$StdDev +
                        bank_summary$sDataFrame$Mean)
plt = rxLinePlot(balance ~ duration, data = BankDS, type = "p")
plt + as.layer(xyplot(balance_scl ~ duration_scl, data = as.data.frame(unscaled_centers),
                      col = "red", pch = 19))


# plot 2
intern.km.df2 <- rxXdfToDataFrame(file = mybankfile, rowSelection = randSamp ==1)
plot(balance ~ duration, data = intern.km.df2, col = intern.km.df2$.rxCluster_scl,pch = 19)
with(as.data.frame(unscaled_centers), points(duration_scl, balance_scl,col = "lightblue", pch = "x", cex = 2))

## How many clustes?
SS <- rep(NA, 20)
for (i in 2:20) SS[i] <- rxKmeans(~duration + balance, data = BankDS,
                                  numClusters = i)$tot.withinss
plot(1:20, SS, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")

## k-means can be sensitive to startingcentroid  positions
## use numStarts to repeat multiple times and refit


########## PRINCIPAL COMPONENTS ##############
## no rx function: must be run locally
## Goal: Simple data reduction technique.
## Main idea: Sequentially create variables that are linearly
##  independent (i.e. uncorrelated) that capture as much variance in the
##  dataset as possible. Create the variables based on linear
##  combinations of the variables already in your dataset.

# how to build our own in-db function

airDS <- RxXdfData(file.path(big.data.path, "AirOnTime7Pct.xdf"))
rxGetInfo(airDS, getVarInfo = TRUE)

# We will explore the relationship among Year, DepDelay, ArrDelay,
# TaxiOut, TaxiIn, CRSDepTime, and Distance.
vars2use <- c("Year", "DepDelay", "ArrDelay", "TaxiOut", "TaxiIn",
              "CRSDepTime", "Distance")
rxGetVarInfo(airDS, varsToKeep = vars2use)

# Step 1. Compute the correlation matrix.
# rxCor() uses complete method – if any values are missing, then that entire row is dropped.

airformula <- as.formula(paste("~", paste(vars2use, collapse = "+")))
(cormat <- rxCor(airformula, data = airDS))

# Step 2. apply open-source princomp() function
# remember to use covmat parameter
args(stats:::princomp.default)

(myair.pca <- princomp(covmat = cormat))

summary(myair.pca)
plot(myair.pca)

# The loadings correspond to the eigen vectors (i.e. the linear
# combinations for each of the components).
myair.pca$loadings

# Now, apply: Create new factor scores associated with the first K principal components.

myAirXDF <- file.path(output.path, "myAirDSpca.xdf")
rxDataStep(airDS, outFile = myAirXDF, varsToKeep = vars2use, overwrite = TRUE,
           removeMissings = TRUE)
rxGetInfo(myAirXDF, getVarInfo = TRUE)

# verify correlations
cormat2 <- rxCor(airformula, data = myAirXDF)
cormat - cormat2

# summarise dataset
summaryResults <- rxSummary(airformula, data = myAirXDF)$sDataFrame
curMeans <- setNames(summaryResults$Mean, summaryResults$Name)
curStd <- setNames(summaryResults$StdDev, summaryResults$Name)

# finally, create a transformfunc
# note: matrix multiplication
createPCAFactorVars  <-  function(mylist){
  ## scale the variables
  scaledvars <- as.data.frame(
    mapply(function(curvar, curmean, cursd){ scale(curvar, center = curmean, scale = cursd)},
           mylist[curvars],
           mymeans[curvars],
           mysds[curvars]
    )
  )
  colnames(scaledvars) <- paste0(curvars,"Z")
  Components <- as.matrix(scaledvars) %*% mypca$loadings
  return(c(mylist, as.list(scaledvars), as.list(as.data.frame(Components))))
}


# apply transformfunc
rxDataStep(myAirXDF,
           outFile = myAirXDF,
           varsToKeep = vars2use,
           append = "cols",
           overwrite = TRUE,
           transformFunc = createPCAFactorVars,
           transformObjects = list(curvars = vars2use, mymeans = curMeans,
                                   mysds = curStd, mypca = myair.pca)
          )

# let's look at it
(newSummary <- rxSummary(~., data = myAirXDF)$sDataFrame)


####### Elastic net / ridge regression / other variable selection... ############
# stepwise = too much chance. coefficients too large, dependant on pre-specified model. overfits
# apply shrinkage or regularisation to reduce estimates
#   - ridge
#   - lasso (L1 constraint)
# basically: minimise[(lack of fit criterion) + (overfitting penalty)]

# Ridge: penalty is lambda * sum(Beta^2)  <-- overfitting
# added to lack of fit = sum(y - x_i%*%Beta_i)^2
# choose lambda with crossvalidation, etc
# disadvantages: coefficients still always remain non-zero: are never removed

# Lasso
# as before, but overfitting = lambda * sum(|Beta|) <-- no coefficients can become zero and drop out
# disadvantages: can be volatile with correlated predictors: which one to pick?
#       poor with wide data (n<p)

# Elastic net = combines ridge and lasso
# overfitting  = lambda * sum(P * Beta)
# where P = 1/2(1-alpha)beta etc etc

# in R: use glmnet & glmnetUtils package
# see slides in gmail


### Neural networks
# nnet(y ~ x1 + x2 + ..., data, size, linest, entropy, softmax, maxit, decay, skip, rang)
#size: number of hidden nodes
#linest: TRUE if a regression network, FALSE for classification
#entropy: TRUE if a classification network and y is numeric, FALSE for regression
#softmax: TRUE if a classification network and y is a factor, FALSE for regression
#maxit: maximum number of iterations
#decay: weight decay parameter
#skip: TRUE to use skip-layer connections (often a good idea)
#rang: initial range for weights (if random starting values used)


## Swinburne Bayesian Statistics.R
#############################################
##### CODE NOTES FROM SWINBURNE
#############################################

######## RANDOM VARIABLES #########

#### DISCRETE #####
# Your mark for this subject
X = c(30, 40, 50, 60, 70, 80)
# The probability of you getting these marks
P = c(0.3, 0.3, 0.05, 0.05, 0.1, 0.2)
# Checking the sum of probabilities equal to 1
sum(P)


### Calculating means and variance
# can be done vectorwise or matrix algebra
mean=sum(X*P)
mean
sum( (X-mean)^2*P )

# MATRIX
mean=t(X)%*%P
mean


####### UNIFORM ########
# Generate 10 uniform random numbers between 3 and 5.
runif(10, 3, 5)
# Generate 100000 uniform random numbers between 3 and 5.
X <- runif(100000,3,5)
# Look at its histogram.
hist(X)
# compute its mean using the formula.
(1/2)*(3+5)
# verify using the mean function.
mean(X)
# compute its variance using the formula.
(1/12)*(5-3)^2
# verify using the var function.
var(X)


########### NORMAL
# Generate 10,000 N(0,1) numbers (i.e., standard normal).
# no need to specify the mean and variance.
X<-rnorm(10000)
# Generate 10,000 N(30,102) numbers.
X<-rnorm(10000,30,10)
# checking the histogram.
hist(X)
# checking the mean and variance.
mean(X)
var(X)


########## CHI
# Generate 10,000 Chi-squared numbers with 5 degrees of freedom.
X<-rchisq(10000,5)
# Look at its histogram.
hist(X)
# Verify that the mean and variance are indeed as n and 2n respectivelyâ€¦
mean(X)
var(X)


##### F-DISTRIBUTION
# Generate 100 F numbers with df1 = 5, df2 = 2.
X<-rf(100,5,2)
# Look at its histogram.
hist(X)


############### BETA
## BETA
# mean =  a/(a+b)
# median = qbeta(0.5,a,b)


############### BAYES EXAMPLES
### BETA BINOMIAL COIN TOSS - large sample (n=11)
a <- 5
b <- 5
s <- 70 # which is y in our previous slides
f <- 30 # which is n â€“ y in our previous slides

# The Prior
curve(dbeta(x,a,b),lty=3,lwd=4, col="red", ylim = range(0,10))

# The Likelihood
curve(dbeta(x,s+1,f+1),add=TRUE, lty=2,lwd=3, col="blue")

# The Posterior
curve(dbeta(x,a+s,b+f), from=0, to=1, add=TRUE, xlab="p",ylab="Density",lty=1,lwd=3, col="green")

legend(.1,8,c("Posterior","Likelihood","Prior"),lty=c(1,2,3),lwd=c(3,3,3),
       col=c( "green", "blue", "red"))


### BETA BINOMIAL COIN TOSS - small sample
a <- 5
b <- 5
s <- 7 # which is y in our previous slides
f <- 3 # which is n - y in our previous slides

# The Prior
curve(dbeta(x,a,b),lty=3,lwd=4, col="red", ylim = range(0,5))

# The Likelihood
curve(dbeta(x,s+1,f+1),add=TRUE, lty=2,lwd=3, col="blue")

# The Posterior
curve(dbeta(x,a+s,b+f), from=0, to=1, add=TRUE, xlab="p",ylab="Density",lty=1,lwd=3, col="green")

legend(.1,8,c("Posterior","Likelihood","Prior"),lty=c(1,2,3),lwd=c(3,3,3),
       col=c( "green", "blue", "red"))

# Posterior Mean
qbeta(0.5, a + s, b + f)

# Bayesian 90% confidence interval
qbeta(c(0.05,0.95), a + s, b + f)

#### What if all ten tosses are heads?
a <- 5
b <- 5
s <- 10 # which is y in our previous slides
f <- 0 # which is n - y in our previous slides

# The Prior
curve(dbeta(x,a,b),lty=3,lwd=4, col="red", ylim = range(0,8))

# The Likelihood
curve(dbeta(x,s+1,f+1),add=TRUE, lty=2,lwd=3, col="blue")

# The Posterior
curve(dbeta(x,a+s,b+f), from=0, to=1, add=TRUE, xlab="p",ylab="Density",lty=1,lwd=3, col="green")

legend(.1,8,c("Posterior","Likelihood","Prior"),lty=c(1,2,3),lwd=c(3,3,3),
       col=c( "green", "blue", "red"))


################ FITTING MAXIMUM LIKELIHOOD ###################
# Assume gaussian pdf for epsilon
# y = XB + epsilon, where epsilon ~ N(0, sigma^2)

# import US_Homicides_1990.csv
data <- read.csv(file.choose())
data<-data.matrix(data)

one <- rep(1,dim(data)[1])  # create constant
one <- data.matrix(one)

y <- data[,1]
y<-data.matrix(y)

X <- data[,(-1)]
X <- cbind(one,X)
X<- data.matrix(X)

# define log-likelihood function
# Note: SIGNS HAVE BEEN REVERSED as optim() only finds MINIMUM
lnL <- function(p)
  (dim(data)[1]/2)*log(2*3.1415926)+(dim(data)[1]/2)*log(p[1])+(1/(2*p[1]))*(t(y-X%*%p[2:8])%*% (y-X%*%p[2:8]))

# Find maximum - NOTE: OPTIM finds MINIMUM, so we have to REVERSE all signs
out <- optim(c(rep(1,8)), lnL, hessian=TRUE, method = "BFGS")

beta <- matrix(out$par)
# Note: first value is variance --> ignore
# Second value is for intercept
# remainder are coefficients for parameters

stdev <- matrix(sqrt(diag(solve(out$hessian))))

t_beta <- beta/stdev # t-statistics

cbind(beta,t_beta)  # bind coefficient estimate & t-statistics
# Interpret coefficients positive or negative
# Interpret t:
# Remember that you are controlling for all other variables


##### LME EXAMPLE FROM ASSIGNMENT 1 ############
# just estimating mean & SD

y=c(6.1,5.9,6.2,5.7,5.1,6.7,5.8,6.4,6.8,4.5,6.3,6.1,6.6,6.1,5.6,6.3,6.6,5.4,6.1,6.2,5.2,6.2,
    6.3,6.7,6.5,5.8,5.4,4.8,6.8,6.3,6.5,6.2,6,5.8,5.2,6.3,6,6.1,4.8,6)

n = length(y)

## Define function
# note that we switch signs (minus'es to plus'es) to accomodate
#     the optim() function, which _minimises_ by default
# p[1] will be the variance, p[2] will be the mean
lnL <- function(p)
  (n/2)*log(2*pi*p[1]) + (1/(2*p[1]))*(t(y-p[2])%*%(y-p[2]))


out <- optim(c(rep(1,2)), lnL, hessian=TRUE, method = "BFGS")

# print variance
print(out$par[1])

# print mean
print(out$par[2])


############# LECTURE 4
#### Slides #2: "The bayesian computation problem" on

## Example: Waiting until a hit in baseball - using beta-binomial conjugates
# Note that observations are WAIT TIMES, but we are estimating p = probability of hit
# observations= wait time until hit baseball
# note that 0's = non-hits
baseball <- c(0,2,0,4,1,0,2,0,1,0,0,1,1,3,1,0,0,0,1,6,0,9,0,4,1,9,1,0,3,4,5,5,1,0,2,
              4,0,4,0,3,2,1,0,1,3,7,0,3,1,2,14,4,0,1,6,1,10,1,2,0,1,0,4,5,0,7,3,1,2,1,2,1,2,2,
              4,3,3,1,1,2,1,2,7,0,3,1,2,2,2,2,0,3,4,1,1,0,0,1,1,1,11,2,2,1,3,1,0,1,2,1,1,1,0,0,
              2,0,10,1,2,2,1,1,3,1,1,0,0,1,0,1,0,1,1,0,1,0,0,0,2,1,4,5,5,0,0,0,0,2,0,8,5,2,11,
              8,0,7,1,3,1)


n <- length(baseball)  # [1] 159

s <- sum(baseball)  # [1] 336

# prior = beta distribution
a = 44
b = 102
curve(dbeta(x,a,b), from=0.2, to=0.5,
      xlab="p",ylab="Density",lty=1,lwd=3, main= "Prior")


# The likelihood function
# A function of p, defined as:

likelihood = function(p)
{
  p^n*(1-p)^s
}

curve(likelihood, from=0.2, to=0.5, xlab="p", ylab="Density", main= "Likelihood Function")
abline(v=0.321, lty=2)

# POSTERIOR - using beta-binomial known conjugacy

a=44
b=102
n=159
s=336
A=n+a
B=s+b

curve(dbeta(x,A,B), from=0.2, to=0.5,
      xlab="p",ylab="Density",lty=1,lwd=3, main= "Posterior Density Function (1)")
# --> note, no normalisation = not a proper density, but still correct shape

## or alternatively we can define the posterior function explicitly

posterior = function(p)
{
  p^(n+a-1)*(1-p)^(s+b-1)
}

#plot of the posterior density function

curve(posterior, from=0.2, to=0.5, xlab="p", ylab="Density",lty=1,lwd=3 , main= "Posterior Density Function (2)")


# Using the qbeta function, find the median of the posterior density of p.

a=44
b=102
n=159
s=336
A=n+a
B=s+b

posterior.median = qbeta(p = 0.5, A, B)

posterior.median

# Using the qbeta function, construct a 95% Bayesian interval estimate for p.

Bayesian_95Interval = qbeta(c(0.025,0.975), A, B)

Bayesian_95Interval


# Using the rbeta function, simulate 100 values from the posterior density of p.

a=44
b=102
n=159
s=336
A=n+a
B=s+b

sim.posterior=rbeta(100, A, B)

# Use the hist function on the simulated sample to display the posterior density.

hist(sim.posterior, col="gray" , main="Simulation for Posterior Density", xlab="p", freq=FALSE )

curve(dbeta(x,a,b), col="red", lty=2, add=TRUE)

curve(dbeta(x,A,B), col="blue", lty=2, add=TRUE)


## Use simulation to get mean & SD
# Using the simulated draws, approximate the mean and S.D. of the posterior density

mean(sim.posterior)
# very close to A/(A+B)

sd(sim.posterior)

# Using the simulated draws, construct a 95% Bayesian interval estimate.
sim_posterior_95Interval = quantile(sim.posterior, c(0.025,0.975))
sim_posterior_95Interval


#########################################################
#########  Oct 1 - MARKOV CHAIN MONTE CARLO #############
#########################################################

#### EXAMPLE ####
# Demonstrate bayes
install.packages('LearnBayes', lib='c:\\notbackedup\\R\\')
library(LearnBayes, lib='c:\\notbackedup\\R\\')# a Bayesian package in R

## Define prior
# a sequence of mid-points
midpt <- seq(0.05, 0.95, by = 0.1) # a sequence of mid-points

# a sequence of "heights" for the prior density
# these are clearly not proper densities.
prior <- c(1, 5.2, 8, 7.2, 4.6, 2.1, 0.7, 0.1, 0, 0)
# Divide by their sum total, they now become proper densities.
prior <- prior/sum(prior)

# histprior is a function in LearnBayes package that plots histogram
curve(histprior(x,midpt,prior), from=0, to=1,ylab="Prior density",ylim=c(0,.3))

## now find posterior
s <- 4 # 4 heads (successes)
f <- 6 # 6 tails (failures)

# define "myfn" as the product of the likelihood and the prior
myfn <- function(x)
  histprior(x,midpt,prior)*dbeta(x,s+1,f+1)

curve(myfn(x), from=0, to=1, ylab="Posterior density")
##--> SHAPE is BIZARRE, doesn't correspond to any nice, known distributions
# Not a proper density because doesn't sum to 1

## In this case, easy to solve
# we can just numerically evaluate area under curve to get normalising constant
pvals <- seq(0, 1, length=1000)
# evaluating "myfn" over the space of [0,1]
mean(myfn(pvals))
## --> this is our normalising constant --> 0.1725680

post <- function(x)
  histprior(x,midpt,prior)*dbeta(x,s+1,f+1)/0.1725680


## simulate values to get mean, etc.
# 1000 posterior values evaluated at 0.001 intervals
pev <- post(pvals)
# the above "pev" is close but not entire accurate due to rounding.
# standardize it
pev <- pev/sum(pev)
# sample WITH replacement from pvals using pev as probability input.
ps <- sample(pvals, replace = TRUE, prob = pev)
# The Mean of the simulated sample
mean(ps)

# The Median of the simulated sample
quantile(ps, 0.5)

# 95% confidence interval
quantile(ps, c(0.05,0.95))


### MARKOV CHAINS ####
# The sunny/rainy example

# for 1 day
P = matrix(c(0.90,0.50,0.10,0.50), 2,2)

# 2 days
P%*%P

# long term equilibrium - N days
N = 20

P = matrix(c(0.90,0.50,0.10,0.50),2,2)

P11 = matrix(0,N,1)
P12 = matrix(0,N,1)
P21 = matrix(0,N,1)
P22 = matrix(0,N,1)

P11[1] = P[1,1]
P12[1] = P[1,2]
P21[1] = P[2,1]
P22[1] = P[2,2]

Temp = P

for (i in 1:(N-1))
{
  Temp = Temp%*%P
  P11[i+1] = Temp[1,1]
  P12[i+1] = Temp[1,2]
  P21[i+1] = Temp[2,1]
  P22[i+1] = Temp[2,2]
}

EP = Temp # this is the equilibrium probability

par(mfrow=c(2,2))
plot(P11)
plot(P12)
plot(P21)
plot(P22)
# notice the burn-in period? Must throw that away

## Eventually markov chain converges to EQUILIBRIUM DISTRIBUTION
## Equilibrium is what we're interested in,
# This is the distribution ignoring the conditionality of time steps

#### MCMC #####
## Example of ages

dval <- c(200,180,170,165,190,195,184,183,150,157)

# log-prior = a function of mu. Note: log taken
lprior <- function(mu) dnorm(mu, 175, 100,log = TRUE)

# log-likelihdood = sum of logs of normal probabilities across all observations
llike <- function(mu) sum(dnorm(dval, mean = mu, sd = 15, log = TRUE))

# log-posterior function - normalising constant not needed
lpost <- function(mu) lprior(mu) + llike(mu)

#now, do MCMC = simulate values and save if accepted

mu0 <- 180
s <- 10     # this controls the standard deviation of normal distribution from which "jumps" are drawn
n <- 10000  # number of steps
mus <- c(mu0, numeric(n))

for(i in 1:n)
{
  # generating proposal value of mu = current value + random draw value
  mup <- mus[i] + rnorm(1,0,s) # s is SD of the jump
  p <- exp(lpost(mup) - lpost(mus[i])) # proposal probability = RATIO ofposteriors

  # either accept or reject the proposal
  if(p  < runif(1)) mus[i+1] <- mus[i] else mus[i+1] <- mup
}

# discard burn-in
musb <-  mus[-(1:1001)] # discards burn-in values

# Examine our chain with code package
install.packages('coda', lib='c:\\notbackedup\\R\\')
library(coda, lib='c:\\notbackedup\\R\\')# a Bayesian package in R

musc <- as.mcmc(musb) # converts to MCMC object
rejectionRate(musc)   # get rejection rate
# 0.53 = 53% of proposals rejected

## PLOT
# plot posterior
par(mfrow=c(2,1)) # 2 graphs stacked on top of each other
traceplot(musc)
densplot(musc)

summary(musc)


##################### END OCT 1 LEC ########################
##### OCT 8 LEC: MCMC ################

install.packages('ismev', lib='c:\\notbackedup\\R\\')
library(ismev, lib='c:\\notbackedup\\R\\')# a Bayesian package in R
install.packages('LearnBayes', lib='c:\\notbackedup\\R\\')
library(LearnBayes, lib='c:\\notbackedup\\R\\')# a Bayesian package in R

data(engine)
plot(engine$Corr,engine$Time)

# as corosion increases, time gets shorter
# We model as exponential distribution
# x = corrosion
# y = a * x ^b
# so, theta = (a,b)
# Exponential distirbutions are parameterised by RATE parameter

# if we just use MLE
# First specify the log likelihood function
fn = function(p, engine)
{
  n = nrow(engine) # number of observations
  y = engine$Time  # engine failure time
  x = engine$Corrosion  # corrosion as a predictor
  -n*log(p[1]) - p[2]*sum(log(x)) + p[1]*sum(x^p[2] * y)
}
out = optim(c(1.16, 0), fn, method = "BFGS",
            hessian = TRUE, engine = engine)

round(out$par,3)
round(sqrt(diag(solve(out$hessian))), 3)

plot(engine$Corr,engine$Time)
CorrLev = seq(0, 4.2, len=100)
lines(CorrLev, 1/(1.133* CorrLev^0.479))  # these are coefficients
# "The rate parameter of an exponential distribution is the reciprocal of the mean" - so we need to use reciprocal
# so if we predict rate, we take reciprocal to get mean

## The bayesian method
# define the log of the posterior
lpost <- function(p, engine)
{
  y <- engine$Time
  x <- engine$Corrosion

  # the log of the joint prior is equal to the sum of log marginal
  # Note: this is because we defined that as independant so ab = a*b
  lprior <- dnorm(p[1],0,100,log=TRUE) + dnorm(p[2],0,100,log=TRUE)

  # the likelihood uses the exponential function
  # note that the "a" coefficient is "exp(a)"
  llike <- sum(dexp(y, rate = exp(p[1])*x^p[2], log = TRUE))

  # the log posterior is the sum of log prior and log likelihood
  lprior + llike
}

start <- c(log(1.16),0) # starting values. We've assumed p[1] is logged

# see next slide for details of gibbs()
ftmcmc  <- gibbs(lpost, start, 100000, c(0.25,0.25), engine = engine)

# get the accept rate. remember, we want this to be ~50-60%
ftmcmc$accept

# get the sample chain
ftmcmc$par
hist(ftmcmc$par[,1]) # hist of b (remmeber, we are sampling from log)
hist(exp(ftmcmc$par[,1]))

# get means
mean(exp(ftmcmc$par[-(1:10000),1]))
mean(ftmcmc$par[-(1:10000),2])


########### ELICITATION
quantile1 <- list(p=.05,x=0.25) # 5%tile at x = 0.25

quantile2 <- list(p=.95,x=0.75) # 95%tile at x = 0.75

# select parameters for a Beta distribution with the above quantiles.

beta.select(quantile1,quantile2)


########## COIN SELECTION EXAMPLE
# drawing pennies from jar then tossing
# each penny could have different probability of landing on heads
# But we know it has a uniform distribution from 0-1

# we select a coin at random, flip it 10 times, get 3 heads
# Q. we toss it 12 more times, what is p(4 heads)?

# two parts
# p(heads) for chosen coin
# p(4 heads  | 12 tosses) for that coin
# So what is joint density?
# {see slides}


random.coin.gibbs = function (p=0.5, m = 1000)
{

  S = matrix(0,m,2)
  dimnames(S)[[2]] = c("p","y")

  for (j in 1:m)
  {
    y = rbinom(1, size=12, prob=p)
    p = rbeta(1, y+4, 20-y)
    S[j,] = c(p,y)
  }

  return(S)
}

sim.values = random.coin.gibbs()

plot(sim.values)

# p(y=4)
table(sim.values[,"y"])
# so, reading off the table, y=4 = number of 4's / total sample 1000 gives us p(y=4)

# p:
mean(sim.values[,"p"])


################################################################
### PART 2 of course ###########################################
################################################################


#############################################
##### MARKOV CHAIN ######
#############################################

## Finding long-term steady-state
transmat<-matrix(c(0.9, 0.2, 0.1, 0.8), nrow=2, byrow=2)
s0<-matrix(c(3000,4000), nrow=2)
s2<-transmat%*%transmat%*%s0

steady50<-transmat
for(i in 1:50)
  steady50<-transmat%*%steady50
steady50

steady51<-transmat
for(i in 1:51)
  steady51<-transmat%*%steady51
steady51

#### Generate sample for Gibbs
n<-50 #Data
x<-rnorm(n,5,2) # mean is 5 and sd=2
N<-1000 #Run MC=2 chains of length N=1000
simmat<-matrix(0, ncol=2, nrow=N) # store the results
for (j in 1:2) { #Loop over chains
  tau<-rgamma(1,n/2,1/2) #Starting value for tau
  for (i in (1:N)) { #Gibbs iterations
    mu<-rnorm(1,mean(x),sqrt(1/(tau*n))) #Update mu
    tau<-rgamma(1,n/2,sum((x-mu)^2)/2) #Update tau
    simmat[i,1]<-mu #Save results
    simmat[i,2]<-tau
  }
}
par(mfrow=c(2,2))
hist(simmat[,1])
hist(simmat[,2])
plot(simmat[,1], type="l")
plot(simmat[,2], type="l")

##### Metropolis algorithm

## example with uniform distribution
metropolis1<-function(n, a, b)
{
  sim<-numeric(n)
  sim[1]<-x<-0
  for (i in 2:n) {
    prob=runif(1,a,b)
    cand=x+prob
    ratio=min(1,dnorm(cand)/dnorm(x))
    u=runif(1)
    if (u < ratio)
      x=cand
    sim[i]=x
  }
  sim
}

par(mfrow=c(1,1))
res<-metropolis1(1000,-1, 1 )
plot(res, type="l", col="dark red")

####### Metropolis-Hasting
## also fix to make log-likelihood
metropolis2<-function(n, a, b)  {
  sim<-numeric(n)
  sim[1]<-x<-0
  logllkelihood.old<-dnorm(x, log=TRUE)
  for (i in 2:n) {
    cand<-x+ runif(1,a,b)
    logllkelihood.new<-dnorm(cand, log=TRUE)
    ratio<-min(1, exp(logllkelihood.new- logllkelihood.old))
    u<-runif(1)
    if (u < ratio){
      x<-cand
      logllkelihood.new <- logllkelihood.old
    }
    sim[i]<-x
  }
  sim
}
res<-metropolis2(1000,-1, 1 )
plot(res, type="l", col="dark red")

#### Generalised Extreme Value distribution example

# log-likelihood function
# theta is 3-value vector
loglikelihood<-function(theta, mdata){
  mu<-theta[1]
  shape<-theta[2]
  sigma<-theta[3]
  n<-length(mdata)
  zz<-1+shape*(mdata-mu)/sigma
  # note we zero any values that are <0
  # not quite zero, as can't log 0
  if(any(zz<=0)) {
    return(-1e16)
  }
  llikelihood<- -log(sigma)-(1+1/shape)*log(zz)-(zz)^(-1/shape)
  sum(llikelihood)
}

# mcmc function
# std is for accept/reject values
# if STD is TOO SMALL then we'll accept everything
# we want 40-60% accept rate
mcmcevd<- function(nn, init,std, mdata, burn=0) {
  theta<-init
  simmat<- matrix(NA,nn+1, ncol=length(theta))
  colnames(simmat) <- c("loc","shape","scale")
  simmat[1,]<-init
  acc.counts<-numeric(3)
  llikelihood.old<-loglikelihood(theta, mdata)
  for(i in 1:nn){
    logposterior.old<-llikelihood.old+dnorm(theta[1], sd=100, log=TRUE) #Prior and logLikelihood
    # Proposal value for location
    loc.old<-theta[1]
    loc.new<-rnorm(1,loc.old,std[1])  #Simulating a value from a normal distribution
    theta[1]<-loc.new
    llikelihood.new<-loglikelihood(theta, mdata)
    logposterior.new<-llikelihood.new+dnorm(theta[1], sd=100, log=TRUE) #Prior and logLikelihood

    ##DECISION WHETHER REJECT/ACCEPT
    alpha<- min(exp(logposterior.new-logposterior.old),1)  # remove log
    u<-runif(1,0,1)    # generate new value
    if(alpha > u)      # decision
    {
      simmat[(i+1),1]<-loc.new
      acc.counts[1]<-acc.counts[1]+1
      llikelihood.old<-llikelihood.new
    }
    else
      simmat[(i+1),1]<-loc.old
    theta[1]<-simmat[(i+1),1]
    logposterior.old<-llikelihood.old+dnorm(theta[2], sd=100, log=TRUE) #Prior and logLikelihood

    # Proposal value for shape
    shape.old<-theta[2]
    shape.new<-rnorm(1,shape.old,std[2])  #Simulating a value from a normal distribution
    theta[2]<-shape.new
    llikelihood.new<-loglikelihood(theta, mdata)
    logposterior.new<-llikelihood.new+dnorm(theta[2], sd=100, log=TRUE) #Prior and logLikelihood

    ##DECISION WHETHER REJECT/ACCEPT
    alpha<- min(exp(logposterior.new-logposterior.old),1)
    u<-runif(1,0,1)

    if(alpha > u)
    {
      simmat[(i+1),2]<-shape.new
      acc.counts[2]<-acc.counts[2]+1
      llikelihood.old<-llikelihood.new
    }
    else
      simmat[(i+1),2]<-shape.old

    theta[2]<-simmat[(i+1),2]

    ## Third parameter: scale
    # this is special: cannot have negative value
    # We take value from LOG-NORMAL
    # This is not a symmetric distribuiton
    # so we need special non-symmetric component in accept/reject
    sigma.old<-theta[3]
    sigma.new<-exp(rnorm(1,log(sigma.old),std[3]))  #Simulating a value from a lognormal distribution
    theta[3]<-sigma.new
    llikelihood.new<-loglikelihood(theta, mdata)
    logposterior.new<-llikelihood.new+dnorm(theta[2], sd=100, log=TRUE) #Prior and logLikelihood

    ##DECISION WHETHER REJECT/ACCEPT
    nonsymm<-log(sigma.new/sigma.old)
    alpha<- min(exp(logposterior.new-logposterior.old+nonsymm),1)
    u<-runif(1,0,1)
    if(alpha > u)
    {
      simmat[(i+1),3]<-sigma.new
      acc.counts[3]<-acc.counts[3]+1
      llikelihood.old<-llikelihood.new
    }
    else
      simmat[(i+1),3]<-sigma.old

    theta[3]<-simmat[(i+1),3]
  } ##End of simulation loop

  simmat<-simmat[(burn+1):nn,]
  acc.rates<-c(acc.counts)/nn
  list(simmat=simmat, acc.rates=acc.rates)
} #End of Function


mdata<-rnorm(25, 25, 2)  # this is our observed dataset
theta<-c(2,0.5,5)        # starting value
loglikelihood(theta, mdata)
init<-c(5,3,1)
std<-c(0.7, 0.6, 0.3)
xx<-mcmcevd(nn=10000, init,std, mdata=mdata, burn=1000)

temp<-xx$simmat
xx$acc.rates
par(mfrow=c(3,1))
plot(temp[,1], type="l")
plot(temp[,2], type="l")
plot(temp[,3], type="l")


###### GLM with Bayes

library("MASS")
install.packages("mvtnorm")
install.packages("mfp")
library(mvtnorm)  # multivariate normal distributions
library(mfp)      #
# import dataset from folder
attach(bodyfat)
summary(bodyfat)
x1<-Abdomen
x2<-Age
x3<-Height
x4<-Neck
y<-Siri

#=======USING MLE============================

RegModel.1 <- lm(y~x1+x2+x3+x4)
summary(RegModel.1)

#==============USING BAYES===============

X<-cbind(1, x1, x2, x3, x4)
dim(X)   # 252 rows, 5 columns
Vbeta<-solve(t(X)%*%X)   # solve = find inverse
Vbeta  # diagnoals are variance, off-diagonals are co-variance
betahat<-solve(t(X)%*%X)%*%t(X)%*%y
betahat
df<-nrow(X)-ncol(X)   # degrees of freedom
df
ssquared<-(sum(t(y-X%*%betahat)%*%(y-X%*%betahat)))/df

#simulate from the posterior
n.sims<-10000
sigma<-sqrt(1/rgamma(n.sims, df/2, df*ssquared/2))
# sigma
bbeta<-matrix(betahat, n.sims, ncol(X), byrow=T)+sigma*rmvnorm(n.sims, rep(0, ncol(X)), Vbeta)
apply(bbeta, 2, quantile, probs=c(0.025, 0.5, 0.975))

# can now plot trace, density, etc

#==================USING LEARNBAYES  PACKAGE====
install.packages("LearnBayes")
library(LearnBayes)

bmod <- blinreg(y,X, 100000)  # prior is null by default
str(bmod)  # contains two vectors: for Beta and Sigma
summary(bmod$beta)  # see the mean & median estimates: very close to above
betas<-bmod$beta

par(mfrow=c(1,1))
plot(betas[,1], type="l")
plot(density(betas[,1]), type="l")


#==================USING MCMCpack  PACKAGE====
# The model formula is the same, but here we have some new options.
# for example: starting value, can incorporate more info,

install.packages("MCMCpack")
library(MCMCpack)
model.mcmc <- MCMCregress(y~x1+x2+x3+x4,
                          data = bodyfat,
                          burnin = 2000,
                          mcmc = 10000,
                          thin = 1,
                          seed = NA,
                          beta.start = NA)

str(model.mcmc)
# mcmc = chain
summary(model.mcmc)

###### logistic regression with Bayes


## Basic MLE approach
x<-c(0,1,2)
ca<-c(10,12,85)
cl<-c(10,65,165)
logitmod<-glm(cbind(ca,cl)~x,family="binomial")
str(logitmod)
thetahat<-logitmod$coeff[2]
exp(thetahat)
# 1.18: so odds of diseased increase 18% with each extra x

var<-vcov(logitmod)[2,2]
# 95% confidence:
exp(thetahat-1.96*sqrt(var))
exp(thetahat+1.96*sqrt(var))

#### logistic via Bayesian - use INLA package
## INLA:
install.packages(c("fields","numDeriv","pixmap", "mvtnorm","R.utils"))
source("http://www.math.ntnu.no/inla/givemeINLA.R")
library(INLA)
inla.upgrade(testing=TRUE)

# First, no priors
# 1. create a list of values [?], convert to dataframe
dat<???list(x=x,ca=ca,cl=cl)
dat<???as.data.frame(rbind(ca,cl,x))

mod.inla<???inla(ca~x, family="binomial" ,data=dat,Ntrials=ca+cl)
str(mod.inla)
# so many parameters...
# mainly interested in marginal distributions for parameters

summary(mod.inla)

vals1 = mod.inla$marginals.fixed
intercept = vals1$'(Intercept)'
x=vals1$x

par(mfrow=c(2,1))
plot(intercept, type="l", main="Intercept")
plot(x, type="l", main='x')

# don't really care about intercept
# get summary statistics for x, x values
# median: 0.16 --> 16% increase odds for each extra 'a'


#=====Now with informative priors=====

Upper975<???log(1.5)

W<???(log(Upper975)/1.96)^2
mod.inla1<???inla(ca~x,family='binomial',data=dat,Ntrials=ca+cl, control.fixed=list(mean.intercept=c(0),prec.intercept=c(.1), mean=c(0),prec=c(1/W)))
summary(mod.inla1)


#=======MULTIPLE LINEAR REG======

Res.inla<-inla(y~x1+x2+x3+x4, data=bodyfat, family="gaussian")
summary ( Res.inla)

# estimated values = marginal distributions
vals<-Res.inla$margianls.fixed
ex1 = vals$x1
ex2 = vals$x2
ex3 = vals$x3
ex4 = vals$x4

# then plot them with plot, type="l"

################ ONE-WAY ANOVA

sigma.mu<-0.3
sigma.e<-0.2
sample<-10
trt<-5
beta0<-0.5
mu<-rnorm(sample, mean=0, sd=sigma.mu)
e<-rnorm(sample*trt, mean=0, sd=sigma.e)
yvec<-beta0 + rep (mu , each=trt) + e
simdata<-data.frame(y=yvec, trt=rep(1:sample, each=trt))

# basic ANOVA
fit1 <- aov(y ~ trt, data=simdata)
summary(fit1)
#

# Bayesian
fit2<-inla(y~f(trt, model="iid"), data=simdata)
summary(fit2)


##########################################################
### PART 5: BAYES FACTOR & TESTING ####################
###########################################################
install.packages('LearnBayes')

# Using Bayes factor to compare models with different
# assumptions of sampling distribution
# See slide see slide 32: "Models for Fire Calls"

fire.counts <- c(75, 88, 84, 99, 79, 68, 86, 109, 73, 85, 101, 85,
                 75, 81, 64, 77, 83, 83, 88, 83, 78, 83, 78, 80,
                 82, 90, 74, 72, 69, 72, 76, 76, 104, 86, 92, 88)

# Step 1: use histograms to compare different posterior sampling distributions
hist(fire.counts, probability=TRUE, ylim=c(0, .08))
x <- 60:110
lines(x, dpois(x, lambda=mean(fire.counts)), col="red")
lines(x, dnorm(x, mean=mean(fire.counts), sd=12), col="blue")
lines(x, dnorm(x, mean=mean(fire.counts), sd=6), col="green")
legend("topright", legend=c("M1: Poisson(theta)",  "M2: N(theta, 12)",  "M3: N(theta, 6)"), col=c("red", "blue", "green"), lty=1)

# which is best?
# We need to calculate log likelihood for each

# use LearnBayes to do integration

#==========Model M1=========

model.1 <- function(theta, y){
  sum(log(dpois(y, theta))) + dgamma(theta, shape=280, rate=4)
}

library(LearnBayes)
log.pred.1 <- laplace(model.1, 80, fire.counts)$int
log.pred.1

#==========Model M2=========

model.2 <- function(theta, y){
  sum(log(dnorm(y, theta, 6))) + dgamma(theta, shape=280, rate=4)
}

log.pred.2 <- laplace(model.2, 80, fire.counts)$int

#==========Model M3=========

model.3 <- function(theta, y){
  sum(log(dnorm(y, theta, 12))) +  dgamma(theta, shape=280, rate=4)
}

log.pred.3 <- laplace(model.3, 80, fire.counts)$int


## Calculate Bayes Factors  - subtract due to logs
exp(log.pred.1-log.pred.2)
# very large: model 1 better than 2
exp(log.pred.2-log.pred.3)

############ Linear regression comparison ############

install.packages('BayesFactor')
library(BayesFactor)

data(attitude)
head(attitude)

lmObj = lm(rating ~ ., data = attitude)
summary(lmObj)
# only complaints is significant variable

# RegressionBF calculates bayes factors for many models at once
bf = regressionBF(rating ~ ., data = attitude)
length(bf) # 63 models = all possible combinations. 2^6 = 64 - 1 = 63

head(bf, n=6) # displays only the 6 models


################ HYPOTHESIS TESTING ##########################

### One sample t-test ###
# classical approach

# first, import dataset with load workspace
t.test(x=BodyTemperature$Temperature-98.4)

# Bayes method
onesam.test1 = ttestBF(x=BodyTemperature$Temperature-98.4)
onesam.test1


### Independant samples t-test
# is there a difference in IQ between male vs. female
# import data

classical.test = t.test(iqf ~ sex, data = lead_environ,var.equal = TRUE)
# p=0.5 --> reject alternative

# bayes factor
bf = ttestBF(formula = iqf ~ sex, data = lead_environ,var.equal = TRUE )
bf
# bayes factor = 0.26 where null: mu1-mu2 = 0
# ??? therefore, we DO NOT accept alternative hypothesis.

### Paired sample t-test
# import data
PrisonStressSport = sp
attach(PrisonStressSport)
ccpaired.test = t.test(x=PSSbefore, y=PSSafter, paired=TRUE, data=PrisonStressSport)
# p=0.0178 = reject Null

ttestBF(x=PSSbefore, y=PSSafter, paired=TRUE)
# Bayes factor = 3.48 = reject Null


##########################################################
### PART 6: Matrix, conjugates, markov ####################
###########################################################

# MAtrix
dataset<-cbind(c(1991:2015), rnorm(25, 40, 2))
t<-dataset[,1]
#design matrix
designmat<-cbind(1, t)
#lets assume a0<-0.2 and a1<-0.8
a0<-0.2; a1<-0.8   # intercept, slope
coeff<-c(a0, a1)
beta<-designmat%*%coeff  # calculate predictions

beta

################################################
### IMPUTING MISSING VALUES
################################################


install.packages("mice", dependencies=TRUE)
library(mice)

## Create a dataset with missing values

# 1. create a dataset
set.seed(100)
x1.r<-runif(200)
x2.r<-runif(200)
y.r<-runif(200)

# 2. we randomly introduce missing values
miss.x1<-rbinom(200, 1, prob=0.9)
miss.x2<-rbinom(200, 1, prob=0.9)
miss.y<-rbinom(200, 1, prob=0.9)

x1<-ifelse(miss.x1==1, x1.r, NA)
x2<-ifelse(miss.x2==1, x2.r, NA)
y<-ifelse(miss.y==1, y.r, NA)

datmat<-cbind(x1, x2, y)

# use mice to create 5 different datasets with filled missings
mice.dat<-mice(datmat, m=5, maxit=20)

#bwplot(mice.dat)

# fit regression line to 5 mice-filled datasets
fit<-with(mice.dat, lm(y~x1+x2))

# compare our model with what we'd get if we just used original dataset
summary(pool(fit))
summary(lm(y~x1+x2), datmat)

# fitting a model to 5 imputed datasets
# gives us more information that fitting
# to original dataset with missing values
#


##########################
# example 2: make 30% missing

set.seed(100)
x1.r<-runif(200)
x2.r<-runif(200)
y.r<-runif(200)

#we randomly generate missing values in the dataset

miss.x1<-rbinom(200, 1, prob=0.7)
miss.x2<-rbinom(200, 1, prob=0.7)
miss.y<-rbinom(200, 1, prob=0.7)

x1<-ifelse(miss.x1==1, x1.r, NA)
x2<-ifelse(miss.x2==1, x2.r, NA)
y<-ifelse(miss.y==1, y.r, NA)


datmat<-cbind(x1, x2, y)

mice.dat<-mice(datmat, m=5, maxit=20)

# Compare model with imputed data to that without
fit<-with(mice.dat, lm(y~x1+x2))
summary(pool(fit))
summary(lm(y~x1+x2), datmat)


###########################
# Example 3: introduce interactions, make 10% missing

x1.r<-runif(200)
x2.r<-runif(200)
y.r<-x1.r+x2.r+x1.r*x2.r+rnorm(200, sd=1)


#we randomly generate missing values in the dataset

miss.x1<-rbinom(200, 1, prob=0.9)
miss.x2<-rbinom(200, 1, prob=0.9)
miss.y<-rbinom(200, 1, prob=0.9)

x1<-ifelse(miss.x1==1, x1.r, NA)
x2<-ifelse(miss.x2==1, x2.r, NA)
y<-ifelse(miss.y==1, y.r, NA)
x1x2<-x1.r*x2.r

datmat<-cbind(x1, x2, x1x2, y)

# impute values to 5 datasets
mice.dat<-mice(datmat, m=5, maxit=20)

# fit model to imputed data
fit<-with(mice.dat, lm(y~x1+x2+I(x1*x2)))
summary(pool(fit))

# compare to model fit to dataset with missings
summary(lm(y~x1+x2+I(x1*x2)), datmat)
# ... has higher errors


#####################
# example 4: categorical data
# Mice package uses POisson


# Create a dataset for explanation
set.seed(100)
b0 <- 1
b1 <- .75
b2 <- -.25
b3 <- .5
N <- 200
x1 <- rnorm(N)
x2 <- rnorm(N)
x3 <- rnorm(N)
lam <- exp( b0+b1*x1+b2*x2+b3*x3)
y <- rpois( N, lam )
POIS <- data.frame( y, x1, x2, x3 )

# We introduce MAR missingness in y using the following function:
generatemissingy<- function( data, pos = 1, Z = 2, pmis = .5, strength = c( .5, .5 ) ) {
  total <- round( pmis * nrow(data) )
  sm <- which( data[,Z] < mean( data[,Z] ) )
  gr <- which( data[,Z] > mean( data[,Z] ) )
  sel.sm <- sample( sm, round( strength[1] * total ) )
  sel.gr <- sample( gr, round( strength[2] * total ) )
  sel <- c( sel.sm, sel.gr )
  data[sel,pos] <- NA
  return(data)
}
mdat <- generatemissingy( POIS, pmis = .2, strength = c( .2, .8) )


library(mice)
imp <- mice( mdat, seed = 100)
res <- with(imp, glm( y ~ x1 + x2 + x3, family = "poisson"))
print( pool.res <- summary( pool( res ) ) )


res<-glm( y ~ x1 + x2 + x3, family = "poisson")
summary(res)
# Error is higher if we don't impute values...


## also see "mi" package - allows specifying prior, etc


## Teradata Revo R sample modelling.R
##############################################################################
# RevoScaleR RxTeradata Getting Started Guide Examples
##############################################################################

##############################################################################
# General setup/cleanup: Run this code before proceeding
##############################################################################

# To follow the examples in this guide, you will need the following
# comma-delimited text data files  loaded into your Teradata database:
#  ccFraud.csv as RevoTestDB.ccFraud10
#  ccFraudScore.csv as RevoTestDB.ccFraudScore10
# These data files are available from Revolution FTP site.
# You can use the 'fastload' Teradata command to load the data sets into your
# data base.

# Modify and uncomment the following code for your connection to Teradata,
# and other information about your Teradata setup:

tdConnString <- "DRIVER=Teradata;DBCNAME=10.4.208.180;DATABASE=RevoTestDB;UID=SysDba;PWD=SysDba;"

tdShareDir = paste("c:\\users\\", Sys.getenv("USERNAME"), "\\tdShare", sep="")
tdRemoteShareDir = "/tmp/revoJobs"
tdRevoPath = "/usr/lib64/Revo-7.3/R-3.1.3/lib64/R"

# Make sure that the local share dir exists:
if (!file.exists(tdShareDir)) dir.create(tdShareDir, recursive = TRUE)

# If your DATABASE is not named RevoTestDB, you will need
# to modify the SQL queries in the following code to your
# DATABASE name


##########################################################
# Using a Teradata Data Source and Compute Context
##########################################################


# Creating an RxTeradata Data Source

tdQuery <- "SELECT * FROM ccFraud10"
teradataDS <- RxTeradata(connectionString = tdConnString,
  sqlQuery = tdQuery, rowsPerRead = 50000)
# NOTE: this query is run every time it is called

# Extracting basic information about your data

rxGetVarInfo(data = teradataDS)

# Specifying column information in your data source

stateAbb <- c("AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC",
    "DE", "FL", "GA", "HI","IA", "ID", "IL", "IN", "KS", "KY", "LA",
    "MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NB", "NC", "ND",
    "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI","SC",
    "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY")


ccColInfo <- list(
    gender = list(
	  type = "factor",
        levels = c("1", "2"),
   	  newLevels = c("Male", "Female")),
    cardholder = list(
	    type = "factor",
	    levels = c("1", "2"),
          newLevels = c("Principal", "Secondary")),
	state = list(
          type = "factor",
          levels = as.character(1:51),
	    newLevels = stateAbb)
	)
teradataDS <- RxTeradata(connectionString = tdConnString,
    sqlQuery = tdQuery, colInfo = ccColInfo, rowsPerRead = 50000)

rxGetVarInfo(data = teradataDS)

tdWait <- TRUE
tdConsoleOutput <- FALSE
tdCompute <- RxInTeradata(
    connectionString = tdConnString,
    shareDir = tdShareDir,
    remoteShareDir = tdRemoteShareDir,
    revoPath = tdRevoPath,
    wait = tdWait,
    consoleOutput = tdConsoleOutput)
rxGetNodeInfo(tdCompute)
tdComputeTrace <- RxInTeradata(
    connectionString = tdConnString,
    shareDir = tdShareDir,
    remoteShareDir = tdRemoteShareDir,
    revoPath = tdRevoPath,
    wait = tdWait,
    consoleOutput = tdConsoleOutput,
    traceEnabled = TRUE,
    traceLevel = 7)

##########################################################
# High-Performance In-Database Analytics in Teradata
##########################################################

# Compute summary statistics in Teradata

# Set the compute context to compute in Teradata
rxSetComputeContext(tdCompute)
rxSummary(formula = ~gender + balance + numTrans + numIntlTrans +
creditLine, data = teradataDS)
# Set the compute context to compute locally
rxSetComputeContext ("local")

# Refining the RxTeradata data source

ccColInfo <- list(
    gender = list(
	  type = "factor",
        levels = c("1", "2"),
   	  newLevels = c("Male", "Female")),
    cardholder = list(
	    type = "factor",
	    levels = c("1", "2"),
          newLevels = c("Principal", "Secondary")),
	state = list(
          type = "factor",
          levels = as.character(1:51),
	    newLevels = stateAbb),
      balance = list(low = 0, high = 41485),
      numTrans = list(low = 0, high = 100),
      numIntlTrans = list(low = 0, high = 60),
      creditLine = list(low = 1, high = 75)
	)
teradataDS <- RxTeradata(connectionString = tdConnString,
    sqlQuery = tdQuery, colInfo = ccColInfo, rowsPerRead = 50000)

# Visualizing your data using rxHistogram and rxCube

rxSetComputeContext(tdCompute)

rxHistogram(~creditLine|gender, data = teradataDS,
histType = "Percent")
cube1 <- rxCube(fraudRisk~F(numTrans):F(numIntlTrans),
data = teradataDS)
cubePlot <- rxResultsDF(cube1)
levelplot(fraudRisk~numTrans*numIntlTrans, data = cubePlot)

# Analyzing your data with rxLinMod

linModObj <- rxLinMod(balance ~ gender + creditLine,
data = teradataDS)

summary(linModObj)


# Analyzing your data with rxLogit

logitObj <- rxLogit(fraudRisk ~ state + gender + cardholder + balance +
    numTrans + numIntlTrans + creditLine, data = teradataDS,
    dropFirst = TRUE)
summary(logitObj)

# Scoring a Data Set

tdQuery <- "SELECT * FROM ccFraudScore10"
teradataScoreDS <- RxTeradata(connectionString = tdConnString,
    sqlQuery = tdQuery, colInfo = ccColInfo, rowsPerRead = 50000)


teradataOutDS <- RxTeradata(table = "ccScoreOutput",
    connectionString = tdConnString, rowsPerRead = 50000 )

rxSetComputeContext(tdCompute)
if (rxTeradataTableExists("ccScoreOutput"))
    rxTeradataDropTable("ccScoreOutput")

rxPredict(modelObject = logitObj,
	data = teradataScoreDS,
	outData = teradataOutDS,
	predVarNames = "ccFraudLogitScore",
	type = "link",
	writeModelVars = TRUE,
	overwrite = TRUE)
if (rxTeradataTableExists("ccScoreOutput"))
    rxTeradataDropTable("ccScoreOutput")

rxPredict(modelObject = logitObj,
	data = teradataScoreDS,
	outData = teradataOutDS,
	predVarNames = "ccFraudLogitScore",
	type = "link",
	writeModelVars = TRUE,
	extraVarsToWrite = "custID",
	overwrite = TRUE)
tdMinMax <- RxOdbcData(sqlQuery = paste(
      "SELECT MIN(ccFraudLogitScore),",
 	"MAX(ccFraudLogitScore) FROM ccScoreOutput"),
	connectionString = tdConnString)
minMaxVals <- rxImport(tdMinMax)
minMaxVals <- as.vector(unlist(minMaxVals))

teradataScoreDS <- RxTeradata(sqlQuery =
	"Select ccFraudLogitScore FROM ccScoreOutput",
      connectionString = tdConnString, rowsPerRead = 50000,
 	colInfo = list(ccFraudLogitScore = list(
            low = floor(minMaxVals[1]),
		high = ceiling(minMaxVals[2]))))

rxSetComputeContext(tdCompute)
rxHistogram(~ccFraudLogitScore, data = teradataScoreDS)

##########################################################
# Using rxDataStep and rxImport
##########################################################

teradataScoreDS <- RxTeradata(
    sqlQuery =  "Select * FROM ccScoreOutput",
    connectionString = tdConnString, rowsPerRead = 50000 )

teradataOutDS2 <- RxTeradata(table = " ccScoreOutput2",
	connectionString = tdConnString, rowsPerRead = 50000)

rxSetComputeContext(tdCompute)
if (rxTeradataTableExists("ccScoreOutput2"))
    rxTeradataDropTable("ccScoreOutput2")

rxDataStep(inData = teradataScoreDS, outFile = teradataOutDS2,
	transforms = list(ccFraudProb = inv.logit(ccFraudLogitScore)),
	transformPackages = "boot", overwrite = TRUE)
rxGetVarInfo(teradataOutDS2)


# Using rxImport to Extract a Subsample

rxSetComputeContext("local")

teradataProbDS <- RxTeradata(
	sqlQuery = paste(
	"Select * FROM ccScoreOutput2",
 	"WHERE (ccFraudProb > .99)"),
	connectionString = tdConnString)
highRisk <- rxImport(teradataProbDS)
orderedHighRisk <- highRisk[order(-highRisk$ccFraudProb),]
row.names(orderedHighRisk) <- NULL  # Reset row numbers
head(orderedHighRisk)

# Using rxDataStep to Create a Teradata Table

rxSetComputeContext("local")
xdfAirDemo <- RxXdfData(file.path(rxGetOption("sampleDataDir"),
    "AirlineDemoSmall.xdf"))
rxGetVarInfo(xdfAirDemo)

teradataAirDemo <- RxTeradata(table = " AirDemoSmallTest",
	connectionString = tdConnString)
if (rxTeradataTableExists("AirDemoSmallTest",
    connectionString = tdConnString))
    rxTeradataDropTable("AirDemoSmallTest",
    connectionString = tdConnString)

rxDataStep(inData = xdfAirDemo, outFile = teradataAirDemo,
	transforms = list(
		DayOfWeek = as.integer(DayOfWeek),
		rowNum = .rxStartRow : (.rxStartRow + .rxNumRows - 1)
	), overwrite = TRUE
)

rxSetComputeContext(tdCompute)
teradataAirDemo <- RxTeradata(sqlQuery =
	"SELECT * FROM AirDemoSmallTest",
	connectionString = tdConnString,
      rowsPerRead = 50000,
	colInfo = list(DayOfWeek = list(type = "factor",
		levels = as.character(1:7))))

rxSummary(~., data = teradataAirDemo)

# Performing Your Own Chunking Analysis


ProcessChunk <- function( dataList)
{
    # Convert the input list to a data frame and
    # call the 'table' function to compute the
    # contingency table
	chunkTable <- table(as.data.frame(dataList))

	# Convert table output to data frame with single row
	varNames <- names(chunkTable)
	varValues <- as.vector(chunkTable)
	dim(varValues) <- c(1, length(varNames))
	chunkDF <- as.data.frame(varValues)
	names(chunkDF) <- varNames

	# Return the data frame, which has a single row
	return( chunkDF )
}
rxSetComputeContext( tdCompute )
tdQuery <-
"select DayOfWeek from AirDemoSmallTest"
inDataSource <- RxTeradata(sqlQuery = tdQuery,
 	rowsPerRead = 50000,
	colInfo = list(DayOfWeek = list(type = "factor",
		levels = as.character(1:7))))
iroDataSource = RxTeradata(table = "iroResults",
	connectionString = tdConnString)
if (rxTeradataTableExists(table = "iroResults",
	connectionString = tdConnString))
{
    rxTeradataDropTable( table = "iroResults",
	connectionString = tdConnString)
}
rxDataStep( inData = inDataSource, outFile = iroDataSource,
        	transformFunc = ProcessChunk,
		overwrite = TRUE)

iroResults <- rxImport(iroDataSource)
finalResults <- colSums(iroResults)
finalResults
rxTeradataDropTable( table = " iroResults",
	connectionString = tdConnString)


##########################################################
# Using rxDataStep for By-Group Analyses
##########################################################

# Creating a simulated data set
set.seed(10)
numObs <- 100000
testData <- data.frame(
	SKU = as.integer(runif(n = numObs, min = 1, max = 11)),
	INCOME = as.integer(runif(n = numObs, min = 10, max = 21)))
# SKU is also underlying coefficient
testData$SALES <- as.integer(testData$INCOME * testData$SKU +
25*rnorm(n = numObs))

# Uploading into a Teradata table

rxSetComputeContext("local")
if (rxTeradataTableExists("sku_sales_100k",
connectionString = tdConnString))
{
	rxTeradataDropTable("sku_sales_100k",
connectionString = tdConnString)
}

teradataDS <- RxTeradata(table = "sku_sales_100k",
connectionString = tdConnString)
rxDataStep(inData = testData, outFile = teradataDS)

# The By-Group Transformation Function

EstimateModel <- function(dataList)
{
      # Convert the input list to a data frame
	chunkDF <- as.data.frame(dataList)

      # Estimate a linear model on this chunk
      # Don't include the model frame in the model object
	model <- lm( SALES~INCOME, chunkDF, model=FALSE )

	# Print statements can be useful for debugging
	# print( model$coefficients )

	# Remove unneeded parts of model object
	model$residuals <- NULL
	model$effects <- NULL
	model$fitted.values <- NULL
	model$assign <- NULL
	model$qr[[1]] <- NULL
	attr(model$terms, ".Environment") <- NULL

	# Convert the model to a character string
	modelString <-
        rawToChar(serialize(model, connect=NULL, ascii=TRUE))

	# Pad to a fixed length
	modelString <- paste( modelString,
        paste(rep("X", 2000 - nchar(modelString)), collapse = ""),
        sep="")

      # Create the entry for the results table
	resultDF <- data.frame(
        SKU = chunkDF[1,]$SKU, # Assumes SKU's all the same
        COEF = model$coefficients[2], # Slope
        MODEL = modelString )

	return( resultDF )
  }

# Specifying the input by-group data source

tdQuery <- "SELECT * FROM sku_sales_100k"
partitionKeyword <- "PARTITION-WITH-VIEW"
partitionClause <- paste( partitionKeyword, "BY sku_sales_100k.SKU" )
inDataSource <- RxTeradata(sqlQuery = tdQuery,
    tableOpClause = partitionClause,
    rowsPerRead = 100000)

# Setting up the results data source

resultsDataSource = RxTeradata(table = "models",
    connectionString = tdConnString )
rxOptions(transformPackages = c("stats"))
# Make sure compute context is in-database
rxSetComputeContext(tdCompute)

# Run the by-group data step

rxDataStep( inData = inDataSource, outFile = resultsDataSource,
    transformFunc = EstimateModel, reportProgress = 0,
    overwrite = TRUE )

# Showing the estimated slopes

lmResults <- rxImport(resultsDataSource)
lmResults[order(lmResults$SKU),c("SKU", "COEF")]


# The Scoring transformation function

ScoreChunk <- function(dataList)
{
    chunkDF <- as.data.frame(dataList)

    # Extract the model string for the first observation
    # and convert back to model object
    # All observations in this chunk have the same SKU,
    # so they all share the same model
    modelString <- as.character(chunkDF[1,]$MODEL)
    model <- unserialize( charToRaw( modelString ) )

    resultDF <- data.frame(
        SKU = chunkDF$SKU,
        INCOME = chunkDF$INCOME,
        # Use lm's predict method
        PREDICTED_SALES = predict(model, chunkDF) )

    return( resultDF )
 }

# Setting up the input data source for predictions

predictQuery <- paste("SELECT sku_sales_100k.SKU,",
 	"sku_sales_100k.INCOME, models.MODEL",
 	"FROM sku_sales_100k JOIN models ON",
 	"sku_sales_100k.SKU = models.SKU")
inDataSource <- RxTeradata(sqlQuery = predictQuery,
connectionString = tdConnString,
	tableOpClause = partitionClause, rowsPerRead = 10000)

# Setting up the output data set for predictions

scoresDataSource = RxTeradata(table = "scores",
    connectionString = tdConnString)

# Perform scoring in-database

rxSetComputeContext(tdCompute)
rxDataStep( inData = inDataSource, outFile = scoresDataSource,
    transformFunc = ScoreChunk, reportProgress = 0,
    overwrite = TRUE )


# Extracting summary results

predSum <- rxImport( RxTeradata(
	sqlQuery = paste("SELECT SKU, INCOME, MIN(PREDICTED_SALES),",
 		       "MAX(PREDICTED_SALES), COUNT(PREDICTED_SALES)",
 	             "FROM scores GROUP BY SKU, INCOME",
                   "ORDER BY SKU, INCOME"),
	connectionString = tdConnString ) )
options(width = 120) # Set display width
predSum[1:15,]


# Clean-up

rxTeradataDropTable("models")
rxTeradataDropTable("scores")
rxSetComputeContext("local")

##########################################################
# Performing Simulations In-Database
##########################################################

playCraps <- function()
{
	result <- NULL
	point <- NULL
	count <- 1
	while (is.null(result))
	{
		roll <- sum(sample(6, 2, replace=TRUE))

		if (is.null(point))
		{
			point <- roll
		}
		if (count == 1 && (roll == 7 || roll == 11))
		{
			result <- "Win"
		}
 		else if (count == 1 &&
               (roll == 2 || roll == 3 || roll == 12))
		{
			result <- "Loss"
		}
		else if (count > 1 && roll == 7 )
		{
			result <- "Loss"
		}
		else if (count > 1 && point == roll)
		{
			result <- "Win"
		}
		else
		{
			count <- count + 1
		}
	}
	result
}


# Calling rxExec

teradataExec <- rxExec(playCraps, timesToRun=1000, RNGseed="auto")
length(teradataExec)

table(unlist(teradataExec))
##########################################################
# Analyzing Your Data Alongside Teradata
##########################################################
# Set the compute context to compute locally
rxSetComputeContext ("local")
tdQuery <- "SELECT * FROM ccFraudScore10"

# Perform an rxSummary using a local compute context

teradataDS1 <- RxTeradata(connectionString = tdConnString,
    sqlQuery = tdQuery, colInfo = ccColInfo, rowsPerRead = 500000)

rxSummary(formula = ~gender + balance + numTrans + numIntlTrans +
creditLine, data = teradataDS1)
statesToKeep <- sapply(c("CA", "OR", "WA"), grep, stateAbb)
statesToKeep

# Fast import of iata from a Teradata Database

importQuery <- paste("SELECT gender,cardholder,balance,state",
 	"FROM ccFraud10",
 	"WHERE (state = 5 OR state = 38 OR state = 48)")
importColInfo <- list(
    gender = list(
	  type = "factor",
        levels = c("1", "2"),
   	  newLevels = c("Male", "Female")),
    cardholder = list(
	    type = "factor",
	    levels = c("1", "2"),
          newLevels = c("Principal", "Secondary")),
	state = list(
          type = "factor",
          levels = as.character(statesToKeep),
	      newLevels = names(statesToKeep))
	)

rxSetComputeContext("local")
teradataImportDS <- RxTeradata(connectionString = tdConnString,
    	sqlQuery = importQuery, colInfo = importColInfo)

localDS <- rxImport(inData = teradataImportDS,
    	outFile = "ccFraudSub.xdf",
    	overwrite = TRUE)

# Using the imported data

rxGetVarInfo(data = localDS)
rxSummary(~gender + cardholder + balance + state, data = localDS)
	## System & file management
	objects() # workspace
	getwd() # working directory
	setwd("/mnt/resource/training3")
	dataPath = "/mnt/resource/BigData/data"
	bigDataPath = "/mnt/resource/BigData/BigData"
	outdir = "output"
	# Check & create directories
	if(!file.exists(outdir)) dir.create(outdir)
	# list files in directory
	dir(dataPath)
	dir(bigDataPath)

	# example
	##
	setwd("/mnt/resource/training3")
	dataPath = "/mnt/resource/BigData/data"
	outdir = "output"
	outPath = "output"
	if(!file.exists(outdir)) dir.create(outdir)

	bigDataPath = "/mnt/resource/BigData/BigData"
	big.data.path = "/mnt/resource/BigData/BigData"
	data.path = "/mnt/resource/BigData/data"
	output.path = 'output'
	sample.data.dir = rxGetOption("sampleDataDir")





	## Read data
	# from internet:


	## Selecting & slicing
	# which() = identify records meet a list of criteria
	# which(mtcars$hp == max(mtcars$hp))
	## also SQLDF

	# repeat something
	replicate()

	## sort and order
	# sort() = sort a vector
	# order() = return element order of a sorted vector
	# for a dataframe rows use order: mtcars[order(mtcars$mpg, decreasing=TRUE), ]

	## Merging
	# use merge()
	# inner join (only keep matches): merge(x,y,all=FALSE)
	# outer join (keep all records): merge(x,y,all=TRUE)
	# left or right join: merge(x,y,all.x=TRUE)
	# BINDING = 1 to 1 rows = cbind()
	# UNION ALL = stack, use rbind
	## also SQLDF

	## regex and strings
	# richest.nations$Amount <- gsub(",", "", richest.nations$Amount)
	# richest.nations$Amount <- sapply(richest.nations$Amount, function(x) substr(x, 2, nchar(x)))

	## connecting to databases, SQL
	# note: SQLite = free, open-source SQL DB
	library("RSQLite")
	db = dbConnect(SQLite(), dbname = file.path(bigDataPath, "airlines.sqlite"))
	dbListTables(db) # list all tables
	dbListFields(db, "planes")
	# run a query
	query <- "SELECT count(*) FROM carriers"
	dbGetQuery(db, query)
	# more queries
	query <- "SELECT * FROM airlines where Dest=\"DAL\""
	query <- "SELECT * FROM carriers where Description LIKE \"American%\""
	query <- 'SELECT * FROM carriers where Description LIKE "%American%"'
	## Sequential reads: taking data one chunk at a time = fetch()
	query <- "SELECT * FROM carriers"
	res <- dbSendQuery(db, query)
	x <- fetch(res, n = 1000) # then use rbind() to stack
	str(x)
	dbClearResult(res) # close connection


	# Misc
	# put brackets around assignment ot make it print
	# eg (test = 2)

	## Apply
	# apply(df, MARGIN, FUN, ...)
	# where Margin is a vector. 1 is rows, 2 is columns
	# FUN = function
	# eg. apply(mtcars, 2, mean) # mean for each column
	## Pass parameters to the FUN in the apply syntax
	# apply(mtcars, 2, quantile, c(0.25,.75))
	# 2th, 75th centile
	## Other applies
	# lapply() = to each element of a list

	## by()
	# split dataframe into defined groups, then apply function
	# eg. by(mtcars, mtcars$cyl, colMeans) # colMeans for each factor of cyl

	## tables & xtabs
	xtabs(~gear + cyl, mtcars)
	aggregate()
	# Note: write your OWN function to return multiple aggregates if you want multiple

	# repeat
	replicate()

	#### RevoR & Teradata
	# xdf seems to be distributed form of dataframe
	### data munging
	## library(dplyrXdf)
	## filter, select, distinct, transmute, mutate, arrange, rename,
	## Summarise
	# bankSmry <- group_by(bankXdf, marital, housing) %>%
	# summarise(mean_age=mean(age), mean_balance=mean(balance), .method=3)
	## doXdf
	# bankModels <- group_by(bankXdf, housing) %>%
	# doXdf(model=rxLogit(y ~ job + marital + education, data=.))
















	> outdir = "~"
	> cars.mod = cars
	> save(cars.mod, file=file.path(outdir, "cars_mod.RData"))


	paste() = concatenate strings
	can include a sep
	paste("row",1:5,sep="_")

	## boolean
	# is.na() for missing values
	# x[is.na(x)] = 412

	## VEctors
	letters[-c(1:5]) = drop first 5 letters

	# giving elements names - can access with names
	x = 1:5
	names(x) = letters[1:5]
	x["c"]

	## Matrices
	# Assing ROW and COLUMN names, then you can query on them
	# mat[c('a','b'),c('f','g')]

	## Dataframes
	# str()
	# summary()
	# data.frame() = create

	## Vectorising
	# ifelse(condition, ifvalue, elsevalue)
	# eg. mtcars$type = ifelse(mtcars$hp>mean(mtcars$hp),'highhp','lowhp')


	Scatterplot: use pairs()


	## R Functions

	test <- function(a, b) return(a + b)
	test

	function.name <- function(arguments) {
	# everything the function does in here
	}

	## Modelling
	# Where there is a big spike at 0 before smaller skewed distribution
	# --> use Tweedie distribution power 1.5 = 'compound gamma poisson' = zero spike like poisson plus gamma
	##############################################################################
	# RevoScaleR RxTeradata Getting Started Guide Examples
	##############################################################################

	##############################################################################
	# General setup/cleanup: Run this code before proceeding
	##############################################################################

	# To follow the examples in this guide, you will need the following
	# comma-delimited text data files loaded into your Teradata database:
	# ccFraud.csv as RevoTestDB.ccFraud10
	# ccFraudScore.csv as RevoTestDB.ccFraudScore10
	# These data files are available from Revolution FTP site.
	# You can use the 'fastload' Teradata command to load the data sets into your
	# data base.

	# Modify and uncomment the following code for your connection to Teradata,
	# and other information about your Teradata setup:

	tdConnString <- "DRIVER=Teradata;DBCNAME=10.4.208.180;DATABASE=RevoTestDB;UID=SysDba;PWD=SysDba;"

	tdShareDir = paste("c:\\users\\", Sys.getenv("USERNAME"), "\\tdShare", sep="")
	tdRemoteShareDir = "/tmp/revoJobs"
	tdRevoPath = "/usr/lib64/Revo-7.3/R-3.1.3/lib64/R"

	# Make sure that the local share dir exists:
	if (!file.exists(tdShareDir)) dir.create(tdShareDir, recursive = TRUE)

	# If your DATABASE is not named RevoTestDB, you will need
	# to modify the SQL queries in the following code to your
	# DATABASE name


	##########################################################
	# Using a Teradata Data Source and Compute Context
	##########################################################


	# Creating an RxTeradata Data Source

	tdQuery <- "SELECT * FROM ccFraud10"
	teradataDS <- RxTeradata(connectionString = tdConnString,
	sqlQuery = tdQuery, rowsPerRead = 50000)
	# NOTE: this query is run every time it is called

	# Extracting basic information about your data

	rxGetVarInfo(data = teradataDS)

	# Specifying column information in your data source

	stateAbb <- c("AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC",
	"DE", "FL", "GA", "HI","IA", "ID", "IL", "IN", "KS", "KY", "LA",
	"MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NB", "NC", "ND",
	"NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI","SC",
	"SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY")


	ccColInfo <- list(
	gender = list(
	type = "factor",
	levels = c("1", "2"),
	newLevels = c("Male", "Female")),
	cardholder = list(
	type = "factor",
	levels = c("1", "2"),
	newLevels = c("Principal", "Secondary")),
	state = list(
	type = "factor",
	levels = as.character(1:51),
	newLevels = stateAbb)
	)
	teradataDS <- RxTeradata(connectionString = tdConnString,
	sqlQuery = tdQuery, colInfo = ccColInfo, rowsPerRead = 50000)

	rxGetVarInfo(data = teradataDS)

	tdWait <- TRUE
	tdConsoleOutput <- FALSE
	tdCompute <- RxInTeradata(
	connectionString = tdConnString,
	shareDir = tdShareDir,
	remoteShareDir = tdRemoteShareDir,
	revoPath = tdRevoPath,
	wait = tdWait,
	consoleOutput = tdConsoleOutput)
	rxGetNodeInfo(tdCompute)
	tdComputeTrace <- RxInTeradata(
	connectionString = tdConnString,
	shareDir = tdShareDir,
	remoteShareDir = tdRemoteShareDir,
	revoPath = tdRevoPath,
	wait = tdWait,
	consoleOutput = tdConsoleOutput,
	traceEnabled = TRUE,
	traceLevel = 7)

	##########################################################
	# High-Performance In-Database Analytics in Teradata
	##########################################################

	# Compute summary statistics in Teradata

	# Set the compute context to compute in Teradata
	rxSetComputeContext(tdCompute)
	rxSummary(formula = ~gender + balance + numTrans + numIntlTrans +
	creditLine, data = teradataDS)
	# Set the compute context to compute locally
	rxSetComputeContext ("local")

	# Refining the RxTeradata data source

	ccColInfo <- list(
	gender = list(
	type = "factor",
	levels = c("1", "2"),
	newLevels = c("Male", "Female")),
	cardholder = list(
	type = "factor",
	levels = c("1", "2"),
	newLevels = c("Principal", "Secondary")),
	state = list(
	type = "factor",
	levels = as.character(1:51),
	newLevels = stateAbb),
	balance = list(low = 0, high = 41485),
	numTrans = list(low = 0, high = 100),
	numIntlTrans = list(low = 0, high = 60),
	creditLine = list(low = 1, high = 75)
	)
	teradataDS <- RxTeradata(connectionString = tdConnString,
	sqlQuery = tdQuery, colInfo = ccColInfo, rowsPerRead = 50000)

	# Visualizing your data using rxHistogram and rxCube

	rxSetComputeContext(tdCompute)

	rxHistogram(~creditLine\|gender, data = teradataDS,
	histType = "Percent")
	cube1 <- rxCube(fraudRisk~F(numTrans):F(numIntlTrans),
	data = teradataDS)
	cubePlot <- rxResultsDF(cube1)
	levelplot(fraudRisk~numTrans*numIntlTrans, data = cubePlot)

	# Analyzing your data with rxLinMod

	linModObj <- rxLinMod(balance ~ gender + creditLine,
	data = teradataDS)

	summary(linModObj)


	# Analyzing your data with rxLogit

	logitObj <- rxLogit(fraudRisk ~ state + gender + cardholder + balance +
	numTrans + numIntlTrans + creditLine, data = teradataDS,
	dropFirst = TRUE)
	summary(logitObj)

	# Scoring a Data Set

	tdQuery <- "SELECT * FROM ccFraudScore10"
	teradataScoreDS <- RxTeradata(connectionString = tdConnString,
	sqlQuery = tdQuery, colInfo = ccColInfo, rowsPerRead = 50000)


	teradataOutDS <- RxTeradata(table = "ccScoreOutput",
	connectionString = tdConnString, rowsPerRead = 50000 )

	rxSetComputeContext(tdCompute)
	if (rxTeradataTableExists("ccScoreOutput"))
	rxTeradataDropTable("ccScoreOutput")

	rxPredict(modelObject = logitObj,
	data = teradataScoreDS,
	outData = teradataOutDS,
	predVarNames = "ccFraudLogitScore",
	type = "link",
	writeModelVars = TRUE,
	overwrite = TRUE)
	if (rxTeradataTableExists("ccScoreOutput"))
	rxTeradataDropTable("ccScoreOutput")

	rxPredict(modelObject = logitObj,
	data = teradataScoreDS,
	outData = teradataOutDS,
	predVarNames = "ccFraudLogitScore",
	type = "link",
	writeModelVars = TRUE,
	extraVarsToWrite = "custID",
	overwrite = TRUE)
	tdMinMax <- RxOdbcData(sqlQuery = paste(
	"SELECT MIN(ccFraudLogitScore),",
	"MAX(ccFraudLogitScore) FROM ccScoreOutput"),
	connectionString = tdConnString)
	minMaxVals <- rxImport(tdMinMax)
	minMaxVals <- as.vector(unlist(minMaxVals))

	teradataScoreDS <- RxTeradata(sqlQuery =
	"Select ccFraudLogitScore FROM ccScoreOutput",
	connectionString = tdConnString, rowsPerRead = 50000,
	colInfo = list(ccFraudLogitScore = list(
	low = floor(minMaxVals[1]),
	high = ceiling(minMaxVals[2]))))

	rxSetComputeContext(tdCompute)
	rxHistogram(~ccFraudLogitScore, data = teradataScoreDS)

	##########################################################
	# Using rxDataStep and rxImport
	##########################################################

	teradataScoreDS <- RxTeradata(
	sqlQuery = "Select * FROM ccScoreOutput",
	connectionString = tdConnString, rowsPerRead = 50000 )

	teradataOutDS2 <- RxTeradata(table = " ccScoreOutput2",
	connectionString = tdConnString, rowsPerRead = 50000)

	rxSetComputeContext(tdCompute)
	if (rxTeradataTableExists("ccScoreOutput2"))
	rxTeradataDropTable("ccScoreOutput2")

	rxDataStep(inData = teradataScoreDS, outFile = teradataOutDS2,
	transforms = list(ccFraudProb = inv.logit(ccFraudLogitScore)),
	transformPackages = "boot", overwrite = TRUE)
	rxGetVarInfo(teradataOutDS2)


	# Using rxImport to Extract a Subsample

	rxSetComputeContext("local")

	teradataProbDS <- RxTeradata(
	sqlQuery = paste(
	"Select * FROM ccScoreOutput2",
	"WHERE (ccFraudProb > .99)"),
	connectionString = tdConnString)
	highRisk <- rxImport(teradataProbDS)
	orderedHighRisk <- highRisk[order(-highRisk$ccFraudProb),]
	row.names(orderedHighRisk) <- NULL # Reset row numbers
	head(orderedHighRisk)

	# Using rxDataStep to Create a Teradata Table

	rxSetComputeContext("local")
	xdfAirDemo <- RxXdfData(file.path(rxGetOption("sampleDataDir"),
	"AirlineDemoSmall.xdf"))
	rxGetVarInfo(xdfAirDemo)

	teradataAirDemo <- RxTeradata(table = " AirDemoSmallTest",
	connectionString = tdConnString)
	if (rxTeradataTableExists("AirDemoSmallTest",
	connectionString = tdConnString))
	rxTeradataDropTable("AirDemoSmallTest",
	connectionString = tdConnString)

	rxDataStep(inData = xdfAirDemo, outFile = teradataAirDemo,
	transforms = list(
	DayOfWeek = as.integer(DayOfWeek),
	rowNum = .rxStartRow : (.rxStartRow + .rxNumRows - 1)
	), overwrite = TRUE
	)

	rxSetComputeContext(tdCompute)
	teradataAirDemo <- RxTeradata(sqlQuery =
	"SELECT * FROM AirDemoSmallTest",
	connectionString = tdConnString,
	rowsPerRead = 50000,
	colInfo = list(DayOfWeek = list(type = "factor",
	levels = as.character(1:7))))

	rxSummary(~., data = teradataAirDemo)

	# Performing Your Own Chunking Analysis


	ProcessChunk <- function( dataList)
	{
	# Convert the input list to a data frame and
	# call the 'table' function to compute the
	# contingency table
	chunkTable <- table(as.data.frame(dataList))

	# Convert table output to data frame with single row
	varNames <- names(chunkTable)
	varValues <- as.vector(chunkTable)
	dim(varValues) <- c(1, length(varNames))
	chunkDF <- as.data.frame(varValues)
	names(chunkDF) <- varNames

	# Return the data frame, which has a single row
	return( chunkDF )
	}
	rxSetComputeContext( tdCompute )
	tdQuery <-
	"select DayOfWeek from AirDemoSmallTest"
	inDataSource <- RxTeradata(sqlQuery = tdQuery,
	rowsPerRead = 50000,
	colInfo = list(DayOfWeek = list(type = "factor",
	levels = as.character(1:7))))
	iroDataSource = RxTeradata(table = "iroResults",
	connectionString = tdConnString)
	if (rxTeradataTableExists(table = "iroResults",
	connectionString = tdConnString))
	{
	rxTeradataDropTable( table = "iroResults",
	connectionString = tdConnString)
	}
	rxDataStep( inData = inDataSource, outFile = iroDataSource,
	transformFunc = ProcessChunk,
	overwrite = TRUE)

	iroResults <- rxImport(iroDataSource)
	finalResults <- colSums(iroResults)
	finalResults
	rxTeradataDropTable( table = " iroResults",
	connectionString = tdConnString)


	##########################################################
	# Using rxDataStep for By-Group Analyses
	##########################################################

	# Creating a simulated data set
	set.seed(10)
	numObs <- 100000
	testData <- data.frame(
	SKU = as.integer(runif(n = numObs, min = 1, max = 11)),
	INCOME = as.integer(runif(n = numObs, min = 10, max = 21)))
	# SKU is also underlying coefficient
	testData$SALES <- as.integer(testData$INCOME * testData$SKU +
	25*rnorm(n = numObs))

	# Uploading into a Teradata table

	rxSetComputeContext("local")
	if (rxTeradataTableExists("sku_sales_100k",
	connectionString = tdConnString))
	{
	rxTeradataDropTable("sku_sales_100k",
	connectionString = tdConnString)
	}

	teradataDS <- RxTeradata(table = "sku_sales_100k",
	connectionString = tdConnString)
	rxDataStep(inData = testData, outFile = teradataDS)

	# The By-Group Transformation Function

	EstimateModel <- function(dataList)
	{
	# Convert the input list to a data frame
	chunkDF <- as.data.frame(dataList)

	# Estimate a linear model on this chunk
	# Don't include the model frame in the model object
	model <- lm( SALES~INCOME, chunkDF, model=FALSE )

	# Print statements can be useful for debugging
	# print( model$coefficients )

	# Remove unneeded parts of model object
	model$residuals <- NULL
	model$effects <- NULL
	model$fitted.values <- NULL
	model$assign <- NULL
	model$qr[[1]] <- NULL
	attr(model$terms, ".Environment") <- NULL

	# Convert the model to a character string
	modelString <-
	rawToChar(serialize(model, connect=NULL, ascii=TRUE))

	# Pad to a fixed length
	modelString <- paste( modelString,
	paste(rep("X", 2000 - nchar(modelString)), collapse = ""),
	sep="")

	# Create the entry for the results table
	resultDF <- data.frame(
	SKU = chunkDF[1,]$SKU, # Assumes SKU's all the same
	COEF = model$coefficients[2], # Slope
	MODEL = modelString )

	return( resultDF )
	}

	# Specifying the input by-group data source

	tdQuery <- "SELECT * FROM sku_sales_100k"
	partitionKeyword <- "PARTITION-WITH-VIEW"
	partitionClause <- paste( partitionKeyword, "BY sku_sales_100k.SKU" )
	inDataSource <- RxTeradata(sqlQuery = tdQuery,
	tableOpClause = partitionClause,
	rowsPerRead = 100000)

	# Setting up the results data source

	resultsDataSource = RxTeradata(table = "models",
	connectionString = tdConnString )
	rxOptions(transformPackages = c("stats"))
	# Make sure compute context is in-database
	rxSetComputeContext(tdCompute)

	# Run the by-group data step

	rxDataStep( inData = inDataSource, outFile = resultsDataSource,
	transformFunc = EstimateModel, reportProgress = 0,
	overwrite = TRUE )

	# Showing the estimated slopes

	lmResults <- rxImport(resultsDataSource)
	lmResults[order(lmResults$SKU),c("SKU", "COEF")]



	# The Scoring transformation function

	ScoreChunk <- function(dataList)
	{
	chunkDF <- as.data.frame(dataList)

	# Extract the model string for the first observation
	# and convert back to model object
	# All observations in this chunk have the same SKU,
	# so they all share the same model
	modelString <- as.character(chunkDF[1,]$MODEL)
	model <- unserialize( charToRaw( modelString ) )

	resultDF <- data.frame(
	SKU = chunkDF$SKU,
	INCOME = chunkDF$INCOME,
	# Use lm's predict method
	PREDICTED_SALES = predict(model, chunkDF) )

	return( resultDF )
	}

	# Setting up the input data source for predictions

	predictQuery <- paste("SELECT sku_sales_100k.SKU,",
	"sku_sales_100k.INCOME, models.MODEL",
	"FROM sku_sales_100k JOIN models ON",
	"sku_sales_100k.SKU = models.SKU")
	inDataSource <- RxTeradata(sqlQuery = predictQuery,
	connectionString = tdConnString,
	tableOpClause = partitionClause, rowsPerRead = 10000)

	# Setting up the output data set for predictions

	scoresDataSource = RxTeradata(table = "scores",
	connectionString = tdConnString)

	# Perform scoring in-database

	rxSetComputeContext(tdCompute)
	rxDataStep( inData = inDataSource, outFile = scoresDataSource,
	transformFunc = ScoreChunk, reportProgress = 0,
	overwrite = TRUE )


	# Extracting summary results

	predSum <- rxImport( RxTeradata(
	sqlQuery = paste("SELECT SKU, INCOME, MIN(PREDICTED_SALES),",
	"MAX(PREDICTED_SALES), COUNT(PREDICTED_SALES)",
	"FROM scores GROUP BY SKU, INCOME",
	"ORDER BY SKU, INCOME"),
	connectionString = tdConnString ) )
	options(width = 120) # Set display width
	predSum[1:15,]


	# Clean-up

	rxTeradataDropTable("models")
	rxTeradataDropTable("scores")
	rxSetComputeContext("local")

	##########################################################
	# Performing Simulations In-Database
	##########################################################

	playCraps <- function()
	{
	result <- NULL
	point <- NULL
	count <- 1
	while (is.null(result))
	{
	roll <- sum(sample(6, 2, replace=TRUE))

	if (is.null(point))
	{
	point <- roll
	}
	if (count == 1 && (roll == 7 \|\| roll == 11))
	{
	result <- "Win"
	}
	else if (count == 1 &&
	(roll == 2 \|\| roll == 3 \|\| roll == 12))
	{
	result <- "Loss"
	}
	else if (count > 1 && roll == 7 )
	{
	result <- "Loss"
	}
	else if (count > 1 && point == roll)
	{
	result <- "Win"
	}
	else
	{
	count <- count + 1
	}
	}
	result
	}


	# Calling rxExec

	teradataExec <- rxExec(playCraps, timesToRun=1000, RNGseed="auto")
	length(teradataExec)

	table(unlist(teradataExec))
	##########################################################
	# Analyzing Your Data Alongside Teradata
	##########################################################
	# Set the compute context to compute locally
	rxSetComputeContext ("local")
	tdQuery <- "SELECT * FROM ccFraudScore10"

	# Perform an rxSummary using a local compute context

	teradataDS1 <- RxTeradata(connectionString = tdConnString,
	sqlQuery = tdQuery, colInfo = ccColInfo, rowsPerRead = 500000)

	rxSummary(formula = ~gender + balance + numTrans + numIntlTrans +
	creditLine, data = teradataDS1)
	statesToKeep <- sapply(c("CA", "OR", "WA"), grep, stateAbb)
	statesToKeep

	# Fast import of iata from a Teradata Database

	importQuery <- paste("SELECT gender,cardholder,balance,state",
	"FROM ccFraud10",
	"WHERE (state = 5 OR state = 38 OR state = 48)")
	importColInfo <- list(
	gender = list(
	type = "factor",
	levels = c("1", "2"),
	newLevels = c("Male", "Female")),
	cardholder = list(
	type = "factor",
	levels = c("1", "2"),
	newLevels = c("Principal", "Secondary")),
	state = list(
	type = "factor",
	levels = as.character(statesToKeep),
	newLevels = names(statesToKeep))
	)

	rxSetComputeContext("local")
	teradataImportDS <- RxTeradata(connectionString = tdConnString,
	sqlQuery = importQuery, colInfo = importColInfo)

	localDS <- rxImport(inData = teradataImportDS,
	outFile = "ccFraudSub.xdf",
	overwrite = TRUE)

	# Using the imported data

	rxGetVarInfo(data = localDS)
	rxSummary(~gender + cardholder + balance + state, data = localDS)