-
-
Save standarderror/f7c2ae19fdbbb01b59ff to your computer and use it in GitHub Desktop.
R code library
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
########################################## | |
#### DATA MANIPULATION WITH DPLYR ######## | |
#### from datacamp course ################ | |
# All examples here: https://rstudio-pubs-static.s3.amazonaws.com/98285_aee86145ffc746e685b8792c7cf8a73d.html | |
########################################## | |
## SUMMARY: THE 5 VERBS ###### | |
# select = subset columns | |
# mutate = new cols | |
# filter = subset rows | |
# arrange = reorder rows | |
# summarise | |
# use GROUP_BY with summarise or mutate to get group calculations | |
# pipe results with %>% | |
## Can use these with dataframes, datatables or databases ## | |
# Additional summary functions within dplyr | |
#first(x) - The first element of vector x. | |
#last(x) - The last element of vector x. | |
#nth(x, n) - The nth element of vector x. | |
#n() - The number of rows in the data.frame or group of observations that summarise() describes. | |
#n_distinct(x) - The number of unique values in vector x. | |
#install.packages("BH", lib='c:\\notbackedup\\R\\') # dplyr requirement | |
library("BH", lib='c:\\notbackedup\\R\\') | |
install.packages("ggplot2", lib='c:\\notbackedup\\R\\') | |
library("ggplot2", lib='c:\\notbackedup\\R\\') | |
install.packages("ROCR", lib='c:\\notbackedup\\R\\') | |
library("ROCR", lib='c:\\notbackedup\\R\\') | |
install.packages("party", lib='c:\\notbackedup\\R\\') | |
library("party", lib='c:\\notbackedup\\R\\') | |
install.packages("dplyr", lib='c:\\notbackedup\\R\\') | |
library(dplyr, lib='c:\\notbackedup\\R\\') | |
install.packages("hflights", lib='\\notbackedup\\R\\') | |
library(hflights, lib='\\notbackedup\\R\\') | |
summary(hflights) | |
## TBL instead of DATA.FRAME = faster? | |
# tbl - more convenient for displaying data.frames | |
# convert with tbl_df() | |
hflights <- tbl_df(hflights) | |
hflights | |
#### cleanup dataset | |
## carrier name | |
lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental", "DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways", "WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier", "FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa") | |
# Use lut to translate the UniqueCarrier column of hflights | |
hflights$UniqueCarrier = lut[hflights$UniqueCarrier] | |
# Inspect the resulting raw values of your variables | |
glimpse(hflights) | |
## more codes | |
# Build the lookup table | |
lut <- c("A" = "carrier", "B" = "weather", "C" = "FFA" , "D" = "security", "E" = "not cancelled") | |
# Use the lookup table to create a vector of code labels. Assign the vector to the CancellationCode column of hflights | |
hflights$CancellationCode = lut[hflights$CancellationCode] | |
# Inspect the resulting raw values of your variables | |
glimpse(hflights) | |
#### THE 5 VERBS: SELECT & MUTATE ###### | |
# select = remove columns | |
# mutate = new cols | |
# filter = removes rows | |
# arrange = reorder rows | |
# summarise | |
############### SELECT() ############### | |
## SELECT HELPER FUNCTIONS | |
#dplyr provides 6 helper functions, each of which only works when used inside select(). | |
#starts_with("X"): every name that starts with "X", | |
#ends_with("X"): every name that ends with "X", | |
#contains("X"): every name that contains "X", | |
#matches("X"): every name that matches "X", which can be a regular expression, | |
#num_range("x", 1:5): the variables named x01, x02, x03, x04 and x05, | |
#one_of(x): every name that appears in x, which should be a character vector. | |
# select(df, Group, Sum) | |
hflights2 = select(hflights, ActualElapsedTime, Airtime, ArrDelay, DepDelay) | |
?select | |
## CHAPTER 2 Ex1 | |
# hflights is pre-loaded as a tbl, together with the necessary libraries. | |
# Return a copy of hflights that contains the four columns related to delay. | |
hflights2 = select(hflights, ActualElapsedTime, AirTime, ArrDelay, DepDelay) | |
# print hflights, nothing has changed! | |
print(hflights) | |
# Return a copy of hflights containing the columns Origin up to Cancelled | |
hflights2 = select(hflights, Origin:Cancelled) | |
# Answer to last question: be concise! | |
#colnames(hflights) | |
hflights2 = select(hflights, Year:DayOfWeek,ArrDelay:Diverted ) | |
## Chapter 2 Ex2 | |
# As usual, hflights is pre-loaded as a tbl, together with the necessary libraries. | |
# Return a tbl containing just ArrDelay and DepDelay | |
hflights2 = select(hflights, ArrDelay, DepDelay) | |
# Return a tbl as described in the second exercise, using both helper functions and variable names | |
hflights2 = select(hflights, UniqueCarrier:TailNum, Cancelled:CancellationCode) | |
# Return a tbl as described in the third exercise, using only helper functions. | |
hflights2 = select(hflights, contains('Time'), contains('Delay')) | |
## Chap 2 ex3 | |
# both hflights and dplyr are available | |
# Exercise 1 | |
ex1r <- hflights[c("TaxiIn","TaxiOut","Distance")] | |
ex1d <- select(hflights,TaxiIn, TaxiOut,Distance) | |
# Exercise 2 | |
ex2r <- hflights[c("Year","Month","DayOfWeek","DepTime","ArrTime")] | |
ex2d <- select(hflights, Year:ArrTime, -DayofMonth) | |
# Exercise 3 | |
ex3r <- hflights[c("TailNum","TaxiIn","TaxiOut")] | |
ex3d <- select(hflights,TailNum, TaxiIn,TaxiOut) | |
colnames(hflights) | |
#### MUTATE() | |
# mutate(tbl, z = x - y, c = a - b, ...) | |
colnames(hflights) | |
# Chapter 2 ex 4 | |
# hflights and dplyr are loaded and ready to serve you. | |
# Add the new variable ActualGroundTime to a copy of hflights and save the result as g1. | |
g1 <- mutate(hflights, ActualGroundTime = ActualElapsedTime - AirTime) | |
# Add the new variable GroundTime to a copy of g1 and save the result as g2. | |
g2 <- mutate(g1, GroundTime = TaxiIn + TaxiOut) | |
# Add the new variable AverageSpeed to a copy of g2 and save the result as g3. | |
g3 <- mutate(g2, AverageSpeed = Distance / AirTime * 60) | |
# chatper 3 ex 5 | |
# hflights and dplyr are ready, are you? | |
# hflights and dplyr are ready, are you? | |
# Add a second variable loss_percent to the dataset and save the result to m1. | |
m1 <- mutate(hflights, loss = ArrDelay - DepDelay, loss_percent = (ArrDelay - DepDelay) / DepDelay * 100) | |
# Remove the redundancy from your previous exercise and reuse loss to define the loss_percent variable. | |
# Assign the result to m2 | |
m2 <- mutate(m1, loss = ArrDelay - DepDelay, loss_percent = loss / DepDelay * 100) | |
# Add the three variables as described in the third exercise and save the result to m3 | |
m3 <- mutate(hflights, TotalTaxi = TaxiIn + TaxiOut, ActualGroundTime = ActualElapsedTime - AirTime, Diff = TotalTaxi - ActualGroundTime) | |
#### Section 5: FILTER rows ##### | |
# form: filter(tablename, logical tests) | |
# use ?comparison to see operators | |
install.packages("dplyr") | |
library(dplyr) | |
# hflights is at your disposal as a tbl, with clean carrier names | |
install.packages("hflights") | |
library(hflights) | |
colnames(hflights) | |
# hflights is at your disposal as a tbl, with clean carrier names | |
colnames(hflights) | |
head(hflights) | |
# All flights that traveled 3000 miles or more | |
filter(hflights, Distance > 3000) | |
# All flights flown by one of JetBlue, Southwest, or Delta | |
filter(hflights, UniqueCarrier %in% c('JetBlue', 'Southwest', 'Delta')) | |
# All flights where taxiing took longer than flying | |
filter(hflights, (TaxiIn+TaxiOut)>AirTime) | |
## Combine with booleans | |
# & (and), | (or), and ! (not) | |
# for NOT: filter(df, !is.na(x)) | |
#filter(df, a > b & c > d) | |
#filter(df, a > b, c > d) | |
# hflights is at your service as a tbl! | |
# All flights that departed before 5am or arrived after 10pm | |
filter(hflights, DepTime< 500 | ArrTime>2200) | |
# All flights that departed late but arrived ahead of schedule | |
filter(hflights, DepDelay>0 & ArrDelay<0) | |
# All cancelled weekend flights | |
filter(hflights, Cancelled==1 & DayOfWeek %in% c(6,7)) | |
# All flights that were cancelled after being delayed | |
filter(hflights, DepDelay>0 & Cancelled==1) | |
# hflights is already available in the workspace | |
# Select the flights that had JFK as their destination: c1 | |
c1 = filter(hflights, Dest == 'JFK') | |
# Combine the Year, Month and DayofMonth variables to create a Date column: c2 | |
c2 = mutate(c1, Date = paste(Year,Month,DayofMonth,sep="-")) | |
# Print out a selection of columns of c2 | |
select(c2, Date, DepTime, ArrTime, TailNum) | |
filter(hflights, Distance>1000 & (TaxiIn+TaxiOut)<15 & DayOfWeek %in% (6,7)) | |
##### Section 6: ARRANGE = reorder dataframe rows | |
### arrange(tbl, columns) - by default in ascending | |
## arrange(tbl, col1, col2) | |
# for descending, wrap var in desc(): arrange(tbl, desc(col1)) | |
# dplyr and the hflights tbl are available | |
# Definition of dtc | |
dtc <- filter(hflights, Cancelled == 1, !is.na(DepDelay)) | |
# Arrange dtc by departure delays | |
arrange(dtc, DepDelay) | |
# Arrange dtc so that cancellation reasons are grouped | |
arrange(dtc, CancellationCode) | |
# Arrange dtc according to carrier and departure delays | |
arrange(dtc, UniqueCarrier, DepDelay) | |
# dplyr and the hflights tbl are available | |
# Arrange according to carrier and decreasing departure delays | |
arrange(hflights, UniqueCarrier, desc(DepDelay)) | |
# Arrange flights by total delay (normal order). | |
arrange(hflights, (ArrDelay+DepDelay)) | |
# Keep flights leaving to DFW before 8am and arrange according to decreasing AirTime | |
arrange(filter(hflights, Dest == 'DFW' & DepTime < 800), desc(AirTime)) | |
### SECTION 7: SUMMARISE | |
# summarise(tbl, newcol = expression(col), ...) | |
# sum, mean, var | |
# functions must take a vector and return a single number | |
# min(x) - minimum value of vector x. | |
# max(x) - maximum value of vector x. | |
# mean(x) - mean value of vector x. | |
# median(x) - median value of vector x. | |
# quantile(x, p) - pth quantile of vector x. | |
# sd(x) - standard deviation of vector x. | |
# var(x) - variance of vector x. | |
# IQR(x) - Inter Quartile Range (IQR) of vector x. | |
# diff(range(x)) - total range of vector x. | |
# hflights and dplyr are loaded in the workspace | |
# Print out a summary with variables min_dist and max_dist | |
summarise(hflights, min_dist = min(Distance), max_dist = max(Distance)) | |
# Print out a summary with variable max_div | |
summarise(filter(hflights, Diverted==1), max_div = max(Distance)) | |
# hflights is available | |
# Remove rows that have NA ArrDelay: temp1 | |
temp1 = filter(hflights, !is.na(ArrDelay)) | |
# Generate summary about ArrDelay column of temp1 | |
summarise(temp1, earliest = min(ArrDelay), average = mean(ArrDelay), latest = max(ArrDelay), sd = sd(ArrDelay)) | |
# Keep rows that have no NA TaxiOut and no NA TaxiOut: temp2 | |
temp2 = filter(hflights, !is.na(TaxiOut) & !is.na(TaxiIn)) | |
# Print the maximum taxiing difference of temp2 with summarise() | |
summarise(temp2, max_taxi_diff = max(abs(TaxiIn - TaxiOut))) | |
##### summary functions within dplyr | |
#first(x) - The first element of vector x. | |
#last(x) - The last element of vector x. | |
#nth(x, n) - The nth element of vector x. | |
#n() - The number of rows in the data.frame or group of observations that summarise() describes. | |
#n_distinct(x) - The number of unique values in vector x. | |
# hflights is available with full names for the carriers | |
#head(hflights) | |
# Generate summarizing statistics for hflights | |
summarise(hflights, n_obs = n(), n_carrier = n_distinct(UniqueCarrier), n_dest = n_distinct(Dest), dest100 = nth(Dest,100)) | |
# Filter hflights to keep all American Airline flights: aa | |
aa = filter(hflights, UniqueCarrier=='American') | |
# Generate summarizing statistics for aa | |
summarise(aa, n_flights = n(), | |
n_canc = sum(Cancelled == 1), | |
p_canc = mean(Cancelled == 1) * 100, | |
avg_delay = mean(ArrDelay, na.rm = TRUE)) | |
########## SECTION 8: CHAINING WITH PIPE OPERATOR | |
# magrittr inside dplyr | |
# %>% passes objcet on left to object on right as FIRST ARGUMENT | |
# eg. c(1, 2, 3, NA) %>% mean(na.rm = TRUE) | |
# Write the 'piped' version of the English sentences. | |
hflights %>% mutate(diff = TaxiOut - TaxiIn) %>% filter(!is.na(diff)) %>% summarise(avg = mean(diff)) | |
# hflights is pre-loaded | |
# Build data frame with 4 columns of hflights and 2 self-defined columns: d | |
d = hflights %>% | |
select(Dest, UniqueCarrier, Distance, ActualElapsedTime) %>% | |
mutate(RealTime = ActualElapsedTime + 100, mph = Distance/RealTime*60) | |
head(d) | |
# Filter and summarise d according to the instructions | |
filter(d, !is.na(mph) & mph<70) %>% | |
summarise(n_less = n(), n_dest = n_distinct(Dest), min_dist = min(Distance), max_dist = max(Distance)) | |
hflights %>% | |
select(Dest, Cancelled, Distance, ActualElapsedTime, Diverted) %>% | |
mutate(RealTime = ActualElapsedTime + 100, mph = Distance / RealTime * 60) %>% | |
filter(mph < 105 | Cancelled == 1 | Diverted == 1) %>% | |
summarise( n_non = n(), | |
p_non = n_non / nrow(hflights) * 100, | |
n_dest = n_distinct(Dest), | |
min_dist = min (Distance), | |
max_dist = max(Distance)) | |
# Count the number of overnight flights | |
hflights %>% | |
filter(!is.na(DepTime), !is.na(ArrTime), DepTime > ArrTime) %>% | |
summarise(n = n()) | |
#### SECTION 9: GROUP BY in SUMMARISE | |
# group_by(tbl, group_var) | |
# Attaches group metadata to dataset which summarise can use | |
# Make an ordered per-carrier summary of hflights | |
hflights %>% | |
group_by(UniqueCarrier) %>% | |
summarise(n_flights = n(), | |
n_canc = sum(Cancelled == 1), | |
p_canc = mean(Cancelled == 1) * 100, | |
avg_delay = mean(ArrDelay, na.rm = TRUE)) %>% | |
arrange(avg_delay, p_canc) | |
# Make an ordered per-day summary of hflights | |
hflights %>% | |
group_by(DayOfWeek) %>% | |
summarise(avg_taxi = mean(TaxiIn + TaxiOut, na.rm=TRUE)) %>% | |
arrange(desc(avg_taxi)) | |
### Using group_by with mutate | |
# dplyr is loaded, hflights is loaded with fancy carrier names | |
# Solution to first instruction | |
hflights %>% | |
group_by(UniqueCarrier) %>% | |
filter(!is.na(ArrDelay)) %>% | |
summarise(p_delay = mean(ArrDelay > 0)) %>% | |
mutate(rank = rank(p_delay)) %>% | |
arrange(rank) | |
# Solution to second instruction | |
hflights %>% | |
group_by(UniqueCarrier) %>% | |
filter(!is.na(ArrDelay), ArrDelay > 0) %>% | |
summarise(avg = mean(ArrDelay)) %>% | |
mutate(rank = rank(avg)) %>% | |
arrange(rank) | |
### ADVANCED COMBINATIONS | |
# Which plane (by tail number) flew out of Houston the most times? How many times? adv1 | |
adv1 <- hflights %>% | |
group_by(TailNum) %>% | |
summarise(n = n()) %>% | |
filter(n == max(n)) | |
# How many airplanes only flew to one destination from Houston? adv2 | |
adv2 <- hflights %>% | |
group_by(TailNum) %>% | |
summarise(ndest = n_distinct(Dest)) %>% | |
filter(ndest == 1) %>% | |
summarise(nplanes = n()) | |
# Find the most visited destination for each carrier: adv3 | |
adv3 <- hflights %>% | |
group_by(UniqueCarrier, Dest) %>% | |
summarise(n = n()) %>% | |
mutate(rank = rank(desc(n))) %>% | |
filter(rank == 1) | |
# Find the carrier that travels to each destination the most: adv4 | |
adv4 <- hflights %>% | |
group_by(Dest, UniqueCarrier) %>% | |
summarise(n = n()) %>% | |
mutate(rank = rank(desc(n))) %>% | |
filter(rank == 1) | |
########### CONNECTING TO DATABASES | |
## SQL | |
# set up a src that connects to the mysql database (src_mysql is provided by dplyr) | |
my_db <- src_mysql(dbname = "dplyr", | |
host = "dplyr.csrrinzqubik.us-east-1.rds.amazonaws.com", | |
port = 3306, | |
user = "dplyr", | |
password = "dplyr") | |
# and reference a table within that src: nycflights is now available as an R object that references to the remote nycflights table | |
nycflights <- tbl(my_db, "dplyr") | |
# glimpse at nycflights | |
glimpse(nycflights) | |
# Calculate the grouped summaries detailed in the instructions | |
dbsumm <- nycflights %>% | |
group_by(carrier) %>% | |
summarise(n_flights = n(), avg_delay = mean(arr_delay)) %>% | |
arrange(avg_delay) | |
dbsumm | |
##################################### | |
# Swinburne: R programming # | |
##################################### | |
#### DISTRIBUTIONS AND MODELLING ### | |
##################################### | |
# Methods of FITTING / ESTIMATING | |
# [1] Classical frequentist | |
# [1.1] Least squares | |
# minimise squared residual | |
# Usually by finding point with minimum error = 0 gradient | |
# from solving sets of linear equations Ax = B | |
# [1.2] Maximum likelihood: ****** | |
# given a function that calculates "likelihood," which means | |
# if _data is fixed_ and parameters vary, what is probability. | |
# usually multiply together f(y,theta) for all y | |
# PICTURE the probability parabola across the various thetas | |
# OR sum the log likelihoods | |
# LS vs MLE: are the same if normally distributed error | |
# LS cannot be used for other distributions, eg binomial | |
# ... | |
# *** | |
# The distribution of ERROR and distribution of Y are independant | |
# You can have NORMALLY DISTRIBUTED error and non-normal Y | |
# --> You can use least squares on NON-NORMAL DATA | |
# ... because you may be adding non-normal variables to get | |
# a non-normal outcome... and that's fine. | |
# ==> you'll have normal error | |
# *** | |
# ** Hence it is the distribution of ERROR we need to attend to | |
## Distributions / random variables | |
# Can be univariate or multivariate | |
# Every model requires assumptions about probability distribution of residuals. | |
# We can do Monte Carlo simulations with our distributions | |
# if residuals are normal and relationship between variables is simple additive | |
# --> simple linear modelling | |
## Discrete RVs | |
# Have a set of values, each has a probability | |
X = c(30, 40, 50, 60, 70, 80) | |
P = c(0.3, 0.3, 0.05, 0.05, 0.1, 0.2) | |
# mean = sum(X*P) # sum product of each value * probabilty of value | |
# of in matrix algebra: mean=t(X)%*%P | |
## Continuous RVs | |
# Must integrate across ranges to get probabilities | |
# mean - integrate over x * Px | |
# variance = integrate over (x-mu)^2 * Px | |
### Uniform distribution ### | |
# Every value has same probability | |
runif(10,3,5) # n, min, max | |
x = runif(100000,3,5) # n, min, max | |
x=data.matrix(x) | |
dim(x) | |
hist(x) | |
# compute theoretical mean | |
(1/2) * (3+5) | |
# compute empirical mean | |
mean(x) | |
# var theoretical | |
(1/12)*(5-3)^2 | |
# var emprical | |
var(x) | |
### Normal #### | |
# Generate 10,000 N(0,1) numbers (i.e., standard normal). | |
# no need to specify the mean and variance. | |
X<-rnorm(10000) | |
# Generate 10,000 N(30,102) numbers. | |
X<-rnorm(10000,30,10) | |
# checking the histogram. | |
hist(X) | |
# checking the mean and variance. | |
mean(X) | |
var(X) | |
### Chi-sq ### | |
# is the SUM of multiple SQUARED NORMAL R.V's | |
# = sum of squared normal RVs | |
# So we could simulate ChiSq with 5 degrees of freedom by generating 5 sets of RNORM, squaring and adding | |
#### F-distribution ### | |
# all ANOVAs are F-tests | |
# F is defined with 2 independant chi-square variables | |
# It is result of ratio between the two | |
x=rf(100,5,2) # number, df of chisq 1, df of chisq 2 | |
hist(x) | |
#### t-distribution ### | |
# variance = v/v-2, where v=df | |
# as df --> infinity (because sample gets huge), then t becomes normal | |
# The square of a t is an F | |
#### Distributions for GLM #### | |
## to account for residual distributions | |
## logistic distribution: similar to normal but with fatter tail | |
# generate 10000 logistic random numbers with location being 0 and scale being 1. | |
X<-rlogis(10000, 0, 1) | |
# generate 10000 standard normal random numbers. | |
Y<-rnorm(10000, 0,1) | |
# compare their histograms | |
hist(X, xlim=c(-20,20), col="red") | |
hist(Y, add=T, col="blue") | |
## Exponential | |
# models time BETWEEN events in Poisson proceess | |
# remember not to confuse RATE with MEAN TIME | |
# generate 10000 random exponential numbers from the example | |
X<-rexp(10000, rate = 1/30) | |
hist(X) | |
## Poisson distribution | |
# Describe probability of a number of events occuring in fixed period of time or space. eg crime | |
# commonly used for COUNT data | |
# lambda = expected # of occurences | |
# if you have TOO MANY zeroes may not an alternative | |
# also, sometimes large count data actually does look normal - in that case, use normal! | |
# generate 10000 random Poisson numbers | |
X<-rpois(10000, 1) | |
Y<-rpois(10000, 10) | |
Z<-rpois(10000, 20) | |
# stack their histograms together | |
hist(X, xlim=c(0,50), col="red") | |
hist(Y, add=T, col="blue") | |
hist(Z, add=T, col="green") | |
# check the mean and variance are indeed very close | |
mean(X) | |
var(X) | |
##### ANOVA ##### | |
library(reshape2) | |
data(tips, package = "reshape2") | |
tips | |
tipAnova <- aov(tip~day+time+sex-1,tips) | |
summary(tipAnova) | |
tipAnova$coefficients | |
# Letâs look at the p-value of gender⦠| |
# In this F distribution: | |
# DF1 = 1, DF2 = 239 | |
# R code for obtaining p-value from an F distribution. | |
pf(0.820,1,238,lower.tail=FALSE) | |
# Recall that when DF1 = 1 the F distribution is the same as the square of a t(DF2). | |
# R code for obtaining p-value from a t distribution. | |
pt(0.820^0.5,238,lower.tail=FALSE)*2 | |
####### MATRIX ALGEBRA ############ | |
# week 8 | |
# X is a matrix, m x n | |
# m is rows, n is coloumns | |
# if n=1, column vector | |
# if m=1, row vector | |
# if m=n, square matrix | |
# Transpose = X' | |
# Column Vector | |
X = matrix(c(3,4,7,8,9,10,3,3),8,1) | |
X | |
# Row Vector | |
X = matrix(c(3,4,7,8,9,10,3,3),1,8) | |
X | |
# a (4x2) matrix | |
X = matrix(c(3,4,7,8,9,10,3,3),4,2) | |
X | |
# a (3x3) square matrix | |
X = matrix(c(3,4,7,8,9,10,3,3,1),3,3) | |
X | |
# transpose of X | |
t(X) | |
## Multiplying matrices | |
# must be CONFORMABLE | |
# (axb)*(bxc) --> col dim of 1st matrix must = row dim of 2nd matrix | |
X = matrix(c(3,4,7,8,9,10,3,3,1),3,3) | |
Y = matrix(c(6,8,2,1,2,1,3,2,1),3,3) | |
O = matrix(c(2,2,3),3,1) | |
# X and O are conformable (3x1) | |
X%*%O | |
# O and X are NOT conformable | |
O%*%X | |
# t(O) and X are conformable (1x3) | |
t(O)%*%X | |
# X and Y are conformable | |
X%*%Y | |
# first element of X%*%Y | |
# is sumproduct of row1 of X and column1 of Y | |
sum(X[1,]*Y[,1]) | |
# last element of X%*%Y | |
# is sumproduct of row3 of X and column3 of Y | |
sum(X[3,]*Y[,3]) | |
# YX is NOT equal to XY | |
Y%*%X==X%*%Y | |
## Matrix inverse | |
# X * X' = I, where I is the identity matrix | |
# Identity matrix = 1's along diagonal | |
I = matrix(diag(3),3,3) | |
I | |
# solve() finds the inverse | |
?solve | |
X = matrix(c(3,4,7,8,9,10,3,3,1),3,3) | |
# in R, matrix inverse is done through solve(). | |
XI = solve(X) | |
X%*%XI | |
# if you foolishly did this | |
XI = X^-1 | |
XI | |
# you have inverted every single element in X. | |
# only square matrices have inverse | |
# So many X's aren't square: obs * variables, not square | |
# Hence we have to use X*X to get square | |
#################### REGRESSION MODELLING ########################### | |
# Y = X*Beta + epsilon | |
# Where X is matrix N * K, B is K x 1 | |
# epsilon is (N x 1). The distribution of epison determines | |
# priperties of Y. Here, we assume multivariate normal. | |
# Beta is what we want to estimate: determines relationship | |
# between X and Y | |
data(cars) | |
attach(cars) | |
plot(speed,dist,xlim=c(0,25)) | |
regmodel=lm(dist~speed) # add -1 to remove intercept | |
summary(regmodel) | |
# generate a sequence of xâs⦠| |
x=seq(min(speed),max(speed),1) | |
ypredicted=regmodel$coefficients[1]+regmodel$coefficients[2]*x | |
lines(x,ypredicted,col="red") | |
## Least squares estimation is fundamental method | |
# Residual e_i = Y_i - Yhat_i | |
# = the distance between line and observation point | |
# Now we want to solve to get minimum residuals | |
# see slides: take the derivative of e_i with respect to B0 and B1 | |
# = two simultaneous equations. Solve for both = 0 | |
# --> solve this with matrix algebra | |
# beta = (X'X)^-1(X'Y) | |
data(cars) | |
attach(cars) | |
Y = matrix(dist) | |
dim(Y) | |
one = matrix(rep(1,50)) | |
X = cbind(one,matrix(speed)) | |
dim(X) | |
b = solve( t(X)%*%X ) %*% ( t(X)%*%Y ) | |
b | |
# It is important to recognize that LS | |
# line of best fit is NOT the only option out there. | |
# If you choose a different penalty function, you will | |
# get a different line... | |
# eg sum of residuals instead of sum SQUARE residuals | |
# LS estimators are BLUE: | |
# BLUE: Best Linear Unbiased Estimator. | |
# Under the following assumptions: | |
# The regressors are INDEPENDENT of the residuals. | |
# The regressors are NOT perfectly collinear with each other. | |
# The residuals are homoscedastic. | |
# The residuals are uncorrelated. | |
# ** | |
# We assume *residuals* must be normally distributed | |
# that DOESN'T MEAN that Y must be normally distributed | |
# as long as ERROR is normal, we can use least squares | |
## Electricity example | |
# Y-variable - electricity usage - is normal with some outliers | |
# By looking at residuals, we see there is something happening which we're missing | |
plot(resid(regmodel)) | |
hist(resid(regmodel)) | |
# ... by adding polynomial (square) terms we get a good model | |
####### GENERALISED LINEAR MODELS ########### | |
# if residuals aren't normal, may need a link function | |
# equation of Beta * X stays same, but link funciton applies to Y: g(Y) | |
# precisely: g(mu), where u is *mean* of response variable | |
## THE RHS MUST ALWAYS BE BETA * X --> LINK IS APPLIED TO Y ONLY! | |
?glm | |
# glm(formula, family = gaussian, data, ...) | |
# glm usually fit using maximum likelihood | |
## Family: binomial | |
# link functions logit & probit | |
# think of logit in terms of threshold from 0 to 1, with the latent variable being some scale of willingness that trips the threshold | |
# ln(p/1-p) = Beta * X | |
data <- read.csv("Homicides.csv") | |
attach(data) | |
regmodel_logit <- glm(HomicideHotSpot~SOUTH+UE90+DV90+MA90+PS90+RD90, family=binomial("logit")) | |
summary(regmodel_logit) | |
# Estimates are | |
## Family: Poisson | |
# the count comes from Poisson distribution, where MEAN of distribution | |
# is defined by linear combination of variables (Beta * X) | |
# be careful of overdispersion (variance>mean) [variance and mean should be same] | |
# be carful of zero inflatio (too many 0's) | |
data <- read.csv("ShipWreck.csv") | |
attach(data) | |
data | |
hist(damage) | |
# examine | |
xtabs(~damage+type) | |
xtabs(~damage+construction) | |
xtabs(~damage+operation) | |
plot(damage~months) | |
# notice that we get different counts by cutting data different ways | |
# --> see which vraibles are predictive | |
# ALSO notice that there is a non-linear relationship between damage & month | |
months2 <- months^2 | |
regmodel_Pois <- glm(damage~type+construction+operation+months+months2-1,family=poisson(link="log")) | |
summary(regmodel_Pois) | |
# Note that link function is log! | |
# BEWARE that we have lots of dummy variables --> We don't need INTERCEPT | |
# Coefficients: negative = less likely | |
######## OPTIMISATION ####### | |
# recall that LS = sum of squared residuals, which is a function of betas | |
# so we vary betas and maximise LS | |
# Can do this in any problem that you can express as error = ... | |
# in R, we define a function to optimise, as well as initial values | |
# we use optim() --> MINIMIZES by default | |
data <- read.csv("NavigationExample.csv") | |
attach(data) | |
# First we must specify the objective function | |
fn = function(P) sum( ( D - ( (X-P[1])^2 + (Y-P[2])^2 )^0.5 )^2 ) | |
# next we specify initial values/guesses to the problem | |
initial = c(0,0) | |
# call on optim to _minimize_ sum of squares | |
# the first argument is the initial value | |
# the second argument is the function to be optimized | |
# the third argument selects the numerical search method | |
# we will talk about Hessian later. | |
out = optim(initial, fn, method = "BFGS", hessian = TRUE) | |
# problem SOLVED⦠| |
out$par | |
# if you want to get a TRACE: | |
out = optim(initial, fn, method = "L-BFGS-B", hessian = TRUE,control=list(trace=6,REPORT=1)) | |
###### Maximum likelihood ###### | |
# ML is a type of OPTIMIZATION | |
# ML is ONE option. Sometimes you MUST use ML, other times you can use LS, ..etc | |
# Y' is a vector of observed values | |
# Theta' = vector of parameters (Betas) | |
# Joint density f(Y;Theta) | |
# Interepreted as EITHER: | |
# [1] Conditional on fixed ð¯ it is the probability of sample outcomes ð. | |
# [2] Conditional on sample outcomes ð it is a function of ð¯. | |
# [2] is LIKELIHOOD function | |
# LIKELIHOOD | |
# probability of this data given assumptions = p(data|assumptions) | |
# = likelihood of each observation | |
# We're assuming all observations are INDEPENDANT: so we can multiply together | |
# Given that it is iid, then joint density function = probability of observing this set of values | |
# = product of marginal probability of each observation | |
# = f(y1, y2, y3... ; Theta) = f(y1;theta) * f(y2;theta) * ... * f(yn;theta) | |
# We're looking for values of theta - theta_hat - that maximise likelihood | |
# To make easier, sum of logs | |
# We take derivative of log-likelihood (called "score") and set to zero | |
# Key assumptions of ML estimator [see slides] | |
# Consistent: big sample converges theta_hat to theta | |
# Asymptotically normal: so we can do hypothesis tests | |
# --> we verify this with Hessian matrix | |
# How to do MLE | |
# First, write down PMF or PDF for your problem: binomial, gaussian, etc | |
# Replace outcome variable in PMF (eg y) with linear combination of betas * X's | |
# Work out joint density = product of each observation's likelihood function | |
# Take log to get log-likelihood | |
# Now we have an equation tying linear model to probability of observation | |
# Optim() to find best values of betas | |
# Here is the data | |
# carbon disulphide concentration | |
x = c(1.6907, 1.7242, 1.7552, 1.7842, 1.8113, 1.8369, 1.8610, 1.8839) | |
# beetles death | |
y = c(6, 13, 18, 28, 52, 53, 61, 60) | |
# beetles experimented on | |
n = c(59, 60, 62, 56, 63, 59, 62, 60) | |
# Note that the default of âoptimâ is to minimize⦠| |
# so we simply reverse the sign of the log likelihood function derived eariler⦠| |
fn = function(B) sum( -y*(B[1]+B[2]*x) + n*log(1+exp(B[1]+B[2]*x)) - lchoose(n, y) ) | |
out = optim(c(-50,20), fn, method = "BFGS", hessian = TRUE) | |
# Hessian matrix | |
# Recall that the inverse of the negative of the Hessian | |
# matrix approximates the variance-covariance matrix of the | |
# estimated coefficient vector (in a minimization problem you | |
# donât have to take the negative of the Hessian). | |
# use solve() to get inverse | |
# Don't care about off-diagonals, because those are the COVARIANCES | |
# Covariances may be interesting if want to understand relationships between variables | |
sqrt(diag(solve(out$hessian))) | |
# [1] 5.180705 2.912137 | |
# These values are STANDARD ERRORS of the ESTIMATES | |
# Therefore, the t-stat for estimated _ð½_1_ is: | |
# 34.27037 / 2.912137 = 11.76 | |
## ML with Gaussian | |
# We assume residuals are normally distributed | |
# Assume Gaussian pdf | |
# (x-mu)^2: means we expect x is normally distributed around mu | |
# for estimation, x is the DEPENDANT variable (y) | |
# mu is our linear combination of variables | |
# WHAT ABOUT SIGMA (VARIANCE)? WE MUST ESTIMATE IT TOO! | |
# include an unknown for sigma in likelihood function | |
lnL <- (n/2)*log(2*3.1415926)+(n/2)*log(p[1])+(1/(2*p[1]))*(t(y-X%*%p[2:4])%*% (y-X%*%p[2:4])) | |
out <- optim(c(rep(1,4)), lnL, hessian=TRUE, method = "BFGS") | |
# estimate / standard error = t-statistic | |
##################################### | |
############ CODE LIBRARY ########## | |
##################################### | |
### highlights to remember | |
# plyr functions: mutate (create new variables), slice, filter... | |
# tapply - to apply a function by group variable | |
## data for playing with | |
data(cars) | |
data(ChickWeight) | |
##### Working with data & structures [week 2] | |
## input/output | |
read.table(filename, header=T, sep=",") | |
# CSVs | |
read.csv(file.choose(), header=T) | |
read.table(file.choose(),header=T, sep=”,”) | |
read.delim(file.choose(), header=T,sep=",") | |
## data types & structures | |
head(tablename) | |
dim(tablename) | |
rownames() | |
colnames() | |
head() # from plyr/dplyr | |
## variable names | |
names() # to get contents of non-dataframe object - eg contents of linear model object | |
rename(tablename, newvariablename = oldvariablenane) # plyr | |
## filtering & subsetting (with PLYR) | |
# filter rows | |
# from plyr or dplyr | |
filter(tablename, Time==0) | |
filter(tablename, Chick>1, Time==0) # multiple filters | |
# slice rows | |
slice(tablename, 1:20) # take first 20 rows | |
# order rows | |
arrange(tablename, time, desc(weight)) # sort by time THEN by weight (descending) | |
# select variables | |
select(tablename, variable1, variable2, ...) | |
select(tablename, -variable1) # drop variable | |
# also mutate... | |
## merging, binding, etc | |
# combine vectors | |
rbind() # rowbind = combine horizontally | |
cbind() # column bind = combine vertivally | |
######### DERIVING variables | |
### APPLYing functions | |
apply(x,1,sum) # sum across rows | |
apply(x,2,sum) # sum down columns | |
# apply by group variable | |
tapply(scores, sex, mean) # get mean score by sex | |
### MUTATING (plyr) | |
mutate(tablename, newvariable = oldvar1 + oldvar2) | |
####### Descriptive statistics [week 2] ######### | |
## crosstab | |
table(gender) | |
xtab(~gender+smoker) # or: table(gender,smoker) | |
table(x,y) | |
# SAME AS | |
xtabs(~x+y) | |
## Describing | |
library(psych) | |
describe(tablename) | |
describe.by(tablename, tablename$gender) | |
## summarising | |
###### PLOTTING [week 03] | |
## histogram | |
hist(cars$speed, main="Distribution of Car Speeds", | |
xlab="Speed (mph)") | |
# use breaks to change bar width | |
hist(cars$speed, breaks=10) | |
## boxplot | |
boxplot(ChickWeight$weight) | |
boxplot(ChickWeight$weight~ChickWeight$Diet, xlab="Diet", ylab="Weight (grams)") | |
## scatterplot | |
plot(cars$speed,cars$dist) | |
plot(cars$speed,cars$dist, pch=3, col="red") | |
## line charts | |
lines(cars$speed,cars$dist) | |
# change xlims | |
plot(cars$speed,cars$dist,xlim=c(0,30)) | |
# add vertical or horizontal lines | |
plot(cars$speed,cars$dist,pch=x) | |
abline(h=40) | |
abline(v=10,lty=2) | |
# legend | |
legend(5,110,c("First Half", "Second Half"),pch=1:2) | |
### par() for paremeters | |
## par() for multiple plots | |
# grid of 4 plots: | |
par(mfrow = c(2, 2)) | |
plot(cars$speed,cars$dist,pch=x) | |
plot(cars$speed,cars$dist) | |
hist(cars$speed) | |
hist(cars$dist) | |
## par() for direction of labels | |
nr.prof <-c(prof.pilots=16,lawyers=11,farmers=10,salesmen=9,physicians=9, | |
mechanics=6,policemen=6,managers=6,engineers=5,teachers=4, | |
housewives=3,students=3,armed.forces=1) | |
par(las = 3) | |
barplot(rbind(nr.prof)) | |
##### GGPLOT2 | |
## qplot = quick plots | |
# qplot histogram | |
qplot(cars$dist) | |
# qplot scatter | |
qplot(cars$speed,cars$dist) | |
# qplot gradient | |
qplot(mtcars$wt, mtcars$mpg, colour=mtcars$cyl) | |
## ggplot layering | |
# geom_point()for scatter/dot plots | |
# geom_line()for line graphs | |
# geom_bar() for bar charts | |
# geom_boxplot() for boxplots | |
# ggplot histogram | |
ggplot(data=diamonds)+geom_histogram(aes(x=carat)) | |
# ggplot density | |
ggplot(data=diamonds)+geom_density(aes(x=carat),fill="grey50") | |
## facet_grid to create grid of 4 plots | |
ggplot(CO2, aes(conc, uptake)) + | |
geom_point() + | |
facet_grid(Treatment ~ Type) + | |
stat_summary(fun.y = mean, colour = "red", geom = "line") | |
## ggthemes | |
# wall street journal theme | |
library(ggthemes) | |
g2<-ggplot(diamonds, aes(x=carat,y=price))+ | |
geom_point(aes(color=color)) | |
g2+theme_wsj | |
## maps with ggplot | |
library(maps) | |
nz <- map_data("nz") | |
nzmap <- ggplot(nz, aes(x=long, y=lat, group=group)) + | |
geom_polygon(fill="white", colour="black") | |
###### INFERENCE AND DISTRIBUTIONS [week 4] | |
# distribution functions - eg. norm | |
# q - for quartile (eg. 0.975) | |
# r - for random number | |
# p for probability of a value (eg. 1.96) = area under curve to that point (integral) | |
# d for density | |
# dnorm(x, mean = 0, sd = 1, log = FALSE) | |
# pnorm(q, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE) | |
# qnorm(p, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE) | |
# rnorm(n, mean = 0, sd = 1) | |
# work with vectors too | |
qnorm(0.975) | |
# 1.96 | |
pnorm(1.96) | |
# 0.975 | |
rnorm(5,0,1) | |
# 5 values | |
dnorm(1.96) | |
# confidence intervals | |
phat=.4 | |
n=120 | |
ME=qnorm(.975)*sqrt(phat*(1-phat)/n) | |
CI=c(phat-ME,phat+ME) | |
# ttests | |
# remember to think of null hypothesis | |
# one sample t-test | |
# H0: mean of speed = 10 | |
# H1: mean of speed <> 10 | |
t.test(speed,mu=10) | |
# compare sample t-test | |
# H0: Male = female | |
# H1: male <> female | |
scores=rnorm(50,20,5) | |
sex=rep(c("Male", "Female"),c(25,25)) | |
boxplot(scores~sex) | |
t.test(scores~sex) | |
# paired samples t-test | |
before.scores=rnorm(50,15,10) | |
after.scores=rnorm(50,25,10) | |
t.test(before.scores,after.scores,paired=TRUE) | |
########## ANOVA | |
# A good one-way ANOVA example can be found at: http://www.stat.wmich.edu/wang/664/egs/Rrust.html | |
# A very good summary of one and two-way ANOVAs with R can be a found at: http://www.statmethods.net/stats/anova.html | |
# one-way anova example | |
boys=rnorm(50,mean=128,sd=14) | |
girls=rnorm(50,mean=130,sd=14) | |
aliens=rnorm(50,mean=75,sd=14) | |
all.heights=c(boys,girls,aliens) | |
type=rep(c("Boy","Girl","Alien"),rep(50,3)) | |
type=factor(type) | |
boxplot(all.heights~type) | |
myanova=aov(all.heights~type) | |
summary(myanova) | |
layout(matrix(c(1,2,3,4),2,2)) | |
plot(myanova) | |
# this is the same as lm | |
mylmanova=lm(all.heights~type) | |
anova(mylmanova) | |
# use TUKEY to get ADJUSTED multiple comparisons | |
# adjusted = adjusted because we are doing many tests simultaneously | |
# http://en.wikipedia.org/wiki/Multiple_comparisons_problem | |
TukeyHSD(myanova) | |
# 2-way ANOVA example | |
data(ToothGrowth) | |
attach(ToothGrowth) | |
supp=factor(supp) | |
dose=factor(dose) | |
toothanova=aov(len~supp*dose) | |
summary(toothanova) | |
####### REGRESSION | |
# simple lm | |
data(cars) | |
plot(cars$speed,cars$dist) | |
regmodel=lm(cars$dist~cars$speed) | |
summary(regmodel) | |
lines(cars$speed,regmodel$fitted) | |
confint(regmodel) # gives confidence interval and slope | |
# transformations - use the I() | |
lm(y~x+I(x^2)) | |
# GLM - can use various links: | |
glm(counts ~ outcome + treatment, family=poisson()) | |
# binomial(link = "logit") | |
# gaussian(link = "identity") | |
# Gamma(link = "inverse") | |
# inverse.gaussian(link = "1/mu^2") | |
# poisson(link = "log") | |
# quasi(link = "identity", variance = "constant") | |
# quasibinomial(link = "logit") | |
# quasipoisson(link = "log") | |
###### FUNCTIONS [week 5] | |
# simple example | |
myfirstfunctionwithinput=function(input) | |
{ | |
print(input) | |
} | |
myfirstfunctionwithinput("I am getting better at functions") | |
# [1] "I am getting better at functions" | |
myfirstfunctionwithinput=function(input1, input2) | |
{ | |
print(input1) | |
print(input2) | |
} | |
myfirstfunctionwithinput(input1=1,input2=2) | |
# [1] 1 | |
# [1] 2 | |
# DEFAULT values | |
meanormedian=function(variable, stat=”mean”) | |
{ | |
# a function that calculates either mean or median of a vector | |
if (stat=="mean") | |
{ | |
return(mean(variable)) | |
} | |
else | |
{ | |
return(median(variable)) | |
} | |
} | |
# IF statements | |
meanormedian=function(variable, stat) | |
{ | |
# a function that calculates either mean or median of a vector | |
if (stat=="mean") | |
{ | |
return(mean(variable)) | |
} | |
else | |
{ | |
return(median(variable)) | |
} | |
} | |
# FOR loops | |
addsomenumbers1=function(n) | |
{ | |
#returns the sum of the numbers from 1 to n | |
total=0 | |
for (i in 1:n) | |
{ | |
total=total + i | |
} | |
return(total) | |
} | |
addsomenumbers1(100) | |
# [1] 5050 | |
# WHILE loops | |
addsomenumbers2=function(n) | |
{ | |
#returns the sum of the numbers from 1 to n | |
i=1 | |
total=0 | |
while (i<=n) | |
{ | |
total=total+i | |
i=i+1 | |
} | |
return(total) | |
} | |
# RECURSIOn | |
fibo=function(n) | |
{ | |
#function that returns the nth Fibonacci number | |
if (n<2) | |
{ | |
return(n) | |
} | |
else | |
{ | |
return(fibo(n-1)+fibo(n-2)) | |
} | |
} | |
############## SHINY ################### | |
http://shiny.rstudio.com/gallery/ | |
install.packages('shiny', lib="c:\\notbackedup\\") | |
library(shiny, lib="c:\\notbackedup\\") | |
runApp("C:\\NotBackedUp\\R\\shiny\\") | |
# | |
## DIAGRAMS | |
install.packages('diagram') | |
library(diagram) | |
names <- c(".Rnw", ".tex", ".pdf") | |
connect <- c(0, 0, 0, | |
"knitr", 0, 0, | |
0, "pdflatex", 0) | |
M <- matrix(nrow=3, ncol=3, byrow=TRUE, data=connect) | |
pp <- plotmat(M, pos=c(1, 2), name=names, box.col="orange") | |
###### MISC ########## | |
## Generating random datasets | |
# ------------------------------- # | |
# R TUTORIAL CODE # | |
# SESSION 1 # | |
# ------------------------------- # | |
## Meta | |
# use # for comments | |
# use CTRL-ENTER to run a single line or highlighed bit of code | |
# ------------------------------- # | |
# CHAPTER 1: working with basic data objects # | |
# ------------------------------- # | |
## [1.1] simple Variables | |
a = 1 | |
a | |
a <- 1 # <- arrow is same as = | |
a | |
a == 1 # use double arrows for boolean equals (more later) | |
b = 2 | |
a + b | |
a * b | |
c = a + b^2 | |
c | |
d = 1.0 | |
str(d) # structure of object: number, string, ... | |
help(str) # help on any function | |
??structure # search for help on a term --> Google is better | |
# unless you know what you're looking for | |
string # if you make an error, R will tell you in red | |
e = 'string' | |
str(e) | |
paste('hello','world') # join two strings | |
help(paste) | |
paste('hello','world', sep=", ") | |
## Meta | |
# R will interpret end of line as end of command | |
# *except* where it a line is left incomplete, | |
# for example it ends in a + or an unclosed bracket | |
a = | |
2 | |
(1 + 7 + | |
19 + | |
12) | |
# ....................................... # | |
# - EXERCISE - # | |
# caculate 57 * 43 | |
# then calculate the square of the result | |
# ....................................... # | |
# ------------------------------- # | |
## [1.2] Lists | |
c('a','b','c') # Combine elements into a vector or list | |
c(1, 2, 3) # Combine elements into a vector or list | |
f = c(1,2,'word',4) # any list with at least 1 string = all strings | |
f | |
# ------------------------------- # | |
## [1.3] Vectors = a list of numbers OR a one dimensional matrix | |
# Also think of this like a column OR a row from a table - it's all the same to R | |
f = c(1,2,3,4) | |
f | |
0:4 # quick list of integers | |
10:0 # reverse | |
seq(1,50,5) # sequence of number from x to y in steps of size z | |
# VECTORISATION | |
# operations can be applied to vectors in one hit = 'vectorisation' | |
g = 1:10 | |
4*g | |
g/2 | |
h = g^2 | |
plot(g,h) | |
# A vector can be a column from a table or a row from a table, R is indifferent. | |
# R has a huge library of functions which you typically apply to vectors. | |
# Starting with the basics: | |
sum(g) | |
mean(g) | |
median(g) | |
summary(g) | |
# through to nonparametric tests... | |
x = c(4,7,10,15) | |
y = c(1,2,3,5) | |
wilcox.test(y,x) | |
# And anything parametric you might want, including tests... | |
pbinom(q=2, size=10, p=0.5 ) # binomial p-value for getting 2 success in 10 trials | |
# ...and density functions... | |
n_successes = 0:10 | |
prob_successes = dbinom(n_successes, size=10, p=0.5) # distribution for 10 trials with p=0.5 | |
plot(n_successes, prob_successes, type='l') | |
# ... and so on | |
## META | |
# Data in R lives in active memory (RAM), enabling 'vectorisation' | |
# Contrast this with SAS, which opens datasets and reads line-by-line, | |
# hence why PROC SORT exists | |
# - EXERCISE - # | |
# create a vector of numbers 5 through 10 | |
# Multiply, elementwise, the numbers in the vector by 3.2 | |
# Calculate the mean & median of the vector | |
# ....................................... # | |
# ------------------------------- # | |
# CHAPTER 2: Data frames # | |
# ------------------------------- # | |
# Flexible tables of data, like Excel or SAS tables | |
# Can still be vectorised and treated like matrices | |
## creating a dataframe by IMPORTING DATA from csv | |
# NOTE double slashes in paths | |
## Getting data out of text fils, eg. CSV | |
# for different delimiters, use sep="XXX". | |
# eg. pipe-delimited: sep="|" | |
# eg. tab-delimited: sep="\t" | |
FileLoc = "C:\\Documents and Settings\\mencec1\\Desktop\\training\\" | |
FileName = 'baseball_train.csv' | |
File = paste(FileLoc,FileName,sep="") | |
print(File) | |
TBL_BASEBALL <-read.delim(File,header=TRUE,sep=",") | |
# look at it in workspace | |
TBL_BASEBALL # view the table in log | |
dim(TBL_BASEBALL) # dimensions | |
colnames(TBL_BASEBALL) # column names | |
summary(TBL_BASEBALL) # quick summary | |
str(TBL_BASEBALL) # summary of structure | |
# ------------------------------- # | |
# [2.1] Working with dataframes: basic slicing & dicing | |
# ------------------------------- # | |
# in R, we access 'cells' of data using [row, column] indexing | |
TBL_BASEBALL[1,1] | |
TBL_BASEBALL[2,3] | |
TBL_BASEBALL[1,] # access entire row --> a vector | |
TBL_BASEBALL[,1] # access entire column --> a vector | |
mean(TBL_BASEBALL[,1]) # works like the vectors we saw above | |
TBL_BASEBALL[,1:2] # select columns 1 & 2 | |
TBL_BASEBALL[1:5,] # select rows 1 through 5. | |
# remember? 1:5 is the same as c(1,2,3,4,5) | |
# So you can pass R a list as an index: | |
TBL_BASEBALL[c(1,7,99),] # use a list as an index | |
# use the [rows,cols] syntax to do both at once: | |
TBL_BASEBALL[1:2,6:8] # rows 1 & 2 with columns 6,7,8 | |
## selecting columns by name | |
colnames(TBL_BASEBALL) | |
TBL_BASEBALL[,"salary"] # name a single column | |
TBL_BASEBALL[,c("salary","n_runs","n_walks")] # use a LIST of columns | |
cols = c("salary","n_runs","n_walks") | |
TBL_BASEBALL[,cols] | |
# Lazy '$' syntax works if you only want a single column | |
TBL_BASEBALL$salary | |
# subset() syntax can be used if it less confusing for you: | |
subset(TBL_BASEBALL, select=c('salary','n_runs','n_hits')) | |
## combine with row slicing: | |
TBL_BASEBALL[27:32,c('n_runs','n_hits')] | |
# Once more, we can apply any R functions to sliced chunks of data | |
# Many functions can handle chunks just as well as vectors: | |
sum(TBL_BASEBALL[27:32,c('n_runs','n_hits')]) | |
### Now that we can slice & dice, let's use it to get to know BASEBALL data | |
pairs(TBL_BASEBALL) # generic R matrix scatterplot | |
## Too many variables... Let's look at fewer | |
cols = c('salary','batting_avg','n_runs','n_hits','n_homeruns') | |
cols | |
pairs(TBL_BASEBALL[,cols]) | |
# More useful scatterplot matrix: | |
install.packages('ggplot2') # ggplot2 plotting library [if your first install, you may be asked to choose a CRAN server] | |
install.packages('GGally') # Companion tools for ggplot2 | |
library(ggplot2) | |
library(GGally) | |
ggpairs(data=TBL_BASEBALL[cols], | |
upper = list(continuous = "cor"), | |
lower = list(continuous = "smooth"), | |
diag = list(continuous="density"), | |
axisLabels='show') | |
# [2] Move to a higher level of abstraction: correlation heatmap | |
z <- cor(TBL_BASEBALL) # correlation matrix with one command :-) | |
z | |
install.packages('lattice') | |
library(lattice) | |
levelplot(z) # default is hideous | |
col.l <- colorRampPalette(c('blue','red','orange','yellow'))(30) | |
levelplot(z, col.regions=col.l, scales=list(x=list(rot=90, cex=0.8))) | |
# ....................................... # | |
# - EXERCISE - # | |
# select only columns 'n_runs' and 'n_errors' from baseball data | |
# do a summary() on baseball data that includes only columns 'n_runs' and 'n_errors' | |
# ....................................... # | |
## creating a dataframe by IMPORTING DATA from Excel | |
install.packages('RODBC') | |
library(RODBC) | |
FileLoc = "C:\\Documents and Settings\\mencec1\\Desktop\\training\\" | |
FileName = 'branch profiles.xls' | |
File = paste(FileLoc,FileName,sep="") | |
print(File) | |
channel <- odbcConnectExcel(File) | |
TBL_BRANCHES <- sqlFetch(channel, "Sheet1",as.is=TRUE) | |
odbcClose(channel) | |
# look at it | |
TBL_BRANCHES | |
dim(TBL_BRANCHES) | |
colnames(TBL_BRANCHES) | |
summary(TBL_BRANCHES) | |
str(TBL_BRANCHES) | |
# ------------------------------- # | |
## [2.2] working with dataframes: boolean slicing & dicing | |
# ------------------------------- # | |
##### using logical conditions to select things | |
# EQUALS is == | |
# OR is | | |
# NOT is ! | |
# AND is & | |
# NOT EQUALS is != | |
# logical conditions work by creating an index then applying that to dataset | |
# , eg.: | |
TBL_BRANCHES['n_custs']>5000 # returns list of TRUE vs FALSE for each row | |
# --> use that list as an index | |
TBL_BRANCHES[TBL_BRANCHES['n_custs']>30000,] | |
TBL_BRANCHES[TBL_BRANCHES['pct_migrant']>0.5,] | |
TBL_BRANCHES[TBL_BRANCHES['pct_migrant']>0.5,c('Branch name','Mean_monthly_net_income','mean_A_RAR')] | |
# can write your index lists into variables if it is clearer | |
bigbranches = TBL_BRANCHES['n_custs']>30000 | |
TBL_BRANCHES[bigbranches,] | |
# boolean conditions create the same list of TRUE/FALSE for indexing: | |
TBL_BRANCHES['pct_migrant']>0.5 & TBL_BRANCHES['State']=='NSW' | |
TBL_BRANCHES[TBL_BRANCHES['pct_migrant']>0.5 & TBL_BRANCHES['State']=='NSW',] | |
TBL_BRANCHES[TBL_BRANCHES['pct_migrant']>0.5 | TBL_BRANCHES['pct_student']>0.5,] # OR | |
# negations - ! can go before an entire condition, or as part of a != | |
TBL_BRANCHES[TBL_BRANCHES['State']!='NSW',] # all states NOT NSW | |
TBL_BRANCHES[!TBL_BRANCHES['pct_migrant']>0.5, ] # every record BUT where pct_migrant > 50% | |
# You can use %in% like SQL 'in' to match to a list of conditions | |
TBL_BRANCHES[,'State'] %in% c('SA','WA') | |
TBL_BRANCHES[TBL_BRANCHES[,'State'] %in% c('SA','WA'),] | |
## Alternative syntax using subset() may be easier for some | |
subset(TBL_BRANCHES, State == 'VIC') | |
subset(TBL_BRANCHES, State == 'VIC' & pct_migrant>0.5) | |
subset(TBL_BRANCHES, State == 'VIC' | pct_migrant>0.5) | |
subset(TBL_BRANCHES, Location %in% c('Regional','Remote')) | |
### now that we can do boolean slicing & dicing, let's use it | |
### to get to know BRANCHES data better | |
# Map it | |
install.packages('ggmap') # package for query google maps & plot with ggplot2 | |
library(ggmap) | |
qmap('Australia', zoom=4, maptype='satellite') # query google maps | |
qmap('Australia', zoom=4, maptype='terrain') | |
# download map into 'mapAus' variable | |
mapAus <- qmap('Australia', zoom=4, maptype='terrain', color="bw") | |
# Overlay points onto mapAus using TBL_BRANCHES data | |
mapAus + geom_point(aes(x=Long, y=Lat), color='red', data=TBL_BRANCHES, alpha=0.5) | |
# Overlay points where size is proportional to N_CUSTS | |
mapAus + geom_point(aes(x=Long, y=Lat, size=n_custs), color='red', data=TBL_BRANCHES, alpha=0.5) | |
## let's use boolean slicing: do a map of high % Affluent 50+ around Melbourne | |
affluentbranches = TBL_BRANCHES['pct_affluent_50+']>0.1 | |
# Get map centered on Melbourne | |
MyMap <- qmap(location = 'Melbourne, Australia', zoom=9, maptype='terrain',color="bw") | |
# plot migrant branches, scale colour to mean RAR | |
MyMap + | |
geom_point(data=TBL_BRANCHES[affluentbranches,], aes(x=Long, y=Lat, colour=as.numeric(mean_A_RAR)), alpha=0.7, size=5) + | |
scale_colour_gradient(low="blue", high="red",name='Mean RAR') | |
# ....................................... # | |
### EXERCISE | |
# Select all branches with >20% affluent 50+ | |
# get average customer RAR for all branches in VIC | |
# ....................................... # | |
# ------------------------------- # | |
# CHAPTER 3: R LIBRARIES | |
# ------------------------------- # | |
## R Libraries make it easy to do stuff | |
# C-RAN: http://cran.r-project.org/ | |
# packages must be downloaded once (this will also update them): | |
# then called in a given script you are using them for: | |
# you can see packages & versions in package tab of rstudio | |
## wordcloud | |
install.packages("wordcloud") # download package | |
library(wordcloud) # call package into memory | |
text = "There are five main ways water vapor can be added to the air. | |
Increased vapor content can result from wind convergence over water or | |
moist ground into areas of upward motion. Precipitation or virga falling | |
from above also enhances moisture content. Daytime heating causes water | |
to evaporate from the surface of oceans, water bodies or wet land. | |
Transpiration from plants is another typical source of water vapor. | |
Lastly, cool or dry air moving over warmer water will become more humid. | |
As with daytime heating, the addition of moisture to the air increases | |
its heat content and instability and helps set into motion those processes | |
that lead to the formation of cloud or fog" | |
wordcloud(text,min.freq=1,random.order=FALSE) | |
# make it look prettier (?) with colours | |
library(RColorBrewer) | |
pal <- brewer.pal(8,"Dark2") | |
wordcloud(text, scale=c(0.3,2), min.freq=1, | |
colors=pal, rot.per=0, fixed.asp=FALSE, | |
vfont=c("gothic english","plain")) | |
## randomForest | |
install.packages("randomForest") | |
library(randomForest) | |
rf <- randomForest(TBL_BASEBALL[,c('n_homeruns','batting_avg','n_errors')], | |
TBL_BASEBALL$salary, # target variable | |
n_tree=500, # 500 trees | |
mtry=2) # try 2 variables (at random) for each split | |
rf | |
importance(rf) | |
TBL_BASEBALL$yhat = predict(rf, TBL_BASEBALL) # predict function | |
plot(TBL_BASEBALL$salary,TBL_BASEBALL$yhat) | |
## sqldf - use SQL to operate on dataframes | |
install.packages('sqldf') | |
library('sqldf') | |
sqldf(' select State | |
, count(*) | |
from TBL_BRANCHES | |
group by State') | |
## Other neat libraries | |
# Interactive charts with Shiny: http://www.rstudio.com/shiny/ | |
# twittR | |
# RODBC - for ODBC connections | |
# XML - for webpage scraping | |
# ....................................... # | |
# - EXERCISE - # | |
# download and install wordcloud library using install.packages() | |
# then make it active with library() | |
# make a wordcloud from a piece of text | |
# if necessary, help(wordcloud) | |
# ....................................... # | |
# ------------------------------- # | |
# CHAPTER 4: PLOTTING | |
# ------------------------------- # | |
# generic R plotting is drab but quick | |
plot(TBL_BASEBALL$batting_avg, TBL_BASEBALL$salary) | |
# ggplot2 library is awesome: http://docs.ggplot2.org/current/ | |
install.packages('ggplot2') | |
library(ggplot2) | |
# build plots logically by adding layers | |
p <- ggplot(TBL_BASEBALL, aes(x=batting_avg, y=salary)) # create a blank canvas in 'p' | |
p <- p + geom_point() # add simple points | |
p | |
p <- p + geom_point(aes(colour = salary), size=5, alpha=0.5 ) | |
p # aes statement defines how data links to plot elements | |
# you can go on and on adding layers and tweaking settings... | |
p <- p + stat_density2d(colour='red') | |
p | |
p <- p + scale_colour_gradient(low = "green", high="yellow") | |
p | |
p <- p + theme_bw() | |
p | |
# ... can layer up endlessly. lines, points, contours; sized, coloured, shaped by variables.. | |
# various add-on libaries for all sorts of charts, eg GGally has scatterplot matrix | |
###### histogram / density example | |
p <- ggplot(TBL_BRANCHES, aes(mean_A_RAR)) | |
p + geom_histogram() | |
# split by location | |
p <- ggplot(TBL_BRANCHES, aes(mean_A_RAR, fill = Location)) | |
p + geom_histogram() # not so good | |
p <- p + geom_density(alpha=0.2) | |
p | |
p <- p + theme_bw() | |
p | |
# use facet_grid to break plots up by a categorical variable | |
# syntax: rows ~ columns | |
p + facet_grid(Location ~ .) | |
p + facet_grid(Location ~ State) | |
### Maps + ggplot2 = lots of possibilities | |
library(ggmap) | |
MyMap <- qmap(location = 'Niddrie, Australia', zoom=10, color='bw') | |
MyMap + | |
geom_point(aes(x=Long, y=Lat, size=as.numeric(n_custs), | |
colour=mean_A_RAR), alpha=0.8, data=TBL_BRANCHES) + | |
scale_size_continuous(range = c(2,10), name='# Custs') + | |
scale_colour_gradient(low="blue", high="red",name='Mean $ RAR') | |
# using a different map source: | |
MyMap <- qmap(location = 'Niddrie, Australia', zoom=10, source = 'osm') | |
MyMap + | |
geom_point(aes(x=Long, y=Lat, size=as.numeric(n_custs), | |
colour=mean_A_RAR), alpha=0.8, data=TBL_BRANCHES) + | |
scale_size_continuous(range = c(2,10), name='# Custs') + | |
scale_colour_gradient(low="blue", high="red",name='Mean $ RAR') | |
## Map of branches coloured by cluster | |
TBL_BRANCHES_VIC = subset(TBL_BRANCHES, State == 'VIC') | |
MyMap <- qmap(location = 'Melbourne, Australia', zoom=10, source = 'osm', color='bw') | |
MyMap + | |
geom_point(aes(x=Long, y=Lat, colour=as.factor(Cluster)), size=5, | |
show_guide = FALSE, data=TBL_BRANCHES_VIC) | |
# add border to points, make colour gradient have more contrast: | |
cols = rainbow(100) # take 100 colours from rainbow | |
cols = sample(cols, 100) # use sample to get them in a random order | |
MyMap + | |
geom_point(aes(x=Long, y=Lat, colour=as.factor(Cluster)), size=5, | |
show_guide = FALSE, data=TBL_BRANCHES_VIC) + | |
geom_point(aes(x=Long, y=Lat), size=5, pch=21 , data=TBL_BRANCHES_VIC) + | |
scale_colour_manual(values=cols) | |
# See list of inbuilt spectrums here: http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/ | |
# Kate also created this ANZ colourset: | |
ANZcolours <- c( | |
"#B2B2B2", | |
"#DDDDDD", | |
"#CCE5F5", | |
"#66B2E0", | |
"#007FCC", | |
"#002B3C", | |
"#D7E7DE", | |
"#E6E1B6", | |
"#F2F285", | |
"#B2A58D", | |
"#87E10C", | |
"#F2AF00", | |
"#F27300", | |
"#BF3500") | |
pie(rep(1,14), col=ANZcolours) | |
# ... ggplot is awesome | |
# virtually any element of the plot can be customised | |
# See galleries of examples at http://docs.ggplot2.org/current/ | |
#### Final awesome example: Houston crime data density | |
str(crime) # the 'crime' dataset is contained in the ggmap library | |
# limit to violent crimes | |
violent_crimes <- subset(crime, offense != "auto theft" & | |
offense != "theft" & offense != "burglary") | |
HoustonMap <- qmap("houston", zoom = 14, color = "bw") | |
theme_set(theme_bw(16)) # changes default plot theme for ggplot2 | |
HoustonMap + | |
stat_density2d(aes(x = lon, y = lat, fill = ..level.., alpha = ..level..), | |
bins = 5, geom = "polygon", | |
data = violent_crimes, show_guide = FALSE) + | |
scale_fill_gradient(low = "black", high= "red", guide=FALSE) + | |
facet_wrap(~ day) | |
theme_set(theme_grey()) # revert to default ggplot2 theme | |
# ....................................... # | |
# - EXERCISE - # | |
# [1] Scatterplot of branch customer FUM vs RAR | |
# hint: 1st try plot(x_values, y_values) | |
# next, try ggplot: | |
# p <- ggplot(data=TABLENAME, aes(x=X_VALUES, y=Y_VALUES)) | |
# p <- p + geom_point() | |
# p | |
# [2] Change the scatterplot so branches are coloured by Location | |
# --> insert colour=VARIABLE into the ggplot aes statement | |
# [3] Make it look prettier | |
# Change the size by adding size=4 into the geom_point() statement | |
# Change the transparency by adding alpha=0.4 into the geom_point() statement | |
# Change the colour to be a function of the MOSAIC_prosperity | |
# .. and so on. look at examples at http://docs.ggplot2.org/current/ and tinker | |
# ---------------------------------------------------------------- # | |
# ---------------------------------------------------------------- # | |
# ------------------------------- # | |
# R TUTORIAL CODE # | |
# SESSION 2 # | |
# ------------------------------- # | |
# refresher on what we did last week | |
# vectors, variables, functions | |
f = c(1,2,3,4) | |
sum(f) | |
# importing data from csv | |
FileLoc = "C:\\Documents and Settings\\mencec1\\Desktop\\training\\" | |
FileName = 'baseball_train.csv' | |
File = paste(FileLoc,FileName,sep="") | |
print(File) | |
TBL_BASEBALL <-read.delim(File,header=TRUE,sep=",") | |
# importing data from Excel | |
install.packages('RODBC') | |
library(RODBC) | |
FileLoc = "C:\\Documents and Settings\\mencec1\\Desktop\\training\\" | |
FileName = 'branch profiles.xls' | |
File = paste(FileLoc,FileName,sep="") | |
print(File) | |
channel <- odbcConnectExcel(File) | |
TBL_BRANCHES <- sqlFetch(channel, "Sheet1",as.is=TRUE) | |
odbcClose(channel) | |
# dataframes, slicing & dicing | |
colnames(TBL_BASEBALL) | |
summary(TBL_BASEBALL) | |
TBL_BASEBALL[1:5,] # select rows 1 through 5. | |
TBL_BASEBALL[,c("salary","n_runs","n_walks")] # use a LIST of columns | |
TBL_BRANCHES[TBL_BRANCHES['pct_migrant']>0.5,] # boolean indexing | |
## libraries | |
install.packages("wordcloud") # download package | |
library(wordcloud) # call package into memory | |
text = "There are five main ways water vapor can be added to the air. | |
Increased vapor content can result from wind convergence over water or | |
moist ground into areas of upward motion. Precipitation or virga falling | |
from above also enhances moisture content. Daytime heating causes water | |
to evaporate from the surface of oceans, water bodies or wet land. | |
Transpiration from plants is another typical source of water vapor. | |
Lastly, cool or dry air moving over warmer water will become more humid. | |
As with daytime heating, the addition of moisture to the air increases | |
its heat content and instability and helps set into motion those processes | |
that lead to the formation of cloud or fog" | |
wordcloud(text,min.freq=1,random.order=FALSE) | |
## plotting & mapping with ggplot2 | |
install.packages('ggplot2') | |
library(ggplot2) | |
p <- ggplot(TBL_BRANCHES, aes(mean_A_RAR, fill = Location)) | |
p <- p + geom_density(alpha=0.2) | |
p <- p + theme_bw() | |
p + facet_grid(Location ~ State) | |
# on to new stuff... | |
# ------------------------------- # | |
# [2.3] aggregating data - like PROC SUMMARY | |
aggregate(TBL_BRANCHES['mean_FUM'], by=list(TBL_BRANCHES$State), FUN=mean) | |
# multiple variables | |
aggcols = c('mean_FUM','mean_n_prods','mean_A_RAR') | |
aggregate(TBL_BRANCHES[aggcols], by=list(TBL_BRANCHES$State), FUN=mean) | |
# multiple functions | |
aggregate(TBL_BRANCHES[aggcols], by=list(TBL_BRANCHES$State), | |
FUN=function(x) c(mn = mean(x), md = median(x) )) | |
# custom aggregate functions - anything that would work on a vector | |
aggregate(TBL_BRANCHES['mean_FUM'], by=list(TBL_BRANCHES$State), | |
FUN=function(x) interest=sum(x*0.18/12) ) | |
# we can simplify syntax by loading formula into a variable | |
# ** virtually anything can be loaded into a variable in R ** | |
f <- function(x) interest=sum(x*0.18/12) | |
aggregate(TBL_BRANCHES['mean_FUM'], by=list(TBL_BRANCHES$State), FUN=f ) | |
# You can write your aggregation into a dataframe for further maniupation, plots, etc | |
aggdf = aggregate(TBL_BRANCHES['mean_FUM'], by=list(TBL_BRANCHES$State), FUN=f ) | |
colnames(aggdf) | |
ggplot(data=aggdf, aes(x=Group.1, y=mean_FUM, fill=Group.1)) + geom_bar(stat="identity") | |
ggplot(data=aggdf, aes(x=Group.1, y=mean_FUM, fill=mean_FUM)) + geom_bar(stat="identity") | |
# ....................................... # | |
# - EXERCISE - # | |
# Aggregate branch data by Location: | |
# calculate mean branch customer FUM AND mean RAR | |
# ....................................... # | |
# ------------------------------- # | |
## [2.4] TRANSFORMING DATA | |
## creating a new field | |
TBL_BASEBALL['new'] = 1 | |
TBL_BASEBALL['new'] | |
TBL_BASEBALL['new'] = c(1,2) # error: vectors are different sizes | |
TBL_BASEBALL['new'] = log(TBL_BASEBALL['salary']) | |
TBL_BASEBALL['new'] | |
TBL_BASEBALL['power'] = TBL_BASEBALL['n_homeruns'] + TBL_BASEBALL['n_stolenbases'] | |
TBL_BASEBALL['power'] | |
## linear transformations of scale | |
# say we have a skewed distribution... | |
hist(TBL_BASEBALL$salary) | |
hist(TBL_BASEBALL$salary, breaks=30) | |
summary(TBL_BASEBALL['salary']) | |
# yes, very skewed. | |
# we can transform it... | |
TBL_BASEBALL$log_salary = log(TBL_BASEBALL$salary) | |
hist(TBL_BASEBALL$log_salary, breaks=30) | |
# ...or we could band it: | |
## banding & categorising | |
# equal spaced cut points based on value field | |
cut(TBL_BASEBALL$n_runs, 5) # 5 groups | |
# [] brackets are inclusive | |
# Labels can be set- see help(cut) | |
# custom cut points | |
cuts = c(0,10,50,90) | |
cut(TBL_BASEBALL$n_runs, cuts) | |
# doing cuts based on # of records, eg deciles, quartiles | |
quantile(TBL_BASEBALL$batting_avg,0.5) # quantile returns the point at X% | |
quantile(TBL_BASEBALL$batting_avg,c(0.25,0.5,0.75)) # quantile returns the point at X% | |
cutpoints = quantile(TBL_BASEBALL$batting_avg,c(0,0.25,0.5,0.75,1.0)) # quantile returns the point at X% | |
cut(TBL_BASEBALL$batting_avg, cutpoints) | |
# deciles | |
(0:10) # remember this? | |
(0:10)/10 # 0 to 1 by 0.1, eg deciles | |
decpoints = quantile(TBL_BASEBALL$batting_avg,(0:10)/10) | |
decpoints | |
TBL_BASEBALL['batting_decile'] = cut(TBL_BASEBALL$batting_avg, decpoints, include.lowest=TRUE) | |
TBL_BASEBALL[,c('batting_decile','batting_avg')] | |
# now we can aggregate | |
aggcols = c('n_hits','n_runs','n_strikeouts','salary') | |
aggregate(TBL_BASEBALL[aggcols], by=list(TBL_BASEBALL$batting_decile), FUN=mean) | |
# ....................................... # | |
# - EXERCISE - # | |
## with TBL_BRANCHES | |
# create a new column which is sum of % migrants and % students | |
# aggregate the table to see average FUM in each FUM decile | |
# ....................................... # | |
## [2.5] Custom functions to transform data | |
# there's lots of ways to recode data... | |
TBL_BRANCHES$c_age <- ifelse(TBL_BRANCHES$mean_age > 45, c("old"), c("young")) | |
TBL_BRANCHES[,c('c_age','mean_age')] | |
# ...which I'll leave you to figure out | |
# let's look at one advanced method: custom functions | |
# define an R function and apply it: | |
fn <- function(x) x/1000 | |
fn | |
fn(2000) | |
# use 'sapply' to apply across a vector | |
sapply(TBL_BRANCHES[,'mean_FUM'],fn) | |
TBL_BRANCHES[,'mean_FUM_k'] = sapply(TBL_BRANCHES[,'mean_FUM'],fn) | |
# functions can be complex, can contain if/then logic, | |
# can be used for categorising: | |
fn <- function(x) { | |
if(x<20) {'young'} | |
else if(x<45) {'mid age'} | |
else {'old'} | |
} # curly brackets allow the function to run over many lines | |
fn(40) | |
sapply(TBL_BRANCHES[,'mean_age'], fn) | |
# ....................................... # | |
# ------------------------------- # | |
# CHAPTER 6: EASY DATA MINING | |
# ------------------------------- # | |
## [6.1] regression: linear regression | |
plot(TBL_BASEBALL$n_homeruns, TBL_BASEBALL$salary) | |
lm.fit = lm(salary~n_homeruns, data=TBL_BASEBALL) | |
lm.fit | |
abline(lm.fit) # overlays regression line onto chart | |
summary(lm.fit) | |
plot(lm.fit) # various residuals and goodness-of-fit plots | |
# we can easily predict back onto another dataset using predict() | |
TBL_BASEBALL['salary_predict'] = predict(lm.fit, TBL_BASEBALL) | |
TBL_BASEBALL[,c('salary','salary_predict')] | |
TBL_BASEBALL[231,] | |
# if you want to use ALL variables, here's a shorthand: | |
lm(salary~., data=TBL_BASEBALL) | |
# lm can be extended into complex linear models, polynomials, etc: | |
lm.fit = lm(salary~n_homeruns + n_errors * n_strikeouts + I(n_hits^2), | |
data=TBL_BASEBALL) | |
summary(lm.fit) | |
# use glm() for generalised linear models, eg logistic | |
## [6.2] unsupervised learning: k-means clustering | |
# what if we automatically cluster branches together using | |
# MOSAIC factors? Do we get anything useful or meaningful? | |
ggplot(TBL_BRANCHES, aes(x=MOSAIC_cultural_diversity, y=MOSAIC_prosperity)) + | |
geom_point() | |
# this way makes it clearer where dense areas are: | |
ggplot(TBL_BRANCHES, aes(x=MOSAIC_cultural_diversity, y=MOSAIC_prosperity)) + | |
geom_point(alpha=0.4, size=5, colour='blue') | |
# use kMeans clustering to find 2 clusters | |
km.out = kmeans(TBL_BRANCHES[,c('MOSAIC_prosperity','MOSAIC_cultural_diversity')], 2) | |
km.out | |
TBL_BRANCHES$cluster = km.out$cluster # write clusters back to dataset | |
# plot | |
ggplot(TBL_BRANCHES, | |
aes(x=MOSAIC_cultural_diversity, y=MOSAIC_prosperity, | |
colour=as.factor(cluster))) + | |
geom_point(alpha=0.4, size=5) | |
# Clustering has done a pretty good job of automatically detecting | |
# metro vs. regional branches: | |
ggplot(TBL_BRANCHES, | |
aes(x=MOSAIC_cultural_diversity, y=MOSAIC_prosperity, colour=Location)) + | |
geom_point(alpha=0.4, size=5) | |
## [6.3] classification: kNN classification | |
# k-nearest neighbour | |
colnames(TBL_BRANCHES) | |
# make three income bands: low, medium, high: | |
cutpoints = quantile(TBL_BRANCHES$Mean_monthly_net_income, c(0,0.33,0.66,1)) | |
TBL_BRANCHES$band_income = cut(TBL_BRANCHES$Mean_monthly_net_income, cutpoints, | |
labels=c(1,2,3), include.lowest=TRUE) | |
TBL_BRANCHES$band_income | |
# first 2 MOSAIC factors are decent discriminators: | |
ggplot(TBL_BRANCHES, | |
aes(x=MOSAIC_prosperity, y=MOSAIC_dependants, colour=band_income)) + | |
geom_point(size=4, alpha=0.5) | |
# use kNN to predict back onto our dataset: | |
install.packages('class') | |
library(class) | |
X = TBL_BRANCHES[,c('MOSAIC_prosperity','MOSAIC_dependants')] | |
y = TBL_BRANCHES[,'band_income'] | |
TBL_BRANCHES$income_predicted = knn(train = X, # Dataset to learn from | |
test = X, # dataset to predict | |
cl = y, # target variable to learn | |
k=10) # number of neighbours to use | |
ggplot(TBL_BRANCHES, | |
aes(x=MOSAIC_prosperity, y=MOSAIC_dependants, colour=income_predicted)) + | |
geom_point(size=4, alpha=0.5) | |
# we can draw a decision boundary plot | |
# first we create a 100x100 grid that spans the range of two variables | |
x1lim = range(TBL_BRANCHES$MOSAIC_prosperity) # gets min and max of variable | |
x2lim = range(TBL_BRANCHES$MOSAIC_dependants) | |
x1range = seq(x1lim[1],x1lim[2],len=100) # creates a sequence of values | |
x2range = seq(x2lim[1],x2lim[2],len=100) | |
grid = data.frame(X1 = rep(x1range, 100), X2 = rep(x2range, each=100)) | |
# use KNN to predict income band of each point in grid | |
grid$predict = knn(train = X, # Dataset to learn from | |
test = grid, # Dataset to predict | |
cl = y, # Target variable to learn | |
k=10) # number of neighbours to use | |
# plot decision grid, then overlay datapoints | |
g <- ggplot(data=grid, aes(X1, X2)) + | |
geom_point(aes(colour=predict), alpha=0.3) + | |
theme_bw() | |
g | |
g + geom_point(data=TBL_BRANCHES, aes(x=MOSAIC_prosperity, | |
y=MOSAIC_dependants, | |
colour=band_income), size=4) | |
# ....................................... # | |
# ------------------------------- # | |
# CHAPTER 5: CREATING HTML NOTEBOOKS OF YOUR ANALYSIS | |
# ------------------------------- # | |
# a small notebook with code & output you can email to anyone... | |
# Steps: | |
# run this | |
install.packages('knitr') # requires knitr package is installed | |
### COPY THIS INTO A NEW R SCRIPT ANDin RSTUDIO AND HIT THE | |
### 'COMPILE NOTEBOOK'BUTTON AT THE TOP RIGHT OF THE SCRIPT PANEL | |
######################################################################################### | |
#### Example of compiling HTML notebook | |
# For HTML notebook creation to work, | |
# code must run standalone from end to end | |
library(knitr) | |
library(RODBC) | |
library(ggmap) | |
library(ggplot2) | |
library(GGally) | |
# import data | |
FileLoc = "C:\\Documents and Settings\\mencec1\\Desktop\\training\\" | |
FileName = 'branch profiles.xls' | |
File = paste(FileLoc,FileName,sep="") | |
print(File) | |
channel <- odbcConnectExcel(File) | |
TBL_BRANCHES <- sqlFetch(channel, "Sheet1",as.is=TRUE) | |
odbcClose(channel) | |
# summarise useful data by state | |
aggcols = c('mean_FUM','mean_n_prods','mean_A_RAR') | |
aggregate(TBL_BRANCHES[aggcols], by=list(TBL_BRANCHES$State), FUN=mean) | |
# plot average customer RAR histogram by branch location | |
ggplot(TBL_BRANCHES, aes(mean_A_RAR, fill = Location)) + geom_density(alpha=0.2) + theme_bw() | |
# plot matrix scatterplot of ... | |
ggpairs(data=TBL_BRANCHES[aggcols], | |
upper = list(continuous = "cor"), | |
lower = list(continuous = "smooth"), | |
diag = list(continuous="density"), | |
axisLabels='show') | |
######################################################################################### | |
# ....................................... # | |
# - EXERCISE - # | |
## DO [1] or [2]: | |
# as you go, remember to visualise your data | |
# [1] Data mining competition: predict baseball salary | |
# you might try: | |
# a linear model: | |
lm.fit = lm(salary~., data=TBL_BASEBALL) | |
predict(lm.fit, TBL_BASEBALL) | |
# a randomforest: | |
install.packages("randomForest") | |
library(randomForest) | |
predictors = c('n_homeruns','batting_avg','n_errors') | |
rf = randomForest(TBL_BASEBALL[predictors], | |
TBL_BASEBALL$salary, | |
n_tree=500, | |
mtry=3) | |
predict(rf,TBL_BASEBALL) | |
# clustering to create derived variables | |
km.out = kmeans(TBL_BASEBALL[predictors], 2) | |
TBL_BASEBALL$cluster = km.out$cluster | |
# [2] predict branch customer profitability: mean_A_RAR | |
# you might try: | |
# a linear model: | |
colnames(TBL_BRANCHES) | |
lm.fit=lm(mean_A_RAR~mean_FUM + Mean_monthly_net_income + | |
mean_n_prods + pct_affluent_50, data=TBL_BRANCHES) | |
predict(lm.fit, TBL_BRANCHES) | |
# a randomforest: | |
install.packages("randomForest") | |
library(randomForest) | |
predictors = c('mean_FUM','Mean_monthly_net_income','mean_n_prods') | |
rf = randomForest(TBL_BRANCHES[predictors], | |
TBL_BRANCHES$mean_A_RAR, | |
n_tree=500, | |
mtry=3) | |
predict(rf,TBL_BRANCHES) | |
# clustering to create derived variables | |
km.out = kmeans(TBL_BRANCHES[predictors], 2) | |
TBL_BRANCHES$cluster = km.out$cluster | |
# ....................................... # | |
# ------------------------------- # | |
# APPENDIX: useful things we skipped | |
# ------------------------------- # | |
## merging dataframes | |
# see rbind, cbind, merge | |
# here are a few examples: https://rhandbook.wordpress.com/tag/slicing-and-dicing-of-data-frame/ | |
## Exporting data | |
# write.table(mydata,"c:/mydata2.txt", sep="\t") | |
## Generate random sequence of integers | |
# sample(20) | |
## Generate distributions | |
# ... | |
############# OPTIMAL BINNING BUCKETING ############### | |
# with decision tree | |
install.packages("party", lib="c:\\notbackedup") | |
library("party", lib='\\notbackedup') | |
?ctree | |
str(iris) | |
iris_ctree <- ctree(Species ~ Sepal.Length, | |
controls = ctree_control(minbucket=30, maxdepth=2), data=iris) | |
print(iris_ctree) | |
# tag node: | |
iris$predicted_value = predict(data=iris, iris_ctree, type = "node") | |
prop.table(xtabs( ~ F_RESPONSE + treenode, campaigndf),2) | |
# discussion of bucketing | |
# http://www.r-bloggers.com/package-party-conditional-inference-trees/ | |
??ctree | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## System & file management | |
objects() # workspace | |
getwd() # working directory | |
setwd("/mnt/resource/training3") | |
dataPath = "/mnt/resource/BigData/data" | |
bigDataPath = "/mnt/resource/BigData/BigData" | |
outdir = "output" | |
# Check & create directories | |
if(!file.exists(outdir)) dir.create(outdir) | |
# list files in directory | |
dir(dataPath) | |
dir(bigDataPath) | |
# example | |
## | |
setwd("/mnt/resource/training3") | |
dataPath = "/mnt/resource/BigData/data" | |
outdir = "output" | |
outPath = "output" | |
if(!file.exists(outdir)) dir.create(outdir) | |
bigDataPath = "/mnt/resource/BigData/BigData" | |
big.data.path = "/mnt/resource/BigData/BigData" | |
data.path = "/mnt/resource/BigData/data" | |
output.path = 'output' | |
sample.data.dir = rxGetOption("sampleDataDir") | |
## Read data | |
# from internet: | |
## Selecting & slicing | |
# which() = identify records meet a list of criteria | |
# which(mtcars$hp == max(mtcars$hp)) | |
## also SQLDF | |
# repeat something | |
replicate() | |
## sort and order | |
# sort() = sort a vector | |
# order() = return element order of a sorted vector | |
# for a dataframe rows use order: mtcars[order(mtcars$mpg, decreasing=TRUE), ] | |
## Merging | |
# use merge() | |
# inner join (only keep matches): merge(x,y,all=FALSE) | |
# outer join (keep all records): merge(x,y,all=TRUE) | |
# left or right join: merge(x,y,all.x=TRUE) | |
# BINDING = 1 to 1 rows = cbind() | |
# UNION ALL = stack, use rbind | |
## also SQLDF | |
## regex and strings | |
# richest.nations$Amount <- gsub(",", "", richest.nations$Amount) | |
# richest.nations$Amount <- sapply(richest.nations$Amount, function(x) substr(x, 2, nchar(x))) | |
## connecting to databases, SQL | |
# note: SQLite = free, open-source SQL DB | |
library("RSQLite") | |
db = dbConnect(SQLite(), dbname = file.path(bigDataPath, "airlines.sqlite")) | |
dbListTables(db) # list all tables | |
dbListFields(db, "planes") | |
# run a query | |
query <- "SELECT count(*) FROM carriers" | |
dbGetQuery(db, query) | |
# more queries | |
query <- "SELECT * FROM airlines where Dest=\"DAL\"" | |
query <- "SELECT * FROM carriers where Description LIKE \"American%\"" | |
query <- 'SELECT * FROM carriers where Description LIKE "%American%"' | |
## Sequential reads: taking data one chunk at a time = fetch() | |
query <- "SELECT * FROM carriers" | |
res <- dbSendQuery(db, query) | |
x <- fetch(res, n = 1000) # then use rbind() to stack | |
str(x) | |
dbClearResult(res) # close connection | |
# Misc | |
# put brackets around assignment ot make it print | |
# eg (test = 2) | |
## Apply | |
# apply(df, MARGIN, FUN, ...) | |
# where Margin is a vector. 1 is rows, 2 is columns | |
# FUN = function | |
# eg. apply(mtcars, 2, mean) # mean for each column | |
## Pass parameters to the FUN in the apply syntax | |
# apply(mtcars, 2, quantile, c(0.25,.75)) | |
# 2th, 75th centile | |
## Other applies | |
# lapply() = to each element of a list | |
## by() | |
# split dataframe into defined groups, then apply function | |
# eg. by(mtcars, mtcars$cyl, colMeans) # colMeans for each factor of cyl | |
## tables & xtabs | |
xtabs(~gear + cyl, mtcars) | |
aggregate() | |
# Note: write your OWN function to return multiple aggregates if you want multiple | |
# repeat | |
replicate() | |
#### RevoR & Teradata | |
# xdf seems to be distributed form of dataframe | |
### data munging | |
## library(dplyrXdf) | |
## filter, select, distinct, transmute, mutate, arrange, rename, | |
## Summarise | |
# bankSmry <- group_by(bankXdf, marital, housing) %>% | |
# summarise(mean_age=mean(age), mean_balance=mean(balance), .method=3) | |
## doXdf | |
# bankModels <- group_by(bankXdf, housing) %>% | |
# doXdf(model=rxLogit(y ~ job + marital + education, data=.)) | |
> outdir = "~" | |
> cars.mod = cars | |
> save(cars.mod, file=file.path(outdir, "cars_mod.RData")) | |
paste() = concatenate strings | |
can include a sep | |
paste("row",1:5,sep="_") | |
## boolean | |
# is.na() for missing values | |
# x[is.na(x)] = 412 | |
## VEctors | |
letters[-c(1:5]) = drop first 5 letters | |
# giving elements names - can access with names | |
x = 1:5 | |
names(x) = letters[1:5] | |
x["c"] | |
## Matrices | |
# Assing ROW and COLUMN names, then you can query on them | |
# mat[c('a','b'),c('f','g')] | |
## Dataframes | |
# str() | |
# summary() | |
# data.frame() = create | |
## Vectorising | |
# ifelse(condition, ifvalue, elsevalue) | |
# eg. mtcars$type = ifelse(mtcars$hp>mean(mtcars$hp),'highhp','lowhp') | |
Scatterplot: use pairs() | |
## R Functions | |
test <- function(a, b) return(a + b) | |
test | |
function.name <- function(arguments) { | |
# everything the function does in here | |
} | |
## Modelling | |
# Where there is a big spike at 0 before smaller skewed distribution | |
# --> use Tweedie distribution power 1.5 = 'compound gamma poisson' = zero spike like poisson plus gamma | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### ggvis | |
mtcars %>% ggvis(~ht) %>% layer_histograms() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#### RevoR | |
# xdf = "extermal data frame" | |
## import | |
args(rxImport) | |
colClasses <- c("integer", rep("factor", 4), "integer", rep("factor",3), "integer", "factor", rep("integer", 4), rep("factor", 2)) | |
names(colClasses) <- c("age", "job", "marital", "education", "default","balance", "housing", "loan", "contact", "day", "month", "duration","campaign", "pdays", "previous", "poutcome", "y") | |
BankXDF <- rxImport(inData = infile, outFile = outfile, colClasses = colClasses,rowsPerRead = 10000, overwrite = TRUE) | |
# from text: RxTextData() | |
## Info & statistics | |
rxGetInfo(inADS, getVarInfo = TRUE) | |
rxGetInfo(bankXDF, getVarInfo = TRUE, numRows = 6) | |
rxGetVarInfo() # names, max, min | |
rxGetVarNames() # variable names | |
rxSummary() | |
rxSummary(~default + balance, data = BankDS) | |
BankSummary2 <- rxSummary(balance ~ F(age), data = BankDS) | |
# quantiles | |
args(rxQuantile) | |
# crosstabs | |
args(rxCrossTabs) | |
rxCrossTabs(balance ~ F(age, low = 20, high = 30):F(campaign, low = 1, high = 5), data = BankDS) | |
#### manipulation & transformation | |
## use rxDataStep to do transforms, drop, keep, etc | |
args(rxDataStep) | |
BankSubDS <- rxDataStep(inData = origBankSubDS, outFile = outfile, | |
transforms = list(logBalance = log(newBalance)), overwrite = TRUE) | |
rxDataStep(inData = BankDS, outFile = BankDS, transforms = list(newBalance = balance + 8019 + 1), overwrite = TRUE) | |
## using custom functions | |
manualFactorRecoding <- function(dataList) { | |
dataList[["F_n.devices"]] <- factor(dataList[["n.devices"]], levels = seq(1, | |
10)) | |
return(dataList) | |
} | |
ChurnDS <- rxDataStep(inData = origChurnDS, outFile = churnoutfile, | |
transformFunc = manualFactorRecoding, overwrite = TRUE) | |
## creating factors - use factorise() or F() inside functions | |
bankXdf2 <- factorise(bankXdf, age) | |
rxFactors(inData = ChurnDS, outFile = ChurnDS, varsToKeep = rxGetVarNames(origChurnDS), | |
factorInfo = list(F_n.devices2 = list(levels = seq(1:10), varName = "n.devices")), | |
overwrite = TRUE) | |
## subset cols | |
BankDS <- rxDataStep(inData = origBankDS, outFile = outfile, varsToKeep = c("balance", | |
"age", "y", "poutcome"), overwrite = TRUE) | |
## subset rows | |
BankDS.rs <- rxDataStep(inData = origBankDS, outFile = outfile.rs, | |
varsToKeep = c("balance", "age", "y", "poutcome"), rowSelection = age < 21, overwrite = TRUE) | |
## Sort: rxSort | |
## Dedupe | |
BankDS2 <- rxSort(inData = BankDS, outFile = outfile2, sortByVars = c("balance", | |
"age"), decreasing = c(FALSE, TRUE), removeDupKeys = TRUE, dupFreqVar = "Dup_Count", | |
overwrite = TRUE) | |
#### manipulation with dplyrXdf | |
### can use: | |
## filter, select, distinct, transmute, mutate, arrange, rename, | |
## group_by, summarise, do | |
## left_join, right_join, full_join, inner_join | |
## these functions supported by rx: sum, n, mean, sd, var, min, max | |
bankSmry <- group_by(bankXdf, marital, housing) %>% | |
summarise(mean_age=mean(age), mean_balance=mean(balance)) | |
## Plots | |
rxHistogram(~logBalance, data = BankSubDS, xnNumTicks = 15) | |
# scatterplot | |
rxLinePlot(newBalance ~ age, type = "p", data = BankDS) | |
## Linear models | |
inADS <- RxXdfData(file.path(sample.data.dir, "AirlineDemoSmall.xdf")) | |
rxGetInfo(inADS, getVarInfo = TRUE) | |
# create model | |
model <- rxLinMod(ArrDelay ~ DayOfWeek + F(CRSDepTime), data = inADS) | |
model | |
model | |
coef(model) | |
names(model) # components of model object | |
# predict | |
'?'(rxPredict) | |
airline.xdf <- file.path(output.path, "airline.xdf") | |
rxPredict(modelObject = model, data = inADS, outData = airline.xdf, | |
writeModelVars = TRUE, computeResiduals = TRUE, overwrite = TRUE) | |
rxGetVarInfo(airline.xdf) | |
# view residuals | |
rxHistogram(formula = ~ArrDelay_Resid, data = airline.xdf) | |
# zoom in | |
rxHistogram(formula = ~ArrDelay_Resid, data = airline.xdf, numBreaks = 100, startVal = -100, endVal = 100) | |
## example | |
bankXDF <- file.path(data.path, "BankXDF.xdf") | |
rxGetInfo(bankXDF, getVarInfo = TRUE, numRows = 6) | |
model <- rxLinMod(balance ~ age + duration + marital + housing, data = bankXDF) | |
model | |
test = rxPredict(modelObject = model, data = bankXDF, computeResiduals = TRUE, writeModelVars = TRUE, overwrite=TRUE) | |
rxGetVarInfo(test) | |
## Optional: estimating standard errors | |
model <- rxLinMod(ArrDelay ~ DayOfWeek + F(CRSDepTime), data = inADS, covCoef = TRUE) | |
model$covCoef | |
# append to output | |
rxPredict(modelObject = model, data = airline.xdf, outData = airline.xdf, | |
computeStdErrors = TRUE, overwrite = TRUE) | |
rxGetVarInfo(airline.xdf) | |
# calc confidence itnervals | |
rxPredict(modelObject = model, data = airline.xdf, outData = airline.xdf, | |
computeStdErrors = TRUE, interval = "confidence", overwrite = TRUE) | |
rxGetVarInfo(airline.xdf) | |
### model Validation: split test & train | |
# Construct a variable to identify training data | |
set.seed(100) | |
rxDataStep(inData = airline.xdf, | |
outFile = airline.xdf, | |
transforms = list( | |
TrainValidate = factor( | |
ifelse(rbinom(length(DayOfWeek), size=1, prob=0.8), "train", "validate") | |
) | |
), | |
overwrite = TRUE) | |
rxGetVarInfo(airline.xdf) | |
rxHistogram(formula = ~TrainValidate, data = airline.xdf) | |
# splitting into two datasets | |
# Build a training data set | |
splitDS <- rxSplit(inData = airline.xdf, outFilesBase = file.path(output.path, | |
"airline"), splitByFactor = "TrainValidate", overwrite = TRUE) | |
splitDS # two files are located in here | |
train <- splitDS[[1]] | |
validate <- splitDS[[2]] | |
# Build a model on the training data | |
model <- rxLinMod(formula = ArrDelay ~ DayOfWeek + F(CRSDepTime), | |
data = train) | |
summary(model) | |
# use predict function to retrieve residuals from model object | |
rxPredict(modelObject = model, data = validate, outData = validate, | |
computeResiduals = TRUE, predVarNames = "ArrDelay_PredOutOfSample", | |
overwrite = TRUE) | |
rxGetInfo(validate, getVarInfo = TRUE, numRows = 5) | |
# correlation matrix - correlate actual & predicted | |
rxCor(formula = ~ArrDelay_PredOutOfSample + ArrDelay, data = validate) | |
### kfold cross validation | |
k = 10; | |
rxDataStep(inData = airline.xdf, | |
outFile = airline.xdf, | |
transforms = list( | |
kSplits = factor(sample(LETTERS[1:k], size = .rxNumRows, replace = TRUE)) | |
), | |
transformObjects = list(LETTERS = LETTERS, k = k), | |
append = "rows", | |
overwrite = TRUE) | |
rxHistogram(~kSplits, data = airline.xdf) | |
# split into k-files | |
kSplits <- rxSplit(inData = airline.xdf, outFilesBase = file.path(output.path, | |
"airline"), splitByFactor = "kSplits", overwrite = TRUE) | |
# write a function | |
myLinModWrapper <- function(holdoutlevel, splitFiles) { | |
## first, estimate the model on all data point but those including | |
## holdoutlevel | |
print(holdoutlevel) | |
myMod <- rxLinMod(ArrDelay ~ DayOfWeek + F(CRSDepTime), data = airline.xdf, | |
rowSelection = kSplits != holdout, transformObjects = list(holdout = holdoutlevel), | |
) | |
## Then, generate predictions | |
curHoldOut <- grep(paste("kSplits", holdoutlevel, "xdf", sep = "."), | |
names(splitFiles), value = TRUE) | |
rxPredict(myMod, data = splitFiles[[curHoldOut]], overwrite = TRUE, | |
predVarNames = "ArrDelay_kFold_Pred") | |
## Generate cross-validation metric for this hold out | |
curCor <- rxCor(~ArrDelay + ArrDelay_kFold_Pred, data = curHoldOut) | |
return(curCor["ArrDelay", "ArrDelay_kFold_Pred"]) | |
} | |
# estimate all holdout samples | |
allCors <- vapply(LETTERS[1:k], myLinModWrapper, splitFiles = kSplits, | |
FUN.VALUE = numeric(1)) | |
# visualise | |
dotplot(allCors, cex = 2, scales = list(y = list(cex = 1), x = list(cex = 1.5)), | |
xlab = list("Out-of-Sample Correlation", cex = 2), ylab = list("Hold out Sample", | |
cex = 2), panel = function(x, y, ...) { | |
panel.dotplot(x, y, ...) | |
panel.abline(v = mean(x), lty = 2, lwd = 3) | |
}) | |
#### Model selection, stepwise, etc | |
## rxStepControl() | |
inADS <- RxXdfData(file.path(sample.data.dir, "AirlineDemoSmall.xdf")) | |
rxGetInfo(inADS, getVarInfo = TRUE) | |
# fit simple model, main effects only | |
model.main <- rxLinMod(ArrDelay ~ DayOfWeek + F(CRSDepTime), data = inADS) | |
length(coef(model.main)) | |
summary(model.main) | |
# fit model with interactions | |
model.interaction <- rxLinMod(ArrDelay ~ DayOfWeek * F(CRSDepTime), | |
data = inADS, dropMain = FALSE) | |
length(coef(model.interaction)) | |
summary(model.interaction) | |
## comparing the models | |
# AIC: 2k - 2ln(L) [# of paremeters - log likelihood] | |
model.main$aic | |
model.interaction$aic | |
# compare relative likelihood | |
exp((model.interaction$aic - model.main$aic)/2) | |
## or do f test | |
Df <- c(model.interaction$df[1] - model.main$df[1], model.interaction$df[2]) | |
SS <- c((model.main$residual.squares - model.interaction$residual.squares), | |
model.interaction$residual.squares) | |
MS <- SS/Df | |
fVal <- c(MS[-length(MS)]/MS[length(MS)], NA) | |
pVal <- pf(fVal, df1 = Df[-length(Df)], df2 = Df[length(Df)], lower = FALSE) | |
anova.tbl <- data.frame(Df, SS, MS, fVal, pVal) | |
dimnames(anova.tbl) <- list(c("modelDiff", "Residuals"), c("Df", "Sum Sq", | |
"Mean Sq", "F value", "Pr(>F)")) | |
stats:::print.anova(anova.tbl) | |
## automatic model selection | |
args(rxLinMod) | |
args(rxStepControl) | |
#method: method for performing model selection | |
#scope: the range of models to explore (see below) | |
myStepControl <- rxStepControl(method = "stepwise", scope = list(lower = ~DayOfWeek + | |
F(CRSDepTime), upper = ~DayOfWeek * F(CRSDepTime)), test = "F") | |
# define starting model and then run | |
model.step1 <- rxLinMod(ArrDelay ~ DayOfWeek + F(CRSDepTime), variableSelection = myStepControl, | |
data = inADS, dropMain = FALSE) | |
# model sequence | |
model.step1$anova | |
summary(model.step1) # final model | |
## Another example | |
bigger.ADS <- file.path(big.data.path, "AirOnTime7Pct.xdf") | |
rxGetInfo(bigger.ADS, getVarInfo = TRUE) | |
mySel <- rxStepControl(method = "stepwise", scope = ~DayOfWeek * F(CRSDepTime) + | |
DepDelay + TaxiOut + UniqueCarrier + Distance) | |
model.complex1 <- rxLinMod(ArrDelay ~ 1, data = bigger.ADS, variableSelection = mySel) | |
summary(model.complex1) | |
model.complex1$anova | |
### LOgistic regression | |
args(rxLogit) | |
mortgages <- file.path(sample.data.dir, "mortDefaultSmall.xdf") | |
rxGetVarInfo(mortgages) | |
myformula <- formula(default ~ F(year) + ccDebt + creditScore + houseAge + | |
yearsEmploy) | |
model = rxLogit(myformula, data=mortgages) | |
# predict for some new mortgages | |
new_mortgages <- data.frame(year = rep(c(2006, 2009), each = 4), ccDebt = rep(c(1000, | |
10000), 4), creditScore = rep(c(700, 800), 4), houseAge = rep(c(1, | |
5, 10, 20), 2), yearsEmploy = rep(7, 8)) | |
str(new_mortgages) | |
rxPredict(modelObject=model, data=new_mortgages) | |
#### GLMs - have a family argument: logic, probit, log... | |
# rxGlm() | |
args(rxGlm) | |
census2000 <- file.path(big.data.path, "Census5PCT2000.xdf") | |
census_insurance <- file.path(output.path, "Census5PCT2000_insurance.xdf") | |
if (!file.exists(census_insurance)) { | |
rxDataStep(inData = census2000, outFile = census_insurance, rowSelection = (related == | |
"Head/Householder") & (age > 20) & (age < 90), varsToKeep = c("propinsr", | |
"age", "sex", "region", "perwt"), blocksPerRead = 1, overwrite = TRUE) | |
} | |
# --> this is about 10g | |
rxGetVarInfo(census_insurance) | |
p <- rxSummary(~region, data = census_insurance) | |
print(p, head = FALSE) | |
rxHistogram(~propinsr, data = census_insurance, pweights = "perwt") | |
propinGlm <- rxGlm(propinsr ~ sex + F(age) + region, pweights = "perwt", | |
data = census_insurance, family = rxTweedie(var.power = 1.5), | |
dropFirst = TRUE) | |
p <- summary(propinGlm) | |
print(p, head = FALSE) | |
rxPredict(propinGlm, data = census_insurance, outData=outData, overwrite =TRUE) | |
rxGetInfo(outData, getVarInfo = TRUE, numRows = 5) | |
rxHistogram(~propinsr_Pred, data = outData, pweights = "perwt") | |
### logistic with rxGLM | |
propinGlm <- rxGlm(default ~ ccDebt + F(year) + creditScore + houseAge + yearsEmploy, | |
data = mortgages, family = binomial()) | |
#### Decision trees | |
# rxDTree(formula, data, ...) | |
# control: maxDepth, minSplit, minBucket, maxNumBins, cost, | |
# outFile = write out terminal node for each observation | |
# parms: loss - specify loss matrix | |
infile <- file.path(data.path, "BankXDF.xdf") | |
BankDS <- RxXdfData(file = infile) | |
myBankDS <- file.path(output.path, "mybank.xdf") | |
Tree <- rxDTree(housing ~ age + balance, data = BankDS, cp = 0.001) | |
# cp = 'complexity parameter' = minimum decrease in lack-of-fit reduction allowed by a split | |
# visualise | |
library(RevoTreeView) | |
plot(createTreeView(Tree)) | |
### Regression tree | |
RTree <- rxDTree(housing_num ~ age + balance, data = BankDS, outFile = myBankDS, | |
writeModelVars = TRUE, overwrite = TRUE, extraVarsToWrite = "housing_num", | |
transforms = list(housing_num = housing == "yes"), cp = 0.005) | |
plot(createTreeView(RTree)) | |
# using loss matrix in parms | |
curlvls <- rxGetVarInfo(BankDS, varsToKeep = "housing")[["housing"]]$levels | |
(loss.mat <- matrix(c(0, 1.5, 1, 0), ncol = 2, dimnames = list(actual = curlvls, | |
predicted = curlvls))) | |
## predicted | |
## actual yes no | |
## yes 0.0 1 | |
## no 1.5 0 | |
TreeLossMat <- rxDTree(formula = housing ~ age + balance, data = BankDS, | |
parms = list(loss = loss.mat), cp = 0.001) | |
plot(createTreeView(TreeLossMat)) | |
# predictions: rxPredict() - can have prob (probability for class), class (predicted terminal node), ... | |
args(RevoScaleR:::rxPredict.rxDTree) | |
rxPredict(Tree, data = BankDS, outData = myBankDS, overwrite = TRUE, | |
writeModelVars = TRUE) | |
rxGetVarInfo(myBankDS) | |
rxPredict(RTree, BankDS, myBankDS, overwrite = TRUE, writeModelVars = TRUE, | |
predVarNames = c("predicted_housingnum_dtree")) | |
## visualization - ROC/AUC | |
rxRocCurve(actualVarName = "housing_num", predVarNames = "predicted_housingnum_dtree", | |
data = myBankDS) | |
roc.df <- rxRoc(actualVarName = "housing_num", predVarNames = "predicted_housingnum_dtree", | |
data = myBankDS) | |
head(roc.df) | |
rxAuc(roc.df) | |
#### Randomforests | |
infile <- file.path(data.path, "BankXDF.xdf") | |
BankDS <- RxXdfData(file = infile) | |
Forest <- rxDForest(housing ~ balance + age, data = BankDS, seed = 10, | |
cp = 0.01, nTree = 50, mTry = 2, overwrite = TRUE) | |
Forest | |
# we can access each tree | |
t(all.imp <- vapply(Forest$forest, getElement, "variable.importance", | |
FUN.VALUE = numeric(2))) | |
# notes | |
# out-of-bag error = every round we fit tree to 2/3rds dataset. Then we run | |
# remaining 1/3 through to see error | |
## determining variable importance | |
# choose an impurity measure, like Gini Index | |
# calculate Gini Index before and after each split | |
# sum up Gini improvements for each variable | |
Forest <- rxDForest(housing ~ balance + age, data = BankDS, seed = 10, | |
cp = 0.01, nTree = 50, mTry = 1, maxDepth = 5, importance = TRUE, | |
overwrite = TRUE) | |
Forest$importance | |
rxVarImpPlot(Forest) | |
## prediction types: response, prob, vote | |
# category = majority prediction | |
# regression = average of predictions | |
#### GOODNESS OF FIT MEASURES ##### | |
rxRocCurve() | |
rxAuc(roc.df) | |
############ UNSUPERVISED LEARNING ################## | |
### Clustering | |
args(rxKmeans) | |
infile <- file.path(data.path, "BankXDF.xdf") | |
BankDS <- RxXdfData(file = infile) | |
mybankfile <- file.path(output.path, "mybankXDF.xdf") | |
bank_clusters <- rxKmeans(~duration + balance, data = BankDS, numClusters = 3, | |
outFile = mybankfile, writeModelVars = TRUE, extraVarsToWrite = "randSamp", | |
transforms = list(randSamp = sample(10, .rxNumRows, replace = TRUE)), | |
overwrite = TRUE) | |
bank_clusters | |
bank_clusters$centers | |
# plot 1 | |
library(latticeExtra) | |
plt = rxLinePlot(balance ~ duration, data = BankDS, type = "p") | |
plt + as.layer(xyplot(balance ~ duration, data = as.data.frame(bank_clusters$centers), | |
col = "red")) | |
# plot 2 | |
intern.km.df <- rxXdfToDataFrame(file = mybankfile, rowSelection = randSamp == 1) | |
plot(balance ~ duration, data = intern.km.df, col = intern.km.df$.rxCluster,pch = 19) | |
with(as.data.frame(bank_clusters$centers), points(duration, balance, col = "lightblue", pch = "x", cex = 2)) | |
## Scaling variables | |
bank_summary <- rxSummary(~ duration + balance, data = BankDS) | |
bank_clusters <- | |
rxKmeans(~ duration_scl + balance_scl, | |
data = BankDS, writeModelVars = TRUE, | |
outFile = mybankfile, outColName = ".rxCluster_scl", | |
transforms = list( | |
duration_scl = (duration - duration_mean)/ | |
duration_stdev, | |
balance_scl = (balance - balance_mean)/ | |
balance_stdev), | |
transformObjects = list( | |
duration_mean=bank_summary$sDataFrame$Mean[1], | |
duration_stdev=bank_summary$sDataFrame$StdDev[1], | |
balance_mean=bank_summary$sDataFrame$Mean[2], | |
balance_stdev=bank_summary$sDataFrame$StdDev[2]), | |
numClusters = 3, overwrite = TRUE) | |
bank_clusters | |
bank_clusters$centers | |
# plot 1 | |
unscaled_centers <- t(t(bank_clusters$centers) * bank_summary$sDataFrame$StdDev + | |
bank_summary$sDataFrame$Mean) | |
plt = rxLinePlot(balance ~ duration, data = BankDS, type = "p") | |
plt + as.layer(xyplot(balance_scl ~ duration_scl, data = as.data.frame(unscaled_centers), | |
col = "red", pch = 19)) | |
# plot 2 | |
intern.km.df2 <- rxXdfToDataFrame(file = mybankfile, rowSelection = randSamp ==1) | |
plot(balance ~ duration, data = intern.km.df2, col = intern.km.df2$.rxCluster_scl,pch = 19) | |
with(as.data.frame(unscaled_centers), points(duration_scl, balance_scl,col = "lightblue", pch = "x", cex = 2)) | |
## How many clustes? | |
SS <- rep(NA, 20) | |
for (i in 2:20) SS[i] <- rxKmeans(~duration + balance, data = BankDS, | |
numClusters = i)$tot.withinss | |
plot(1:20, SS, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares") | |
## k-means can be sensitive to startingcentroid positions | |
## use numStarts to repeat multiple times and refit | |
########## PRINCIPAL COMPONENTS ############## | |
## no rx function: must be run locally | |
## Goal: Simple data reduction technique. | |
## Main idea: Sequentially create variables that are linearly | |
## independent (i.e. uncorrelated) that capture as much variance in the | |
## dataset as possible. Create the variables based on linear | |
## combinations of the variables already in your dataset. | |
# how to build our own in-db function | |
airDS <- RxXdfData(file.path(big.data.path, "AirOnTime7Pct.xdf")) | |
rxGetInfo(airDS, getVarInfo = TRUE) | |
# We will explore the relationship among Year, DepDelay, ArrDelay, | |
# TaxiOut, TaxiIn, CRSDepTime, and Distance. | |
vars2use <- c("Year", "DepDelay", "ArrDelay", "TaxiOut", "TaxiIn", | |
"CRSDepTime", "Distance") | |
rxGetVarInfo(airDS, varsToKeep = vars2use) | |
# Step 1. Compute the correlation matrix. | |
# rxCor() uses complete method – if any values are missing, then that entire row is dropped. | |
airformula <- as.formula(paste("~", paste(vars2use, collapse = "+"))) | |
(cormat <- rxCor(airformula, data = airDS)) | |
# Step 2. apply open-source princomp() function | |
# remember to use covmat parameter | |
args(stats:::princomp.default) | |
(myair.pca <- princomp(covmat = cormat)) | |
summary(myair.pca) | |
plot(myair.pca) | |
# The loadings correspond to the eigen vectors (i.e. the linear | |
# combinations for each of the components). | |
myair.pca$loadings | |
# Now, apply: Create new factor scores associated with the first K principal components. | |
myAirXDF <- file.path(output.path, "myAirDSpca.xdf") | |
rxDataStep(airDS, outFile = myAirXDF, varsToKeep = vars2use, overwrite = TRUE, | |
removeMissings = TRUE) | |
rxGetInfo(myAirXDF, getVarInfo = TRUE) | |
# verify correlations | |
cormat2 <- rxCor(airformula, data = myAirXDF) | |
cormat - cormat2 | |
# summarise dataset | |
summaryResults <- rxSummary(airformula, data = myAirXDF)$sDataFrame | |
curMeans <- setNames(summaryResults$Mean, summaryResults$Name) | |
curStd <- setNames(summaryResults$StdDev, summaryResults$Name) | |
# finally, create a transformfunc | |
# note: matrix multiplication | |
createPCAFactorVars <- function(mylist){ | |
## scale the variables | |
scaledvars <- as.data.frame( | |
mapply(function(curvar, curmean, cursd){ scale(curvar, center = curmean, scale = cursd)}, | |
mylist[curvars], | |
mymeans[curvars], | |
mysds[curvars] | |
) | |
) | |
colnames(scaledvars) <- paste0(curvars,"Z") | |
Components <- as.matrix(scaledvars) %*% mypca$loadings | |
return(c(mylist, as.list(scaledvars), as.list(as.data.frame(Components)))) | |
} | |
# apply transformfunc | |
rxDataStep(myAirXDF, | |
outFile = myAirXDF, | |
varsToKeep = vars2use, | |
append = "cols", | |
overwrite = TRUE, | |
transformFunc = createPCAFactorVars, | |
transformObjects = list(curvars = vars2use, mymeans = curMeans, | |
mysds = curStd, mypca = myair.pca) | |
) | |
# let's look at it | |
(newSummary <- rxSummary(~., data = myAirXDF)$sDataFrame) | |
####### Elastic net / ridge regression / other variable selection... ############ | |
# stepwise = too much chance. coefficients too large, dependant on pre-specified model. overfits | |
# apply shrinkage or regularisation to reduce estimates | |
# - ridge | |
# - lasso (L1 constraint) | |
# basically: minimise[(lack of fit criterion) + (overfitting penalty)] | |
# Ridge: penalty is lambda * sum(Beta^2) <-- overfitting | |
# added to lack of fit = sum(y - x_i%*%Beta_i)^2 | |
# choose lambda with crossvalidation, etc | |
# disadvantages: coefficients still always remain non-zero: are never removed | |
# Lasso | |
# as before, but overfitting = lambda * sum(|Beta|) <-- no coefficients can become zero and drop out | |
# disadvantages: can be volatile with correlated predictors: which one to pick? | |
# poor with wide data (n<p) | |
# Elastic net = combines ridge and lasso | |
# overfitting = lambda * sum(P * Beta) | |
# where P = 1/2(1-alpha)beta etc etc | |
# in R: use glmnet & glmnetUtils package | |
# see slides in gmail | |
### Neural networks | |
# nnet(y ~ x1 + x2 + ..., data, size, linest, entropy, softmax, maxit, decay, skip, rang) | |
#size: number of hidden nodes | |
#linest: TRUE if a regression network, FALSE for classification | |
#entropy: TRUE if a classification network and y is numeric, FALSE for regression | |
#softmax: TRUE if a classification network and y is a factor, FALSE for regression | |
#maxit: maximum number of iterations | |
#decay: weight decay parameter | |
#skip: TRUE to use skip-layer connections (often a good idea) | |
#rang: initial range for weights (if random starting values used) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################# | |
##### CODE NOTES FROM SWINBURNE | |
############################################# | |
######## RANDOM VARIABLES ######### | |
#### DISCRETE ##### | |
# Your mark for this subject | |
X = c(30, 40, 50, 60, 70, 80) | |
# The probability of you getting these marks | |
P = c(0.3, 0.3, 0.05, 0.05, 0.1, 0.2) | |
# Checking the sum of probabilities equal to 1 | |
sum(P) | |
### Calculating means and variance | |
# can be done vectorwise or matrix algebra | |
mean=sum(X*P) | |
mean | |
sum( (X-mean)^2*P ) | |
# MATRIX | |
mean=t(X)%*%P | |
mean | |
####### UNIFORM ######## | |
# Generate 10 uniform random numbers between 3 and 5. | |
runif(10, 3, 5) | |
# Generate 100000 uniform random numbers between 3 and 5. | |
X <- runif(100000,3,5) | |
# Look at its histogram. | |
hist(X) | |
# compute its mean using the formula. | |
(1/2)*(3+5) | |
# verify using the mean function. | |
mean(X) | |
# compute its variance using the formula. | |
(1/12)*(5-3)^2 | |
# verify using the var function. | |
var(X) | |
########### NORMAL | |
# Generate 10,000 N(0,1) numbers (i.e., standard normal). | |
# no need to specify the mean and variance. | |
X<-rnorm(10000) | |
# Generate 10,000 N(30,102) numbers. | |
X<-rnorm(10000,30,10) | |
# checking the histogram. | |
hist(X) | |
# checking the mean and variance. | |
mean(X) | |
var(X) | |
########## CHI | |
# Generate 10,000 Chi-squared numbers with 5 degrees of freedom. | |
X<-rchisq(10000,5) | |
# Look at its histogram. | |
hist(X) | |
# Verify that the mean and variance are indeed as n and 2n respectively… | |
mean(X) | |
var(X) | |
##### F-DISTRIBUTION | |
# Generate 100 F numbers with df1 = 5, df2 = 2. | |
X<-rf(100,5,2) | |
# Look at its histogram. | |
hist(X) | |
############### BETA | |
## BETA | |
# mean = a/(a+b) | |
# median = qbeta(0.5,a,b) | |
############### BAYES EXAMPLES | |
### BETA BINOMIAL COIN TOSS - large sample (n=11) | |
a <- 5 | |
b <- 5 | |
s <- 70 # which is y in our previous slides | |
f <- 30 # which is n – y in our previous slides | |
# The Prior | |
curve(dbeta(x,a,b),lty=3,lwd=4, col="red", ylim = range(0,10)) | |
# The Likelihood | |
curve(dbeta(x,s+1,f+1),add=TRUE, lty=2,lwd=3, col="blue") | |
# The Posterior | |
curve(dbeta(x,a+s,b+f), from=0, to=1, add=TRUE, xlab="p",ylab="Density",lty=1,lwd=3, col="green") | |
legend(.1,8,c("Posterior","Likelihood","Prior"),lty=c(1,2,3),lwd=c(3,3,3), | |
col=c( "green", "blue", "red")) | |
### BETA BINOMIAL COIN TOSS - small sample | |
a <- 5 | |
b <- 5 | |
s <- 7 # which is y in our previous slides | |
f <- 3 # which is n - y in our previous slides | |
# The Prior | |
curve(dbeta(x,a,b),lty=3,lwd=4, col="red", ylim = range(0,5)) | |
# The Likelihood | |
curve(dbeta(x,s+1,f+1),add=TRUE, lty=2,lwd=3, col="blue") | |
# The Posterior | |
curve(dbeta(x,a+s,b+f), from=0, to=1, add=TRUE, xlab="p",ylab="Density",lty=1,lwd=3, col="green") | |
legend(.1,8,c("Posterior","Likelihood","Prior"),lty=c(1,2,3),lwd=c(3,3,3), | |
col=c( "green", "blue", "red")) | |
# Posterior Mean | |
qbeta(0.5, a + s, b + f) | |
# Bayesian 90% confidence interval | |
qbeta(c(0.05,0.95), a + s, b + f) | |
#### What if all ten tosses are heads? | |
a <- 5 | |
b <- 5 | |
s <- 10 # which is y in our previous slides | |
f <- 0 # which is n - y in our previous slides | |
# The Prior | |
curve(dbeta(x,a,b),lty=3,lwd=4, col="red", ylim = range(0,8)) | |
# The Likelihood | |
curve(dbeta(x,s+1,f+1),add=TRUE, lty=2,lwd=3, col="blue") | |
# The Posterior | |
curve(dbeta(x,a+s,b+f), from=0, to=1, add=TRUE, xlab="p",ylab="Density",lty=1,lwd=3, col="green") | |
legend(.1,8,c("Posterior","Likelihood","Prior"),lty=c(1,2,3),lwd=c(3,3,3), | |
col=c( "green", "blue", "red")) | |
################ FITTING MAXIMUM LIKELIHOOD ################### | |
# Assume gaussian pdf for epsilon | |
# y = XB + epsilon, where epsilon ~ N(0, sigma^2) | |
# import US_Homicides_1990.csv | |
data <- read.csv(file.choose()) | |
data<-data.matrix(data) | |
one <- rep(1,dim(data)[1]) # create constant | |
one <- data.matrix(one) | |
y <- data[,1] | |
y<-data.matrix(y) | |
X <- data[,(-1)] | |
X <- cbind(one,X) | |
X<- data.matrix(X) | |
# define log-likelihood function | |
# Note: SIGNS HAVE BEEN REVERSED as optim() only finds MINIMUM | |
lnL <- function(p) | |
(dim(data)[1]/2)*log(2*3.1415926)+(dim(data)[1]/2)*log(p[1])+(1/(2*p[1]))*(t(y-X%*%p[2:8])%*% (y-X%*%p[2:8])) | |
# Find maximum - NOTE: OPTIM finds MINIMUM, so we have to REVERSE all signs | |
out <- optim(c(rep(1,8)), lnL, hessian=TRUE, method = "BFGS") | |
beta <- matrix(out$par) | |
# Note: first value is variance --> ignore | |
# Second value is for intercept | |
# remainder are coefficients for parameters | |
stdev <- matrix(sqrt(diag(solve(out$hessian)))) | |
t_beta <- beta/stdev # t-statistics | |
cbind(beta,t_beta) # bind coefficient estimate & t-statistics | |
# Interpret coefficients positive or negative | |
# Interpret t: | |
# Remember that you are controlling for all other variables | |
##### LME EXAMPLE FROM ASSIGNMENT 1 ############ | |
# just estimating mean & SD | |
y=c(6.1,5.9,6.2,5.7,5.1,6.7,5.8,6.4,6.8,4.5,6.3,6.1,6.6,6.1,5.6,6.3,6.6,5.4,6.1,6.2,5.2,6.2, | |
6.3,6.7,6.5,5.8,5.4,4.8,6.8,6.3,6.5,6.2,6,5.8,5.2,6.3,6,6.1,4.8,6) | |
n = length(y) | |
## Define function | |
# note that we switch signs (minus'es to plus'es) to accomodate | |
# the optim() function, which _minimises_ by default | |
# p[1] will be the variance, p[2] will be the mean | |
lnL <- function(p) | |
(n/2)*log(2*pi*p[1]) + (1/(2*p[1]))*(t(y-p[2])%*%(y-p[2])) | |
out <- optim(c(rep(1,2)), lnL, hessian=TRUE, method = "BFGS") | |
# print variance | |
print(out$par[1]) | |
# print mean | |
print(out$par[2]) | |
############# LECTURE 4 | |
#### Slides #2: "The bayesian computation problem" on | |
## Example: Waiting until a hit in baseball - using beta-binomial conjugates | |
# Note that observations are WAIT TIMES, but we are estimating p = probability of hit | |
# observations= wait time until hit baseball | |
# note that 0's = non-hits | |
baseball <- c(0,2,0,4,1,0,2,0,1,0,0,1,1,3,1,0,0,0,1,6,0,9,0,4,1,9,1,0,3,4,5,5,1,0,2, | |
4,0,4,0,3,2,1,0,1,3,7,0,3,1,2,14,4,0,1,6,1,10,1,2,0,1,0,4,5,0,7,3,1,2,1,2,1,2,2, | |
4,3,3,1,1,2,1,2,7,0,3,1,2,2,2,2,0,3,4,1,1,0,0,1,1,1,11,2,2,1,3,1,0,1,2,1,1,1,0,0, | |
2,0,10,1,2,2,1,1,3,1,1,0,0,1,0,1,0,1,1,0,1,0,0,0,2,1,4,5,5,0,0,0,0,2,0,8,5,2,11, | |
8,0,7,1,3,1) | |
n <- length(baseball) # [1] 159 | |
s <- sum(baseball) # [1] 336 | |
# prior = beta distribution | |
a = 44 | |
b = 102 | |
curve(dbeta(x,a,b), from=0.2, to=0.5, | |
xlab="p",ylab="Density",lty=1,lwd=3, main= "Prior") | |
# The likelihood function | |
# A function of p, defined as: | |
likelihood = function(p) | |
{ | |
p^n*(1-p)^s | |
} | |
curve(likelihood, from=0.2, to=0.5, xlab="p", ylab="Density", main= "Likelihood Function") | |
abline(v=0.321, lty=2) | |
# POSTERIOR - using beta-binomial known conjugacy | |
a=44 | |
b=102 | |
n=159 | |
s=336 | |
A=n+a | |
B=s+b | |
curve(dbeta(x,A,B), from=0.2, to=0.5, | |
xlab="p",ylab="Density",lty=1,lwd=3, main= "Posterior Density Function (1)") | |
# --> note, no normalisation = not a proper density, but still correct shape | |
## or alternatively we can define the posterior function explicitly | |
posterior = function(p) | |
{ | |
p^(n+a-1)*(1-p)^(s+b-1) | |
} | |
#plot of the posterior density function | |
curve(posterior, from=0.2, to=0.5, xlab="p", ylab="Density",lty=1,lwd=3 , main= "Posterior Density Function (2)") | |
# Using the qbeta function, find the median of the posterior density of p. | |
a=44 | |
b=102 | |
n=159 | |
s=336 | |
A=n+a | |
B=s+b | |
posterior.median = qbeta(p = 0.5, A, B) | |
posterior.median | |
# Using the qbeta function, construct a 95% Bayesian interval estimate for p. | |
Bayesian_95Interval = qbeta(c(0.025,0.975), A, B) | |
Bayesian_95Interval | |
# Using the rbeta function, simulate 100 values from the posterior density of p. | |
a=44 | |
b=102 | |
n=159 | |
s=336 | |
A=n+a | |
B=s+b | |
sim.posterior=rbeta(100, A, B) | |
# Use the hist function on the simulated sample to display the posterior density. | |
hist(sim.posterior, col="gray" , main="Simulation for Posterior Density", xlab="p", freq=FALSE ) | |
curve(dbeta(x,a,b), col="red", lty=2, add=TRUE) | |
curve(dbeta(x,A,B), col="blue", lty=2, add=TRUE) | |
## Use simulation to get mean & SD | |
# Using the simulated draws, approximate the mean and S.D. of the posterior density | |
mean(sim.posterior) | |
# very close to A/(A+B) | |
sd(sim.posterior) | |
# Using the simulated draws, construct a 95% Bayesian interval estimate. | |
sim_posterior_95Interval = quantile(sim.posterior, c(0.025,0.975)) | |
sim_posterior_95Interval | |
######################################################### | |
######### Oct 1 - MARKOV CHAIN MONTE CARLO ############# | |
######################################################### | |
#### EXAMPLE #### | |
# Demonstrate bayes | |
install.packages('LearnBayes', lib='c:\\notbackedup\\R\\') | |
library(LearnBayes, lib='c:\\notbackedup\\R\\')# a Bayesian package in R | |
## Define prior | |
# a sequence of mid-points | |
midpt <- seq(0.05, 0.95, by = 0.1) # a sequence of mid-points | |
# a sequence of "heights" for the prior density | |
# these are clearly not proper densities. | |
prior <- c(1, 5.2, 8, 7.2, 4.6, 2.1, 0.7, 0.1, 0, 0) | |
# Divide by their sum total, they now become proper densities. | |
prior <- prior/sum(prior) | |
# histprior is a function in LearnBayes package that plots histogram | |
curve(histprior(x,midpt,prior), from=0, to=1,ylab="Prior density",ylim=c(0,.3)) | |
## now find posterior | |
s <- 4 # 4 heads (successes) | |
f <- 6 # 6 tails (failures) | |
# define "myfn" as the product of the likelihood and the prior | |
myfn <- function(x) | |
histprior(x,midpt,prior)*dbeta(x,s+1,f+1) | |
curve(myfn(x), from=0, to=1, ylab="Posterior density") | |
##--> SHAPE is BIZARRE, doesn't correspond to any nice, known distributions | |
# Not a proper density because doesn't sum to 1 | |
## In this case, easy to solve | |
# we can just numerically evaluate area under curve to get normalising constant | |
pvals <- seq(0, 1, length=1000) | |
# evaluating "myfn" over the space of [0,1] | |
mean(myfn(pvals)) | |
## --> this is our normalising constant --> 0.1725680 | |
post <- function(x) | |
histprior(x,midpt,prior)*dbeta(x,s+1,f+1)/0.1725680 | |
## simulate values to get mean, etc. | |
# 1000 posterior values evaluated at 0.001 intervals | |
pev <- post(pvals) | |
# the above "pev" is close but not entire accurate due to rounding. | |
# standardize it | |
pev <- pev/sum(pev) | |
# sample WITH replacement from pvals using pev as probability input. | |
ps <- sample(pvals, replace = TRUE, prob = pev) | |
# The Mean of the simulated sample | |
mean(ps) | |
# The Median of the simulated sample | |
quantile(ps, 0.5) | |
# 95% confidence interval | |
quantile(ps, c(0.05,0.95)) | |
### MARKOV CHAINS #### | |
# The sunny/rainy example | |
# for 1 day | |
P = matrix(c(0.90,0.50,0.10,0.50), 2,2) | |
# 2 days | |
P%*%P | |
# long term equilibrium - N days | |
N = 20 | |
P = matrix(c(0.90,0.50,0.10,0.50),2,2) | |
P11 = matrix(0,N,1) | |
P12 = matrix(0,N,1) | |
P21 = matrix(0,N,1) | |
P22 = matrix(0,N,1) | |
P11[1] = P[1,1] | |
P12[1] = P[1,2] | |
P21[1] = P[2,1] | |
P22[1] = P[2,2] | |
Temp = P | |
for (i in 1:(N-1)) | |
{ | |
Temp = Temp%*%P | |
P11[i+1] = Temp[1,1] | |
P12[i+1] = Temp[1,2] | |
P21[i+1] = Temp[2,1] | |
P22[i+1] = Temp[2,2] | |
} | |
EP = Temp # this is the equilibrium probability | |
par(mfrow=c(2,2)) | |
plot(P11) | |
plot(P12) | |
plot(P21) | |
plot(P22) | |
# notice the burn-in period? Must throw that away | |
## Eventually markov chain converges to EQUILIBRIUM DISTRIBUTION | |
## Equilibrium is what we're interested in, | |
# This is the distribution ignoring the conditionality of time steps | |
#### MCMC ##### | |
## Example of ages | |
dval <- c(200,180,170,165,190,195,184,183,150,157) | |
# log-prior = a function of mu. Note: log taken | |
lprior <- function(mu) dnorm(mu, 175, 100,log = TRUE) | |
# log-likelihdood = sum of logs of normal probabilities across all observations | |
llike <- function(mu) sum(dnorm(dval, mean = mu, sd = 15, log = TRUE)) | |
# log-posterior function - normalising constant not needed | |
lpost <- function(mu) lprior(mu) + llike(mu) | |
#now, do MCMC = simulate values and save if accepted | |
mu0 <- 180 | |
s <- 10 # this controls the standard deviation of normal distribution from which "jumps" are drawn | |
n <- 10000 # number of steps | |
mus <- c(mu0, numeric(n)) | |
for(i in 1:n) | |
{ | |
# generating proposal value of mu = current value + random draw value | |
mup <- mus[i] + rnorm(1,0,s) # s is SD of the jump | |
p <- exp(lpost(mup) - lpost(mus[i])) # proposal probability = RATIO ofposteriors | |
# either accept or reject the proposal | |
if(p < runif(1)) mus[i+1] <- mus[i] else mus[i+1] <- mup | |
} | |
# discard burn-in | |
musb <- mus[-(1:1001)] # discards burn-in values | |
# Examine our chain with code package | |
install.packages('coda', lib='c:\\notbackedup\\R\\') | |
library(coda, lib='c:\\notbackedup\\R\\')# a Bayesian package in R | |
musc <- as.mcmc(musb) # converts to MCMC object | |
rejectionRate(musc) # get rejection rate | |
# 0.53 = 53% of proposals rejected | |
## PLOT | |
# plot posterior | |
par(mfrow=c(2,1)) # 2 graphs stacked on top of each other | |
traceplot(musc) | |
densplot(musc) | |
summary(musc) | |
##################### END OCT 1 LEC ######################## | |
##### OCT 8 LEC: MCMC ################ | |
install.packages('ismev', lib='c:\\notbackedup\\R\\') | |
library(ismev, lib='c:\\notbackedup\\R\\')# a Bayesian package in R | |
install.packages('LearnBayes', lib='c:\\notbackedup\\R\\') | |
library(LearnBayes, lib='c:\\notbackedup\\R\\')# a Bayesian package in R | |
data(engine) | |
plot(engine$Corr,engine$Time) | |
# as corosion increases, time gets shorter | |
# We model as exponential distribution | |
# x = corrosion | |
# y = a * x ^b | |
# so, theta = (a,b) | |
# Exponential distirbutions are parameterised by RATE parameter | |
# if we just use MLE | |
# First specify the log likelihood function | |
fn = function(p, engine) | |
{ | |
n = nrow(engine) # number of observations | |
y = engine$Time # engine failure time | |
x = engine$Corrosion # corrosion as a predictor | |
-n*log(p[1]) - p[2]*sum(log(x)) + p[1]*sum(x^p[2] * y) | |
} | |
out = optim(c(1.16, 0), fn, method = "BFGS", | |
hessian = TRUE, engine = engine) | |
round(out$par,3) | |
round(sqrt(diag(solve(out$hessian))), 3) | |
plot(engine$Corr,engine$Time) | |
CorrLev = seq(0, 4.2, len=100) | |
lines(CorrLev, 1/(1.133* CorrLev^0.479)) # these are coefficients | |
# "The rate parameter of an exponential distribution is the reciprocal of the mean" - so we need to use reciprocal | |
# so if we predict rate, we take reciprocal to get mean | |
## The bayesian method | |
# define the log of the posterior | |
lpost <- function(p, engine) | |
{ | |
y <- engine$Time | |
x <- engine$Corrosion | |
# the log of the joint prior is equal to the sum of log marginal | |
# Note: this is because we defined that as independant so ab = a*b | |
lprior <- dnorm(p[1],0,100,log=TRUE) + dnorm(p[2],0,100,log=TRUE) | |
# the likelihood uses the exponential function | |
# note that the "a" coefficient is "exp(a)" | |
llike <- sum(dexp(y, rate = exp(p[1])*x^p[2], log = TRUE)) | |
# the log posterior is the sum of log prior and log likelihood | |
lprior + llike | |
} | |
start <- c(log(1.16),0) # starting values. We've assumed p[1] is logged | |
# see next slide for details of gibbs() | |
ftmcmc <- gibbs(lpost, start, 100000, c(0.25,0.25), engine = engine) | |
# get the accept rate. remember, we want this to be ~50-60% | |
ftmcmc$accept | |
# get the sample chain | |
ftmcmc$par | |
hist(ftmcmc$par[,1]) # hist of b (remmeber, we are sampling from log) | |
hist(exp(ftmcmc$par[,1])) | |
# get means | |
mean(exp(ftmcmc$par[-(1:10000),1])) | |
mean(ftmcmc$par[-(1:10000),2]) | |
########### ELICITATION | |
quantile1 <- list(p=.05,x=0.25) # 5%tile at x = 0.25 | |
quantile2 <- list(p=.95,x=0.75) # 95%tile at x = 0.75 | |
# select parameters for a Beta distribution with the above quantiles. | |
beta.select(quantile1,quantile2) | |
########## COIN SELECTION EXAMPLE | |
# drawing pennies from jar then tossing | |
# each penny could have different probability of landing on heads | |
# But we know it has a uniform distribution from 0-1 | |
# we select a coin at random, flip it 10 times, get 3 heads | |
# Q. we toss it 12 more times, what is p(4 heads)? | |
# two parts | |
# p(heads) for chosen coin | |
# p(4 heads | 12 tosses) for that coin | |
# So what is joint density? | |
# {see slides} | |
random.coin.gibbs = function (p=0.5, m = 1000) | |
{ | |
S = matrix(0,m,2) | |
dimnames(S)[[2]] = c("p","y") | |
for (j in 1:m) | |
{ | |
y = rbinom(1, size=12, prob=p) | |
p = rbeta(1, y+4, 20-y) | |
S[j,] = c(p,y) | |
} | |
return(S) | |
} | |
sim.values = random.coin.gibbs() | |
plot(sim.values) | |
# p(y=4) | |
table(sim.values[,"y"]) | |
# so, reading off the table, y=4 = number of 4's / total sample 1000 gives us p(y=4) | |
# p: | |
mean(sim.values[,"p"]) | |
################################################################ | |
### PART 2 of course ########################################### | |
################################################################ | |
############################################# | |
##### MARKOV CHAIN ###### | |
############################################# | |
## Finding long-term steady-state | |
transmat<-matrix(c(0.9, 0.2, 0.1, 0.8), nrow=2, byrow=2) | |
s0<-matrix(c(3000,4000), nrow=2) | |
s2<-transmat%*%transmat%*%s0 | |
steady50<-transmat | |
for(i in 1:50) | |
steady50<-transmat%*%steady50 | |
steady50 | |
steady51<-transmat | |
for(i in 1:51) | |
steady51<-transmat%*%steady51 | |
steady51 | |
#### Generate sample for Gibbs | |
n<-50 #Data | |
x<-rnorm(n,5,2) # mean is 5 and sd=2 | |
N<-1000 #Run MC=2 chains of length N=1000 | |
simmat<-matrix(0, ncol=2, nrow=N) # store the results | |
for (j in 1:2) { #Loop over chains | |
tau<-rgamma(1,n/2,1/2) #Starting value for tau | |
for (i in (1:N)) { #Gibbs iterations | |
mu<-rnorm(1,mean(x),sqrt(1/(tau*n))) #Update mu | |
tau<-rgamma(1,n/2,sum((x-mu)^2)/2) #Update tau | |
simmat[i,1]<-mu #Save results | |
simmat[i,2]<-tau | |
} | |
} | |
par(mfrow=c(2,2)) | |
hist(simmat[,1]) | |
hist(simmat[,2]) | |
plot(simmat[,1], type="l") | |
plot(simmat[,2], type="l") | |
##### Metropolis algorithm | |
## example with uniform distribution | |
metropolis1<-function(n, a, b) | |
{ | |
sim<-numeric(n) | |
sim[1]<-x<-0 | |
for (i in 2:n) { | |
prob=runif(1,a,b) | |
cand=x+prob | |
ratio=min(1,dnorm(cand)/dnorm(x)) | |
u=runif(1) | |
if (u < ratio) | |
x=cand | |
sim[i]=x | |
} | |
sim | |
} | |
par(mfrow=c(1,1)) | |
res<-metropolis1(1000,-1, 1 ) | |
plot(res, type="l", col="dark red") | |
####### Metropolis-Hasting | |
## also fix to make log-likelihood | |
metropolis2<-function(n, a, b) { | |
sim<-numeric(n) | |
sim[1]<-x<-0 | |
logllkelihood.old<-dnorm(x, log=TRUE) | |
for (i in 2:n) { | |
cand<-x+ runif(1,a,b) | |
logllkelihood.new<-dnorm(cand, log=TRUE) | |
ratio<-min(1, exp(logllkelihood.new- logllkelihood.old)) | |
u<-runif(1) | |
if (u < ratio){ | |
x<-cand | |
logllkelihood.new <- logllkelihood.old | |
} | |
sim[i]<-x | |
} | |
sim | |
} | |
res<-metropolis2(1000,-1, 1 ) | |
plot(res, type="l", col="dark red") | |
#### Generalised Extreme Value distribution example | |
# log-likelihood function | |
# theta is 3-value vector | |
loglikelihood<-function(theta, mdata){ | |
mu<-theta[1] | |
shape<-theta[2] | |
sigma<-theta[3] | |
n<-length(mdata) | |
zz<-1+shape*(mdata-mu)/sigma | |
# note we zero any values that are <0 | |
# not quite zero, as can't log 0 | |
if(any(zz<=0)) { | |
return(-1e16) | |
} | |
llikelihood<- -log(sigma)-(1+1/shape)*log(zz)-(zz)^(-1/shape) | |
sum(llikelihood) | |
} | |
# mcmc function | |
# std is for accept/reject values | |
# if STD is TOO SMALL then we'll accept everything | |
# we want 40-60% accept rate | |
mcmcevd<- function(nn, init,std, mdata, burn=0) { | |
theta<-init | |
simmat<- matrix(NA,nn+1, ncol=length(theta)) | |
colnames(simmat) <- c("loc","shape","scale") | |
simmat[1,]<-init | |
acc.counts<-numeric(3) | |
llikelihood.old<-loglikelihood(theta, mdata) | |
for(i in 1:nn){ | |
logposterior.old<-llikelihood.old+dnorm(theta[1], sd=100, log=TRUE) #Prior and logLikelihood | |
# Proposal value for location | |
loc.old<-theta[1] | |
loc.new<-rnorm(1,loc.old,std[1]) #Simulating a value from a normal distribution | |
theta[1]<-loc.new | |
llikelihood.new<-loglikelihood(theta, mdata) | |
logposterior.new<-llikelihood.new+dnorm(theta[1], sd=100, log=TRUE) #Prior and logLikelihood | |
##DECISION WHETHER REJECT/ACCEPT | |
alpha<- min(exp(logposterior.new-logposterior.old),1) # remove log | |
u<-runif(1,0,1) # generate new value | |
if(alpha > u) # decision | |
{ | |
simmat[(i+1),1]<-loc.new | |
acc.counts[1]<-acc.counts[1]+1 | |
llikelihood.old<-llikelihood.new | |
} | |
else | |
simmat[(i+1),1]<-loc.old | |
theta[1]<-simmat[(i+1),1] | |
logposterior.old<-llikelihood.old+dnorm(theta[2], sd=100, log=TRUE) #Prior and logLikelihood | |
# Proposal value for shape | |
shape.old<-theta[2] | |
shape.new<-rnorm(1,shape.old,std[2]) #Simulating a value from a normal distribution | |
theta[2]<-shape.new | |
llikelihood.new<-loglikelihood(theta, mdata) | |
logposterior.new<-llikelihood.new+dnorm(theta[2], sd=100, log=TRUE) #Prior and logLikelihood | |
##DECISION WHETHER REJECT/ACCEPT | |
alpha<- min(exp(logposterior.new-logposterior.old),1) | |
u<-runif(1,0,1) | |
if(alpha > u) | |
{ | |
simmat[(i+1),2]<-shape.new | |
acc.counts[2]<-acc.counts[2]+1 | |
llikelihood.old<-llikelihood.new | |
} | |
else | |
simmat[(i+1),2]<-shape.old | |
theta[2]<-simmat[(i+1),2] | |
## Third parameter: scale | |
# this is special: cannot have negative value | |
# We take value from LOG-NORMAL | |
# This is not a symmetric distribuiton | |
# so we need special non-symmetric component in accept/reject | |
sigma.old<-theta[3] | |
sigma.new<-exp(rnorm(1,log(sigma.old),std[3])) #Simulating a value from a lognormal distribution | |
theta[3]<-sigma.new | |
llikelihood.new<-loglikelihood(theta, mdata) | |
logposterior.new<-llikelihood.new+dnorm(theta[2], sd=100, log=TRUE) #Prior and logLikelihood | |
##DECISION WHETHER REJECT/ACCEPT | |
nonsymm<-log(sigma.new/sigma.old) | |
alpha<- min(exp(logposterior.new-logposterior.old+nonsymm),1) | |
u<-runif(1,0,1) | |
if(alpha > u) | |
{ | |
simmat[(i+1),3]<-sigma.new | |
acc.counts[3]<-acc.counts[3]+1 | |
llikelihood.old<-llikelihood.new | |
} | |
else | |
simmat[(i+1),3]<-sigma.old | |
theta[3]<-simmat[(i+1),3] | |
} ##End of simulation loop | |
simmat<-simmat[(burn+1):nn,] | |
acc.rates<-c(acc.counts)/nn | |
list(simmat=simmat, acc.rates=acc.rates) | |
} #End of Function | |
mdata<-rnorm(25, 25, 2) # this is our observed dataset | |
theta<-c(2,0.5,5) # starting value | |
loglikelihood(theta, mdata) | |
init<-c(5,3,1) | |
std<-c(0.7, 0.6, 0.3) | |
xx<-mcmcevd(nn=10000, init,std, mdata=mdata, burn=1000) | |
temp<-xx$simmat | |
xx$acc.rates | |
par(mfrow=c(3,1)) | |
plot(temp[,1], type="l") | |
plot(temp[,2], type="l") | |
plot(temp[,3], type="l") | |
###### GLM with Bayes | |
library("MASS") | |
install.packages("mvtnorm") | |
install.packages("mfp") | |
library(mvtnorm) # multivariate normal distributions | |
library(mfp) # | |
# import dataset from folder | |
attach(bodyfat) | |
summary(bodyfat) | |
x1<-Abdomen | |
x2<-Age | |
x3<-Height | |
x4<-Neck | |
y<-Siri | |
#=======USING MLE============================ | |
RegModel.1 <- lm(y~x1+x2+x3+x4) | |
summary(RegModel.1) | |
#==============USING BAYES=============== | |
X<-cbind(1, x1, x2, x3, x4) | |
dim(X) # 252 rows, 5 columns | |
Vbeta<-solve(t(X)%*%X) # solve = find inverse | |
Vbeta # diagnoals are variance, off-diagonals are co-variance | |
betahat<-solve(t(X)%*%X)%*%t(X)%*%y | |
betahat | |
df<-nrow(X)-ncol(X) # degrees of freedom | |
df | |
ssquared<-(sum(t(y-X%*%betahat)%*%(y-X%*%betahat)))/df | |
#simulate from the posterior | |
n.sims<-10000 | |
sigma<-sqrt(1/rgamma(n.sims, df/2, df*ssquared/2)) | |
# sigma | |
bbeta<-matrix(betahat, n.sims, ncol(X), byrow=T)+sigma*rmvnorm(n.sims, rep(0, ncol(X)), Vbeta) | |
apply(bbeta, 2, quantile, probs=c(0.025, 0.5, 0.975)) | |
# can now plot trace, density, etc | |
#==================USING LEARNBAYES PACKAGE==== | |
install.packages("LearnBayes") | |
library(LearnBayes) | |
bmod <- blinreg(y,X, 100000) # prior is null by default | |
str(bmod) # contains two vectors: for Beta and Sigma | |
summary(bmod$beta) # see the mean & median estimates: very close to above | |
betas<-bmod$beta | |
par(mfrow=c(1,1)) | |
plot(betas[,1], type="l") | |
plot(density(betas[,1]), type="l") | |
#==================USING MCMCpack PACKAGE==== | |
# The model formula is the same, but here we have some new options. | |
# for example: starting value, can incorporate more info, | |
install.packages("MCMCpack") | |
library(MCMCpack) | |
model.mcmc <- MCMCregress(y~x1+x2+x3+x4, | |
data = bodyfat, | |
burnin = 2000, | |
mcmc = 10000, | |
thin = 1, | |
seed = NA, | |
beta.start = NA) | |
str(model.mcmc) | |
# mcmc = chain | |
summary(model.mcmc) | |
###### logistic regression with Bayes | |
## Basic MLE approach | |
x<-c(0,1,2) | |
ca<-c(10,12,85) | |
cl<-c(10,65,165) | |
logitmod<-glm(cbind(ca,cl)~x,family="binomial") | |
str(logitmod) | |
thetahat<-logitmod$coeff[2] | |
exp(thetahat) | |
# 1.18: so odds of diseased increase 18% with each extra x | |
var<-vcov(logitmod)[2,2] | |
# 95% confidence: | |
exp(thetahat-1.96*sqrt(var)) | |
exp(thetahat+1.96*sqrt(var)) | |
#### logistic via Bayesian - use INLA package | |
## INLA: | |
install.packages(c("fields","numDeriv","pixmap", "mvtnorm","R.utils")) | |
source("http://www.math.ntnu.no/inla/givemeINLA.R") | |
library(INLA) | |
inla.upgrade(testing=TRUE) | |
# First, no priors | |
# 1. create a list of values [?], convert to dataframe | |
dat<???list(x=x,ca=ca,cl=cl) | |
dat<???as.data.frame(rbind(ca,cl,x)) | |
mod.inla<???inla(ca~x, family="binomial" ,data=dat,Ntrials=ca+cl) | |
str(mod.inla) | |
# so many parameters... | |
# mainly interested in marginal distributions for parameters | |
summary(mod.inla) | |
vals1 = mod.inla$marginals.fixed | |
intercept = vals1$'(Intercept)' | |
x=vals1$x | |
par(mfrow=c(2,1)) | |
plot(intercept, type="l", main="Intercept") | |
plot(x, type="l", main='x') | |
# don't really care about intercept | |
# get summary statistics for x, x values | |
# median: 0.16 --> 16% increase odds for each extra 'a' | |
#=====Now with informative priors===== | |
Upper975<???log(1.5) | |
W<???(log(Upper975)/1.96)^2 | |
mod.inla1<???inla(ca~x,family='binomial',data=dat,Ntrials=ca+cl, control.fixed=list(mean.intercept=c(0),prec.intercept=c(.1), mean=c(0),prec=c(1/W))) | |
summary(mod.inla1) | |
#=======MULTIPLE LINEAR REG====== | |
Res.inla<-inla(y~x1+x2+x3+x4, data=bodyfat, family="gaussian") | |
summary ( Res.inla) | |
# estimated values = marginal distributions | |
vals<-Res.inla$margianls.fixed | |
ex1 = vals$x1 | |
ex2 = vals$x2 | |
ex3 = vals$x3 | |
ex4 = vals$x4 | |
# then plot them with plot, type="l" | |
################ ONE-WAY ANOVA | |
sigma.mu<-0.3 | |
sigma.e<-0.2 | |
sample<-10 | |
trt<-5 | |
beta0<-0.5 | |
mu<-rnorm(sample, mean=0, sd=sigma.mu) | |
e<-rnorm(sample*trt, mean=0, sd=sigma.e) | |
yvec<-beta0 + rep (mu , each=trt) + e | |
simdata<-data.frame(y=yvec, trt=rep(1:sample, each=trt)) | |
# basic ANOVA | |
fit1 <- aov(y ~ trt, data=simdata) | |
summary(fit1) | |
# | |
# Bayesian | |
fit2<-inla(y~f(trt, model="iid"), data=simdata) | |
summary(fit2) | |
########################################################## | |
### PART 5: BAYES FACTOR & TESTING #################### | |
########################################################### | |
install.packages('LearnBayes') | |
# Using Bayes factor to compare models with different | |
# assumptions of sampling distribution | |
# See slide see slide 32: "Models for Fire Calls" | |
fire.counts <- c(75, 88, 84, 99, 79, 68, 86, 109, 73, 85, 101, 85, | |
75, 81, 64, 77, 83, 83, 88, 83, 78, 83, 78, 80, | |
82, 90, 74, 72, 69, 72, 76, 76, 104, 86, 92, 88) | |
# Step 1: use histograms to compare different posterior sampling distributions | |
hist(fire.counts, probability=TRUE, ylim=c(0, .08)) | |
x <- 60:110 | |
lines(x, dpois(x, lambda=mean(fire.counts)), col="red") | |
lines(x, dnorm(x, mean=mean(fire.counts), sd=12), col="blue") | |
lines(x, dnorm(x, mean=mean(fire.counts), sd=6), col="green") | |
legend("topright", legend=c("M1: Poisson(theta)", "M2: N(theta, 12)", "M3: N(theta, 6)"), col=c("red", "blue", "green"), lty=1) | |
# which is best? | |
# We need to calculate log likelihood for each | |
# use LearnBayes to do integration | |
#==========Model M1========= | |
model.1 <- function(theta, y){ | |
sum(log(dpois(y, theta))) + dgamma(theta, shape=280, rate=4) | |
} | |
library(LearnBayes) | |
log.pred.1 <- laplace(model.1, 80, fire.counts)$int | |
log.pred.1 | |
#==========Model M2========= | |
model.2 <- function(theta, y){ | |
sum(log(dnorm(y, theta, 6))) + dgamma(theta, shape=280, rate=4) | |
} | |
log.pred.2 <- laplace(model.2, 80, fire.counts)$int | |
#==========Model M3========= | |
model.3 <- function(theta, y){ | |
sum(log(dnorm(y, theta, 12))) + dgamma(theta, shape=280, rate=4) | |
} | |
log.pred.3 <- laplace(model.3, 80, fire.counts)$int | |
## Calculate Bayes Factors - subtract due to logs | |
exp(log.pred.1-log.pred.2) | |
# very large: model 1 better than 2 | |
exp(log.pred.2-log.pred.3) | |
############ Linear regression comparison ############ | |
install.packages('BayesFactor') | |
library(BayesFactor) | |
data(attitude) | |
head(attitude) | |
lmObj = lm(rating ~ ., data = attitude) | |
summary(lmObj) | |
# only complaints is significant variable | |
# RegressionBF calculates bayes factors for many models at once | |
bf = regressionBF(rating ~ ., data = attitude) | |
length(bf) # 63 models = all possible combinations. 2^6 = 64 - 1 = 63 | |
head(bf, n=6) # displays only the 6 models | |
################ HYPOTHESIS TESTING ########################## | |
### One sample t-test ### | |
# classical approach | |
# first, import dataset with load workspace | |
t.test(x=BodyTemperature$Temperature-98.4) | |
# Bayes method | |
onesam.test1 = ttestBF(x=BodyTemperature$Temperature-98.4) | |
onesam.test1 | |
### Independant samples t-test | |
# is there a difference in IQ between male vs. female | |
# import data | |
classical.test = t.test(iqf ~ sex, data = lead_environ,var.equal = TRUE) | |
# p=0.5 --> reject alternative | |
# bayes factor | |
bf = ttestBF(formula = iqf ~ sex, data = lead_environ,var.equal = TRUE ) | |
bf | |
# bayes factor = 0.26 where null: mu1-mu2 = 0 | |
# ??? therefore, we DO NOT accept alternative hypothesis. | |
### Paired sample t-test | |
# import data | |
PrisonStressSport = sp | |
attach(PrisonStressSport) | |
ccpaired.test = t.test(x=PSSbefore, y=PSSafter, paired=TRUE, data=PrisonStressSport) | |
# p=0.0178 = reject Null | |
ttestBF(x=PSSbefore, y=PSSafter, paired=TRUE) | |
# Bayes factor = 3.48 = reject Null | |
########################################################## | |
### PART 6: Matrix, conjugates, markov #################### | |
########################################################### | |
# MAtrix | |
dataset<-cbind(c(1991:2015), rnorm(25, 40, 2)) | |
t<-dataset[,1] | |
#design matrix | |
designmat<-cbind(1, t) | |
#lets assume a0<-0.2 and a1<-0.8 | |
a0<-0.2; a1<-0.8 # intercept, slope | |
coeff<-c(a0, a1) | |
beta<-designmat%*%coeff # calculate predictions | |
beta | |
################################################ | |
### IMPUTING MISSING VALUES | |
################################################ | |
install.packages("mice", dependencies=TRUE) | |
library(mice) | |
## Create a dataset with missing values | |
# 1. create a dataset | |
set.seed(100) | |
x1.r<-runif(200) | |
x2.r<-runif(200) | |
y.r<-runif(200) | |
# 2. we randomly introduce missing values | |
miss.x1<-rbinom(200, 1, prob=0.9) | |
miss.x2<-rbinom(200, 1, prob=0.9) | |
miss.y<-rbinom(200, 1, prob=0.9) | |
x1<-ifelse(miss.x1==1, x1.r, NA) | |
x2<-ifelse(miss.x2==1, x2.r, NA) | |
y<-ifelse(miss.y==1, y.r, NA) | |
datmat<-cbind(x1, x2, y) | |
# use mice to create 5 different datasets with filled missings | |
mice.dat<-mice(datmat, m=5, maxit=20) | |
#bwplot(mice.dat) | |
# fit regression line to 5 mice-filled datasets | |
fit<-with(mice.dat, lm(y~x1+x2)) | |
# compare our model with what we'd get if we just used original dataset | |
summary(pool(fit)) | |
summary(lm(y~x1+x2), datmat) | |
# fitting a model to 5 imputed datasets | |
# gives us more information that fitting | |
# to original dataset with missing values | |
# | |
########################## | |
# example 2: make 30% missing | |
set.seed(100) | |
x1.r<-runif(200) | |
x2.r<-runif(200) | |
y.r<-runif(200) | |
#we randomly generate missing values in the dataset | |
miss.x1<-rbinom(200, 1, prob=0.7) | |
miss.x2<-rbinom(200, 1, prob=0.7) | |
miss.y<-rbinom(200, 1, prob=0.7) | |
x1<-ifelse(miss.x1==1, x1.r, NA) | |
x2<-ifelse(miss.x2==1, x2.r, NA) | |
y<-ifelse(miss.y==1, y.r, NA) | |
datmat<-cbind(x1, x2, y) | |
mice.dat<-mice(datmat, m=5, maxit=20) | |
# Compare model with imputed data to that without | |
fit<-with(mice.dat, lm(y~x1+x2)) | |
summary(pool(fit)) | |
summary(lm(y~x1+x2), datmat) | |
########################### | |
# Example 3: introduce interactions, make 10% missing | |
x1.r<-runif(200) | |
x2.r<-runif(200) | |
y.r<-x1.r+x2.r+x1.r*x2.r+rnorm(200, sd=1) | |
#we randomly generate missing values in the dataset | |
miss.x1<-rbinom(200, 1, prob=0.9) | |
miss.x2<-rbinom(200, 1, prob=0.9) | |
miss.y<-rbinom(200, 1, prob=0.9) | |
x1<-ifelse(miss.x1==1, x1.r, NA) | |
x2<-ifelse(miss.x2==1, x2.r, NA) | |
y<-ifelse(miss.y==1, y.r, NA) | |
x1x2<-x1.r*x2.r | |
datmat<-cbind(x1, x2, x1x2, y) | |
# impute values to 5 datasets | |
mice.dat<-mice(datmat, m=5, maxit=20) | |
# fit model to imputed data | |
fit<-with(mice.dat, lm(y~x1+x2+I(x1*x2))) | |
summary(pool(fit)) | |
# compare to model fit to dataset with missings | |
summary(lm(y~x1+x2+I(x1*x2)), datmat) | |
# ... has higher errors | |
##################### | |
# example 4: categorical data | |
# Mice package uses POisson | |
# Create a dataset for explanation | |
set.seed(100) | |
b0 <- 1 | |
b1 <- .75 | |
b2 <- -.25 | |
b3 <- .5 | |
N <- 200 | |
x1 <- rnorm(N) | |
x2 <- rnorm(N) | |
x3 <- rnorm(N) | |
lam <- exp( b0+b1*x1+b2*x2+b3*x3) | |
y <- rpois( N, lam ) | |
POIS <- data.frame( y, x1, x2, x3 ) | |
# We introduce MAR missingness in y using the following function: | |
generatemissingy<- function( data, pos = 1, Z = 2, pmis = .5, strength = c( .5, .5 ) ) { | |
total <- round( pmis * nrow(data) ) | |
sm <- which( data[,Z] < mean( data[,Z] ) ) | |
gr <- which( data[,Z] > mean( data[,Z] ) ) | |
sel.sm <- sample( sm, round( strength[1] * total ) ) | |
sel.gr <- sample( gr, round( strength[2] * total ) ) | |
sel <- c( sel.sm, sel.gr ) | |
data[sel,pos] <- NA | |
return(data) | |
} | |
mdat <- generatemissingy( POIS, pmis = .2, strength = c( .2, .8) ) | |
library(mice) | |
imp <- mice( mdat, seed = 100) | |
res <- with(imp, glm( y ~ x1 + x2 + x3, family = "poisson")) | |
print( pool.res <- summary( pool( res ) ) ) | |
res<-glm( y ~ x1 + x2 + x3, family = "poisson") | |
summary(res) | |
# Error is higher if we don't impute values... | |
## also see "mi" package - allows specifying prior, etc | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################## | |
# RevoScaleR RxTeradata Getting Started Guide Examples | |
############################################################################## | |
############################################################################## | |
# General setup/cleanup: Run this code before proceeding | |
############################################################################## | |
# To follow the examples in this guide, you will need the following | |
# comma-delimited text data files loaded into your Teradata database: | |
# ccFraud.csv as RevoTestDB.ccFraud10 | |
# ccFraudScore.csv as RevoTestDB.ccFraudScore10 | |
# These data files are available from Revolution FTP site. | |
# You can use the 'fastload' Teradata command to load the data sets into your | |
# data base. | |
# Modify and uncomment the following code for your connection to Teradata, | |
# and other information about your Teradata setup: | |
tdConnString <- "DRIVER=Teradata;DBCNAME=10.4.208.180;DATABASE=RevoTestDB;UID=SysDba;PWD=SysDba;" | |
tdShareDir = paste("c:\\users\\", Sys.getenv("USERNAME"), "\\tdShare", sep="") | |
tdRemoteShareDir = "/tmp/revoJobs" | |
tdRevoPath = "/usr/lib64/Revo-7.3/R-3.1.3/lib64/R" | |
# Make sure that the local share dir exists: | |
if (!file.exists(tdShareDir)) dir.create(tdShareDir, recursive = TRUE) | |
# If your DATABASE is not named RevoTestDB, you will need | |
# to modify the SQL queries in the following code to your | |
# DATABASE name | |
########################################################## | |
# Using a Teradata Data Source and Compute Context | |
########################################################## | |
# Creating an RxTeradata Data Source | |
tdQuery <- "SELECT * FROM ccFraud10" | |
teradataDS <- RxTeradata(connectionString = tdConnString, | |
sqlQuery = tdQuery, rowsPerRead = 50000) | |
# NOTE: this query is run every time it is called | |
# Extracting basic information about your data | |
rxGetVarInfo(data = teradataDS) | |
# Specifying column information in your data source | |
stateAbb <- c("AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", | |
"DE", "FL", "GA", "HI","IA", "ID", "IL", "IN", "KS", "KY", "LA", | |
"MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NB", "NC", "ND", | |
"NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI","SC", | |
"SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY") | |
ccColInfo <- list( | |
gender = list( | |
type = "factor", | |
levels = c("1", "2"), | |
newLevels = c("Male", "Female")), | |
cardholder = list( | |
type = "factor", | |
levels = c("1", "2"), | |
newLevels = c("Principal", "Secondary")), | |
state = list( | |
type = "factor", | |
levels = as.character(1:51), | |
newLevels = stateAbb) | |
) | |
teradataDS <- RxTeradata(connectionString = tdConnString, | |
sqlQuery = tdQuery, colInfo = ccColInfo, rowsPerRead = 50000) | |
rxGetVarInfo(data = teradataDS) | |
tdWait <- TRUE | |
tdConsoleOutput <- FALSE | |
tdCompute <- RxInTeradata( | |
connectionString = tdConnString, | |
shareDir = tdShareDir, | |
remoteShareDir = tdRemoteShareDir, | |
revoPath = tdRevoPath, | |
wait = tdWait, | |
consoleOutput = tdConsoleOutput) | |
rxGetNodeInfo(tdCompute) | |
tdComputeTrace <- RxInTeradata( | |
connectionString = tdConnString, | |
shareDir = tdShareDir, | |
remoteShareDir = tdRemoteShareDir, | |
revoPath = tdRevoPath, | |
wait = tdWait, | |
consoleOutput = tdConsoleOutput, | |
traceEnabled = TRUE, | |
traceLevel = 7) | |
########################################################## | |
# High-Performance In-Database Analytics in Teradata | |
########################################################## | |
# Compute summary statistics in Teradata | |
# Set the compute context to compute in Teradata | |
rxSetComputeContext(tdCompute) | |
rxSummary(formula = ~gender + balance + numTrans + numIntlTrans + | |
creditLine, data = teradataDS) | |
# Set the compute context to compute locally | |
rxSetComputeContext ("local") | |
# Refining the RxTeradata data source | |
ccColInfo <- list( | |
gender = list( | |
type = "factor", | |
levels = c("1", "2"), | |
newLevels = c("Male", "Female")), | |
cardholder = list( | |
type = "factor", | |
levels = c("1", "2"), | |
newLevels = c("Principal", "Secondary")), | |
state = list( | |
type = "factor", | |
levels = as.character(1:51), | |
newLevels = stateAbb), | |
balance = list(low = 0, high = 41485), | |
numTrans = list(low = 0, high = 100), | |
numIntlTrans = list(low = 0, high = 60), | |
creditLine = list(low = 1, high = 75) | |
) | |
teradataDS <- RxTeradata(connectionString = tdConnString, | |
sqlQuery = tdQuery, colInfo = ccColInfo, rowsPerRead = 50000) | |
# Visualizing your data using rxHistogram and rxCube | |
rxSetComputeContext(tdCompute) | |
rxHistogram(~creditLine|gender, data = teradataDS, | |
histType = "Percent") | |
cube1 <- rxCube(fraudRisk~F(numTrans):F(numIntlTrans), | |
data = teradataDS) | |
cubePlot <- rxResultsDF(cube1) | |
levelplot(fraudRisk~numTrans*numIntlTrans, data = cubePlot) | |
# Analyzing your data with rxLinMod | |
linModObj <- rxLinMod(balance ~ gender + creditLine, | |
data = teradataDS) | |
summary(linModObj) | |
# Analyzing your data with rxLogit | |
logitObj <- rxLogit(fraudRisk ~ state + gender + cardholder + balance + | |
numTrans + numIntlTrans + creditLine, data = teradataDS, | |
dropFirst = TRUE) | |
summary(logitObj) | |
# Scoring a Data Set | |
tdQuery <- "SELECT * FROM ccFraudScore10" | |
teradataScoreDS <- RxTeradata(connectionString = tdConnString, | |
sqlQuery = tdQuery, colInfo = ccColInfo, rowsPerRead = 50000) | |
teradataOutDS <- RxTeradata(table = "ccScoreOutput", | |
connectionString = tdConnString, rowsPerRead = 50000 ) | |
rxSetComputeContext(tdCompute) | |
if (rxTeradataTableExists("ccScoreOutput")) | |
rxTeradataDropTable("ccScoreOutput") | |
rxPredict(modelObject = logitObj, | |
data = teradataScoreDS, | |
outData = teradataOutDS, | |
predVarNames = "ccFraudLogitScore", | |
type = "link", | |
writeModelVars = TRUE, | |
overwrite = TRUE) | |
if (rxTeradataTableExists("ccScoreOutput")) | |
rxTeradataDropTable("ccScoreOutput") | |
rxPredict(modelObject = logitObj, | |
data = teradataScoreDS, | |
outData = teradataOutDS, | |
predVarNames = "ccFraudLogitScore", | |
type = "link", | |
writeModelVars = TRUE, | |
extraVarsToWrite = "custID", | |
overwrite = TRUE) | |
tdMinMax <- RxOdbcData(sqlQuery = paste( | |
"SELECT MIN(ccFraudLogitScore),", | |
"MAX(ccFraudLogitScore) FROM ccScoreOutput"), | |
connectionString = tdConnString) | |
minMaxVals <- rxImport(tdMinMax) | |
minMaxVals <- as.vector(unlist(minMaxVals)) | |
teradataScoreDS <- RxTeradata(sqlQuery = | |
"Select ccFraudLogitScore FROM ccScoreOutput", | |
connectionString = tdConnString, rowsPerRead = 50000, | |
colInfo = list(ccFraudLogitScore = list( | |
low = floor(minMaxVals[1]), | |
high = ceiling(minMaxVals[2])))) | |
rxSetComputeContext(tdCompute) | |
rxHistogram(~ccFraudLogitScore, data = teradataScoreDS) | |
########################################################## | |
# Using rxDataStep and rxImport | |
########################################################## | |
teradataScoreDS <- RxTeradata( | |
sqlQuery = "Select * FROM ccScoreOutput", | |
connectionString = tdConnString, rowsPerRead = 50000 ) | |
teradataOutDS2 <- RxTeradata(table = " ccScoreOutput2", | |
connectionString = tdConnString, rowsPerRead = 50000) | |
rxSetComputeContext(tdCompute) | |
if (rxTeradataTableExists("ccScoreOutput2")) | |
rxTeradataDropTable("ccScoreOutput2") | |
rxDataStep(inData = teradataScoreDS, outFile = teradataOutDS2, | |
transforms = list(ccFraudProb = inv.logit(ccFraudLogitScore)), | |
transformPackages = "boot", overwrite = TRUE) | |
rxGetVarInfo(teradataOutDS2) | |
# Using rxImport to Extract a Subsample | |
rxSetComputeContext("local") | |
teradataProbDS <- RxTeradata( | |
sqlQuery = paste( | |
"Select * FROM ccScoreOutput2", | |
"WHERE (ccFraudProb > .99)"), | |
connectionString = tdConnString) | |
highRisk <- rxImport(teradataProbDS) | |
orderedHighRisk <- highRisk[order(-highRisk$ccFraudProb),] | |
row.names(orderedHighRisk) <- NULL # Reset row numbers | |
head(orderedHighRisk) | |
# Using rxDataStep to Create a Teradata Table | |
rxSetComputeContext("local") | |
xdfAirDemo <- RxXdfData(file.path(rxGetOption("sampleDataDir"), | |
"AirlineDemoSmall.xdf")) | |
rxGetVarInfo(xdfAirDemo) | |
teradataAirDemo <- RxTeradata(table = " AirDemoSmallTest", | |
connectionString = tdConnString) | |
if (rxTeradataTableExists("AirDemoSmallTest", | |
connectionString = tdConnString)) | |
rxTeradataDropTable("AirDemoSmallTest", | |
connectionString = tdConnString) | |
rxDataStep(inData = xdfAirDemo, outFile = teradataAirDemo, | |
transforms = list( | |
DayOfWeek = as.integer(DayOfWeek), | |
rowNum = .rxStartRow : (.rxStartRow + .rxNumRows - 1) | |
), overwrite = TRUE | |
) | |
rxSetComputeContext(tdCompute) | |
teradataAirDemo <- RxTeradata(sqlQuery = | |
"SELECT * FROM AirDemoSmallTest", | |
connectionString = tdConnString, | |
rowsPerRead = 50000, | |
colInfo = list(DayOfWeek = list(type = "factor", | |
levels = as.character(1:7)))) | |
rxSummary(~., data = teradataAirDemo) | |
# Performing Your Own Chunking Analysis | |
ProcessChunk <- function( dataList) | |
{ | |
# Convert the input list to a data frame and | |
# call the 'table' function to compute the | |
# contingency table | |
chunkTable <- table(as.data.frame(dataList)) | |
# Convert table output to data frame with single row | |
varNames <- names(chunkTable) | |
varValues <- as.vector(chunkTable) | |
dim(varValues) <- c(1, length(varNames)) | |
chunkDF <- as.data.frame(varValues) | |
names(chunkDF) <- varNames | |
# Return the data frame, which has a single row | |
return( chunkDF ) | |
} | |
rxSetComputeContext( tdCompute ) | |
tdQuery <- | |
"select DayOfWeek from AirDemoSmallTest" | |
inDataSource <- RxTeradata(sqlQuery = tdQuery, | |
rowsPerRead = 50000, | |
colInfo = list(DayOfWeek = list(type = "factor", | |
levels = as.character(1:7)))) | |
iroDataSource = RxTeradata(table = "iroResults", | |
connectionString = tdConnString) | |
if (rxTeradataTableExists(table = "iroResults", | |
connectionString = tdConnString)) | |
{ | |
rxTeradataDropTable( table = "iroResults", | |
connectionString = tdConnString) | |
} | |
rxDataStep( inData = inDataSource, outFile = iroDataSource, | |
transformFunc = ProcessChunk, | |
overwrite = TRUE) | |
iroResults <- rxImport(iroDataSource) | |
finalResults <- colSums(iroResults) | |
finalResults | |
rxTeradataDropTable( table = " iroResults", | |
connectionString = tdConnString) | |
########################################################## | |
# Using rxDataStep for By-Group Analyses | |
########################################################## | |
# Creating a simulated data set | |
set.seed(10) | |
numObs <- 100000 | |
testData <- data.frame( | |
SKU = as.integer(runif(n = numObs, min = 1, max = 11)), | |
INCOME = as.integer(runif(n = numObs, min = 10, max = 21))) | |
# SKU is also underlying coefficient | |
testData$SALES <- as.integer(testData$INCOME * testData$SKU + | |
25*rnorm(n = numObs)) | |
# Uploading into a Teradata table | |
rxSetComputeContext("local") | |
if (rxTeradataTableExists("sku_sales_100k", | |
connectionString = tdConnString)) | |
{ | |
rxTeradataDropTable("sku_sales_100k", | |
connectionString = tdConnString) | |
} | |
teradataDS <- RxTeradata(table = "sku_sales_100k", | |
connectionString = tdConnString) | |
rxDataStep(inData = testData, outFile = teradataDS) | |
# The By-Group Transformation Function | |
EstimateModel <- function(dataList) | |
{ | |
# Convert the input list to a data frame | |
chunkDF <- as.data.frame(dataList) | |
# Estimate a linear model on this chunk | |
# Don't include the model frame in the model object | |
model <- lm( SALES~INCOME, chunkDF, model=FALSE ) | |
# Print statements can be useful for debugging | |
# print( model$coefficients ) | |
# Remove unneeded parts of model object | |
model$residuals <- NULL | |
model$effects <- NULL | |
model$fitted.values <- NULL | |
model$assign <- NULL | |
model$qr[[1]] <- NULL | |
attr(model$terms, ".Environment") <- NULL | |
# Convert the model to a character string | |
modelString <- | |
rawToChar(serialize(model, connect=NULL, ascii=TRUE)) | |
# Pad to a fixed length | |
modelString <- paste( modelString, | |
paste(rep("X", 2000 - nchar(modelString)), collapse = ""), | |
sep="") | |
# Create the entry for the results table | |
resultDF <- data.frame( | |
SKU = chunkDF[1,]$SKU, # Assumes SKU's all the same | |
COEF = model$coefficients[2], # Slope | |
MODEL = modelString ) | |
return( resultDF ) | |
} | |
# Specifying the input by-group data source | |
tdQuery <- "SELECT * FROM sku_sales_100k" | |
partitionKeyword <- "PARTITION-WITH-VIEW" | |
partitionClause <- paste( partitionKeyword, "BY sku_sales_100k.SKU" ) | |
inDataSource <- RxTeradata(sqlQuery = tdQuery, | |
tableOpClause = partitionClause, | |
rowsPerRead = 100000) | |
# Setting up the results data source | |
resultsDataSource = RxTeradata(table = "models", | |
connectionString = tdConnString ) | |
rxOptions(transformPackages = c("stats")) | |
# Make sure compute context is in-database | |
rxSetComputeContext(tdCompute) | |
# Run the by-group data step | |
rxDataStep( inData = inDataSource, outFile = resultsDataSource, | |
transformFunc = EstimateModel, reportProgress = 0, | |
overwrite = TRUE ) | |
# Showing the estimated slopes | |
lmResults <- rxImport(resultsDataSource) | |
lmResults[order(lmResults$SKU),c("SKU", "COEF")] | |
# The Scoring transformation function | |
ScoreChunk <- function(dataList) | |
{ | |
chunkDF <- as.data.frame(dataList) | |
# Extract the model string for the first observation | |
# and convert back to model object | |
# All observations in this chunk have the same SKU, | |
# so they all share the same model | |
modelString <- as.character(chunkDF[1,]$MODEL) | |
model <- unserialize( charToRaw( modelString ) ) | |
resultDF <- data.frame( | |
SKU = chunkDF$SKU, | |
INCOME = chunkDF$INCOME, | |
# Use lm's predict method | |
PREDICTED_SALES = predict(model, chunkDF) ) | |
return( resultDF ) | |
} | |
# Setting up the input data source for predictions | |
predictQuery <- paste("SELECT sku_sales_100k.SKU,", | |
"sku_sales_100k.INCOME, models.MODEL", | |
"FROM sku_sales_100k JOIN models ON", | |
"sku_sales_100k.SKU = models.SKU") | |
inDataSource <- RxTeradata(sqlQuery = predictQuery, | |
connectionString = tdConnString, | |
tableOpClause = partitionClause, rowsPerRead = 10000) | |
# Setting up the output data set for predictions | |
scoresDataSource = RxTeradata(table = "scores", | |
connectionString = tdConnString) | |
# Perform scoring in-database | |
rxSetComputeContext(tdCompute) | |
rxDataStep( inData = inDataSource, outFile = scoresDataSource, | |
transformFunc = ScoreChunk, reportProgress = 0, | |
overwrite = TRUE ) | |
# Extracting summary results | |
predSum <- rxImport( RxTeradata( | |
sqlQuery = paste("SELECT SKU, INCOME, MIN(PREDICTED_SALES),", | |
"MAX(PREDICTED_SALES), COUNT(PREDICTED_SALES)", | |
"FROM scores GROUP BY SKU, INCOME", | |
"ORDER BY SKU, INCOME"), | |
connectionString = tdConnString ) ) | |
options(width = 120) # Set display width | |
predSum[1:15,] | |
# Clean-up | |
rxTeradataDropTable("models") | |
rxTeradataDropTable("scores") | |
rxSetComputeContext("local") | |
########################################################## | |
# Performing Simulations In-Database | |
########################################################## | |
playCraps <- function() | |
{ | |
result <- NULL | |
point <- NULL | |
count <- 1 | |
while (is.null(result)) | |
{ | |
roll <- sum(sample(6, 2, replace=TRUE)) | |
if (is.null(point)) | |
{ | |
point <- roll | |
} | |
if (count == 1 && (roll == 7 || roll == 11)) | |
{ | |
result <- "Win" | |
} | |
else if (count == 1 && | |
(roll == 2 || roll == 3 || roll == 12)) | |
{ | |
result <- "Loss" | |
} | |
else if (count > 1 && roll == 7 ) | |
{ | |
result <- "Loss" | |
} | |
else if (count > 1 && point == roll) | |
{ | |
result <- "Win" | |
} | |
else | |
{ | |
count <- count + 1 | |
} | |
} | |
result | |
} | |
# Calling rxExec | |
teradataExec <- rxExec(playCraps, timesToRun=1000, RNGseed="auto") | |
length(teradataExec) | |
table(unlist(teradataExec)) | |
########################################################## | |
# Analyzing Your Data Alongside Teradata | |
########################################################## | |
# Set the compute context to compute locally | |
rxSetComputeContext ("local") | |
tdQuery <- "SELECT * FROM ccFraudScore10" | |
# Perform an rxSummary using a local compute context | |
teradataDS1 <- RxTeradata(connectionString = tdConnString, | |
sqlQuery = tdQuery, colInfo = ccColInfo, rowsPerRead = 500000) | |
rxSummary(formula = ~gender + balance + numTrans + numIntlTrans + | |
creditLine, data = teradataDS1) | |
statesToKeep <- sapply(c("CA", "OR", "WA"), grep, stateAbb) | |
statesToKeep | |
# Fast import of iata from a Teradata Database | |
importQuery <- paste("SELECT gender,cardholder,balance,state", | |
"FROM ccFraud10", | |
"WHERE (state = 5 OR state = 38 OR state = 48)") | |
importColInfo <- list( | |
gender = list( | |
type = "factor", | |
levels = c("1", "2"), | |
newLevels = c("Male", "Female")), | |
cardholder = list( | |
type = "factor", | |
levels = c("1", "2"), | |
newLevels = c("Principal", "Secondary")), | |
state = list( | |
type = "factor", | |
levels = as.character(statesToKeep), | |
newLevels = names(statesToKeep)) | |
) | |
rxSetComputeContext("local") | |
teradataImportDS <- RxTeradata(connectionString = tdConnString, | |
sqlQuery = importQuery, colInfo = importColInfo) | |
localDS <- rxImport(inData = teradataImportDS, | |
outFile = "ccFraudSub.xdf", | |
overwrite = TRUE) | |
# Using the imported data | |
rxGetVarInfo(data = localDS) | |
rxSummary(~gender + cardholder + balance + state, data = localDS) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment