Skip to content

Instantly share code, notes, and snippets.

Created August 27, 2016 12:11
Show Gist options
  • Save anonymous/14cb87d5f8dd3bad8170f04e07b36a94 to your computer and use it in GitHub Desktop.
Save anonymous/14cb87d5f8dd3bad8170f04e07b36a94 to your computer and use it in GitHub Desktop.
R Code: Analytics Boot Camp
# R is a scripting language
# Arithmetic Operators
5+5
# 10
# Arithmetic Operators
# Addition: +
# Subtraction: -
# Multiplication: *
# Division: /
# Exponentiation: ^
# Modulo: %%
# Assign the value 5 to the variable called 'my_apples'
my_apples <- 5
# Print out the value of the variable 'my_apples'
my_apples
# adding txt and number will give you error
"Sdfsdf"+"sdfs" # gives error
"sdfsdf" + 5 # gives error
# Decimals values like 4.5 are called numerics.
# Natural numbers like 4 are called integers. Integers are also numerics.
# Boolean values (TRUE or FALSE) are called logical (TRUE can be abbreviated to T and FALSE to F).
# Text (or string) values are called characters.
my_character <- "forty-two" #This value is string
remove(my_var) # Removes variable
class(some_variable_name) # gives type of variable, Note- return type is string
# Vectors
numeric_v <- c(1, 2, 3) #In R, you create a vector with the combine function c()
# You can give a name to the elements of a vector with the names() function
some_vector <- c("Johnny", "Poker Player")
names(some_vector) <- c("Name", "Profession")
# if you sum two vectors in R, it takes the element-wise sum.
c(1, 2, 3) + c(4, 5, 6)
c(1 + 4, 2 + 5, 3 + 6)
c(5, 7, 9)
poker_vector <- c(1,2,-3,4,5)
names(poker_vector) <- c("Sunday", "Monday", "Tuesday", "Wen", "Thu")
# It calculates the sum of all elements of a vector
total_poker <- sum(poker_vector)
# if you add a vector and number , it will add it to all elements
# for example, to select the first element of the vector, you type poker_vector[1]
poker_midweek <- poker_vector[c(2,3,4)] # gets data of all the indices in the vector inside
#also this might work
poker_midweek <- poker_vector[2:4]
poker_vector["Monday"]
poker_vector[c("Monday","Tuesday")]
# for getting mead use mean function
# for less than
# > for greater than
# >= for greater than or equal to
# == for equal to each other
# != not equal to each other
# for comparing with all elements you can use directly these operators
# What days of the week did you make money on poker?
selection_vector <- poker_vector > 0
# Select from poker_vector these days
poker_winning_days <- poker_vector[selection_vector]
# Matrix
# You can construct a matrix in R with the matrix() function.
mat = matrix(1:9, byrow = TRUE, nrow = 3)
mat
# The argument byrow indicates that the matrix is filled by the rows. If we want the vector to be filled by the columns, we just place byrow = FALSE
matrix(10,nrow=3,ncol=5) # Creates matrix with all elements 10
# Similar to vectors, you can add names for the rows and the columns of a matrix
rownames(my_matrix) <- row_names_vector
colnames(my_matrix) <- col_names_vector
# nrow returns number of rows
nrow(mat)
# ncol - number of columns
ncol(matrix)
# In R, the function rowSums() conveniently calculates the totals for each row of a matrix
sum_of_rows_vector <- rowSums(my_matrix)
# Similarly we also have colSums()
# use cbind(mat,vec/mat) to add more columns in a matrix
# similarly you can use rbind()
# my_matrix[1,2] selects from the first row the second element.
# my_matrix[1:3,2:4] selects rows 1,2,3 and columns 2,3,4.
# my_matrix[,1] selects all elements of the first column.
# my_matrix[1,] selects all elements of the first row.
# the standard operators like +, -, /, *, etc. work in an element-wise way on matrices in R.
# note applying * on matrices does element wise. To do standard matrix multiplication use %*% For more use this http://www.statmethods.net/advstats/matrix.html
# Factors
gender_vector <- c("Male", "Female", "Female", "Male", "Male")
# Define factor_gender_vector using 'factor()'
factor_gender_vector <- factor(gender_vector)
# There are 2 types of Factor Variables
animals_vector <- c("Elephant", "Giraffe", "Donkey", "Horse")
temperature_vector <- c("High", "Low", "High","Low", "Medium")
factor_animals_vector <- factor(animals_vector) # Unordered stored alphabetically
factor_animals_vector
factor_temperature_vector <- factor(temperature_vector, order = TRUE, levels = c("Low", "Medium", "High"))
factor_temperature_vector
# levels(factor_vector) <- c("name1","name2",...) % You can change names of the factor according to your convenience
# This will give you a quick overview of some_variable:
summary(some_variable)
# You cannot compare 2 nominal factors eg Male and Female
# But you can compare 2 ordinal factors example High and Low
# Remember matrix and vectors, all elements are of same data type
# Dataset
# Remember matrix and vectors, all elements are of same data type
head(carset) # first observations of a data frame (or any R object you pass to it).
tail(carset) # last observations of a data frame (or any R object you pass to it).
head(carset,number of elements) # first n observations of a data frame (or any R object you pass to it).
str(mtcars) # The function str() shows you the structure of your data set.
data.frame(vectors) #Creates a data sets given vectors as argument
# remember rbind and cbind works here but using rbind will also merge the factor levels.
# In general, remember there three lines.
dataset[a,b] # For row and column
dataset[a,] # For row/s
dataset[,b] # For column/s
# All data from the rows
furthest_planets_df <- planets_df[6:8,]
# Instead of using numerics to select elements of a data frame, you can also use the variable names to select columns of a data frame
furthest_planets_diameter <- planets_df[3:8,"diameter"]
data_frame_name$variable_name # access to a specific column/row of a given dataset
planets_with_rings_df <- planets_df[rings_vector,] # access all the elements of rings_vector==true, Remember ‘,’ is important.
# For cutting data use subsets instead
subset(my_data_frame, subset = some_condition)
# order() is a function that gives you the ranked position of each element when it is applied on a variable
a <- c(100,9,101)
order(a) # returns the vector {2, 1, 3}
# What is the correct ordering based on the planets_df$diameter variable?
positions <- order(planets_df$diameter,decreasing=T)
# Create new "ordered" data frame:
largest_first_df <- planets_df[positions,]
dim(obj) # returns dimension of object i.e number of rows and column
#Conditionals
# Comparison of character strings
"user"=="useR" # False
# True compares to 1 and false compares to 0
# Compare a logical with a numeric
TRUE==1 # True
# Comparison of character strings
"raining" <= "raining dogs" # True because alphabetic order
"hello">"Goodbye" # True
# Comparison of logicals # True
TRUE>FALSE
# Sum of a boolean array gives number of trues
# AND and OR # In R and operator is & and operator is |
# Is last between 0 and 5 or between 10 and 15?
(last > 0 & last < 5) | (last > 10 & last < 15)
# ! is NOT operator
# remember this is the syntax do not add any more white space
if (condition){
expr
} else if (condition2) {
expr2
} else (condition) {
expr3
}
if (num_views>15){
print("You're popular!") # Notice using print statement
}
# For getting detail of any syntax use ?<keyword> Example
?print
Loops
# While loop
while(condition) {
expr
}
# You can use break statement similar way
if (speed>80) break # remember inside the loop
# In R instead of using continue you can use next
# For loop
primes <- c(2, 3, 5, 7, 11, 13)
# loop version 1
for(p in primes) {
print(p)
}
# Spliting String
chars <- strsplit(rquote, split = "")[[1]]
number <- sample(1:6, size = 1) # returns a sample number between 1 to 6
# Functions in R
# A quick hack to simply see the arguments of the sample() function is the args() function
args(sample)
# mean function with different arguments
mean(x, trim = 0, na.rm = FALSE, ...)
# na.rm removes the elements which do not exists Eg NA or Infinity
speed <- 31
print(paste("Your speed is",speed)) # Adding string "Contenation"
#Writing your own functions
my_fun <- function(arg1, arg2) {
body
my_fun=something
return(value) # this is different from assign value. Prefer this
}
# Scoping implies that variables that are defined inside a function, are not accessible outside that function
# R passes by value, so the R objects you pass to a function can never change unless you do an explicit assigment
# Inside R functions, everything you print is returned if return not specified
# Syntax
# Plotting in R
str(mtcars)
plot(data=mtcars)
plot(mtcars$mpg, mtcars$wt)
k-NN
-----------------------------------------------------------------------
library (class)
data(iris3)
# 1. using a separate test set
train <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3])
test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3])
cl <- factor(c(rep("s",25), rep("c",25), rep("v",25)))
myknn <- knn(train, test, cl, k = 3, prob=TRUE)
attributes(.Last.value)
tab <- table(myknn, cl)
sum(tab[row(tab)==col(tab)])/sum(tab)
Naive-Bayes
---------------------------------------------------
library (e1071)
## Naive Bayes Classifier for Discrete Predictors: we use again the Congressional Voting Records of 1984
# Note refusals to vote have been treated as missing values!
data (HouseVotes84, package="mlbench")
model <- naiveBayes(Class ~ ., data = HouseVotes84)
# predict the outcome of the first 20 records
predict(model, HouseVotes84[1:20,-1])
# same but displaying posteriors
predict(model, HouseVotes84[1:20,-1], type = "raw")
# now all of them: this is the resubstituion error
pred <- predict(model, HouseVotes84[,-1])
# form and display confusion matrix & overall accuracy
tab <- table(pred, HouseVotes84$Class)
tab
sum(tab[row(tab)==col(tab)])/sum(tab)
Source: http://www.cs.upc.edu/~belanche/Docencia/mineria/English-september-2008/Practical-work/Labo-NBayes-kNN.R
http://www.cs.upc.edu/~belanche/Docencia/mineria/English-september-2008/Practical-work/Labo-SVMs.R
SVMs
----------------------------------------------------
# In a second example, we use the glass data (available in package mlbench) for classification
# The task is to predict the type of a glass on basis of its chemical analysis
library(e1071)
library(rpart)
library(mlbench)
data(Glass)
## split data into a training (2/3) and test set (1/3)
index <- 1:nrow(Glass)
testindex <- sample(index, trunc(length(index)/3))
testset <- Glass[testindex,]
trainset <- Glass[-testindex,]
# fit the model
svm.model <- svm(Type ~ ., data = trainset, cost = 100, gamma = 1)
# make the prediction (the dependent variable, Type, has column number 10)
svm.pred <- predict(svm.model, testset[,-10])
table(pred = svm.pred, true = testset[,10])
# The function svm() returns an object of class “svm”, which partly includes the following components:
# SV: matrix of support vectors found;
# labels: their labels in classification mode;
# index: index of the support vectors in the input data (could be used e.g., for visualization)
# Other important parameters:
# class.weights: allows to introduce class weighing, useful for very asymmetric classes
# cross: (default 0) for k-fold CV
# A nice tool in package e1071 is the possibility of tuning the parameters by 10-CV grid search:
mytunedsvm <- tune.svm(Type ~ ., data = trainset, gamma = 2^(-1:1), cost = 2^(2:4))
summary(mytunedsvm)
plot (mytunedsvm, transform.x=log10, xlab=expression(log[10](gamma)), ylab="C")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment