-
-
Save somnathrakshit/a1f0385cfaf299639d12816e22637c9b to your computer and use it in GitHub Desktop.
R Code: Analytics Boot Camp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# R is a scripting language | |
# Arithmetic Operators | |
5+5 | |
# 10 | |
# Arithmetic Operators | |
# Addition: + | |
# Subtraction: - | |
# Multiplication: * | |
# Division: / | |
# Exponentiation: ^ | |
# Modulo: %% | |
# Assign the value 5 to the variable called 'my_apples' | |
my_apples <- 5 | |
# Print out the value of the variable 'my_apples' | |
my_apples | |
# adding txt and number will give you error | |
"Sdfsdf"+"sdfs" # gives error | |
"sdfsdf" + 5 # gives error | |
# Decimals values like 4.5 are called numerics. | |
# Natural numbers like 4 are called integers. Integers are also numerics. | |
# Boolean values (TRUE or FALSE) are called logical (TRUE can be abbreviated to T and FALSE to F). | |
# Text (or string) values are called characters. | |
my_character <- "forty-two" #This value is string | |
remove(my_var) # Removes variable | |
class(some_variable_name) # gives type of variable, Note- return type is string | |
# Vectors | |
numeric_v <- c(1, 2, 3) #In R, you create a vector with the combine function c() | |
# You can give a name to the elements of a vector with the names() function | |
some_vector <- c("Johnny", "Poker Player") | |
names(some_vector) <- c("Name", "Profession") | |
# if you sum two vectors in R, it takes the element-wise sum. | |
c(1, 2, 3) + c(4, 5, 6) | |
c(1 + 4, 2 + 5, 3 + 6) | |
c(5, 7, 9) | |
poker_vector <- c(1,2,-3,4,5) | |
names(poker_vector) <- c("Sunday", "Monday", "Tuesday", "Wen", "Thu") | |
# It calculates the sum of all elements of a vector | |
total_poker <- sum(poker_vector) | |
# if you add a vector and number , it will add it to all elements | |
# for example, to select the first element of the vector, you type poker_vector[1] | |
poker_midweek <- poker_vector[c(2,3,4)] # gets data of all the indices in the vector inside | |
#also this might work | |
poker_midweek <- poker_vector[2:4] | |
poker_vector["Monday"] | |
poker_vector[c("Monday","Tuesday")] | |
# for getting mead use mean function | |
# for less than | |
# > for greater than | |
# >= for greater than or equal to | |
# == for equal to each other | |
# != not equal to each other | |
# for comparing with all elements you can use directly these operators | |
# What days of the week did you make money on poker? | |
selection_vector <- poker_vector > 0 | |
# Select from poker_vector these days | |
poker_winning_days <- poker_vector[selection_vector] | |
# Matrix | |
# You can construct a matrix in R with the matrix() function. | |
mat = matrix(1:9, byrow = TRUE, nrow = 3) | |
mat | |
# The argument byrow indicates that the matrix is filled by the rows. If we want the vector to be filled by the columns, we just place byrow = FALSE | |
matrix(10,nrow=3,ncol=5) # Creates matrix with all elements 10 | |
# Similar to vectors, you can add names for the rows and the columns of a matrix | |
rownames(my_matrix) <- row_names_vector | |
colnames(my_matrix) <- col_names_vector | |
# nrow returns number of rows | |
nrow(mat) | |
# ncol - number of columns | |
ncol(matrix) | |
# In R, the function rowSums() conveniently calculates the totals for each row of a matrix | |
sum_of_rows_vector <- rowSums(my_matrix) | |
# Similarly we also have colSums() | |
# use cbind(mat,vec/mat) to add more columns in a matrix | |
# similarly you can use rbind() | |
# my_matrix[1,2] selects from the first row the second element. | |
# my_matrix[1:3,2:4] selects rows 1,2,3 and columns 2,3,4. | |
# my_matrix[,1] selects all elements of the first column. | |
# my_matrix[1,] selects all elements of the first row. | |
# the standard operators like +, -, /, *, etc. work in an element-wise way on matrices in R. | |
# note applying * on matrices does element wise. To do standard matrix multiplication use %*% For more use this http://www.statmethods.net/advstats/matrix.html | |
# Factors | |
gender_vector <- c("Male", "Female", "Female", "Male", "Male") | |
# Define factor_gender_vector using 'factor()' | |
factor_gender_vector <- factor(gender_vector) | |
# There are 2 types of Factor Variables | |
animals_vector <- c("Elephant", "Giraffe", "Donkey", "Horse") | |
temperature_vector <- c("High", "Low", "High","Low", "Medium") | |
factor_animals_vector <- factor(animals_vector) # Unordered stored alphabetically | |
factor_animals_vector | |
factor_temperature_vector <- factor(temperature_vector, order = TRUE, levels = c("Low", "Medium", "High")) | |
factor_temperature_vector | |
# levels(factor_vector) <- c("name1","name2",...) % You can change names of the factor according to your convenience | |
# This will give you a quick overview of some_variable: | |
summary(some_variable) | |
# You cannot compare 2 nominal factors eg Male and Female | |
# But you can compare 2 ordinal factors example High and Low | |
# Remember matrix and vectors, all elements are of same data type | |
# Dataset | |
# Remember matrix and vectors, all elements are of same data type | |
head(carset) # first observations of a data frame (or any R object you pass to it). | |
tail(carset) # last observations of a data frame (or any R object you pass to it). | |
head(carset,number of elements) # first n observations of a data frame (or any R object you pass to it). | |
str(mtcars) # The function str() shows you the structure of your data set. | |
data.frame(vectors) #Creates a data sets given vectors as argument | |
# remember rbind and cbind works here but using rbind will also merge the factor levels. | |
# In general, remember there three lines. | |
dataset[a,b] # For row and column | |
dataset[a,] # For row/s | |
dataset[,b] # For column/s | |
# All data from the rows | |
furthest_planets_df <- planets_df[6:8,] | |
# Instead of using numerics to select elements of a data frame, you can also use the variable names to select columns of a data frame | |
furthest_planets_diameter <- planets_df[3:8,"diameter"] | |
data_frame_name$variable_name # access to a specific column/row of a given dataset | |
planets_with_rings_df <- planets_df[rings_vector,] # access all the elements of rings_vector==true, Remember ‘,’ is important. | |
# For cutting data use subsets instead | |
subset(my_data_frame, subset = some_condition) | |
# order() is a function that gives you the ranked position of each element when it is applied on a variable | |
a <- c(100,9,101) | |
order(a) # returns the vector {2, 1, 3} | |
# What is the correct ordering based on the planets_df$diameter variable? | |
positions <- order(planets_df$diameter,decreasing=T) | |
# Create new "ordered" data frame: | |
largest_first_df <- planets_df[positions,] | |
dim(obj) # returns dimension of object i.e number of rows and column | |
#Conditionals | |
# Comparison of character strings | |
"user"=="useR" # False | |
# True compares to 1 and false compares to 0 | |
# Compare a logical with a numeric | |
TRUE==1 # True | |
# Comparison of character strings | |
"raining" <= "raining dogs" # True because alphabetic order | |
"hello">"Goodbye" # True | |
# Comparison of logicals # True | |
TRUE>FALSE | |
# Sum of a boolean array gives number of trues | |
# AND and OR # In R and operator is & and operator is | | |
# Is last between 0 and 5 or between 10 and 15? | |
(last > 0 & last < 5) | (last > 10 & last < 15) | |
# ! is NOT operator | |
# remember this is the syntax do not add any more white space | |
if (condition){ | |
expr | |
} else if (condition2) { | |
expr2 | |
} else (condition) { | |
expr3 | |
} | |
if (num_views>15){ | |
print("You're popular!") # Notice using print statement | |
} | |
# For getting detail of any syntax use ?<keyword> Example | |
Loops | |
# While loop | |
while(condition) { | |
expr | |
} | |
# You can use break statement similar way | |
if (speed>80) break # remember inside the loop | |
# In R instead of using continue you can use next | |
# For loop | |
primes <- c(2, 3, 5, 7, 11, 13) | |
# loop version 1 | |
for(p in primes) { | |
print(p) | |
} | |
# Spliting String | |
chars <- strsplit(rquote, split = "")[[1]] | |
number <- sample(1:6, size = 1) # returns a sample number between 1 to 6 | |
# Functions in R | |
# A quick hack to simply see the arguments of the sample() function is the args() function | |
args(sample) | |
# mean function with different arguments | |
mean(x, trim = 0, na.rm = FALSE, ...) | |
# na.rm removes the elements which do not exists Eg NA or Infinity | |
speed <- 31 | |
print(paste("Your speed is",speed)) # Adding string "Contenation" | |
#Writing your own functions | |
my_fun <- function(arg1, arg2) { | |
body | |
my_fun=something | |
return(value) # this is different from assign value. Prefer this | |
} | |
# Scoping implies that variables that are defined inside a function, are not accessible outside that function | |
# R passes by value, so the R objects you pass to a function can never change unless you do an explicit assigment | |
# Inside R functions, everything you print is returned if return not specified | |
# Syntax | |
# Plotting in R | |
str(mtcars) | |
plot(data=mtcars) | |
plot(mtcars$mpg, mtcars$wt) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
k-NN | |
----------------------------------------------------------------------- | |
library (class) | |
data(iris3) | |
# 1. using a separate test set | |
train <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3]) | |
test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3]) | |
cl <- factor(c(rep("s",25), rep("c",25), rep("v",25))) | |
myknn <- knn(train, test, cl, k = 3, prob=TRUE) | |
attributes(.Last.value) | |
tab <- table(myknn, cl) | |
sum(tab[row(tab)==col(tab)])/sum(tab) | |
Naive-Bayes | |
--------------------------------------------------- | |
library (e1071) | |
## Naive Bayes Classifier for Discrete Predictors: we use again the Congressional Voting Records of 1984 | |
# Note refusals to vote have been treated as missing values! | |
data (HouseVotes84, package="mlbench") | |
model <- naiveBayes(Class ~ ., data = HouseVotes84) | |
# predict the outcome of the first 20 records | |
predict(model, HouseVotes84[1:20,-1]) | |
# same but displaying posteriors | |
predict(model, HouseVotes84[1:20,-1], type = "raw") | |
# now all of them: this is the resubstituion error | |
pred <- predict(model, HouseVotes84[,-1]) | |
# form and display confusion matrix & overall accuracy | |
tab <- table(pred, HouseVotes84$Class) | |
tab | |
sum(tab[row(tab)==col(tab)])/sum(tab) | |
Source: http://www.cs.upc.edu/~belanche/Docencia/mineria/English-september-2008/Practical-work/Labo-NBayes-kNN.R | |
http://www.cs.upc.edu/~belanche/Docencia/mineria/English-september-2008/Practical-work/Labo-SVMs.R | |
SVMs | |
---------------------------------------------------- | |
# In a second example, we use the glass data (available in package mlbench) for classification | |
# The task is to predict the type of a glass on basis of its chemical analysis | |
library(e1071) | |
library(rpart) | |
library(mlbench) | |
data(Glass) | |
## split data into a training (2/3) and test set (1/3) | |
index <- 1:nrow(Glass) | |
testindex <- sample(index, trunc(length(index)/3)) | |
testset <- Glass[testindex,] | |
trainset <- Glass[-testindex,] | |
# fit the model | |
svm.model <- svm(Type ~ ., data = trainset, cost = 100, gamma = 1) | |
# make the prediction (the dependent variable, Type, has column number 10) | |
svm.pred <- predict(svm.model, testset[,-10]) | |
table(pred = svm.pred, true = testset[,10]) | |
# The function svm() returns an object of class “svm”, which partly includes the following components: | |
# SV: matrix of support vectors found; | |
# labels: their labels in classification mode; | |
# index: index of the support vectors in the input data (could be used e.g., for visualization) | |
# Other important parameters: | |
# class.weights: allows to introduce class weighing, useful for very asymmetric classes | |
# cross: (default 0) for k-fold CV | |
# A nice tool in package e1071 is the possibility of tuning the parameters by 10-CV grid search: | |
mytunedsvm <- tune.svm(Type ~ ., data = trainset, gamma = 2^(-1:1), cost = 2^(2:4)) | |
summary(mytunedsvm) | |
plot (mytunedsvm, transform.x=log10, xlab=expression(log[10](gamma)), ylab="C") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment