somnathrakshit/basics.R

## basics.R

# R is a scripting language

# Arithmetic Operators
5+5
# 10

# Arithmetic Operators
# Addition: +
# Subtraction: -
# Multiplication: *
# Division: /
# Exponentiation: ^
# Modulo: %%

# Assign the value 5 to the variable called 'my_apples'
my_apples <- 5

# Print out the value of the variable 'my_apples'
my_apples

# adding txt and number will give you error
"Sdfsdf"+"sdfs" # gives error
"sdfsdf" + 5 # gives error

# Decimals values like 4.5 are called numerics.
# Natural numbers like 4 are called integers. Integers are also numerics.
# Boolean values (TRUE or FALSE) are called logical (TRUE can be abbreviated to T and FALSE to F).
# Text (or string) values are called characters.

my_character <- "forty-two" #This value is string

remove(my_var) # Removes variable
class(some_variable_name) # gives type of variable, Note- return type is string

# Vectors

numeric_v <- c(1, 2, 3) #In R, you create a vector with the combine function c()

# You can give a name to the elements of a vector with the names() function
some_vector <- c("Johnny", "Poker Player")
names(some_vector) <- c("Name", "Profession")


# if you sum two vectors in R, it takes the element-wise sum.
c(1, 2, 3) + c(4, 5, 6)
c(1 + 4, 2 + 5, 3 + 6)
c(5, 7, 9)

poker_vector <- c(1,2,-3,4,5)
names(poker_vector) <- c("Sunday", "Monday", "Tuesday", "Wen", "Thu")

# It calculates the sum of all elements of a vector
total_poker <- sum(poker_vector)


# if you add a vector and number , it will add it to all elements

# for example, to select the first element of the vector, you type poker_vector[1]

poker_midweek <- poker_vector[c(2,3,4)] # gets data of all the indices in the vector inside

#also this might work

poker_midweek <- poker_vector[2:4]

poker_vector["Monday"]

poker_vector[c("Monday","Tuesday")]

# for getting mead use mean function

# for less than
# > for greater than
# >= for greater than or equal to
# == for equal to each other
# != not equal to each other

# for comparing with all elements you can use directly these operators

# What days of the week did you make money on poker?
selection_vector <- poker_vector > 0

# Select from poker_vector these days
poker_winning_days <- poker_vector[selection_vector]

# Matrix

# You can construct a matrix in R with the matrix() function.

mat = matrix(1:9, byrow = TRUE, nrow = 3)
mat

# The argument byrow indicates that the matrix is filled by the rows. If we want the vector to be filled by the columns, we just place byrow = FALSE

matrix(10,nrow=3,ncol=5) # Creates matrix with all elements 10

# Similar to vectors, you can add names for the rows and the columns of a matrix
rownames(my_matrix) <- row_names_vector
colnames(my_matrix) <- col_names_vector


# nrow returns number of rows
nrow(mat)

# ncol - number of columns
ncol(matrix)

# In R, the function rowSums() conveniently calculates the totals for each row of a matrix
sum_of_rows_vector <- rowSums(my_matrix)
# Similarly we also have colSums()
# use cbind(mat,vec/mat) to add more columns in a matrix
# similarly you can use rbind()

# my_matrix[1,2] selects from the first row the second element.
# my_matrix[1:3,2:4] selects rows 1,2,3 and columns 2,3,4.
# my_matrix[,1] selects all elements of the first column.
# my_matrix[1,] selects all elements of the first row.

# the standard operators like +, -, /, *, etc. work in an element-wise way on matrices in R.

# note applying * on matrices does element wise. To do standard matrix multiplication use %*% For more use this http://www.statmethods.net/advstats/matrix.html

# Factors


gender_vector <- c("Male", "Female", "Female", "Male", "Male")

# Define factor_gender_vector using 'factor()'
factor_gender_vector <- factor(gender_vector)

# There are 2 types of Factor Variables

animals_vector <- c("Elephant", "Giraffe", "Donkey", "Horse")
temperature_vector <- c("High", "Low", "High","Low", "Medium")

factor_animals_vector <- factor(animals_vector) # Unordered stored alphabetically
factor_animals_vector
factor_temperature_vector <- factor(temperature_vector, order = TRUE, levels = c("Low", "Medium", "High"))
factor_temperature_vector

# levels(factor_vector) <- c("name1","name2",...) % You can change names of the factor according to your convenience

# This will give you a quick overview of some_variable:
summary(some_variable)


# You cannot compare 2 nominal factors eg Male and Female

# But you can compare 2 ordinal factors example High and Low

# Remember matrix and vectors, all elements are of same data type

# Dataset
# Remember matrix and vectors, all elements are of same data type

head(carset) # first observations of a data frame (or any R object you pass to it).

tail(carset) # last observations of a data frame (or any R object you pass to it).

head(carset,number of elements) # first n observations of a data frame (or any R object you pass to it).

str(mtcars) # The function str() shows you the structure of your data set.

data.frame(vectors) #Creates a data sets given vectors as argument

# remember rbind and cbind works here but using rbind will also merge the factor levels.

# In general, remember there three lines.

dataset[a,b] # For row and column
dataset[a,] # For row/s
dataset[,b] # For column/s

# All data from the rows
furthest_planets_df <- planets_df[6:8,]

# Instead of using numerics to select elements of a data frame, you can also use the variable names to select columns of a data frame
furthest_planets_diameter <- planets_df[3:8,"diameter"]

data_frame_name$variable_name # access to a specific column/row of a given dataset
planets_with_rings_df <- planets_df[rings_vector,] # access all the elements of rings_vector==true, Remember ‘,’ is important.

# For cutting data use subsets instead
subset(my_data_frame, subset = some_condition)

# order() is a function that gives you the ranked position of each element when it is applied on a variable
a <- c(100,9,101)
order(a) # returns the vector {2, 1, 3}

# What is the correct ordering based on the planets_df$diameter variable?
positions <-  order(planets_df$diameter,decreasing=T)

# Create new "ordered" data frame:
largest_first_df <- planets_df[positions,]


dim(obj) # returns dimension of object i.e number of rows and column


#Conditionals

# Comparison of character strings
"user"=="useR" # False

# True compares to 1 and false compares to 0
# Compare a logical with a numeric
TRUE==1 # True

# Comparison of character strings
"raining" <= "raining dogs" # True because alphabetic order

"hello">"Goodbye" # True

# Comparison of logicals # True
TRUE>FALSE

# Sum of a boolean array gives number of trues

# AND and OR # In R and operator is & and operator is |

# Is last between 0 and 5 or between 10 and 15?
(last > 0 & last < 5) | (last > 10 & last < 15)

# ! is NOT operator

# remember this is the syntax do not add any more white space
if (condition){
    expr
} else if (condition2) {
    expr2
} else (condition) {
    expr3
}

if (num_views>15){
    print("You're popular!") # Notice using print statement
}

# For getting detail of any syntax use ?<keyword> Example
?print


Loops

# While loop
while(condition) {
  expr
}


# You can use break statement similar way
if (speed>80) break # remember inside the loop

# In R instead of using continue you can use next

# For loop
primes <- c(2, 3, 5, 7, 11, 13)

# loop version 1
for(p in primes) {
  print(p)
}

# Spliting String
chars <- strsplit(rquote, split = "")[[1]]

number <- sample(1:6, size = 1) # returns a sample number between 1 to 6


# Functions in R

# A quick hack to simply see the arguments of the sample() function is the args() function
args(sample)

# mean function with different arguments
mean(x, trim = 0, na.rm = FALSE, ...)
# na.rm removes the elements which do not exists Eg NA or Infinity
speed <- 31
print(paste("Your speed is",speed)) # Adding string "Contenation"

#Writing your own functions
my_fun <- function(arg1, arg2) {
  body
  my_fun=something
  return(value) # this is different from assign value. Prefer this
}
# Scoping implies that variables that are defined inside a function, are not accessible outside that function
# R passes by value, so the R objects you pass to a function can never change unless you do an explicit assigment
# Inside R functions, everything you print is returned if return not specified
# Syntax


# Plotting in R
str(mtcars)
plot(data=mtcars)
plot(mtcars$mpg, mtcars$wt)

## classify.R
k-NN
-----------------------------------------------------------------------
library (class)
data(iris3)

# 1. using a separate test set
train <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3])
test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3])
cl <- factor(c(rep("s",25), rep("c",25), rep("v",25)))
myknn <- knn(train, test, cl, k = 3, prob=TRUE)
attributes(.Last.value)
tab <- table(myknn, cl)
sum(tab[row(tab)==col(tab)])/sum(tab)

Naive-Bayes
---------------------------------------------------
library (e1071)

## Naive Bayes Classifier for Discrete Predictors: we use again the Congressional Voting Records of 1984
# Note refusals to vote have been treated as missing values!

data (HouseVotes84, package="mlbench")
model <- naiveBayes(Class ~ ., data = HouseVotes84)

# predict the outcome of the first 20 records
predict(model, HouseVotes84[1:20,-1])

# same but displaying posteriors
predict(model, HouseVotes84[1:20,-1], type = "raw")

# now all of them: this is the resubstituion error
pred <- predict(model, HouseVotes84[,-1])

# form and display confusion matrix & overall accuracy
tab <- table(pred, HouseVotes84$Class)
tab
sum(tab[row(tab)==col(tab)])/sum(tab)

Source: http://www.cs.upc.edu/~belanche/Docencia/mineria/English-september-2008/Practical-work/Labo-NBayes-kNN.R
http://www.cs.upc.edu/~belanche/Docencia/mineria/English-september-2008/Practical-work/Labo-SVMs.R

SVMs
----------------------------------------------------
# In a second example, we use the glass data (available in package mlbench) for classification
# The task is to predict the type of a glass on basis of its chemical analysis

library(e1071)
library(rpart)
library(mlbench)
data(Glass)

## split data into a training (2/3) and test set (1/3)

index <- 1:nrow(Glass)
testindex <- sample(index, trunc(length(index)/3))
testset <- Glass[testindex,]
trainset <- Glass[-testindex,]

# fit the model

svm.model <- svm(Type ~ ., data = trainset, cost = 100, gamma = 1)

# make the prediction (the dependent variable, Type, has column number 10)

svm.pred <- predict(svm.model, testset[,-10])

table(pred = svm.pred, true = testset[,10])

# The function svm() returns an object of class “svm”, which partly includes the following components:
#	SV: matrix of support vectors found;
#	labels: their labels in classification mode;
#	index: index of the support vectors in the input data (could be used e.g., for visualization)

# Other important parameters:
# 	class.weights: allows to introduce class weighing, useful for very asymmetric classes
#	cross: (default 0) for k-fold CV

# A nice tool in package e1071 is the possibility of tuning the parameters by 10-CV grid search:

mytunedsvm <- tune.svm(Type ~ ., data = trainset, gamma = 2^(-1:1), cost = 2^(2:4))
summary(mytunedsvm)
plot (mytunedsvm, transform.x=log10, xlab=expression(log[10](gamma)), ylab="C")

	# R is a scripting language

	# Arithmetic Operators
	5+5
	# 10

	# Arithmetic Operators
	# Addition: +
	# Subtraction: -
	# Multiplication: *
	# Division: /
	# Exponentiation: ^
	# Modulo: %%

	# Assign the value 5 to the variable called 'my_apples'
	my_apples <- 5

	# Print out the value of the variable 'my_apples'
	my_apples

	# adding txt and number will give you error
	"Sdfsdf"+"sdfs" # gives error
	"sdfsdf" + 5 # gives error

	# Decimals values like 4.5 are called numerics.
	# Natural numbers like 4 are called integers. Integers are also numerics.
	# Boolean values (TRUE or FALSE) are called logical (TRUE can be abbreviated to T and FALSE to F).
	# Text (or string) values are called characters.

	my_character <- "forty-two" #This value is string

	remove(my_var) # Removes variable
	class(some_variable_name) # gives type of variable, Note- return type is string

	# Vectors

	numeric_v <- c(1, 2, 3) #In R, you create a vector with the combine function c()

	# You can give a name to the elements of a vector with the names() function
	some_vector <- c("Johnny", "Poker Player")
	names(some_vector) <- c("Name", "Profession")



	# if you sum two vectors in R, it takes the element-wise sum.
	c(1, 2, 3) + c(4, 5, 6)
	c(1 + 4, 2 + 5, 3 + 6)
	c(5, 7, 9)

	poker_vector <- c(1,2,-3,4,5)
	names(poker_vector) <- c("Sunday", "Monday", "Tuesday", "Wen", "Thu")

	# It calculates the sum of all elements of a vector
	total_poker <- sum(poker_vector)


	# if you add a vector and number , it will add it to all elements

	# for example, to select the first element of the vector, you type poker_vector[1]

	poker_midweek <- poker_vector[c(2,3,4)] # gets data of all the indices in the vector inside

	#also this might work

	poker_midweek <- poker_vector[2:4]

	poker_vector["Monday"]

	poker_vector[c("Monday","Tuesday")]

	# for getting mead use mean function

	# for less than
	# > for greater than
	# >= for greater than or equal to
	# == for equal to each other
	# != not equal to each other

	# for comparing with all elements you can use directly these operators

	# What days of the week did you make money on poker?
	selection_vector <- poker_vector > 0

	# Select from poker_vector these days
	poker_winning_days <- poker_vector[selection_vector]

	# Matrix

	# You can construct a matrix in R with the matrix() function.

	mat = matrix(1:9, byrow = TRUE, nrow = 3)
	mat

	# The argument byrow indicates that the matrix is filled by the rows. If we want the vector to be filled by the columns, we just place byrow = FALSE

	matrix(10,nrow=3,ncol=5) # Creates matrix with all elements 10

	# Similar to vectors, you can add names for the rows and the columns of a matrix
	rownames(my_matrix) <- row_names_vector
	colnames(my_matrix) <- col_names_vector


	# nrow returns number of rows
	nrow(mat)

	# ncol - number of columns
	ncol(matrix)

	# In R, the function rowSums() conveniently calculates the totals for each row of a matrix
	sum_of_rows_vector <- rowSums(my_matrix)
	# Similarly we also have colSums()
	# use cbind(mat,vec/mat) to add more columns in a matrix
	# similarly you can use rbind()

	# my_matrix[1,2] selects from the first row the second element.
	# my_matrix[1:3,2:4] selects rows 1,2,3 and columns 2,3,4.
	# my_matrix[,1] selects all elements of the first column.
	# my_matrix[1,] selects all elements of the first row.

	# the standard operators like +, -, /, *, etc. work in an element-wise way on matrices in R.

	# note applying * on matrices does element wise. To do standard matrix multiplication use %*% For more use this http://www.statmethods.net/advstats/matrix.html

	# Factors


	gender_vector <- c("Male", "Female", "Female", "Male", "Male")

	# Define factor_gender_vector using 'factor()'
	factor_gender_vector <- factor(gender_vector)

	# There are 2 types of Factor Variables

	animals_vector <- c("Elephant", "Giraffe", "Donkey", "Horse")
	temperature_vector <- c("High", "Low", "High","Low", "Medium")

	factor_animals_vector <- factor(animals_vector) # Unordered stored alphabetically
	factor_animals_vector
	factor_temperature_vector <- factor(temperature_vector, order = TRUE, levels = c("Low", "Medium", "High"))
	factor_temperature_vector

	# levels(factor_vector) <- c("name1","name2",...) % You can change names of the factor according to your convenience

	# This will give you a quick overview of some_variable:
	summary(some_variable)


	# You cannot compare 2 nominal factors eg Male and Female

	# But you can compare 2 ordinal factors example High and Low

	# Remember matrix and vectors, all elements are of same data type

	# Dataset
	# Remember matrix and vectors, all elements are of same data type

	head(carset) # first observations of a data frame (or any R object you pass to it).

	tail(carset) # last observations of a data frame (or any R object you pass to it).

	head(carset,number of elements) # first n observations of a data frame (or any R object you pass to it).

	str(mtcars) # The function str() shows you the structure of your data set.

	data.frame(vectors) #Creates a data sets given vectors as argument

	# remember rbind and cbind works here but using rbind will also merge the factor levels.

	# In general, remember there three lines.

	dataset[a,b] # For row and column
	dataset[a,] # For row/s
	dataset[,b] # For column/s

	# All data from the rows
	furthest_planets_df <- planets_df[6:8,]

	# Instead of using numerics to select elements of a data frame, you can also use the variable names to select columns of a data frame
	furthest_planets_diameter <- planets_df[3:8,"diameter"]

	data_frame_name$variable_name # access to a specific column/row of a given dataset
	planets_with_rings_df <- planets_df[rings_vector,] # access all the elements of rings_vector==true, Remember ‘,’ is important.

	# For cutting data use subsets instead
	subset(my_data_frame, subset = some_condition)

	# order() is a function that gives you the ranked position of each element when it is applied on a variable
	a <- c(100,9,101)
	order(a) # returns the vector {2, 1, 3}

	# What is the correct ordering based on the planets_df$diameter variable?
	positions <- order(planets_df$diameter,decreasing=T)

	# Create new "ordered" data frame:
	largest_first_df <- planets_df[positions,]


	dim(obj) # returns dimension of object i.e number of rows and column


	#Conditionals

	# Comparison of character strings
	"user"=="useR" # False

	# True compares to 1 and false compares to 0
	# Compare a logical with a numeric
	TRUE==1 # True

	# Comparison of character strings
	"raining" <= "raining dogs" # True because alphabetic order

	"hello">"Goodbye" # True

	# Comparison of logicals # True
	TRUE>FALSE

	# Sum of a boolean array gives number of trues

	# AND and OR # In R and operator is & and operator is \|

	# Is last between 0 and 5 or between 10 and 15?
	(last > 0 & last < 5) \| (last > 10 & last < 15)

	# ! is NOT operator

	# remember this is the syntax do not add any more white space
	if (condition){
	expr
	} else if (condition2) {
	expr2
	} else (condition) {
	expr3
	}

	if (num_views>15){
	print("You're popular!") # Notice using print statement
	}

	# For getting detail of any syntax use ?<keyword> Example
	?print


	Loops

	# While loop
	while(condition) {
	expr
	}


	# You can use break statement similar way
	if (speed>80) break # remember inside the loop

	# In R instead of using continue you can use next

	# For loop
	primes <- c(2, 3, 5, 7, 11, 13)

	# loop version 1
	for(p in primes) {
	print(p)
	}

	# Spliting String
	chars <- strsplit(rquote, split = "")[[1]]

	number <- sample(1:6, size = 1) # returns a sample number between 1 to 6


	# Functions in R

	# A quick hack to simply see the arguments of the sample() function is the args() function
	args(sample)

	# mean function with different arguments
	mean(x, trim = 0, na.rm = FALSE, ...)
	# na.rm removes the elements which do not exists Eg NA or Infinity
	speed <- 31
	print(paste("Your speed is",speed)) # Adding string "Contenation"

	#Writing your own functions
	my_fun <- function(arg1, arg2) {
	body
	my_fun=something
	return(value) # this is different from assign value. Prefer this
	}
	# Scoping implies that variables that are defined inside a function, are not accessible outside that function
	# R passes by value, so the R objects you pass to a function can never change unless you do an explicit assigment
	# Inside R functions, everything you print is returned if return not specified
	# Syntax



	# Plotting in R
	str(mtcars)
	plot(data=mtcars)
	plot(mtcars$mpg, mtcars$wt)
	k-NN
	-----------------------------------------------------------------------
	library (class)
	data(iris3)

	# 1. using a separate test set
	train <- rbind(iris3[1:25,,1], iris3[1:25,,2], iris3[1:25,,3])
	test <- rbind(iris3[26:50,,1], iris3[26:50,,2], iris3[26:50,,3])
	cl <- factor(c(rep("s",25), rep("c",25), rep("v",25)))
	myknn <- knn(train, test, cl, k = 3, prob=TRUE)
	attributes(.Last.value)
	tab <- table(myknn, cl)
	sum(tab[row(tab)==col(tab)])/sum(tab)

	Naive-Bayes
	---------------------------------------------------
	library (e1071)

	## Naive Bayes Classifier for Discrete Predictors: we use again the Congressional Voting Records of 1984
	# Note refusals to vote have been treated as missing values!

	data (HouseVotes84, package="mlbench")
	model <- naiveBayes(Class ~ ., data = HouseVotes84)

	# predict the outcome of the first 20 records
	predict(model, HouseVotes84[1:20,-1])

	# same but displaying posteriors
	predict(model, HouseVotes84[1:20,-1], type = "raw")

	# now all of them: this is the resubstituion error
	pred <- predict(model, HouseVotes84[,-1])

	# form and display confusion matrix & overall accuracy
	tab <- table(pred, HouseVotes84$Class)
	tab
	sum(tab[row(tab)==col(tab)])/sum(tab)

	Source: http://www.cs.upc.edu/~belanche/Docencia/mineria/English-september-2008/Practical-work/Labo-NBayes-kNN.R
	http://www.cs.upc.edu/~belanche/Docencia/mineria/English-september-2008/Practical-work/Labo-SVMs.R

	SVMs
	----------------------------------------------------
	# In a second example, we use the glass data (available in package mlbench) for classification
	# The task is to predict the type of a glass on basis of its chemical analysis

	library(e1071)
	library(rpart)
	library(mlbench)
	data(Glass)

	## split data into a training (2/3) and test set (1/3)

	index <- 1:nrow(Glass)
	testindex <- sample(index, trunc(length(index)/3))
	testset <- Glass[testindex,]
	trainset <- Glass[-testindex,]

	# fit the model

	svm.model <- svm(Type ~ ., data = trainset, cost = 100, gamma = 1)

	# make the prediction (the dependent variable, Type, has column number 10)

	svm.pred <- predict(svm.model, testset[,-10])

	table(pred = svm.pred, true = testset[,10])

	# The function svm() returns an object of class “svm”, which partly includes the following components:
	# SV: matrix of support vectors found;
	# labels: their labels in classification mode;
	# index: index of the support vectors in the input data (could be used e.g., for visualization)

	# Other important parameters:
	# class.weights: allows to introduce class weighing, useful for very asymmetric classes
	# cross: (default 0) for k-fold CV

	# A nice tool in package e1071 is the possibility of tuning the parameters by 10-CV grid search:

	mytunedsvm <- tune.svm(Type ~ ., data = trainset, gamma = 2^(-1:1), cost = 2^(2:4))
	summary(mytunedsvm)
	plot (mytunedsvm, transform.x=log10, xlab=expression(log[10](gamma)), ylab="C")