iCHAIT/R-cheats.r

## R-cheats.r
# install packages
install.packages('data.table')
install.packages('ggplot2')

# import library
library(data.table)
library(ggplot2)

# Assigning Vectors
x= c(0,1,2,3,4)
y=c(1:5)

#using a function in R
square_root = function(x = 2){sqrt(x)}

# Creating a data table
dt = data.table(name=c('Himanshu','Abhishek'),age = c(22,23))

# Viewing a data table
#Reading And Writing Tables, removing row names

# attaching the diamond table included in ggplot package
attach(diamonds)
diamonds = data.table(diamonds)

# show all colnames of diamonds
colnames(diamonds)

# showing number of rows, columns and dimensions of a table
nrow(diamonds)

# showing number of rows, columns and dimensions of a table
ncol(diamonds)

# for dimension
dim(diamonds)

# changing column names in a data table
setnames(diamonds,c('carat','cut'),c('Carat','Cut'))

colnames(diamonds)

setnames(diamonds,c('Carat','Cut'),c('carat','cut'))


# Find the number and types unique cuts and assign it to a variable
unique_cuts = unique(diamonds$cut)
unique_cuts

len_unique_cuts = length(unique(diamonds$cut))
len_unique_cuts

# Find the combination of unique cuts and clarities
unique_cuts_and_colors = unique(diamonds, by=c('cut','clarity'))
unique_cuts_and_colors[,c('cut','color'),with = F]

unique_cuts_and_colors[cut=='Ideal' ,c('cut','color', 'clarity'),with=F]


#filtering - looking at values of a Ideal Cut and E color
IdealCut_EColor_diamonds = diamonds[cut=='Ideal' & color == 'E']
IdealCut_EColor_diamonds

# Grouping - Find number of instances of every cut and mean price of every cut and list the cut with highest price
grouping_on_cut = diamonds[,list(Total_instance = .N,mean_price = mean(price)),
                           by=c('cut')]

# Defining a new column in R
price_cat <- diamonds[cut == "Ideal" & price < 3457.42, price_category := "Cheap"]
View(price_cat)

# Way to do it
max_mean_price = max(grouping_on_cut$mean_price)
cut_highest_price = unique(grouping_on_cut[mean_price == max_mean_price]$cut)


# Excercise
# Find the count of instances and mean of depth of every cut and color combination
# Find and print the cut and color combination with least depth and its value
# Find the combination with highest mean depth among the Premium Cut

grouping_on_cut_color = diamonds[,list(Total_instance = .N,mean_depth = mean(depth)),
                           by=c('cut','color')]

min_depth = min(grouping_on_cut_color$mean_depth)
grouping_on_cut_color[mean_depth == min_depth,c('cut','color'),with=F]
max_depth_Premium <- max(grouping_on_cut_color[cut == 'Premium']$mean_depth)
grouping_on_cut_color[mean_depth == max_depth_Premium,c('cut','color'),with=F]

nrow(diamonds[cut == "Ideal" & price < 350])


# Merging in a Data Table
cuts = unique(diamonds$cut)

cutQuality = data.table(cut=cuts,quality = c("Q1","Q2","Q3","Q4","Q5"))

cutPriceType = data.table(cut=cuts[1:4],priceType = c("P1","P2","P3","P4"))

# left join
diamondsWithQuality = merge(diamonds,cutQuality,all.x=T,by=c('cut'))

diamondsWithQuality[,price_category := NULL]
diamondsWithPrice = merge(diamonds,cutPriceType,all.x=T,by=c('cut'))

naPriceType = diamondsWithPrice[is.na(priceType)]

diamondsWithPriceOnlyExisting = merge(diamonds,cutPriceType,by=c('cut'))

# Cartesian Join - Create all combination of price and color
cutTable = data.table(cut=unique(diamonds$cut))
cutTable[,key:=1]

colorTable = data.table(color=unique(diamonds$color))
colorTable[,key:=1]

# Allow.cartesian = T
cutColorCombo = merge(cutTable,colorTable,allow.cartesian = T,by=c('key'))

# deleting a column
cutColorCombo[,key:=NULL]

#Excercise 3
# Does the data set contain all possible combinations of these
# cut,colort,clarity types?
# If no, list the values of missing combinations in the diamonds set.
# Which combination(s) has/have the most records?
# Which one(s) has/have the least? Print them in your own way

cutTable = data.table(cut=unique(diamonds$cut))
cutTable[,key:=1]

colorTable = data.table(color=unique(diamonds$color))
colorTable[,key:=1]

clarityTable = data.table(clarity=unique(diamonds$clarity))
clarityTable[,key := 1]

cutColorTable <- merge(cutTable,colorTable, allow.cartesian = T, by = 'key')
cutColorClarityTable <- merge(cutColorTable,clarityTable, allow.cartesian = T, by = 'key')

diamonds1 <- unique(diamonds, by = c("cut", 'color', 'clarity'))
diamonds1[,present := 1]

final <- merge(cutColorClarityTable, diamonds1, all.x = T, by = c("cut", 'color', 'clarity'))
View(final)
View(final[is.na(present)])


iris = data.table(iris)
View(iris)
summary(iris)
str(iris)

# Plotting

qplot(Sepal.Length, Petal.Length, color = Species, data = iris)
qplot(Sepal.Length, color = Species, data = iris)


# Modelling

# use the copy function
FeatureSetTrain = copy(iris)

FeatureSetTrain = FeatureSetTrain[,c('Sepal.Length','Petal.Length'),with=F]

train <- FeatureSetTrain[1:100]

Target = FeatureSetTrain$Sepal.Length

FeatureSetTrain = data.frame(FeatureSetTrain)

fit1 <- lm(Sepal.Length ~ Petal.Length , data = FeatureSetTrain)

print(coef(fit1))

coeff=data.table(coef(fit1)[[1]])

res = predict(fit1,FeatureSetTrain)
inp = data.table(Sepal.Length=Target)
res = data.table(Sepal.Length.Pred = res)
View(inp - res)

fin_res = cbind(inp,res)
View(fin_res)
FinalRes = cbind(iris,fin_res)
View(FinalRes)

# MAE By Mean and ME By Mean Analysis

# Excercise 4
# Calculate the Mean Error, Mean Absolute Error and Mean Sepal Length for each species type
# Then calculate Mean Error By Mean and Mean Absolute Error by Mean for each species type

m <- matrix(data=cbind(rnorm(30, 0), rnorm(30, 2), rnorm(30, 5)), nrow=30, ncol=3)
View(m)

# See help of apply function - 1 represents rows and 2 represents colulmns
apply(m, 2, function(x) length(x[x<0]))

# sapply and lapply
# sapply
sapply(1:3, function(x) x^2)

# lapply, very similar function but returns list rather than vector
lapply(1:3, function(x) x^2)

rbind(lapply(iris$Sepal.Length, function(x) x^3))

rbind(lapply(iris$Sepal.Length, function(x) iris[,xyz := x^2]))
	# install packages
	install.packages('data.table')
	install.packages('ggplot2')

	# import library
	library(data.table)
	library(ggplot2)

	# Assigning Vectors
	x= c(0,1,2,3,4)
	y=c(1:5)

	#using a function in R
	square_root = function(x = 2){sqrt(x)}

	# Creating a data table
	dt = data.table(name=c('Himanshu','Abhishek'),age = c(22,23))

	# Viewing a data table
	#Reading And Writing Tables, removing row names

	# attaching the diamond table included in ggplot package
	attach(diamonds)
	diamonds = data.table(diamonds)

	# show all colnames of diamonds
	colnames(diamonds)

	# showing number of rows, columns and dimensions of a table
	nrow(diamonds)

	# showing number of rows, columns and dimensions of a table
	ncol(diamonds)

	# for dimension
	dim(diamonds)

	# changing column names in a data table
	setnames(diamonds,c('carat','cut'),c('Carat','Cut'))

	colnames(diamonds)

	setnames(diamonds,c('Carat','Cut'),c('carat','cut'))


	# Find the number and types unique cuts and assign it to a variable
	unique_cuts = unique(diamonds$cut)
	unique_cuts

	len_unique_cuts = length(unique(diamonds$cut))
	len_unique_cuts

	# Find the combination of unique cuts and clarities
	unique_cuts_and_colors = unique(diamonds, by=c('cut','clarity'))
	unique_cuts_and_colors[,c('cut','color'),with = F]

	unique_cuts_and_colors[cut=='Ideal' ,c('cut','color', 'clarity'),with=F]


	#filtering - looking at values of a Ideal Cut and E color
	IdealCut_EColor_diamonds = diamonds[cut=='Ideal' & color == 'E']
	IdealCut_EColor_diamonds

	# Grouping - Find number of instances of every cut and mean price of every cut and list the cut with highest price
	grouping_on_cut = diamonds[,list(Total_instance = .N,mean_price = mean(price)),
	by=c('cut')]

	# Defining a new column in R
	price_cat <- diamonds[cut == "Ideal" & price < 3457.42, price_category := "Cheap"]
	View(price_cat)

	# Way to do it
	max_mean_price = max(grouping_on_cut$mean_price)
	cut_highest_price = unique(grouping_on_cut[mean_price == max_mean_price]$cut)



	# Excercise
	# Find the count of instances and mean of depth of every cut and color combination
	# Find and print the cut and color combination with least depth and its value
	# Find the combination with highest mean depth among the Premium Cut

	grouping_on_cut_color = diamonds[,list(Total_instance = .N,mean_depth = mean(depth)),
	by=c('cut','color')]

	min_depth = min(grouping_on_cut_color$mean_depth)
	grouping_on_cut_color[mean_depth == min_depth,c('cut','color'),with=F]
	max_depth_Premium <- max(grouping_on_cut_color[cut == 'Premium']$mean_depth)
	grouping_on_cut_color[mean_depth == max_depth_Premium,c('cut','color'),with=F]

	nrow(diamonds[cut == "Ideal" & price < 350])


	# Merging in a Data Table
	cuts = unique(diamonds$cut)

	cutQuality = data.table(cut=cuts,quality = c("Q1","Q2","Q3","Q4","Q5"))

	cutPriceType = data.table(cut=cuts[1:4],priceType = c("P1","P2","P3","P4"))

	# left join
	diamondsWithQuality = merge(diamonds,cutQuality,all.x=T,by=c('cut'))

	diamondsWithQuality[,price_category := NULL]
	diamondsWithPrice = merge(diamonds,cutPriceType,all.x=T,by=c('cut'))

	naPriceType = diamondsWithPrice[is.na(priceType)]

	diamondsWithPriceOnlyExisting = merge(diamonds,cutPriceType,by=c('cut'))

	# Cartesian Join - Create all combination of price and color
	cutTable = data.table(cut=unique(diamonds$cut))
	cutTable[,key:=1]

	colorTable = data.table(color=unique(diamonds$color))
	colorTable[,key:=1]

	# Allow.cartesian = T
	cutColorCombo = merge(cutTable,colorTable,allow.cartesian = T,by=c('key'))

	# deleting a column
	cutColorCombo[,key:=NULL]

	#Excercise 3
	# Does the data set contain all possible combinations of these
	# cut,colort,clarity types?
	# If no, list the values of missing combinations in the diamonds set.
	# Which combination(s) has/have the most records?
	# Which one(s) has/have the least? Print them in your own way

	cutTable = data.table(cut=unique(diamonds$cut))
	cutTable[,key:=1]

	colorTable = data.table(color=unique(diamonds$color))
	colorTable[,key:=1]

	clarityTable = data.table(clarity=unique(diamonds$clarity))
	clarityTable[,key := 1]

	cutColorTable <- merge(cutTable,colorTable, allow.cartesian = T, by = 'key')
	cutColorClarityTable <- merge(cutColorTable,clarityTable, allow.cartesian = T, by = 'key')

	diamonds1 <- unique(diamonds, by = c("cut", 'color', 'clarity'))
	diamonds1[,present := 1]

	final <- merge(cutColorClarityTable, diamonds1, all.x = T, by = c("cut", 'color', 'clarity'))
	View(final)
	View(final[is.na(present)])


	iris = data.table(iris)
	View(iris)
	summary(iris)
	str(iris)

	# Plotting

	qplot(Sepal.Length, Petal.Length, color = Species, data = iris)
	qplot(Sepal.Length, color = Species, data = iris)


	# Modelling

	# use the copy function
	FeatureSetTrain = copy(iris)

	FeatureSetTrain = FeatureSetTrain[,c('Sepal.Length','Petal.Length'),with=F]

	train <- FeatureSetTrain[1:100]

	Target = FeatureSetTrain$Sepal.Length

	FeatureSetTrain = data.frame(FeatureSetTrain)

	fit1 <- lm(Sepal.Length ~ Petal.Length , data = FeatureSetTrain)

	print(coef(fit1))

	coeff=data.table(coef(fit1)[[1]])

	res = predict(fit1,FeatureSetTrain)
	inp = data.table(Sepal.Length=Target)
	res = data.table(Sepal.Length.Pred = res)
	View(inp - res)

	fin_res = cbind(inp,res)
	View(fin_res)
	FinalRes = cbind(iris,fin_res)
	View(FinalRes)

	# MAE By Mean and ME By Mean Analysis

	# Excercise 4
	# Calculate the Mean Error, Mean Absolute Error and Mean Sepal Length for each species type
	# Then calculate Mean Error By Mean and Mean Absolute Error by Mean for each species type

	m <- matrix(data=cbind(rnorm(30, 0), rnorm(30, 2), rnorm(30, 5)), nrow=30, ncol=3)
	View(m)

	# See help of apply function - 1 represents rows and 2 represents colulmns
	apply(m, 2, function(x) length(x[x<0]))

	# sapply and lapply
	# sapply
	sapply(1:3, function(x) x^2)

	# lapply, very similar function but returns list rather than vector
	lapply(1:3, function(x) x^2)

	rbind(lapply(iris$Sepal.Length, function(x) x^3))

	rbind(lapply(iris$Sepal.Length, function(x) iris[,xyz := x^2]))