Skip to content

Instantly share code, notes, and snippets.

@iCHAIT
Created Jul 1, 2016
Embed
What would you like to do?
# install packages
install.packages('data.table')
install.packages('ggplot2')
# import library
library(data.table)
library(ggplot2)
# Assigning Vectors
x= c(0,1,2,3,4)
y=c(1:5)
#using a function in R
square_root = function(x = 2){sqrt(x)}
# Creating a data table
dt = data.table(name=c('Himanshu','Abhishek'),age = c(22,23))
# Viewing a data table
#Reading And Writing Tables, removing row names
# attaching the diamond table included in ggplot package
attach(diamonds)
diamonds = data.table(diamonds)
# show all colnames of diamonds
colnames(diamonds)
# showing number of rows, columns and dimensions of a table
nrow(diamonds)
# showing number of rows, columns and dimensions of a table
ncol(diamonds)
# for dimension
dim(diamonds)
# changing column names in a data table
setnames(diamonds,c('carat','cut'),c('Carat','Cut'))
colnames(diamonds)
setnames(diamonds,c('Carat','Cut'),c('carat','cut'))
# Find the number and types unique cuts and assign it to a variable
unique_cuts = unique(diamonds$cut)
unique_cuts
len_unique_cuts = length(unique(diamonds$cut))
len_unique_cuts
# Find the combination of unique cuts and clarities
unique_cuts_and_colors = unique(diamonds, by=c('cut','clarity'))
unique_cuts_and_colors[,c('cut','color'),with = F]
unique_cuts_and_colors[cut=='Ideal' ,c('cut','color', 'clarity'),with=F]
#filtering - looking at values of a Ideal Cut and E color
IdealCut_EColor_diamonds = diamonds[cut=='Ideal' & color == 'E']
IdealCut_EColor_diamonds
# Grouping - Find number of instances of every cut and mean price of every cut and list the cut with highest price
grouping_on_cut = diamonds[,list(Total_instance = .N,mean_price = mean(price)),
by=c('cut')]
# Defining a new column in R
price_cat <- diamonds[cut == "Ideal" & price < 3457.42, price_category := "Cheap"]
View(price_cat)
# Way to do it
max_mean_price = max(grouping_on_cut$mean_price)
cut_highest_price = unique(grouping_on_cut[mean_price == max_mean_price]$cut)
# Excercise
# Find the count of instances and mean of depth of every cut and color combination
# Find and print the cut and color combination with least depth and its value
# Find the combination with highest mean depth among the Premium Cut
grouping_on_cut_color = diamonds[,list(Total_instance = .N,mean_depth = mean(depth)),
by=c('cut','color')]
min_depth = min(grouping_on_cut_color$mean_depth)
grouping_on_cut_color[mean_depth == min_depth,c('cut','color'),with=F]
max_depth_Premium <- max(grouping_on_cut_color[cut == 'Premium']$mean_depth)
grouping_on_cut_color[mean_depth == max_depth_Premium,c('cut','color'),with=F]
nrow(diamonds[cut == "Ideal" & price < 350])
# Merging in a Data Table
cuts = unique(diamonds$cut)
cutQuality = data.table(cut=cuts,quality = c("Q1","Q2","Q3","Q4","Q5"))
cutPriceType = data.table(cut=cuts[1:4],priceType = c("P1","P2","P3","P4"))
# left join
diamondsWithQuality = merge(diamonds,cutQuality,all.x=T,by=c('cut'))
diamondsWithQuality[,price_category := NULL]
diamondsWithPrice = merge(diamonds,cutPriceType,all.x=T,by=c('cut'))
naPriceType = diamondsWithPrice[is.na(priceType)]
diamondsWithPriceOnlyExisting = merge(diamonds,cutPriceType,by=c('cut'))
# Cartesian Join - Create all combination of price and color
cutTable = data.table(cut=unique(diamonds$cut))
cutTable[,key:=1]
colorTable = data.table(color=unique(diamonds$color))
colorTable[,key:=1]
# Allow.cartesian = T
cutColorCombo = merge(cutTable,colorTable,allow.cartesian = T,by=c('key'))
# deleting a column
cutColorCombo[,key:=NULL]
#Excercise 3
# Does the data set contain all possible combinations of these
# cut,colort,clarity types?
# If no, list the values of missing combinations in the diamonds set.
# Which combination(s) has/have the most records?
# Which one(s) has/have the least? Print them in your own way
cutTable = data.table(cut=unique(diamonds$cut))
cutTable[,key:=1]
colorTable = data.table(color=unique(diamonds$color))
colorTable[,key:=1]
clarityTable = data.table(clarity=unique(diamonds$clarity))
clarityTable[,key := 1]
cutColorTable <- merge(cutTable,colorTable, allow.cartesian = T, by = 'key')
cutColorClarityTable <- merge(cutColorTable,clarityTable, allow.cartesian = T, by = 'key')
diamonds1 <- unique(diamonds, by = c("cut", 'color', 'clarity'))
diamonds1[,present := 1]
final <- merge(cutColorClarityTable, diamonds1, all.x = T, by = c("cut", 'color', 'clarity'))
View(final)
View(final[is.na(present)])
iris = data.table(iris)
View(iris)
summary(iris)
str(iris)
# Plotting
qplot(Sepal.Length, Petal.Length, color = Species, data = iris)
qplot(Sepal.Length, color = Species, data = iris)
# Modelling
# use the copy function
FeatureSetTrain = copy(iris)
FeatureSetTrain = FeatureSetTrain[,c('Sepal.Length','Petal.Length'),with=F]
train <- FeatureSetTrain[1:100]
Target = FeatureSetTrain$Sepal.Length
FeatureSetTrain = data.frame(FeatureSetTrain)
fit1 <- lm(Sepal.Length ~ Petal.Length , data = FeatureSetTrain)
print(coef(fit1))
coeff=data.table(coef(fit1)[[1]])
res = predict(fit1,FeatureSetTrain)
inp = data.table(Sepal.Length=Target)
res = data.table(Sepal.Length.Pred = res)
View(inp - res)
fin_res = cbind(inp,res)
View(fin_res)
FinalRes = cbind(iris,fin_res)
View(FinalRes)
# MAE By Mean and ME By Mean Analysis
# Excercise 4
# Calculate the Mean Error, Mean Absolute Error and Mean Sepal Length for each species type
# Then calculate Mean Error By Mean and Mean Absolute Error by Mean for each species type
m <- matrix(data=cbind(rnorm(30, 0), rnorm(30, 2), rnorm(30, 5)), nrow=30, ncol=3)
View(m)
# See help of apply function - 1 represents rows and 2 represents colulmns
apply(m, 2, function(x) length(x[x<0]))
# sapply and lapply
# sapply
sapply(1:3, function(x) x^2)
# lapply, very similar function but returns list rather than vector
lapply(1:3, function(x) x^2)
rbind(lapply(iris$Sepal.Length, function(x) x^3))
rbind(lapply(iris$Sepal.Length, function(x) iris[,xyz := x^2]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment