huberflores/R_Common_Commands.txt

## R_Common_Commands.txt
###DO NOT delete header###
#
# author Huber Flores
#

#clear screen
> command
Ctrl + l
>

#close console
> quite()

#get the working directory
> getwd()
[1] "/home/huber/Desktop/TechnicalInformation/R/examples/loadData"

#set folder as working directory
> setwd("/path/")

#load a CSV file
#the commands loads the file separated with comma
#if the file is separated by tabs then created a new file in open office and then save it with the new delimiter
> mydata = read.csv("data.csv") # data.csv is a file located in the directory obtained with getwd()
> mydata
  Col1 Col2 Col3
1  10   a1  b1
2  20   a2  b2
3  30   a3  b3
>

#search for command
> apropos("mean")
 [1] "colMeans"        ".colMeans"       "kmeans"          "mean"
 [5] "mean.data.frame" "mean.Date"       "mean.default"    "mean.difftime"
 [9] "mean.POSIXct"    "mean.POSIXlt"    "rowMeans"        ".rowMeans"
[13] "weighted.mean"

#help for a specific command
> help(mean)

#avoid non-readable values before calculating a function
> xbar = mean(mydata$X69.4, na.rm = TRUE)

#search for packages and install them
> install.packages()

#standard deviation
> sapply(mydata[2],sd) # Remember sd(mydata) is deprecated already for higher versions of R > 2.14.0

#sort table based on a specific column
> sort1.mydata <- mydata[order(columnname),]

#write data to a file
> write.csv(mydata, "file.csv")

#cut interval
>  a <- runif(100) # a few values for example
>  b <- cut(a, seq(from=0, to =1, by = 0.2))

#search if a keyword is in a column and return a logical vector
# vector <- grepl("keyword", mydata$words)

#create a subset from a specific keyword into a new vector
> chess <- mydata[grepl("chess", mydata$key),]

#extract columns from data frame
> slice1 <- mydata[, c("CPU.cores", "Col.sample")]

#show labels/name of the columns
>colnames(mydata)

#clustering and plotting
#k-means
>
km <- kmeans(mycluster, center=2, nstart=4)
plot(mycluster, col=km$cluster)

#hierarchical cluster (unsupervised)
>
d <- dist(mycluster)
hc <- hclust(d, method= "complete")
plot(hc)    #dendrogram
rect.hclust(hc, k=4) #select are in the dendrogram

#DBSCAN
>
library(fpc) #if package not installed, then install via install.packages("fpc")
db <- dbscan(mycluster, eps=.3, MinPts=6)
str(db)
plot(mycluster, col=db$cluster+1L)

#kNN (k-nearest neighbors)
>
#install.packages("class") #Although is installed by default
library(class)
A1 = c(0,0)
A2 = c(1,1)
...
B2 = c(5.5, 7)

train=rbind(A1,A2,A3, B1,B2,B3)
#  [,1] [,2]
#A1  0.0    0
#A2  1.0    1
#...
#B2  5.5    7

cl=factor(c(rep("A",3),rep("B",3))) # This means the train set has 6 rows (A1,A2,A3,B1,B2,B3)
test = c1(3.5, 3.5) #test can be also a Matrix - test = matrix (c(4,4,3,3,5,6,7,7), ncol=2, byrow=TRUE)
summary(knn(train, test, cl, k=1))
#A B
#0 1 #classified in group B

#calculate percentiles
#calculates 33nd, 57th and 95th percentiles
> appenergy <- mydata$ev
> quantile(appenergy, c(.33, .57, .95))

#number of rows and cols
> nrow(mydata)
> ncol(mydata)

#select rows with a particular string
> myapp <- mydata[grep("chess", mydata$key),]

#select all the rows except the one with the keyword
> mycommunity <- mydata[!grepl("chess", mydata$key),]


#Run script in R
> Rscript file.R

#ifthen and apply
> sapply(list, customfunction(x) ifelse(is.numeric(x), mean(x), NA))

#Iterate a list with name
> myhistory
$a1
  timestamp rtt acceleration userId
1         3 789           a1  user1
2         3 586           a1  user2
$a2
  timestamp  rtt acceleration userId
3         3  748           a2  user3

>
for (name in names(myhistory)) {
    print(name)
    print(myhistory[[name]])
}

>
for (name in names(myhistory)) {
   acc1 <- myhistory[[name]] #element of the list
   acc2 <- myhistory[name] #new list based on the index of the element
   print(acc)
}


### Install package locally without online repository
install.packages("https://cran.r-project.org/src/contrib/Archive/actuar/actuar_1.2-0.tar.gz", repo=NULL, type="source")


#here you can find some potential errors found by trying the commands above
FAQ
====

***Error 1***
/usr/bin/ld: cannot find -lgfortran
/usr/bin/ld: cannot find -lquadmath
....
[SOLVED]: first, figure out the version of the gfortran compiler you are using
> gfortran --version
GNU Fortran (Ubuntu/Linaro 4.7.2-5ubuntu1) 4.7.2
Copyright (C) 2012 Free Software Foundation, Inc.
...
#Another way to check
> gfortran -print-file-name=libgfortran.so
/usr/lib/gcc/x86_64-linux-gnu/4.7/libgfortran.so

In order to solve this issue, gcc and g++ compiler MUST work with the same version. In other words, gcc and g++ should be 4.7 as well.
	###DO NOT delete header###
	#
	# author Huber Flores
	#

	#clear screen
	> command
	Ctrl + l
	>

	#close console
	> quite()

	#get the working directory
	> getwd()
	[1] "/home/huber/Desktop/TechnicalInformation/R/examples/loadData"

	#set folder as working directory
	> setwd("/path/")

	#load a CSV file
	#the commands loads the file separated with comma
	#if the file is separated by tabs then created a new file in open office and then save it with the new delimiter
	> mydata = read.csv("data.csv") # data.csv is a file located in the directory obtained with getwd()
	> mydata
	Col1 Col2 Col3
	1 10 a1 b1
	2 20 a2 b2
	3 30 a3 b3
	>

	#search for command
	> apropos("mean")
	[1] "colMeans" ".colMeans" "kmeans" "mean"
	[5] "mean.data.frame" "mean.Date" "mean.default" "mean.difftime"
	[9] "mean.POSIXct" "mean.POSIXlt" "rowMeans" ".rowMeans"
	[13] "weighted.mean"

	#help for a specific command
	> help(mean)

	#avoid non-readable values before calculating a function
	> xbar = mean(mydata$X69.4, na.rm = TRUE)

	#search for packages and install them
	> install.packages()

	#standard deviation
	> sapply(mydata[2],sd) # Remember sd(mydata) is deprecated already for higher versions of R > 2.14.0

	#sort table based on a specific column
	> sort1.mydata <- mydata[order(columnname),]

	#write data to a file
	> write.csv(mydata, "file.csv")

	#cut interval
	> a <- runif(100) # a few values for example
	> b <- cut(a, seq(from=0, to =1, by = 0.2))

	#search if a keyword is in a column and return a logical vector
	# vector <- grepl("keyword", mydata$words)

	#create a subset from a specific keyword into a new vector
	> chess <- mydata[grepl("chess", mydata$key),]

	#extract columns from data frame
	> slice1 <- mydata[, c("CPU.cores", "Col.sample")]

	#show labels/name of the columns
	>colnames(mydata)

	#clustering and plotting
	#k-means
	>
	km <- kmeans(mycluster, center=2, nstart=4)
	plot(mycluster, col=km$cluster)

	#hierarchical cluster (unsupervised)
	>
	d <- dist(mycluster)
	hc <- hclust(d, method= "complete")
	plot(hc) #dendrogram
	rect.hclust(hc, k=4) #select are in the dendrogram

	#DBSCAN
	>
	library(fpc) #if package not installed, then install via install.packages("fpc")
	db <- dbscan(mycluster, eps=.3, MinPts=6)
	str(db)
	plot(mycluster, col=db$cluster+1L)

	#kNN (k-nearest neighbors)
	>
	#install.packages("class") #Although is installed by default
	library(class)
	A1 = c(0,0)
	A2 = c(1,1)
	...
	B2 = c(5.5, 7)

	train=rbind(A1,A2,A3, B1,B2,B3)
	# [,1] [,2]
	#A1 0.0 0
	#A2 1.0 1
	#...
	#B2 5.5 7

	cl=factor(c(rep("A",3),rep("B",3))) # This means the train set has 6 rows (A1,A2,A3,B1,B2,B3)
	test = c1(3.5, 3.5) #test can be also a Matrix - test = matrix (c(4,4,3,3,5,6,7,7), ncol=2, byrow=TRUE)
	summary(knn(train, test, cl, k=1))
	#A B
	#0 1 #classified in group B

	#calculate percentiles
	#calculates 33nd, 57th and 95th percentiles
	> appenergy <- mydata$ev
	> quantile(appenergy, c(.33, .57, .95))

	#number of rows and cols
	> nrow(mydata)
	> ncol(mydata)

	#select rows with a particular string
	> myapp <- mydata[grep("chess", mydata$key),]

	#select all the rows except the one with the keyword
	> mycommunity <- mydata[!grepl("chess", mydata$key),]


	#Run script in R
	> Rscript file.R

	#ifthen and apply
	> sapply(list, customfunction(x) ifelse(is.numeric(x), mean(x), NA))

	#Iterate a list with name
	> myhistory
	$a1
	timestamp rtt acceleration userId
	1 3 789 a1 user1
	2 3 586 a1 user2
	$a2
	timestamp rtt acceleration userId
	3 3 748 a2 user3

	>
	for (name in names(myhistory)) {
	print(name)
	print(myhistory[[name]])
	}

	>
	for (name in names(myhistory)) {
	acc1 <- myhistory[[name]] #element of the list
	acc2 <- myhistory[name] #new list based on the index of the element
	print(acc)
	}


	### Install package locally without online repository
	install.packages("https://cran.r-project.org/src/contrib/Archive/actuar/actuar_1.2-0.tar.gz", repo=NULL, type="source")



	#here you can find some potential errors found by trying the commands above
	FAQ
	====

	*Error 1*
	/usr/bin/ld: cannot find -lgfortran
	/usr/bin/ld: cannot find -lquadmath
	....
	[SOLVED]: first, figure out the version of the gfortran compiler you are using
	> gfortran --version
	GNU Fortran (Ubuntu/Linaro 4.7.2-5ubuntu1) 4.7.2
	Copyright (C) 2012 Free Software Foundation, Inc.
	...
	#Another way to check
	> gfortran -print-file-name=libgfortran.so
	/usr/lib/gcc/x86_64-linux-gnu/4.7/libgfortran.so

	In order to solve this issue, gcc and g++ compiler MUST work with the same version. In other words, gcc and g++ should be 4.7 as well.