zunaira zunman

## cdh_prereq10.txt
$ sudo service cloudera-scm-server start
$ sudo tail -f /var/log/cloudera-scm-server/cloudera-scm-server.log

## cdh_prereq11_hosts.txt
127.0.0.1   localhost localhost.localdomain localhost4 localhost4.localdomain4
::1         localhost localhost.localdomain localhost6 localhost6.localdomain6
10.240.0.3 cluster-dn1.c.symmetric-rune-115401.internal cluster-dn1
10.240.0.4 cluster-dn2.c.symmetric-rune-115401.internal cluster-dn2
10.240.0.5 cluster-dn3.c.symmetric-rune-115401.internal cluster-dn3
10.240.0.2 cluster-cm.c.symmetric-rune-115401.internal cluster-cm  # Added by Google
~

## tweets_removeRtUrlMentions.R
#Remove RTs and urls
CaseSensitive_FilterTerms <- c("RT", "http")
filter_regex<- paste(CaseSensitive_FilterTerms, collapse = "|")
df <- filter(df, !grepl(filter_regex, df$text))

#create new dataframe and initialize with first record
df.noMentions <- df[1,]
#append rows that do not start with @mention
for(j in 1:nrow(df)){
  text <- c(df$text[j])

## UPDATED_tweets_removeRtUrlMentions.R
#Remove RTs and urls
CaseSensitive_FilterTerms <- c("RT", "http")
filter_regex<- paste(CaseSensitive_FilterTerms, collapse = "|")
df <- filter(df, !grepl(filter_regex, df$text))

#remove mentions. (only remove tweets that start with @)
df$noMentions <- 0  #add new temporary column
df$noMentions <- substr(c(df$text),0,1)
df <- filter(df, !grepl(c("@"), df$noMentions))
df$noMentions <- NULL   #remove temporary column added in line7

## smote_svm_libraries.R
library(caret)
library(e1071)
library(rpart)
library(RTextTools)
library(tm)
library(DMwR)
set.seed(1234)

## smote_svm_1.R
# Create the corpus
MyCorpus <- VCorpus(VectorSource(data$text),  readerControl = list(language = "en"))
content(MyCorpus[[1]])
# Some preprocessing
MyCorpus <- tm_map(MyCorpus, content_transformer(tolower))
content(MyCorpus[[1]])

## smote_svm_2.R
# Create the Document-Term matrix
DTM <- DocumentTermMatrix(MyCorpus, control = list(bounds = list(global = c(0, Inf))))
dim(DTM)

# Create a sparse matrix to put into SVM
sparse_DTM <- sparseMatrix(i = DTM$i, j = DTM$j, x = DTM$v,
                           dims = dim(DTM),
                           dimnames = list(rownames(DTM), colnames(DTM)))

## smote_svm_3.R
#convert sparse dtm to data.frame
data.DTM <- as.data.frame(as.matrix(sparse_DTM))

#append label column
data.DTM$label <- data$label

## smote_svm_4.R
#perform split
splitIndex <- createDataPartition(data.DTM$label, p = .50, list = FALSE, times = 1)
trainset <- data.DTM[ splitIndex,]
testset <- data.DTM[-splitIndex,]

traindata <- data[ splitIndex,]
testdata <- data [-splitIndex,]

## smote_svm_5.R
prop.table(table(trainset$label))
prop.table(table(testset$label))
	$ sudo service cloudera-scm-server start
	$ sudo tail -f /var/log/cloudera-scm-server/cloudera-scm-server.log
	127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4
	::1 localhost localhost.localdomain localhost6 localhost6.localdomain6
	10.240.0.3 cluster-dn1.c.symmetric-rune-115401.internal cluster-dn1
	10.240.0.4 cluster-dn2.c.symmetric-rune-115401.internal cluster-dn2
	10.240.0.5 cluster-dn3.c.symmetric-rune-115401.internal cluster-dn3
	10.240.0.2 cluster-cm.c.symmetric-rune-115401.internal cluster-cm # Added by Google
	~
	#Remove RTs and urls
	CaseSensitive_FilterTerms <- c("RT", "http")
	filter_regex<- paste(CaseSensitive_FilterTerms, collapse = "\|")
	df <- filter(df, !grepl(filter_regex, df$text))

	#create new dataframe and initialize with first record
	df.noMentions <- df[1,]
	#append rows that do not start with @mention
	for(j in 1:nrow(df)){
	text <- c(df$text[j])
	library(caret)
	library(e1071)
	library(rpart)
	library(RTextTools)
	library(tm)
	library(DMwR)
	set.seed(1234)
	# Create the corpus
	MyCorpus <- VCorpus(VectorSource(data$text), readerControl = list(language = "en"))
	content(MyCorpus[[1]])
	# Some preprocessing
	MyCorpus <- tm_map(MyCorpus, content_transformer(tolower))
	content(MyCorpus[[1]])
	# Create the Document-Term matrix
	DTM <- DocumentTermMatrix(MyCorpus, control = list(bounds = list(global = c(0, Inf))))
	dim(DTM)

	# Create a sparse matrix to put into SVM
	sparse_DTM <- sparseMatrix(i = DTM$i, j = DTM$j, x = DTM$v,
	dims = dim(DTM),
	dimnames = list(rownames(DTM), colnames(DTM)))
	#convert sparse dtm to data.frame
	data.DTM <- as.data.frame(as.matrix(sparse_DTM))

	#append label column
	data.DTM$label <- data$label
	#perform split
	splitIndex <- createDataPartition(data.DTM$label, p = .50, list = FALSE, times = 1)
	trainset <- data.DTM[ splitIndex,]
	testset <- data.DTM[-splitIndex,]

	traindata <- data[ splitIndex,]
	testdata <- data [-splitIndex,]
	prop.table(table(trainset$label))
	prop.table(table(testset$label))