Skip to content

Instantly share code, notes, and snippets.

$ sudo service cloudera-scm-server start
$ sudo tail -f /var/log/cloudera-scm-server/cloudera-scm-server.log
127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4
::1 localhost localhost.localdomain localhost6 localhost6.localdomain6
10.240.0.3 cluster-dn1.c.symmetric-rune-115401.internal cluster-dn1
10.240.0.4 cluster-dn2.c.symmetric-rune-115401.internal cluster-dn2
10.240.0.5 cluster-dn3.c.symmetric-rune-115401.internal cluster-dn3
10.240.0.2 cluster-cm.c.symmetric-rune-115401.internal cluster-cm # Added by Google
~
#Remove RTs and urls
CaseSensitive_FilterTerms <- c("RT", "http")
filter_regex<- paste(CaseSensitive_FilterTerms, collapse = "|")
df <- filter(df, !grepl(filter_regex, df$text))
#create new dataframe and initialize with first record
df.noMentions <- df[1,]
#append rows that do not start with @mention
for(j in 1:nrow(df)){
text <- c(df$text[j])
#Remove RTs and urls
CaseSensitive_FilterTerms <- c("RT", "http")
filter_regex<- paste(CaseSensitive_FilterTerms, collapse = "|")
df <- filter(df, !grepl(filter_regex, df$text))
#remove mentions. (only remove tweets that start with @)
df$noMentions <- 0 #add new temporary column
df$noMentions <- substr(c(df$text),0,1)
df <- filter(df, !grepl(c("@"), df$noMentions))
df$noMentions <- NULL #remove temporary column added in line7
library(caret)
library(e1071)
library(rpart)
library(RTextTools)
library(tm)
library(DMwR)
set.seed(1234)
# Create the corpus
MyCorpus <- VCorpus(VectorSource(data$text), readerControl = list(language = "en"))
content(MyCorpus[[1]])
# Some preprocessing
MyCorpus <- tm_map(MyCorpus, content_transformer(tolower))
content(MyCorpus[[1]])
# Create the Document-Term matrix
DTM <- DocumentTermMatrix(MyCorpus, control = list(bounds = list(global = c(0, Inf))))
dim(DTM)
# Create a sparse matrix to put into SVM
sparse_DTM <- sparseMatrix(i = DTM$i, j = DTM$j, x = DTM$v,
dims = dim(DTM),
dimnames = list(rownames(DTM), colnames(DTM)))
#convert sparse dtm to data.frame
data.DTM <- as.data.frame(as.matrix(sparse_DTM))
#append label column
data.DTM$label <- data$label
#perform split
splitIndex <- createDataPartition(data.DTM$label, p = .50, list = FALSE, times = 1)
trainset <- data.DTM[ splitIndex,]
testset <- data.DTM[-splitIndex,]
traindata <- data[ splitIndex,]
testdata <- data [-splitIndex,]
@zunman
zunman / smote_svm_5.R
Last active September 11, 2016 21:35
prop.table(table(trainset$label))
prop.table(table(testset$label))