Skip to content

Instantly share code, notes, and snippets.

@Zoldin
Last active July 21, 2017 20:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Zoldin/9e79c047fd8ad7aa6596b0682aca83c6 to your computer and use it in GitHub Desktop.
Save Zoldin/9e79c047fd8ad7aa6596b0682aca83c6 to your computer and use it in GitHub Desktop.
featurization.R
#!/usr/bin/Rscript
library(text2vec)
library(MASS)
library(Matrix)
args = commandArgs(trailingOnly=TRUE)
if (!length(args)==4) {
stop("Four arguments must be supplied ( train file (csv format) ,test data set (csv format), train output file name and test output file name - txt files ).n", call.=FALSE)
}
#read input files
df_train = read.csv(args[1],stringsAsFactors = FALSE)
df_test = read.csv(args[2],stringsAsFactors = FALSE)
#create vocabulary - words
prep_fun = tolower
tok_fun = word_tokenizer
it_train = itoken(df_train$text, preprocessor = prep_fun, tokenizer = tok_fun, ids = df_train$ID, progressbar = FALSE)
vocab = create_vocabulary(it_train,stopwords = stop_words)
#clean vocabualary - use only 5000 terms
pruned_vocab <- prune_vocabulary(vocab, max_number_of_terms=5000)
vectorizer = vocab_vectorizer(pruned_vocab)
dtm_train = create_dtm(it_train, vectorizer)
#create tf-idf for train data set
tfidf = TfIdf$new()
dtm_train_tfidf = fit_transform(dtm_train, tfidf)
#create test tf-idf - use vocabulary that is build on train
it_test = itoken(df_test$text, preprocessor = prep_fun, tokenizer = tok_fun, ids = df_test$ID, progressbar = FALSE)
dtm_test_tfidf = create_dtm(it_test, vectorizer) %>%
transform(tfidf)
#add Id as additional column in matrices
dtm_train_tfidf<- Matrix(cbind(label=df_train$label,dtm_train_tfidf),sparse = TRUE)
dtm_test_tfidf<- Matrix(cbind(label=df_test$label,dtm_test_tfidf),sparse = TRUE)
# write output - tf-idf matrices
writeMM(dtm_train_tfidf,args[3])
writeMM(dtm_test_tfidf,args[4])
print("Two matrices were created - one for train and one for test data set")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment