Skip to content

Instantly share code, notes, and snippets.

@suryadutta
Last active November 13, 2017 02:20
Show Gist options
  • Save suryadutta/24887eaa05092020b64128b025b9861c to your computer and use it in GitHub Desktop.
Save suryadutta/24887eaa05092020b64128b025b9861c to your computer and use it in GitHub Desktop.
Extract 10,000 useful words from CSV
#install.packages('tm')
library(tm)
#install.packages('slam')
library("slam")
#import data
alldata <- read.csv('stackexchange/20161215StatsPostsMerged.csv', header = TRUE, stringsAsFactors = FALSE)
#make corpus
corp <- Corpus(VectorSource(alldata$CleanBody))
#corpus processing steps
corp <- tm_map(corp, stripWhitespace)
dtm <- DocumentTermMatrix(corp,
control = list(
tolower = TRUE,
stemming = TRUE,
stopwords = TRUE,
minWordLength = 3,
removeNumbers = TRUE,
removePunctuation = TRUE))
#compute term frequency-inverse document frequency (tf-idf)
term_tfidf <-
tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
log2(nDocs(dtm)/col_sums(dtm > 0))
#omit terms that are very frequent over many documents
dtm <- dtm[,term_tfidf >= median(term_tfidf)]
#omit insignificant terms
dtm <- dtm[row_sums(dtm) > 0,]
#compute term frequency matrix
FreqMat <- data.frame(ST = colnames(dtm),
Freq = col_sums(dtm),
row.names = NULL)
FreqMat <- FreqMat[order(-FreqMat$Freq),]
FreqMat <- FreqMat[!FreqMat$ST %in% stopwords("en"),]
#output most frequent terms from reduced DTM
vocabList <- head(FreqMat, 10000)
#write out to CSV
write.csv(vocabList,file='vocab.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment