Skip to content

Instantly share code, notes, and snippets.

@s13731105
Created July 25, 2014 14:05
Show Gist options
  • Save s13731105/238be2e70933a2744245 to your computer and use it in GitHub Desktop.
Save s13731105/238be2e70933a2744245 to your computer and use it in GitHub Desktop.
basic Text Mining
policy.HTML.page <-readLines("http://policy.unt.edu/policy/3-5")
length(policy.HTML.page)
policy.HTML.page[186:202]
id.1 <- 3 + which(policy.HTML.page ==" TOTAL UNIVERSITY </div>")
id.2 <- id.1+5
text.data <- policy.HTML.page[id.1:id.2]
rm(policy.HTML.page, id.1, id.2)
text.data
td.1 <- gsub(pattern = "<p>", replacement = "", x = text.data, ignore.case = TRUE, perl = FALSE, fixed = FALSE, useBytes = FALSE)
td.2 <- gsub(pattern = "</p>", replacement = "", x = td.1, ignore.case = TRUE, perl = FALSE, fixed = FALSE, useBytes = FALSE)
text.d <-td.2; rm(text.data, td.1, td.2)
text.d
library(tm)
txt <- VectorSource(text.d); rm(text.d)
txt.corpus <- Corpus(txt); rm(txt)
inspect(txt.corpus)
#txt.corpus <- tm_map(txt.corpus, tolower)
txt.corpus <- tm_map(txt.corpus, content_transformer(tolower))
txt.corpus <- tm_map(txt.corpus, removePunctuation)
txt.corpus <- tm_map(txt.corpus, removeNumbers)
txt.corpus <- tm_map(txt.corpus, removeWords, stopwords("english"))
library(SnowballC)
txt.corpus <- tm_map(txt.corpus, stemDocument)
detach("package:SnowballC")
inspect(txt.corpus)
txt.corpus <- tm_map(txt.corpus, stripWhitespace)
inspect(txt.corpus)
tdm <- TermDocumentMatrix(txt.corpus)
inspect(tdm[1:20,])
findFreqTerms(x=tdm, lowfreq = 8, highfreq = Inf)
findAssocs(x=tdm, term="compute", corlimit = 0.6)
tdm.common.60 <- removeSparseTerms(x=tdm, sparse = 0.60)
tdm.common.20 <- removeSparseTerms(x=tdm, sparse = 0.20)
tdm
tdm.common.60
tdm.common.20
inspect(tdm.common.60)
inspect(tdm.common.20)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment