Instantly share code, notes, and snippets.

Embed
What would you like to do?
library(tm)
dirname <- "episodes"
rawCorpus <- Corpus(DirSource(dirname, recursive=TRUE), readerControl=list(language="en"))
my.corpus <- rawCorpus
my.stopwords <- c(stopwords("english"),"ain't","just","can","get","got","will")
my.stopwords <- rev(my.stopwords) # Hack to apply i'll etc. before i
my.stopwords <- my.stopwords[my.stopwords != "who"] # Not a stopword. Not here.
my.corpus <- tm_map(my.corpus, tolower)
my.corpus <- tm_map(my.corpus, removeWords, my.stopwords)
my.corpus <- tm_map(my.corpus, removePunctuation)
my.corpus <- tm_map(my.corpus, stripWhitespace)
#my.corpus <- tm_map(my.corpus, stemDocument)
my.dtm <- DocumentTermMatrix(my.corpus)
### Dendrogram
library(ape)
dend.dtm <- removeSparseTerms(my.dtm, sparse=0.65)
dend.df <- as.data.frame(inspect(dend.dtm)) # Rows are input files, cols are words
rn <- rownames(dend.df)
rownames(dend.df) <- gsub("\\.txt", "", rn) # remove suffix from labels
rm(rn)
#dend.df <- as.data.frame( t(as.matrix(dend.dtm)))
dend.df.scale <- scale(dend.df) # scale and center files/words dataframe
dend.distm <- dist(dend.df.scale, method = "euclidean") # distance matrix of files
dend.fit <- hclust(dend.distm, method="ward")
# Via http://gastonsanchez.wordpress.com/2012/10/03/7-ways-to-plot-dendrograms-in-r/
dend.nclust = 7
dend.pal = brewer.pal(dend.nclust, "Dark2")
dend.clust = cutree(dend.fit, dend.nclust)
#op = par(bg="#E8DDCB")
png("ep_dendogram.png", width=1200, height=1200)
# svg("ep_dendogram.svg", width=3000,height=3000)
# plot(dend.fit) # display dendrogram
# plot(dend.fit, axes=F, ylab="")
plot(as.phylo(dend.fit), type="fan", tip.color=dend.pal[dend.clust], label.offset=1, col="red",cex=1.1)
dev.off()
### Top 10 words
top10.dtm <- removeSparseTerms(my.dtm, 0.65)
top10 <- as.matrix(top10.dtm)
v <- apply(top10,2,sum)
v <- sort(v, decreasing = TRUE)
v1 <- sort(v[1:10])
png("terms_frequency.png", width=500,height=400)
op <- par(mar = c(3.5,2,0,0) + 2) # bottom, left, top, right
barplot(v1, horiz=TRUE, cex.names = 1.0, las = 1, col=rev(grey.colors(10)), main="Terms Frequency", sub="Doctor Who (2005) – Seasons 1 to 7")
par(op)
dev.off()
# vdtm <- as.data.frame(v)
### Wordcloud
library(RColorBrewer)
library(wordcloud)
ap.tdm <- TermDocumentMatrix(my.corpus)
ap.m <- as.matrix(ap.tdm)
ap.v <- sort(rowSums(ap.m),decreasing=TRUE)
ap.d <- data.frame(word = names(ap.v),freq=ap.v)
#table(ap.d$freq)
palette = brewer.pal(9, "Blues")[-c(1:3)]
png("wordcloud_all.png", width=1500,height=900)
wordcloud(ap.d$word,ap.d$freq, scale=c(13,.5),min.freq=3,
max.words=150, random.order=FALSE, rot.per=.15, colors=palette)
dev.off()
### N-gramy
cnt.n=3
cnt.topn=10
library(textcat)
library(tau)
cnt.corpus <- rawCorpus
cnt.corpus <- tm_map(cnt.corpus, tolower)
cnt.corpus <- tm_map(cnt.corpus, removeWords, c("a","an","the"))
cnt.corpus <- tm_map(cnt.corpus, removePunctuation)
cnt.corpus <- tm_map(cnt.corpus, stripWhitespace)
cnt.text <- textcnt(cnt.corpus, method = "string",n=cnt.n, decreasing=TRUE)
#cnt.text[1:50]
cnt.d <- data.frame(word = names(cnt.text), freq=unclass(cnt.text))
cnt.v1 <- sort(head(cnt.text, n=cnt.topn))
png(sprintf("ngrams_%s_plot.png",cnt.n), width=700,height=cnt.topn*30)
# Set margins to prevent names overflow
op <- par(mar = c(4,cnt.n*2,0,0) + 2) # bottom, left, top, right
barplot(cnt.v1, horiz=TRUE, main=sprintf("Top %s-grams", cnt.n), sub="Doctor Who (2005) – Seasons 1 to 7",
cex.names = 1, beside = F,
las = 1, col=rev(grey.colors(cnt.topn)))
par(op)
dev.off()
### Sentences
library(NLP)
library(openNLP)
# Via http://stackoverflow.com/a/18790617/240963
reshape_corpus <- function(current.corpus, FUN, ...) {
# Extract the text from each document in the corpus and put into a list
text <- lapply(current.corpus, Content)
# Basically convert the text
docs <- lapply(text, FUN, ...)
docs <- as.vector(unlist(docs))
# Create a new corpus structure and return it
new.corpus <- Corpus(VectorSource(docs))
return(new.corpus)
}
convert_text_to_sentences <- function(text, lang = "en") {
# Function to compute sentence annotations using the Apache OpenNLP Maxent sentence detector employing the default model for language 'en'.
sentence_token_annotator <- Maxent_Sent_Token_Annotator(language = lang)
# Convert text to class String from package NLP
text <- as.String(text)
# Sentence boundaries in text
sentence.boundaries <- annotate(text, sentence_token_annotator)
# Extract sentences
sentences <- text[sentence.boundaries]
# return sentences
return(sentences)
}
# Extract the text from each document in the corpus and put into a list
# rawText <- lapply(rawCorpus, Content)
# head(toString(rawCorpus))
# rawText <- paste0(rawText[1],rawText[2],rawText[3])
# txt1 <- rawText[[1]]
# paste0(as.String(txt1))
# sentences <- lapply(rawText, convert_text_to_sentences)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment