Last active
April 24, 2022 01:06
-
-
Save jnv/7724230 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tm) | |
dirname <- "episodes" | |
rawCorpus <- Corpus(DirSource(dirname, recursive=TRUE), readerControl=list(language="en")) | |
my.corpus <- rawCorpus | |
my.stopwords <- c(stopwords("english"),"ain't","just","can","get","got","will") | |
my.stopwords <- rev(my.stopwords) # Hack to apply i'll etc. before i | |
my.stopwords <- my.stopwords[my.stopwords != "who"] # Not a stopword. Not here. | |
my.corpus <- tm_map(my.corpus, tolower) | |
my.corpus <- tm_map(my.corpus, removeWords, my.stopwords) | |
my.corpus <- tm_map(my.corpus, removePunctuation) | |
my.corpus <- tm_map(my.corpus, stripWhitespace) | |
#my.corpus <- tm_map(my.corpus, stemDocument) | |
my.dtm <- DocumentTermMatrix(my.corpus) | |
### Dendrogram | |
library(ape) | |
dend.dtm <- removeSparseTerms(my.dtm, sparse=0.65) | |
dend.df <- as.data.frame(inspect(dend.dtm)) # Rows are input files, cols are words | |
rn <- rownames(dend.df) | |
rownames(dend.df) <- gsub("\\.txt", "", rn) # remove suffix from labels | |
rm(rn) | |
#dend.df <- as.data.frame( t(as.matrix(dend.dtm))) | |
dend.df.scale <- scale(dend.df) # scale and center files/words dataframe | |
dend.distm <- dist(dend.df.scale, method = "euclidean") # distance matrix of files | |
dend.fit <- hclust(dend.distm, method="ward") | |
# Via http://gastonsanchez.wordpress.com/2012/10/03/7-ways-to-plot-dendrograms-in-r/ | |
dend.nclust = 7 | |
dend.pal = brewer.pal(dend.nclust, "Dark2") | |
dend.clust = cutree(dend.fit, dend.nclust) | |
#op = par(bg="#E8DDCB") | |
png("ep_dendogram.png", width=1200, height=1200) | |
# svg("ep_dendogram.svg", width=3000,height=3000) | |
# plot(dend.fit) # display dendrogram | |
# plot(dend.fit, axes=F, ylab="") | |
plot(as.phylo(dend.fit), type="fan", tip.color=dend.pal[dend.clust], label.offset=1, col="red",cex=1.1) | |
dev.off() | |
### Top 10 words | |
top10.dtm <- removeSparseTerms(my.dtm, 0.65) | |
top10 <- as.matrix(top10.dtm) | |
v <- apply(top10,2,sum) | |
v <- sort(v, decreasing = TRUE) | |
v1 <- sort(v[1:10]) | |
png("terms_frequency.png", width=500,height=400) | |
op <- par(mar = c(3.5,2,0,0) + 2) # bottom, left, top, right | |
barplot(v1, horiz=TRUE, cex.names = 1.0, las = 1, col=rev(grey.colors(10)), main="Terms Frequency", sub="Doctor Who (2005) – Seasons 1 to 7") | |
par(op) | |
dev.off() | |
# vdtm <- as.data.frame(v) | |
### Wordcloud | |
library(RColorBrewer) | |
library(wordcloud) | |
ap.tdm <- TermDocumentMatrix(my.corpus) | |
ap.m <- as.matrix(ap.tdm) | |
ap.v <- sort(rowSums(ap.m),decreasing=TRUE) | |
ap.d <- data.frame(word = names(ap.v),freq=ap.v) | |
#table(ap.d$freq) | |
palette = brewer.pal(9, "Blues")[-c(1:3)] | |
png("wordcloud_all.png", width=1500,height=900) | |
wordcloud(ap.d$word,ap.d$freq, scale=c(13,.5),min.freq=3, | |
max.words=150, random.order=FALSE, rot.per=.15, colors=palette) | |
dev.off() | |
### N-gramy | |
cnt.n=3 | |
cnt.topn=10 | |
library(textcat) | |
library(tau) | |
cnt.corpus <- rawCorpus | |
cnt.corpus <- tm_map(cnt.corpus, tolower) | |
cnt.corpus <- tm_map(cnt.corpus, removeWords, c("a","an","the")) | |
cnt.corpus <- tm_map(cnt.corpus, removePunctuation) | |
cnt.corpus <- tm_map(cnt.corpus, stripWhitespace) | |
cnt.text <- textcnt(cnt.corpus, method = "string",n=cnt.n, decreasing=TRUE) | |
#cnt.text[1:50] | |
cnt.d <- data.frame(word = names(cnt.text), freq=unclass(cnt.text)) | |
cnt.v1 <- sort(head(cnt.text, n=cnt.topn)) | |
png(sprintf("ngrams_%s_plot.png",cnt.n), width=700,height=cnt.topn*30) | |
# Set margins to prevent names overflow | |
op <- par(mar = c(4,cnt.n*2,0,0) + 2) # bottom, left, top, right | |
barplot(cnt.v1, horiz=TRUE, main=sprintf("Top %s-grams", cnt.n), sub="Doctor Who (2005) – Seasons 1 to 7", | |
cex.names = 1, beside = F, | |
las = 1, col=rev(grey.colors(cnt.topn))) | |
par(op) | |
dev.off() | |
### Sentences | |
library(NLP) | |
library(openNLP) | |
# Via http://stackoverflow.com/a/18790617/240963 | |
reshape_corpus <- function(current.corpus, FUN, ...) { | |
# Extract the text from each document in the corpus and put into a list | |
text <- lapply(current.corpus, Content) | |
# Basically convert the text | |
docs <- lapply(text, FUN, ...) | |
docs <- as.vector(unlist(docs)) | |
# Create a new corpus structure and return it | |
new.corpus <- Corpus(VectorSource(docs)) | |
return(new.corpus) | |
} | |
convert_text_to_sentences <- function(text, lang = "en") { | |
# Function to compute sentence annotations using the Apache OpenNLP Maxent sentence detector employing the default model for language 'en'. | |
sentence_token_annotator <- Maxent_Sent_Token_Annotator(language = lang) | |
# Convert text to class String from package NLP | |
text <- as.String(text) | |
# Sentence boundaries in text | |
sentence.boundaries <- annotate(text, sentence_token_annotator) | |
# Extract sentences | |
sentences <- text[sentence.boundaries] | |
# return sentences | |
return(sentences) | |
} | |
# Extract the text from each document in the corpus and put into a list | |
# rawText <- lapply(rawCorpus, Content) | |
# head(toString(rawCorpus)) | |
# rawText <- paste0(rawText[1],rawText[2],rawText[3]) | |
# txt1 <- rawText[[1]] | |
# paste0(as.String(txt1)) | |
# sentences <- lapply(rawText, convert_text_to_sentences) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment