jnv/dh-doctor-who-nlp.R

## dh-doctor-who-nlp.R
library(tm)
dirname <- "episodes"

rawCorpus <- Corpus(DirSource(dirname, recursive=TRUE), readerControl=list(language="en"))

my.corpus <- rawCorpus

my.stopwords <- c(stopwords("english"),"ain't","just","can","get","got","will")
my.stopwords <- rev(my.stopwords) # Hack to apply i'll etc. before i
my.stopwords <- my.stopwords[my.stopwords != "who"] # Not a stopword. Not here.

my.corpus <- tm_map(my.corpus, tolower)
my.corpus <- tm_map(my.corpus, removeWords, my.stopwords)
my.corpus <- tm_map(my.corpus, removePunctuation)
my.corpus <- tm_map(my.corpus, stripWhitespace)

#my.corpus <- tm_map(my.corpus, stemDocument)

my.dtm <- DocumentTermMatrix(my.corpus)

### Dendrogram
library(ape)

dend.dtm <- removeSparseTerms(my.dtm, sparse=0.65)
dend.df <- as.data.frame(inspect(dend.dtm)) # Rows are input files, cols are words
rn <- rownames(dend.df)
rownames(dend.df) <- gsub("\\.txt", "", rn) # remove suffix from labels
rm(rn)
#dend.df <- as.data.frame( t(as.matrix(dend.dtm)))
dend.df.scale <- scale(dend.df) # scale and center files/words dataframe
dend.distm <- dist(dend.df.scale, method = "euclidean") # distance matrix of files
dend.fit <- hclust(dend.distm, method="ward")

# Via http://gastonsanchez.wordpress.com/2012/10/03/7-ways-to-plot-dendrograms-in-r/
dend.nclust = 7
dend.pal = brewer.pal(dend.nclust, "Dark2")
dend.clust = cutree(dend.fit, dend.nclust)
#op = par(bg="#E8DDCB")
png("ep_dendogram.png", width=1200, height=1200)
# svg("ep_dendogram.svg", width=3000,height=3000)
# plot(dend.fit) # display dendrogram
# plot(dend.fit, axes=F, ylab="")
plot(as.phylo(dend.fit), type="fan", tip.color=dend.pal[dend.clust], label.offset=1, col="red",cex=1.1)
dev.off()

### Top 10 words
top10.dtm <- removeSparseTerms(my.dtm, 0.65)
top10 <- as.matrix(top10.dtm)
v <- apply(top10,2,sum)
v <- sort(v, decreasing = TRUE)
v1 <- sort(v[1:10])
png("terms_frequency.png", width=500,height=400)
op <- par(mar = c(3.5,2,0,0) + 2) # bottom, left, top, right
barplot(v1, horiz=TRUE, cex.names = 1.0, las = 1, col=rev(grey.colors(10)), main="Terms Frequency", sub="Doctor Who (2005) – Seasons 1 to 7")
par(op)
dev.off()
# vdtm <- as.data.frame(v)

### Wordcloud

library(RColorBrewer)
library(wordcloud)

ap.tdm <- TermDocumentMatrix(my.corpus)
ap.m <- as.matrix(ap.tdm)
ap.v <- sort(rowSums(ap.m),decreasing=TRUE)
ap.d <- data.frame(word = names(ap.v),freq=ap.v)
#table(ap.d$freq)
palette = brewer.pal(9, "Blues")[-c(1:3)]
png("wordcloud_all.png", width=1500,height=900)
wordcloud(ap.d$word,ap.d$freq, scale=c(13,.5),min.freq=3,
          max.words=150, random.order=FALSE, rot.per=.15, colors=palette)
dev.off()

### N-gramy
cnt.n=3
cnt.topn=10
library(textcat)
library(tau)
cnt.corpus <- rawCorpus
cnt.corpus <- tm_map(cnt.corpus, tolower)
cnt.corpus <- tm_map(cnt.corpus, removeWords, c("a","an","the"))
cnt.corpus <- tm_map(cnt.corpus, removePunctuation)
cnt.corpus <- tm_map(cnt.corpus, stripWhitespace)
cnt.text <- textcnt(cnt.corpus, method = "string",n=cnt.n, decreasing=TRUE)
#cnt.text[1:50]

cnt.d <- data.frame(word = names(cnt.text), freq=unclass(cnt.text))
cnt.v1 <- sort(head(cnt.text, n=cnt.topn))

png(sprintf("ngrams_%s_plot.png",cnt.n), width=700,height=cnt.topn*30)
# Set margins to prevent names overflow
op <- par(mar = c(4,cnt.n*2,0,0) + 2) # bottom, left, top, right
barplot(cnt.v1, horiz=TRUE, main=sprintf("Top %s-grams", cnt.n), sub="Doctor Who (2005) – Seasons 1 to 7",
        cex.names = 1, beside = F,
        las = 1, col=rev(grey.colors(cnt.topn)))
par(op)
dev.off()

### Sentences
library(NLP)
library(openNLP)

# Via http://stackoverflow.com/a/18790617/240963
reshape_corpus <- function(current.corpus, FUN, ...) {
  # Extract the text from each document in the corpus and put into a list
  text <- lapply(current.corpus, Content)

  # Basically convert the text
  docs <- lapply(text, FUN, ...)
  docs <- as.vector(unlist(docs))

  # Create a new corpus structure and return it
  new.corpus <- Corpus(VectorSource(docs))
  return(new.corpus)
}
convert_text_to_sentences <- function(text, lang = "en") {
  # Function to compute sentence annotations using the Apache OpenNLP Maxent sentence detector employing the default model for language 'en'.
  sentence_token_annotator <- Maxent_Sent_Token_Annotator(language = lang)

  # Convert text to class String from package NLP
  text <- as.String(text)

  # Sentence boundaries in text
  sentence.boundaries <- annotate(text, sentence_token_annotator)

  # Extract sentences
  sentences <- text[sentence.boundaries]

  # return sentences
  return(sentences)
}

# Extract the text from each document in the corpus and put into a list
# rawText <- lapply(rawCorpus, Content)
# head(toString(rawCorpus))
# rawText <- paste0(rawText[1],rawText[2],rawText[3])
# txt1 <- rawText[[1]]
# paste0(as.String(txt1))
# sentences <- lapply(rawText, convert_text_to_sentences)
	library(tm)
	dirname <- "episodes"

	rawCorpus <- Corpus(DirSource(dirname, recursive=TRUE), readerControl=list(language="en"))

	my.corpus <- rawCorpus

	my.stopwords <- c(stopwords("english"),"ain't","just","can","get","got","will")
	my.stopwords <- rev(my.stopwords) # Hack to apply i'll etc. before i
	my.stopwords <- my.stopwords[my.stopwords != "who"] # Not a stopword. Not here.

	my.corpus <- tm_map(my.corpus, tolower)
	my.corpus <- tm_map(my.corpus, removeWords, my.stopwords)
	my.corpus <- tm_map(my.corpus, removePunctuation)
	my.corpus <- tm_map(my.corpus, stripWhitespace)

	#my.corpus <- tm_map(my.corpus, stemDocument)

	my.dtm <- DocumentTermMatrix(my.corpus)

	### Dendrogram
	library(ape)

	dend.dtm <- removeSparseTerms(my.dtm, sparse=0.65)
	dend.df <- as.data.frame(inspect(dend.dtm)) # Rows are input files, cols are words
	rn <- rownames(dend.df)
	rownames(dend.df) <- gsub("\\.txt", "", rn) # remove suffix from labels
	rm(rn)
	#dend.df <- as.data.frame( t(as.matrix(dend.dtm)))
	dend.df.scale <- scale(dend.df) # scale and center files/words dataframe
	dend.distm <- dist(dend.df.scale, method = "euclidean") # distance matrix of files
	dend.fit <- hclust(dend.distm, method="ward")

	# Via http://gastonsanchez.wordpress.com/2012/10/03/7-ways-to-plot-dendrograms-in-r/
	dend.nclust = 7
	dend.pal = brewer.pal(dend.nclust, "Dark2")
	dend.clust = cutree(dend.fit, dend.nclust)
	#op = par(bg="#E8DDCB")
	png("ep_dendogram.png", width=1200, height=1200)
	# svg("ep_dendogram.svg", width=3000,height=3000)
	# plot(dend.fit) # display dendrogram
	# plot(dend.fit, axes=F, ylab="")
	plot(as.phylo(dend.fit), type="fan", tip.color=dend.pal[dend.clust], label.offset=1, col="red",cex=1.1)
	dev.off()

	### Top 10 words
	top10.dtm <- removeSparseTerms(my.dtm, 0.65)
	top10 <- as.matrix(top10.dtm)
	v <- apply(top10,2,sum)
	v <- sort(v, decreasing = TRUE)
	v1 <- sort(v[1:10])
	png("terms_frequency.png", width=500,height=400)
	op <- par(mar = c(3.5,2,0,0) + 2) # bottom, left, top, right
	barplot(v1, horiz=TRUE, cex.names = 1.0, las = 1, col=rev(grey.colors(10)), main="Terms Frequency", sub="Doctor Who (2005) – Seasons 1 to 7")
	par(op)
	dev.off()
	# vdtm <- as.data.frame(v)

	### Wordcloud

	library(RColorBrewer)
	library(wordcloud)

	ap.tdm <- TermDocumentMatrix(my.corpus)
	ap.m <- as.matrix(ap.tdm)
	ap.v <- sort(rowSums(ap.m),decreasing=TRUE)
	ap.d <- data.frame(word = names(ap.v),freq=ap.v)
	#table(ap.d$freq)
	palette = brewer.pal(9, "Blues")[-c(1:3)]
	png("wordcloud_all.png", width=1500,height=900)
	wordcloud(ap.d$word,ap.d$freq, scale=c(13,.5),min.freq=3,
	max.words=150, random.order=FALSE, rot.per=.15, colors=palette)
	dev.off()

	### N-gramy
	cnt.n=3
	cnt.topn=10
	library(textcat)
	library(tau)
	cnt.corpus <- rawCorpus
	cnt.corpus <- tm_map(cnt.corpus, tolower)
	cnt.corpus <- tm_map(cnt.corpus, removeWords, c("a","an","the"))
	cnt.corpus <- tm_map(cnt.corpus, removePunctuation)
	cnt.corpus <- tm_map(cnt.corpus, stripWhitespace)
	cnt.text <- textcnt(cnt.corpus, method = "string",n=cnt.n, decreasing=TRUE)
	#cnt.text[1:50]

	cnt.d <- data.frame(word = names(cnt.text), freq=unclass(cnt.text))
	cnt.v1 <- sort(head(cnt.text, n=cnt.topn))

	png(sprintf("ngrams_%s_plot.png",cnt.n), width=700,height=cnt.topn*30)
	# Set margins to prevent names overflow
	op <- par(mar = c(4,cnt.n*2,0,0) + 2) # bottom, left, top, right
	barplot(cnt.v1, horiz=TRUE, main=sprintf("Top %s-grams", cnt.n), sub="Doctor Who (2005) – Seasons 1 to 7",
	cex.names = 1, beside = F,
	las = 1, col=rev(grey.colors(cnt.topn)))
	par(op)
	dev.off()

	### Sentences
	library(NLP)
	library(openNLP)

	# Via http://stackoverflow.com/a/18790617/240963
	reshape_corpus <- function(current.corpus, FUN, ...) {
	# Extract the text from each document in the corpus and put into a list
	text <- lapply(current.corpus, Content)

	# Basically convert the text
	docs <- lapply(text, FUN, ...)
	docs <- as.vector(unlist(docs))

	# Create a new corpus structure and return it
	new.corpus <- Corpus(VectorSource(docs))
	return(new.corpus)
	}
	convert_text_to_sentences <- function(text, lang = "en") {
	# Function to compute sentence annotations using the Apache OpenNLP Maxent sentence detector employing the default model for language 'en'.
	sentence_token_annotator <- Maxent_Sent_Token_Annotator(language = lang)

	# Convert text to class String from package NLP
	text <- as.String(text)

	# Sentence boundaries in text
	sentence.boundaries <- annotate(text, sentence_token_annotator)

	# Extract sentences
	sentences <- text[sentence.boundaries]

	# return sentences
	return(sentences)
	}

	# Extract the text from each document in the corpus and put into a list
	# rawText <- lapply(rawCorpus, Content)
	# head(toString(rawCorpus))
	# rawText <- paste0(rawText[1],rawText[2],rawText[3])
	# txt1 <- rawText[[1]]
	# paste0(as.String(txt1))
	# sentences <- lapply(rawText, convert_text_to_sentences)