Skip to content

Instantly share code, notes, and snippets.

@Guibrich
Last active December 17, 2015 11:39
Show Gist options
  • Save Guibrich/5604146 to your computer and use it in GitHub Desktop.
Save Guibrich/5604146 to your computer and use it in GitHub Desktop.
# 18/05/2013
# Key words : TextMining, Elections, France, Debate, 2nd Round
# We use the packages qdap from (donner le lien) and
# tm to perform textmining analysis and the classical
# package like ggplot or RColorBrewer to get the graphics pretty.
suppressPackageStartupMessages(require(twitteR))
suppressPackageStartupMessages(require(XML))
suppressPackageStartupMessages(require(tm))
suppressPackageStartupMessages(require(rgdal))
suppressPackageStartupMessages(require(ggplot2))
suppressPackageStartupMessages(require(qdap))
suppressPackageStartupMessages(require(rJava))
suppressPackageStartupMessages(library(wordcloud))
library(Rstem)
setwd("D:/PERSO/R_Working/Tutoriels/TextMining")
# Hollande
debate <- read.transcript("./Data/debat2tours.docx", col.names=c("person", "dialogue"))
htruncdf(debate,5,50)
# We keep just Holland's word
Hollande = subset(debate,person=="HOLLANDE")
# We define the stop words
sw=c("a","ou",tm::stopwords("fr"),"c'est", "n'est","s'y","qu'on","s'il","ah",
letters,"ca","n'y","d'un","monsieur")
generateCorpus= function(df,my.stopwords=c()){
text2.corpus= Corpus(VectorSource(df),readerControl=list(language="fr"))
text2.corpus = tm_map(text2.corpus, removePunctuation)
text2.corpus = tm_map(text2.corpus, tolower)
text2.corpus= tm_map(text2.corpus, removeNumbers)
text2.corpus = tm_map(text2.corpus, removeWords, stopwords("fr"))
text2.corpus = tm_map(text2.corpus, removeWords, my.stopwords)
#text2.corpus <- tm_map(text2.corpus, stemDocument, language = "french")
}
HollandeCorpus<-generateCorpus(Hollande,sw)
# We build a Term Document Matrix
H.tdm <- TermDocumentMatrix(HollandeCorpus)
H.m <- as.matrix(H.tdm)
H.v <- sort(rowSums(H.m),decreasing=TRUE)
H.d <- data.frame(word = names(H.v),freq=H.v)
H.d = subset(H.d,freq<=90)
H.d = subset(H.d,freq>=3)
H.d$stem <- wordStem(row.names(H.d), language = "french")
# and put words to column, otherwise they would be lost when aggregating
H.d$word <- row.names(H.d)
agg_freq <- stats::aggregate(freq ~ stem, data = H.d, sum)
agg_word <- stats::aggregate(word ~ stem, data = H.d, function(x) x[1])
forW <- cbind(freq = agg_freq[, 2], agg_word)
# sort by frequency
forW <- forW[order(forW$freq, decreasing = T), ]
# Wordcloud
col<- brewer.pal(8,"Dark2")
png("wordcloud_Hollande.png", width=1280,height=800)
wordcloud(forW$word,forW$freq, scale=c(8,.2),min.freq=5,
max.words=Inf, random.order=FALSE, rot.per=.20, colors=col)
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment