Last active
August 29, 2015 14:15
-
-
Save shawngraham/4eca2de3a4222057a7ed to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
setwd("desktop/beals-new") | |
# give yourself as much memory as you've got | |
options(java.parameters = "-Xmx5120m") | |
library(rJava) | |
## from http://cran.r-project.org/web/packages/mallet/mallet.pdf | |
library(mallet) | |
#CND xml file transformed in browser into csv table. copy & paste into excel, saved as csv. Cut the column headers and paste them in the line below: | |
library(RCurl) | |
x <- getURL("https://raw.githubusercontent.com/shawngraham/exercise/gh-pages/CND.csv") | |
documents <- read.csv(text = x, col.names=c("Article_ID", "Newspaper Title", "Newspaper City", "Newspaper Province", "Newspaper Country", "Year", "Month", "Day", "Article Type", "Text", "Keywords"), | |
colClasses=rep("character", 3), sep=",", quote="") | |
# now we import the important bits. Use a stopwords list. use regex to strain out punctuation etc. | |
mallet.instances <- mallet.import(documents$Article_ID, documents$Text, "/Users/shawngraham/Desktop/data mining and tools/TextAnalysisWithR/data/stoplist.csv", | |
token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}") | |
#set the number of desired topics | |
num.topics <- 50 | |
topic.model <- MalletLDA(num.topics) | |
## Load our documents. We could also pass in the filename of a | |
## saved instance list file that we build from the command-line tools. | |
topic.model$loadDocuments(mallet.instances) | |
## Get the vocabulary, and some statistics about word frequencies. | |
## These may be useful in further curating the stopword list. | |
vocabulary <- topic.model$getVocabulary() | |
word.freqs <- mallet.word.freqs(topic.model) | |
head(word.freqs) | |
write.csv(word.freqs, "cnd-word-freqs.csv" ) | |
## Optimize hyperparameters every 20 iterations, | |
## after 50 burn-in iterations. | |
topic.model$setAlphaOptimization(20, 50) | |
## Now train a model. Note that hyperparameter optimization is on, by default. | |
## We can specify the number of iterations. Here we'll use a large-ish round number. | |
topic.model$train(1000) | |
## NEW: run through a few iterations where we pick the best topic for each token, | |
## rather than sampling from the posterior distribution. | |
topic.model$maximize(10) | |
## Get the probability of topics in documents and the probability of words in topics. | |
## By default, these functions return raw word counts. Here we want probabilities, | |
## so we normalize, and add "smoothing" so that nothing has exactly 0 probability. | |
doc.topics <- mallet.doc.topics(topic.model, smoothed=T, normalized=T) | |
topic.words <- mallet.topic.words(topic.model, smoothed=T, normalized=T) | |
## What are the top words in topic 7? | |
## Notice that R indexes from 1, so this will be the topic that mallet called topic 6. | |
mallet.top.words(topic.model, topic.words[7,]) | |
## Show the first few documents with at least 5 | |
head(documents[ doc.topics[7,] > 0.05 & doc.topics[10,] > 0.05, ]) | |
###from my other script; above was mimno's example script | |
topic.docs <- t(doc.topics) | |
topic.docs <- topic.docs / rowSums(topic.docs) | |
write.csv(topic.docs, "cnd-topics-docs.csv" ) ## "C:\\Malletopic-docs.csv" | |
## Get a vector containing short names for the topics | |
topics.labels <- rep("", num.topics) | |
for (topic in 1:num.topics) topics.labels[topic] <- paste(mallet.top.words(topic.model, topic.words[topic,], num.top.words=5)$words, collapse=" ") | |
# have a look at keywords for each topic | |
topics.labels | |
write.csv(topics.labels, "cnd-topics-labels.csv") ## "C:\\Mallet-2.0.7\\topics-labels.csv") | |
### do word clouds of the topics | |
library(wordcloud) | |
for(i in 1:num.topics){ | |
topic.top.words <- mallet.top.words(topic.model, | |
topic.words[i,], 25) | |
print(wordcloud(topic.top.words$words, | |
topic.top.words$weights, | |
c(4,.8), rot.per=0, | |
random.order=F)) | |
} | |
##marwick-type outputs | |
topic_docs <- data.frame(topic.docs) | |
names(topic_docs) <- documents$id | |
## cluster based on shared words | |
plot(hclust(dist(topic.words)), labels=topics.labels) | |
library(cluster) | |
topic_df_dist <- as.matrix(daisy(t(topic_docs), metric = "euclidean", stand = TRUE)) | |
# Change row values to zero if less than row minimum plus row standard deviation | |
# keep only closely related documents and avoid a dense spagetti diagram | |
# that's difficult to interpret (hat-tip: http://stackoverflow.com/a/16047196/1036500) | |
topic_df_dist[ sweep(topic_df_dist, 1, (apply(topic_df_dist,1,min) + apply(topic_df_dist,1,sd) )) > 0 ] <- 0 | |
#' Use kmeans to identify groups of similar authors | |
km <- kmeans(topic_df_dist, num.topics) | |
# get names for each cluster | |
allnames <- vector("list", length = num.topics) | |
for(i in 1:num.topics){ | |
allnames[[i]] <- names(km$cluster[km$cluster == i]) | |
} | |
allnames | |
library(igraph) | |
g <- as.undirected(graph.adjacency(topic_df_dist)) | |
layout1 <- layout.fruchterman.reingold(g, niter=500) | |
plot(g, layout=layout1, edge.curved = TRUE, vertex.size = 1, vertex.color= "grey", edge.arrow.size = 0, vertex.label.dist=0.5, vertex.label = NA) | |
write.graph(g, file="cnd.graphml", format="graphml") | |
# some other visualizations | |
counts <- table(documents$Newspaper.City) | |
barplot(counts, main="Cities", xlab="Number of Articles") | |
years <- table(documents$Year) | |
barplot(years, main="Publication Year", xlab="Year", ylab="Number of Articles") | |
#If you're using RStudio to explore this data, you can export the images as pdfs, and open them in Inkscape or Illustrator to pretty them up. | |
#In Inkscape, import -> pdf. Then object -> ungroup all (shift + ctrl + g). Then each element can be selected, manipulated. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### groups of similar articles, by article id; 'allnames' | |
[[1]] | |
[1] "44" "191" "203" | |
[[2]] | |
[1] "82" "134" "142" "169" | |
[[3]] | |
[1] "16" "197" "200" | |
[[4]] | |
[1] "18" "42" "92" "136" | |
[[5]] | |
[1] "7" "47" "87" "116" "120" "170" | |
[[6]] | |
[1] "29" "104" "108" "122" "126" "127" "129" "151" | |
[[7]] | |
[1] "11" "111" "112" "178" "179" "202" "252" | |
[[8]] | |
[1] "26" "27" "56" "95" "168" | |
[[9]] | |
[1] "17" "34" "41" "186" | |
[[10]] | |
[1] "14" "217" "281" "324" "331" | |
[[11]] | |
[1] "59" "81" "101" "110" | |
[[12]] | |
[1] "114" "225" "271" "275" "282" "283" "294" "296" "297" "300" "315" "326" "349" | |
[[13]] | |
[1] "28" "69" | |
[[14]] | |
[1] "63" | |
[[15]] | |
[1] "77" "190" "196" "198" "211" | |
[[16]] | |
[1] "12" "22" "32" "33" "36" "49" "55" | |
[[17]] | |
[1] "39" "67" "73" "78" "254" "261" | |
[[18]] | |
[1] "243" "270" "273" | |
[[19]] | |
[1] "262" "303" "311" "327" "332" "351" | |
[[20]] | |
[1] "150" "247" "249" "277" "295" | |
[[21]] | |
[1] "3" "54" "222" "240" "260" "293" "309" "310" | |
[[22]] | |
[1] "50" "74" "98" "103" "106" "115" "213" "305" "352" | |
[[23]] | |
[1] "31" "139" "299" "312" "313" "317" "320" | |
[[24]] | |
[1] "1" "6" "9" "10" "21" "24" "30" "40" "45" "46" "48" "52" "58" "60" "61" "64" "68" "70" "71" "72" | |
[21] "75" "79" "84" "91" "99" "105" "107" "113" "118" "124" "125" "130" "131" "132" "135" "140" "141" "143" "146" "147" | |
[41] "152" "154" "155" "156" "157" "158" "159" "161" "162" "163" "164" "165" "166" "167" "176" "177" "180" "181" "184" "185" | |
[61] "189" "192" "193" "195" "199" "201" "205" "206" "207" "209" "210" "212" "219" "228" "229" "231" "233" "236" "239" "241" | |
[81] "244" "246" "255" "258" "259" "267" "268" "272" "274" "278" "279" "288" "289" "290" "291" "292" "304" "306" "307" "308" | |
[101] "314" "321" "325" "333" "334" "335" "338" "340" "345" "346" "354" "357" "359" | |
[[25]] | |
[1] "96" "214" "224" "226" | |
[[26]] | |
[1] "2" "221" "237" "266" "319" | |
[[27]] | |
[1] "5" "13" "15" "83" "172" "174" | |
[[28]] | |
[1] "57" "160" "187" "208" "223" "230" "232" | |
[[29]] | |
[1] "227" "238" "250" "257" "285" "316" "341" | |
[[30]] | |
[1] "358" | |
[[31]] | |
[1] "204" "245" "339" "342" | |
[[32]] | |
[1] "137" "145" "148" | |
[[33]] | |
[1] "280" "318" "323" "353" | |
[[34]] | |
[1] "220" "242" | |
[[35]] | |
[1] "4" "35" "53" "119" "121" "133" "276" "286" "287" | |
[[36]] | |
[1] "251" "322" "337" "343" "344" "348" "356" | |
[[37]] | |
[1] "171" "183" "216" "263" "265" | |
[[38]] | |
[1] "8" "144" "173" "175" "182" | |
[[39]] | |
[1] "248" "329" | |
[[40]] | |
[1] "25" "38" "43" "51" "76" "90" "194" "350" | |
[[41]] | |
[1] "89" | |
[[42]] | |
[1] "66" "138" "188" "256" | |
[[43]] | |
[1] "23" "37" "65" "102" | |
[[44]] | |
[1] "94" "355" | |
[[45]] | |
[1] "19" "234" "235" "269" "301" "302" "328" "347" | |
[[46]] | |
[1] "284" "298" "336" | |
[[47]] | |
[1] "62" "215" "218" "253" "264" "330" | |
[[48]] | |
[1] "80" "88" "93" "97" "100" "109" "117" "128" "153" | |
[[49]] | |
[1] "86" "149" | |
[[50]] | |
[1] "20" "85" "123" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment