Skip to content

Instantly share code, notes, and snippets.

@shawngraham
Last active August 29, 2015 14:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shawngraham/4eca2de3a4222057a7ed to your computer and use it in GitHub Desktop.
Save shawngraham/4eca2de3a4222057a7ed to your computer and use it in GitHub Desktop.
setwd("desktop/beals-new")
# give yourself as much memory as you've got
options(java.parameters = "-Xmx5120m")
library(rJava)
## from http://cran.r-project.org/web/packages/mallet/mallet.pdf
library(mallet)
#CND xml file transformed in browser into csv table. copy & paste into excel, saved as csv. Cut the column headers and paste them in the line below:
library(RCurl)
x <- getURL("https://raw.githubusercontent.com/shawngraham/exercise/gh-pages/CND.csv")
documents <- read.csv(text = x, col.names=c("Article_ID", "Newspaper Title", "Newspaper City", "Newspaper Province", "Newspaper Country", "Year", "Month", "Day", "Article Type", "Text", "Keywords"),
colClasses=rep("character", 3), sep=",", quote="")
# now we import the important bits. Use a stopwords list. use regex to strain out punctuation etc.
mallet.instances <- mallet.import(documents$Article_ID, documents$Text, "/Users/shawngraham/Desktop/data mining and tools/TextAnalysisWithR/data/stoplist.csv",
token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
#set the number of desired topics
num.topics <- 50
topic.model <- MalletLDA(num.topics)
## Load our documents. We could also pass in the filename of a
## saved instance list file that we build from the command-line tools.
topic.model$loadDocuments(mallet.instances)
## Get the vocabulary, and some statistics about word frequencies.
## These may be useful in further curating the stopword list.
vocabulary <- topic.model$getVocabulary()
word.freqs <- mallet.word.freqs(topic.model)
head(word.freqs)
write.csv(word.freqs, "cnd-word-freqs.csv" )
## Optimize hyperparameters every 20 iterations,
## after 50 burn-in iterations.
topic.model$setAlphaOptimization(20, 50)
## Now train a model. Note that hyperparameter optimization is on, by default.
## We can specify the number of iterations. Here we'll use a large-ish round number.
topic.model$train(1000)
## NEW: run through a few iterations where we pick the best topic for each token,
## rather than sampling from the posterior distribution.
topic.model$maximize(10)
## Get the probability of topics in documents and the probability of words in topics.
## By default, these functions return raw word counts. Here we want probabilities,
## so we normalize, and add "smoothing" so that nothing has exactly 0 probability.
doc.topics <- mallet.doc.topics(topic.model, smoothed=T, normalized=T)
topic.words <- mallet.topic.words(topic.model, smoothed=T, normalized=T)
## What are the top words in topic 7?
## Notice that R indexes from 1, so this will be the topic that mallet called topic 6.
mallet.top.words(topic.model, topic.words[7,])
## Show the first few documents with at least 5
head(documents[ doc.topics[7,] > 0.05 & doc.topics[10,] > 0.05, ])
###from my other script; above was mimno's example script
topic.docs <- t(doc.topics)
topic.docs <- topic.docs / rowSums(topic.docs)
write.csv(topic.docs, "cnd-topics-docs.csv" ) ## "C:\\Malletopic-docs.csv"
## Get a vector containing short names for the topics
topics.labels <- rep("", num.topics)
for (topic in 1:num.topics) topics.labels[topic] <- paste(mallet.top.words(topic.model, topic.words[topic,], num.top.words=5)$words, collapse=" ")
# have a look at keywords for each topic
topics.labels
write.csv(topics.labels, "cnd-topics-labels.csv") ## "C:\\Mallet-2.0.7\\topics-labels.csv")
### do word clouds of the topics
library(wordcloud)
for(i in 1:num.topics){
topic.top.words <- mallet.top.words(topic.model,
topic.words[i,], 25)
print(wordcloud(topic.top.words$words,
topic.top.words$weights,
c(4,.8), rot.per=0,
random.order=F))
}
##marwick-type outputs
topic_docs <- data.frame(topic.docs)
names(topic_docs) <- documents$id
## cluster based on shared words
plot(hclust(dist(topic.words)), labels=topics.labels)
library(cluster)
topic_df_dist <- as.matrix(daisy(t(topic_docs), metric = "euclidean", stand = TRUE))
# Change row values to zero if less than row minimum plus row standard deviation
# keep only closely related documents and avoid a dense spagetti diagram
# that's difficult to interpret (hat-tip: http://stackoverflow.com/a/16047196/1036500)
topic_df_dist[ sweep(topic_df_dist, 1, (apply(topic_df_dist,1,min) + apply(topic_df_dist,1,sd) )) > 0 ] <- 0
#' Use kmeans to identify groups of similar authors
km <- kmeans(topic_df_dist, num.topics)
# get names for each cluster
allnames <- vector("list", length = num.topics)
for(i in 1:num.topics){
allnames[[i]] <- names(km$cluster[km$cluster == i])
}
allnames
library(igraph)
g <- as.undirected(graph.adjacency(topic_df_dist))
layout1 <- layout.fruchterman.reingold(g, niter=500)
plot(g, layout=layout1, edge.curved = TRUE, vertex.size = 1, vertex.color= "grey", edge.arrow.size = 0, vertex.label.dist=0.5, vertex.label = NA)
write.graph(g, file="cnd.graphml", format="graphml")
# some other visualizations
counts <- table(documents$Newspaper.City)
barplot(counts, main="Cities", xlab="Number of Articles")
years <- table(documents$Year)
barplot(years, main="Publication Year", xlab="Year", ylab="Number of Articles")
#If you're using RStudio to explore this data, you can export the images as pdfs, and open them in Inkscape or Illustrator to pretty them up.
#In Inkscape, import -> pdf. Then object -> ungroup all (shift + ctrl + g). Then each element can be selected, manipulated.
### groups of similar articles, by article id; 'allnames'
[[1]]
[1] "44" "191" "203"
[[2]]
[1] "82" "134" "142" "169"
[[3]]
[1] "16" "197" "200"
[[4]]
[1] "18" "42" "92" "136"
[[5]]
[1] "7" "47" "87" "116" "120" "170"
[[6]]
[1] "29" "104" "108" "122" "126" "127" "129" "151"
[[7]]
[1] "11" "111" "112" "178" "179" "202" "252"
[[8]]
[1] "26" "27" "56" "95" "168"
[[9]]
[1] "17" "34" "41" "186"
[[10]]
[1] "14" "217" "281" "324" "331"
[[11]]
[1] "59" "81" "101" "110"
[[12]]
[1] "114" "225" "271" "275" "282" "283" "294" "296" "297" "300" "315" "326" "349"
[[13]]
[1] "28" "69"
[[14]]
[1] "63"
[[15]]
[1] "77" "190" "196" "198" "211"
[[16]]
[1] "12" "22" "32" "33" "36" "49" "55"
[[17]]
[1] "39" "67" "73" "78" "254" "261"
[[18]]
[1] "243" "270" "273"
[[19]]
[1] "262" "303" "311" "327" "332" "351"
[[20]]
[1] "150" "247" "249" "277" "295"
[[21]]
[1] "3" "54" "222" "240" "260" "293" "309" "310"
[[22]]
[1] "50" "74" "98" "103" "106" "115" "213" "305" "352"
[[23]]
[1] "31" "139" "299" "312" "313" "317" "320"
[[24]]
[1] "1" "6" "9" "10" "21" "24" "30" "40" "45" "46" "48" "52" "58" "60" "61" "64" "68" "70" "71" "72"
[21] "75" "79" "84" "91" "99" "105" "107" "113" "118" "124" "125" "130" "131" "132" "135" "140" "141" "143" "146" "147"
[41] "152" "154" "155" "156" "157" "158" "159" "161" "162" "163" "164" "165" "166" "167" "176" "177" "180" "181" "184" "185"
[61] "189" "192" "193" "195" "199" "201" "205" "206" "207" "209" "210" "212" "219" "228" "229" "231" "233" "236" "239" "241"
[81] "244" "246" "255" "258" "259" "267" "268" "272" "274" "278" "279" "288" "289" "290" "291" "292" "304" "306" "307" "308"
[101] "314" "321" "325" "333" "334" "335" "338" "340" "345" "346" "354" "357" "359"
[[25]]
[1] "96" "214" "224" "226"
[[26]]
[1] "2" "221" "237" "266" "319"
[[27]]
[1] "5" "13" "15" "83" "172" "174"
[[28]]
[1] "57" "160" "187" "208" "223" "230" "232"
[[29]]
[1] "227" "238" "250" "257" "285" "316" "341"
[[30]]
[1] "358"
[[31]]
[1] "204" "245" "339" "342"
[[32]]
[1] "137" "145" "148"
[[33]]
[1] "280" "318" "323" "353"
[[34]]
[1] "220" "242"
[[35]]
[1] "4" "35" "53" "119" "121" "133" "276" "286" "287"
[[36]]
[1] "251" "322" "337" "343" "344" "348" "356"
[[37]]
[1] "171" "183" "216" "263" "265"
[[38]]
[1] "8" "144" "173" "175" "182"
[[39]]
[1] "248" "329"
[[40]]
[1] "25" "38" "43" "51" "76" "90" "194" "350"
[[41]]
[1] "89"
[[42]]
[1] "66" "138" "188" "256"
[[43]]
[1] "23" "37" "65" "102"
[[44]]
[1] "94" "355"
[[45]]
[1] "19" "234" "235" "269" "301" "302" "328" "347"
[[46]]
[1] "284" "298" "336"
[[47]]
[1] "62" "215" "218" "253" "264" "330"
[[48]]
[1] "80" "88" "93" "97" "100" "109" "117" "128" "153"
[[49]]
[1] "86" "149"
[[50]]
[1] "20" "85" "123"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment