shawngraham/CND-topic-model.r

## CND-topic-model.r
setwd("desktop/beals-new")
# give yourself as much memory as you've got
options(java.parameters = "-Xmx5120m")
library(rJava)
## from http://cran.r-project.org/web/packages/mallet/mallet.pdf
library(mallet)

#CND xml file transformed in browser into csv table. copy & paste into excel, saved as csv. Cut the column headers and paste them in the line below:

library(RCurl)
x <- getURL("https://raw.githubusercontent.com/shawngraham/exercise/gh-pages/CND.csv")

documents <- read.csv(text = x, col.names=c("Article_ID", "Newspaper Title", "Newspaper City", "Newspaper Province", "Newspaper Country", "Year", "Month", "Day", "Article Type", "Text", "Keywords"),
                        colClasses=rep("character", 3), sep=",", quote="")

# now we import the important bits. Use a stopwords list. use regex to strain out punctuation etc.
mallet.instances <- mallet.import(documents$Article_ID, documents$Text, "/Users/shawngraham/Desktop/data mining and tools/TextAnalysisWithR/data/stoplist.csv",
                                  token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
#set the number of desired topics
num.topics <- 50
topic.model <- MalletLDA(num.topics)

## Load our documents. We could also pass in the filename of a
## saved instance list file that we build from the command-line tools.
    topic.model$loadDocuments(mallet.instances)
## Get the vocabulary, and some statistics about word frequencies.
## These may be useful in further curating the stopword list.
vocabulary <- topic.model$getVocabulary()
word.freqs <- mallet.word.freqs(topic.model)
head(word.freqs)
write.csv(word.freqs, "cnd-word-freqs.csv" )
## Optimize hyperparameters every 20 iterations,
## after 50 burn-in iterations.
topic.model$setAlphaOptimization(20, 50)
## Now train a model. Note that hyperparameter optimization is on, by default.
## We can specify the number of iterations. Here we'll use a large-ish round number.
topic.model$train(1000)
## NEW: run through a few iterations where we pick the best topic for each token,
## rather than sampling from the posterior distribution.
topic.model$maximize(10)
## Get the probability of topics in documents and the probability of words in topics.
## By default, these functions return raw word counts. Here we want probabilities,
## so we normalize, and add "smoothing" so that nothing has exactly 0 probability.
doc.topics <- mallet.doc.topics(topic.model, smoothed=T, normalized=T)
topic.words <- mallet.topic.words(topic.model, smoothed=T, normalized=T)

## What are the top words in topic 7?
## Notice that R indexes from 1, so this will be the topic that mallet called topic 6.
mallet.top.words(topic.model, topic.words[7,])
## Show the first few documents with at least 5
head(documents[ doc.topics[7,] > 0.05 & doc.topics[10,] > 0.05, ])

###from my other script; above was mimno's example script
topic.docs <- t(doc.topics)
topic.docs <- topic.docs / rowSums(topic.docs)
write.csv(topic.docs, "cnd-topics-docs.csv" ) ## "C:\\Malletopic-docs.csv"
## Get a vector containing short names for the topics
topics.labels <- rep("", num.topics)
for (topic in 1:num.topics) topics.labels[topic] <- paste(mallet.top.words(topic.model, topic.words[topic,], num.top.words=5)$words, collapse=" ")
# have a look at keywords for each topic
topics.labels
write.csv(topics.labels, "cnd-topics-labels.csv") ## "C:\\Mallet-2.0.7\\topics-labels.csv")

### do word clouds of the topics
library(wordcloud)
for(i in 1:num.topics){
  topic.top.words <- mallet.top.words(topic.model,
                                      topic.words[i,], 25)
  print(wordcloud(topic.top.words$words,
                  topic.top.words$weights,
                  c(4,.8), rot.per=0,
                  random.order=F))
}
##marwick-type outputs
topic_docs <- data.frame(topic.docs)
names(topic_docs) <- documents$id

## cluster based on shared words
plot(hclust(dist(topic.words)), labels=topics.labels)

library(cluster)
topic_df_dist <- as.matrix(daisy(t(topic_docs), metric = "euclidean", stand = TRUE))
# Change row values to zero if less than row minimum plus row standard deviation
# keep only closely related documents and avoid a dense spagetti diagram
# that's difficult to interpret (hat-tip: http://stackoverflow.com/a/16047196/1036500)
topic_df_dist[ sweep(topic_df_dist, 1, (apply(topic_df_dist,1,min) + apply(topic_df_dist,1,sd) )) > 0 ] <- 0

#' Use kmeans to identify groups of similar authors

km <- kmeans(topic_df_dist, num.topics)

# get names for each cluster
allnames <- vector("list", length = num.topics)
for(i in 1:num.topics){
  allnames[[i]] <- names(km$cluster[km$cluster == i])
}
allnames

library(igraph)
g <- as.undirected(graph.adjacency(topic_df_dist))
layout1 <- layout.fruchterman.reingold(g, niter=500)
plot(g, layout=layout1, edge.curved = TRUE, vertex.size = 1, vertex.color= "grey", edge.arrow.size = 0, vertex.label.dist=0.5, vertex.label = NA)

write.graph(g, file="cnd.graphml", format="graphml")

# some other visualizations
counts <- table(documents$Newspaper.City)
barplot(counts, main="Cities", xlab="Number of Articles")

years <- table(documents$Year)
barplot(years, main="Publication Year", xlab="Year", ylab="Number of Articles")

#If you're using RStudio to explore this data, you can export the images as pdfs, and open them in Inkscape or Illustrator to pretty them up.
#In Inkscape, import -> pdf. Then object -> ungroup all (shift + ctrl + g). Then each element can be selected, manipulated.

## output-1.txt
### groups of similar articles, by article id; 'allnames'

[[1]]
[1] "44"  "191" "203"

[[2]]
[1] "82"  "134" "142" "169"

[[3]]
[1] "16"  "197" "200"

[[4]]
[1] "18"  "42"  "92"  "136"

[[5]]
[1] "7"   "47"  "87"  "116" "120" "170"

[[6]]
[1] "29"  "104" "108" "122" "126" "127" "129" "151"

[[7]]
[1] "11"  "111" "112" "178" "179" "202" "252"

[[8]]
[1] "26"  "27"  "56"  "95"  "168"

[[9]]
[1] "17"  "34"  "41"  "186"

[[10]]
[1] "14"  "217" "281" "324" "331"

[[11]]
[1] "59"  "81"  "101" "110"

[[12]]
 [1] "114" "225" "271" "275" "282" "283" "294" "296" "297" "300" "315" "326" "349"

[[13]]
[1] "28" "69"

[[14]]
[1] "63"

[[15]]
[1] "77"  "190" "196" "198" "211"

[[16]]
[1] "12" "22" "32" "33" "36" "49" "55"

[[17]]
[1] "39"  "67"  "73"  "78"  "254" "261"

[[18]]
[1] "243" "270" "273"

[[19]]
[1] "262" "303" "311" "327" "332" "351"

[[20]]
[1] "150" "247" "249" "277" "295"

[[21]]
[1] "3"   "54"  "222" "240" "260" "293" "309" "310"

[[22]]
[1] "50"  "74"  "98"  "103" "106" "115" "213" "305" "352"

[[23]]
[1] "31"  "139" "299" "312" "313" "317" "320"

[[24]]
  [1] "1"   "6"   "9"   "10"  "21"  "24"  "30"  "40"  "45"  "46"  "48"  "52"  "58"  "60"  "61"  "64"  "68"  "70"  "71"  "72"
 [21] "75"  "79"  "84"  "91"  "99"  "105" "107" "113" "118" "124" "125" "130" "131" "132" "135" "140" "141" "143" "146" "147"
 [41] "152" "154" "155" "156" "157" "158" "159" "161" "162" "163" "164" "165" "166" "167" "176" "177" "180" "181" "184" "185"
 [61] "189" "192" "193" "195" "199" "201" "205" "206" "207" "209" "210" "212" "219" "228" "229" "231" "233" "236" "239" "241"
 [81] "244" "246" "255" "258" "259" "267" "268" "272" "274" "278" "279" "288" "289" "290" "291" "292" "304" "306" "307" "308"
[101] "314" "321" "325" "333" "334" "335" "338" "340" "345" "346" "354" "357" "359"

[[25]]
[1] "96"  "214" "224" "226"

[[26]]
[1] "2"   "221" "237" "266" "319"

[[27]]
[1] "5"   "13"  "15"  "83"  "172" "174"

[[28]]
[1] "57"  "160" "187" "208" "223" "230" "232"

[[29]]
[1] "227" "238" "250" "257" "285" "316" "341"

[[30]]
[1] "358"

[[31]]
[1] "204" "245" "339" "342"

[[32]]
[1] "137" "145" "148"

[[33]]
[1] "280" "318" "323" "353"

[[34]]
[1] "220" "242"

[[35]]
[1] "4"   "35"  "53"  "119" "121" "133" "276" "286" "287"

[[36]]
[1] "251" "322" "337" "343" "344" "348" "356"

[[37]]
[1] "171" "183" "216" "263" "265"

[[38]]
[1] "8"   "144" "173" "175" "182"

[[39]]
[1] "248" "329"

[[40]]
[1] "25"  "38"  "43"  "51"  "76"  "90"  "194" "350"

[[41]]
[1] "89"

[[42]]
[1] "66"  "138" "188" "256"

[[43]]
[1] "23"  "37"  "65"  "102"

[[44]]
[1] "94"  "355"

[[45]]
[1] "19"  "234" "235" "269" "301" "302" "328" "347"

[[46]]
[1] "284" "298" "336"

[[47]]
[1] "62"  "215" "218" "253" "264" "330"

[[48]]
[1] "80"  "88"  "93"  "97"  "100" "109" "117" "128" "153"

[[49]]
[1] "86"  "149"

[[50]]
[1] "20"  "85"  "123"
	setwd("desktop/beals-new")
	# give yourself as much memory as you've got
	options(java.parameters = "-Xmx5120m")
	library(rJava)
	## from http://cran.r-project.org/web/packages/mallet/mallet.pdf
	library(mallet)

	#CND xml file transformed in browser into csv table. copy & paste into excel, saved as csv. Cut the column headers and paste them in the line below:

	library(RCurl)
	x <- getURL("https://raw.githubusercontent.com/shawngraham/exercise/gh-pages/CND.csv")

	documents <- read.csv(text = x, col.names=c("Article_ID", "Newspaper Title", "Newspaper City", "Newspaper Province", "Newspaper Country", "Year", "Month", "Day", "Article Type", "Text", "Keywords"),
	colClasses=rep("character", 3), sep=",", quote="")

	# now we import the important bits. Use a stopwords list. use regex to strain out punctuation etc.
	mallet.instances <- mallet.import(documents$Article_ID, documents$Text, "/Users/shawngraham/Desktop/data mining and tools/TextAnalysisWithR/data/stoplist.csv",
	token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
	#set the number of desired topics
	num.topics <- 50
	topic.model <- MalletLDA(num.topics)

	## Load our documents. We could also pass in the filename of a
	## saved instance list file that we build from the command-line tools.
	topic.model$loadDocuments(mallet.instances)
	## Get the vocabulary, and some statistics about word frequencies.
	## These may be useful in further curating the stopword list.
	vocabulary <- topic.model$getVocabulary()
	word.freqs <- mallet.word.freqs(topic.model)
	head(word.freqs)
	write.csv(word.freqs, "cnd-word-freqs.csv" )
	## Optimize hyperparameters every 20 iterations,
	## after 50 burn-in iterations.
	topic.model$setAlphaOptimization(20, 50)
	## Now train a model. Note that hyperparameter optimization is on, by default.
	## We can specify the number of iterations. Here we'll use a large-ish round number.
	topic.model$train(1000)
	## NEW: run through a few iterations where we pick the best topic for each token,
	## rather than sampling from the posterior distribution.
	topic.model$maximize(10)
	## Get the probability of topics in documents and the probability of words in topics.
	## By default, these functions return raw word counts. Here we want probabilities,
	## so we normalize, and add "smoothing" so that nothing has exactly 0 probability.
	doc.topics <- mallet.doc.topics(topic.model, smoothed=T, normalized=T)
	topic.words <- mallet.topic.words(topic.model, smoothed=T, normalized=T)

	## What are the top words in topic 7?
	## Notice that R indexes from 1, so this will be the topic that mallet called topic 6.
	mallet.top.words(topic.model, topic.words[7,])
	## Show the first few documents with at least 5
	head(documents[ doc.topics[7,] > 0.05 & doc.topics[10,] > 0.05, ])

	###from my other script; above was mimno's example script
	topic.docs <- t(doc.topics)
	topic.docs <- topic.docs / rowSums(topic.docs)
	write.csv(topic.docs, "cnd-topics-docs.csv" ) ## "C:\\Malletopic-docs.csv"
	## Get a vector containing short names for the topics
	topics.labels <- rep("", num.topics)
	for (topic in 1:num.topics) topics.labels[topic] <- paste(mallet.top.words(topic.model, topic.words[topic,], num.top.words=5)$words, collapse=" ")
	# have a look at keywords for each topic
	topics.labels
	write.csv(topics.labels, "cnd-topics-labels.csv") ## "C:\\Mallet-2.0.7\\topics-labels.csv")

	### do word clouds of the topics
	library(wordcloud)
	for(i in 1:num.topics){
	topic.top.words <- mallet.top.words(topic.model,
	topic.words[i,], 25)
	print(wordcloud(topic.top.words$words,
	topic.top.words$weights,
	c(4,.8), rot.per=0,
	random.order=F))
	}
	##marwick-type outputs
	topic_docs <- data.frame(topic.docs)
	names(topic_docs) <- documents$id

	## cluster based on shared words
	plot(hclust(dist(topic.words)), labels=topics.labels)

	library(cluster)
	topic_df_dist <- as.matrix(daisy(t(topic_docs), metric = "euclidean", stand = TRUE))
	# Change row values to zero if less than row minimum plus row standard deviation
	# keep only closely related documents and avoid a dense spagetti diagram
	# that's difficult to interpret (hat-tip: http://stackoverflow.com/a/16047196/1036500)
	topic_df_dist[ sweep(topic_df_dist, 1, (apply(topic_df_dist,1,min) + apply(topic_df_dist,1,sd) )) > 0 ] <- 0

	#' Use kmeans to identify groups of similar authors

	km <- kmeans(topic_df_dist, num.topics)

	# get names for each cluster
	allnames <- vector("list", length = num.topics)
	for(i in 1:num.topics){
	allnames[[i]] <- names(km$cluster[km$cluster == i])
	}
	allnames

	library(igraph)
	g <- as.undirected(graph.adjacency(topic_df_dist))
	layout1 <- layout.fruchterman.reingold(g, niter=500)
	plot(g, layout=layout1, edge.curved = TRUE, vertex.size = 1, vertex.color= "grey", edge.arrow.size = 0, vertex.label.dist=0.5, vertex.label = NA)

	write.graph(g, file="cnd.graphml", format="graphml")

	# some other visualizations
	counts <- table(documents$Newspaper.City)
	barplot(counts, main="Cities", xlab="Number of Articles")

	years <- table(documents$Year)
	barplot(years, main="Publication Year", xlab="Year", ylab="Number of Articles")

	#If you're using RStudio to explore this data, you can export the images as pdfs, and open them in Inkscape or Illustrator to pretty them up.
	#In Inkscape, import -> pdf. Then object -> ungroup all (shift + ctrl + g). Then each element can be selected, manipulated.
	### groups of similar articles, by article id; 'allnames'

	[[1]]
	[1] "44" "191" "203"

	[[2]]
	[1] "82" "134" "142" "169"

	[[3]]
	[1] "16" "197" "200"

	[[4]]
	[1] "18" "42" "92" "136"

	[[5]]
	[1] "7" "47" "87" "116" "120" "170"

	[[6]]
	[1] "29" "104" "108" "122" "126" "127" "129" "151"

	[[7]]
	[1] "11" "111" "112" "178" "179" "202" "252"

	[[8]]
	[1] "26" "27" "56" "95" "168"

	[[9]]
	[1] "17" "34" "41" "186"

	[[10]]
	[1] "14" "217" "281" "324" "331"

	[[11]]
	[1] "59" "81" "101" "110"

	[[12]]
	[1] "114" "225" "271" "275" "282" "283" "294" "296" "297" "300" "315" "326" "349"

	[[13]]
	[1] "28" "69"

	[[14]]
	[1] "63"

	[[15]]
	[1] "77" "190" "196" "198" "211"

	[[16]]
	[1] "12" "22" "32" "33" "36" "49" "55"

	[[17]]
	[1] "39" "67" "73" "78" "254" "261"

	[[18]]
	[1] "243" "270" "273"

	[[19]]
	[1] "262" "303" "311" "327" "332" "351"

	[[20]]
	[1] "150" "247" "249" "277" "295"

	[[21]]
	[1] "3" "54" "222" "240" "260" "293" "309" "310"

	[[22]]
	[1] "50" "74" "98" "103" "106" "115" "213" "305" "352"

	[[23]]
	[1] "31" "139" "299" "312" "313" "317" "320"

	[[24]]
	[1] "1" "6" "9" "10" "21" "24" "30" "40" "45" "46" "48" "52" "58" "60" "61" "64" "68" "70" "71" "72"
	[21] "75" "79" "84" "91" "99" "105" "107" "113" "118" "124" "125" "130" "131" "132" "135" "140" "141" "143" "146" "147"
	[41] "152" "154" "155" "156" "157" "158" "159" "161" "162" "163" "164" "165" "166" "167" "176" "177" "180" "181" "184" "185"
	[61] "189" "192" "193" "195" "199" "201" "205" "206" "207" "209" "210" "212" "219" "228" "229" "231" "233" "236" "239" "241"
	[81] "244" "246" "255" "258" "259" "267" "268" "272" "274" "278" "279" "288" "289" "290" "291" "292" "304" "306" "307" "308"
	[101] "314" "321" "325" "333" "334" "335" "338" "340" "345" "346" "354" "357" "359"

	[[25]]
	[1] "96" "214" "224" "226"

	[[26]]
	[1] "2" "221" "237" "266" "319"

	[[27]]
	[1] "5" "13" "15" "83" "172" "174"

	[[28]]
	[1] "57" "160" "187" "208" "223" "230" "232"

	[[29]]
	[1] "227" "238" "250" "257" "285" "316" "341"

	[[30]]
	[1] "358"

	[[31]]
	[1] "204" "245" "339" "342"

	[[32]]
	[1] "137" "145" "148"

	[[33]]
	[1] "280" "318" "323" "353"

	[[34]]
	[1] "220" "242"

	[[35]]
	[1] "4" "35" "53" "119" "121" "133" "276" "286" "287"

	[[36]]
	[1] "251" "322" "337" "343" "344" "348" "356"

	[[37]]
	[1] "171" "183" "216" "263" "265"

	[[38]]
	[1] "8" "144" "173" "175" "182"

	[[39]]
	[1] "248" "329"

	[[40]]
	[1] "25" "38" "43" "51" "76" "90" "194" "350"

	[[41]]
	[1] "89"

	[[42]]
	[1] "66" "138" "188" "256"

	[[43]]
	[1] "23" "37" "65" "102"

	[[44]]
	[1] "94" "355"

	[[45]]
	[1] "19" "234" "235" "269" "301" "302" "328" "347"

	[[46]]
	[1] "284" "298" "336"

	[[47]]
	[1] "62" "215" "218" "253" "264" "330"

	[[48]]
	[1] "80" "88" "93" "97" "100" "109" "117" "128" "153"

	[[49]]
	[1] "86" "149"

	[[50]]
	[1] "20" "85" "123"