benmarwick/archaeology-grand-challenges.r

## archaeology-grand-challenges.r
library(devtools)
install_github("benmarwick/JSTORr")
load("~/teamviewer/five_journals/five_journals.RData")

# looking at American Antiquity, Journal of World Prehistory, World Archaeology,

library(JSTORr)

# major groups of challenges

emergence <- c("emergence", "communities",  "complexity")
JSTOR_1word(unpack_multi, "emergence", span = 0.5, se = FALSE)

resilience <-  c("resilience", "persistence", "transformation",  "collapse", "decline", "complexity")
JSTOR_1word(unpack_multi, resilience, span = 0.5, se = FALSE)

movement <- c("movement", "mobility", "migration", "migrant", "adaptation")
JSTOR_1word(unpack_multi, movement, span = 0.5, se = FALSE)

cognition <- c("cognition", "behavior", "identity", "identities", "societal")
JSTOR_1word(unpack_multi, cognition, span = 0.5, se = FALSE)

human_environment <- c("environment", "health", "domestication", "agriculture", "environmental", "climate")
JSTOR_1word(unpack_multi, human_environment, span = 0.5, se = FALSE)

other <- c("gender", "indigeneity", "postprocessual", "postcolonial", "phenomenology")
JSTOR_1word(unpack_multi, other, span = 0.5, se = FALSE)

states <- c("state", "empires", "imperial", "domination")
JSTOR_1word(unpack_multi, states, span = 0.5, se = FALSE)

# numbers

challenges <- list(emergence=emergence, resilience=resilience, movement=movement, cognition=cognition, human_environment=human_environment, other=other)

numbers <- data.frame(challenge = names(challenges), word_count = NA, article_count = NA)
for(i in 1:length(challenges)){
  tmp <- as.matrix(unpack_multi$wordcounts[, unpack_multi$wordcounts$dimnames$Terms %in% unlist(unname(challenges[i])) ])
  numbers$word_count[i] <- sum(colSums(tmp))
  # how many instances of the word
  numbers$article_count[i] <- sum(tmp!=0)
  # how many articles with the word
  print(i)
}

# plot
devtools::install_github("karthik/wesanderson")
library(ggplot2); library(reshape2); library(wesanderson)

numbers_m <- melt(numbers)
ggplot(numbers_m, aes(x=reorder(challenge, -value), value, fill = variable)) +
  geom_bar(position = "dodge", stat="identity") +
  scale_fill_manual(values = wes.palette(2, "GrandBudapest")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 14)) +
  xlab("challenge")

nouns <- JSTOR_dtmofnouns(unpack_multi, sparse = 0.75, POStag = TRUE)

K <- 30
# high alpha-value will lead to documents being more similar in terms of what topics they contain
topics <- JSTOR_lda(unpack_multi, nouns, K, alpha = 1000/K)
hotncold <- JSTOR_lda_hotncoldtopics(topics, pval = 0.05, ma = 10, size = 12)


## just do it for American Antiquity
load("~/teamviewer/kretzler_and_marwick.RData")

# major groups of challenges just in American Antiquity

emergence <- c("emergence", "communities",  "complexity")
JSTOR_1word(unpack1grams, "emergence", span = 0.5, se = FALSE)

resilience <-  c("resilience", "persistence", "transformation",  "collapse", "decline", "complexity")
JSTOR_1word(unpack1grams, resilience, span = 0.5, se = FALSE)

movement <- c("movement", "mobility", "migration", "migrant", "adaptation")
JSTOR_1word(unpack1grams, movement, span = 0.5, se = FALSE)

cognition <- c("cognition", "behavior", "identity", "identities", "societal")
JSTOR_1word(unpack1grams, cognition, span = 0.5, se = FALSE)

human_environment <- c("environment", "health", "domestication", "agriculture", "environmental", "climate")
JSTOR_1word(unpack1grams, human_environment, span = 0.5, se = FALSE)

other <- c("gender", "indigeneity", "postprocessual", "postcolonial", "phenomenology")
JSTOR_1word(unpack1grams, other, span = 0.5, se = FALSE)

states <- c("state", "empires", "imperial", "domination")
JSTOR_1word(unpack1grams, states, span = 0.5, se = FALSE)

# numbers

challenges <- list(emergence=emergence, resilience=resilience, movement=movement, cognition=cognition, human_environment=human_environment, other=other)

numbers <- data.frame(challenge = names(challenges), word_count = NA, article_count = NA)
for(i in 1:length(challenges)){
  tmp <- as.matrix(unpack1grams$wordcounts[, unpack1grams$wordcounts$dimnames$Terms %in% unlist(unname(challenges[i])) ])
  numbers$word_count[i] <- sum(colSums(tmp))
  # how many instances of the word
  numbers$article_count[i] <- sum(tmp!=0)
  # how many articles with the word
  print(i)
}

# plot
devtools::install_github("karthik/wesanderson")
library(ggplot2); library(reshape2); library(wesanderson)

numbers_m <- melt(numbers)
ggplot(numbers_m, aes(x=reorder(challenge, -value), value, fill = variable)) +
  geom_bar(position = "dodge", stat="identity") +
  scale_fill_manual(values = wes.palette(2, "GrandBudapest")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 14)) +
  xlab("challenge")


## Topic model the Grand challenges raw data in tDAR report
# https://core.tdar.org/document/391233

# convert PDF to text
# windows: see https://gist.github.com/benmarwick/11333467
# linux: sudo apt-get install poppler-utils
report <- "~/teamviewer/five_journals/GrandChallengeCrowdSourcingReport.pdf"
system(paste('"pdftotext"', report, '"out.txt"'), wait = FALSE )
report <- scan('out.txt', what = character())
# where do the reponses start?
grep("Responses", report) # 350
# where do they stop
grep("SECTOR", report) # 3268
# get responses only
responses <- report[350:3268]
# join into one big vector
responses <- paste(responses, collapse = " ")
# get only text between 'challenge' and 'justification
# ie. exclude justifications
responses <- regmatches(responses, gregexpr("(?<=Challenge).*?(?=Justification)", responses, perl=TRUE))
# join into one big vector (again)
responses <- paste(responses, collapse = " ")
# remove anything that's nota letter (ie. punctuation, numbers)
responses <- gsub('[^[:alpha:]]',' ', responses)
# split into words
responses <-  unlist(strsplit(responses, " "))
# remove empty items
responses <- responses[responses != ""]
# remove words with one or two letters
responses <- responses[!nchar(responses) < 3 ]
# remove a few stopwords
responses <- responses[!(responses %in% tm::stopwords(kind = "en"))]


## pre-process

# split into n-word documents
n <- 100
docs <- split(responses, ceiling(seq_along(responses)/n))
# make corpus
library(tm)
corpus <- Corpus(VectorSource(docs))
# make term document matrix
tdm <- TermDocumentMatrix(corpus,
                          control = list(removePunctuation = TRUE,
                                         stopwords = TRUE,
                                         removePunctuation = TRUE,
                                         removeNumbers = TRUE,
                                         removeNumbers = TRUE,
                                         stripWhitespace = TRUE,
                                         tolower = TRUE))


# isolate terms to remove non-nouns
terms <- tdm$dimnames$Terms

# remove punctation (again)
terms <- gsub('[[:punct:]]','', terms)
terms <- gsub('\\\\','', terms)

library(NLP); library(data.table); library(openNLP)
tagPOS <-  function(x) {

  s <- NLP::as.String(x)
  ## Need sentence and word token annotations.

  a1 <- NLP::Annotation(1L, "sentence", 1L, nchar(s))
  a2 <- NLP::annotate(s, openNLP::Maxent_Word_Token_Annotator(), a1)
  a3 <- NLP::annotate(s,  openNLP::Maxent_POS_Tag_Annotator(), a2)

  ## Determine the distribution of POS tags for word tokens.
  a3w <- a3[a3$type == "word"]
  POStags <- unlist(lapply(a3w$features, `[[`, "POS"))

  ## Extract token/POS pairs (all of them): easy - not needed
  # POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
  return(unlist(POStags))
}


# divide Terms into chunks of 1000 terms each because more than that can cause
# memory problems
terms_split <- split(terms, ceiling(seq_along(terms)/1000))

# loop over each chunk of 1000 terms to do POStagging, I found that trying to
# do 10,000 terms or more causes Java memory problems, so this is a very safe
# method to try not to fill memory
terms_split_chunks <- plyr::llply(terms_split, function(i){
  tmp <- paste(gsub("[^[:alnum:]]", " ", i), collapse = " ")
  tmp <- tagPOS(tmp)
  tmp <- tmp[!tmp %in% c(",", "``", "''", ".")]
}, .progress = "text")


# get all the tags in a vector
terms_split_chunks_out <- unname(c(unlist(terms_split_chunks)))

# subset document term matrix terms to keep only nouns
tdm_nouns <- tdm[  c(tdm$dimnames$Terms[terms_split_chunks_out == "NN"]), ]

# drop specific words relevant to the grand challenge
specific_words <- c("grand", "challenge", "challenges", "archaeology", "archeology",
                    "archaeological", "archaeologists", "discipline", "disciplines",
                    "can", "like", "pay", "many", "well", "way", "non", "yet", "world",
                    "broad", "future", "across", "questions", "don", "dacs", "understanding",
                    "still", "make", "made", "researchers", "upon", "among", "ever", "good",
                    "conduct", "year", "cultural", "projects", "provide", "means", "high",
                    "get", "next", "coming", "available", "kind", "geo", "insights", "different",
                    "differences", "ambitious", "continue", "another", "currently" , "nearly",
                    "particular", "certain", "current", "critical", "era", "taking", "happened",
                    "changing", "idea", "first", "whether", "key", "facing", "justify", "leveraging",
                    "identified", "collected", "record", "sciences","framework", "chose", "whatever",
                    "nothing", "terms", "result", "remains", "scientific", "scientists", "activities",
                    "perspective", "burn", "american", "doesn", "concepts", "faced", "societies",
                    "focus", "forms", "either", "adot", "enormous", "fully","largely", "despite",
                    "similar", "probably", "act", "known", "changes", "partial", "five", "quickly",
                    "following", "hoc", "relevant", "shift", "life", "linked", "useful", "set",
                    "developed", "response", "gaining", "hardly", "refine", "patterns", "investigate",
                    "ground", "sufficient", "argued", "entirely",  "proposed", "scholars", "main",
                    "around", "incomplete", "azsite", "assign", "agree", "oldest", "practices" ,
                    "innovations", "several",  "specifically",  "standard","late", "early", "return",
                    "seem", "return", "constructed", "investigation", "diagnostic", "altered",
                    "program", "minimum", "minimal","articulate", "consequences", "somehow", "factors",
                    "seems", "never",  "effectively",  "sophisticated",  "unique", "cnh", "creating",
                    "knowing",  "discovering" , "longer", "problems", "paradigms",  "modern", "efficiency",
                    "foundations", "graphics", "leads", "scripps", "huge", "extensive", "potential",
                    "contexts", "near", "sense", "scope", "rising", "significant", "occurred",
                    "manner", "compare", "extend", "exist", "advantage", "interested", "space",
                    "evident", "chimp", "heck", "pick", "illustrated","space", "tired", "hopes",
                    "vanity", "stop", "plays", "fundamentally", "slightly", "reasons", "fresh",
                    "suitably", "volume", "shut", "kept", "togeter", "commented", "latter",
                    "suggests", "boyd", "say", "hence", "mos", "bruse", "waiting", "steps",
                    "assuring", "sorely", "anxious", "undergone", "submit", "touted", "assorted",
                    "equally", "legendary", "sadly", "guesses", "hardesty", "placing", "avoided",
                    "stored", "boserup", "commonly", "need", "problem", "term", "will", "information",
                    "help", "basis", "ability", "move", "address", "change", "access",
                    "barbara", "via", "attempt", "post", "etc", "norm", "cross", "comment",
                    "step", "compromising", "benefit","gain", "sake", "entity", "chance",
                    "issue", "couple", "trying", "detail", "ways", "redundancy", "example",
                    "using")
tdm_nouns <- tdm_nouns[  (!tdm_nouns$dimnames$Terms %in%  specific_words), ]
# tdm_nouns$dimnames$Terms[tdm_nouns$dimnames$Terms %in% specific_words]
# tdm_nouns$dimnames$Terms[!tdm_nouns$dimnames$Terms %in% specific_words]

# most frequent qords
(freq_words <- findFreqTerms(tdm_nouns, 5))

# find words associated with those high freq words
assocs <- lapply(freq_words, function(i) findAssocs(tdm_nouns, i, 0.3))
names(assocs) <- freq_words
assocs
######## generate topic model ###############
library(topicmodels)
K <- 20 # set number of topics
# remove empty rows
tdm_nouns <- tdm_nouns[ ,tdm_nouns$dimnames$Docs != "" ]
tdm_nouns <- tdm_nouns[ rowSums(as.matrix(tdm_nouns)) != 0, ]
# remove empty columns
tdm_nouns <- tdm_nouns[ , colSums(as.matrix(tdm_nouns)) != 0 ]

# generate topic model, v. low alpha value
response_topics <- LDA(t(tdm_nouns), K, control = list(alpha = 0.0001))
# inspect results
Topic <- topics(response_topics, 1)
Terms <- terms(response_topics, 20)
Terms[,1:K]


library(lda)
ldafmt <- dtm2ldaformat(t(tdm_nouns))
wc <- word.counts(ldafmt$documents) #get word counts
# generate topic model
K <- 10
result <- lda.collapsed.gibbs.sampler(ldafmt$documents,
                                      K,    # number of topics
                                      ldafmt$vocab,
                                      100,  # number of iterations
                                      0.001, # alpha, after Griffiths & Steyvers 2004
                                      0.1,  # eta
                                      burnin = 100,
                                      compute.log.likelihood = FALSE
) # uses a collapsed Gibbs sampler to ﬁt a latent Dirichlet allocation (LDA) model, consider 0.01 for alpha http://www.bytemining.com/2011/08/sigkdd-2011-conference-day-1-graph-mining-and-david-bleitopic-models/

# find top five words per topic to label topics
top.words <- top.topic.words(result$topics, 10, by.score=TRUE) # assign top.words, following the demo(lda)

# make data frame of topics (columns), documents (rows) and probabilities (cells)
topic.proportions <- t(result$document_sums) / colSums(result$document_sums) # assign topic.proportions
topic.proportions[is.na(topic.proportions)] <-  1 / K # etc. from demo(lda)
colnames(topic.proportions) <- apply(top.words, 2, paste, collapse=" ") # assign col names...
topic.proportions <- data.frame(topic.proportions)
	library(devtools)
	install_github("benmarwick/JSTORr")
	load("~/teamviewer/five_journals/five_journals.RData")

	# looking at American Antiquity, Journal of World Prehistory, World Archaeology,

	library(JSTORr)

	# major groups of challenges

	emergence <- c("emergence", "communities", "complexity")
	JSTOR_1word(unpack_multi, "emergence", span = 0.5, se = FALSE)

	resilience <- c("resilience", "persistence", "transformation", "collapse", "decline", "complexity")
	JSTOR_1word(unpack_multi, resilience, span = 0.5, se = FALSE)

	movement <- c("movement", "mobility", "migration", "migrant", "adaptation")
	JSTOR_1word(unpack_multi, movement, span = 0.5, se = FALSE)

	cognition <- c("cognition", "behavior", "identity", "identities", "societal")
	JSTOR_1word(unpack_multi, cognition, span = 0.5, se = FALSE)

	human_environment <- c("environment", "health", "domestication", "agriculture", "environmental", "climate")
	JSTOR_1word(unpack_multi, human_environment, span = 0.5, se = FALSE)

	other <- c("gender", "indigeneity", "postprocessual", "postcolonial", "phenomenology")
	JSTOR_1word(unpack_multi, other, span = 0.5, se = FALSE)

	states <- c("state", "empires", "imperial", "domination")
	JSTOR_1word(unpack_multi, states, span = 0.5, se = FALSE)

	# numbers

	challenges <- list(emergence=emergence, resilience=resilience, movement=movement, cognition=cognition, human_environment=human_environment, other=other)

	numbers <- data.frame(challenge = names(challenges), word_count = NA, article_count = NA)
	for(i in 1:length(challenges)){
	tmp <- as.matrix(unpack_multi$wordcounts[, unpack_multi$wordcounts$dimnames$Terms %in% unlist(unname(challenges[i])) ])
	numbers$word_count[i] <- sum(colSums(tmp))
	# how many instances of the word
	numbers$article_count[i] <- sum(tmp!=0)
	# how many articles with the word
	print(i)
	}

	# plot
	devtools::install_github("karthik/wesanderson")
	library(ggplot2); library(reshape2); library(wesanderson)

	numbers_m <- melt(numbers)
	ggplot(numbers_m, aes(x=reorder(challenge, -value), value, fill = variable)) +
	geom_bar(position = "dodge", stat="identity") +
	scale_fill_manual(values = wes.palette(2, "GrandBudapest")) +
	theme_minimal() +
	theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 14)) +
	xlab("challenge")

	nouns <- JSTOR_dtmofnouns(unpack_multi, sparse = 0.75, POStag = TRUE)

	K <- 30
	# high alpha-value will lead to documents being more similar in terms of what topics they contain
	topics <- JSTOR_lda(unpack_multi, nouns, K, alpha = 1000/K)
	hotncold <- JSTOR_lda_hotncoldtopics(topics, pval = 0.05, ma = 10, size = 12)


	## just do it for American Antiquity
	load("~/teamviewer/kretzler_and_marwick.RData")

	# major groups of challenges just in American Antiquity

	emergence <- c("emergence", "communities", "complexity")
	JSTOR_1word(unpack1grams, "emergence", span = 0.5, se = FALSE)

	resilience <- c("resilience", "persistence", "transformation", "collapse", "decline", "complexity")
	JSTOR_1word(unpack1grams, resilience, span = 0.5, se = FALSE)

	movement <- c("movement", "mobility", "migration", "migrant", "adaptation")
	JSTOR_1word(unpack1grams, movement, span = 0.5, se = FALSE)

	cognition <- c("cognition", "behavior", "identity", "identities", "societal")
	JSTOR_1word(unpack1grams, cognition, span = 0.5, se = FALSE)

	human_environment <- c("environment", "health", "domestication", "agriculture", "environmental", "climate")
	JSTOR_1word(unpack1grams, human_environment, span = 0.5, se = FALSE)

	other <- c("gender", "indigeneity", "postprocessual", "postcolonial", "phenomenology")
	JSTOR_1word(unpack1grams, other, span = 0.5, se = FALSE)

	states <- c("state", "empires", "imperial", "domination")
	JSTOR_1word(unpack1grams, states, span = 0.5, se = FALSE)

	# numbers

	challenges <- list(emergence=emergence, resilience=resilience, movement=movement, cognition=cognition, human_environment=human_environment, other=other)

	numbers <- data.frame(challenge = names(challenges), word_count = NA, article_count = NA)
	for(i in 1:length(challenges)){
	tmp <- as.matrix(unpack1grams$wordcounts[, unpack1grams$wordcounts$dimnames$Terms %in% unlist(unname(challenges[i])) ])
	numbers$word_count[i] <- sum(colSums(tmp))
	# how many instances of the word
	numbers$article_count[i] <- sum(tmp!=0)
	# how many articles with the word
	print(i)
	}

	# plot
	devtools::install_github("karthik/wesanderson")
	library(ggplot2); library(reshape2); library(wesanderson)

	numbers_m <- melt(numbers)
	ggplot(numbers_m, aes(x=reorder(challenge, -value), value, fill = variable)) +
	geom_bar(position = "dodge", stat="identity") +
	scale_fill_manual(values = wes.palette(2, "GrandBudapest")) +
	theme_minimal() +
	theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 14)) +
	xlab("challenge")


	## Topic model the Grand challenges raw data in tDAR report
	# https://core.tdar.org/document/391233

	# convert PDF to text
	# windows: see https://gist.github.com/benmarwick/11333467
	# linux: sudo apt-get install poppler-utils
	report <- "~/teamviewer/five_journals/GrandChallengeCrowdSourcingReport.pdf"
	system(paste('"pdftotext"', report, '"out.txt"'), wait = FALSE )
	report <- scan('out.txt', what = character())
	# where do the reponses start?
	grep("Responses", report) # 350
	# where do they stop
	grep("SECTOR", report) # 3268
	# get responses only
	responses <- report[350:3268]
	# join into one big vector
	responses <- paste(responses, collapse = " ")
	# get only text between 'challenge' and 'justification
	# ie. exclude justifications
	responses <- regmatches(responses, gregexpr("(?<=Challenge).*?(?=Justification)", responses, perl=TRUE))
	# join into one big vector (again)
	responses <- paste(responses, collapse = " ")
	# remove anything that's nota letter (ie. punctuation, numbers)
	responses <- gsub('[^[:alpha:]]',' ', responses)
	# split into words
	responses <- unlist(strsplit(responses, " "))
	# remove empty items
	responses <- responses[responses != ""]
	# remove words with one or two letters
	responses <- responses[!nchar(responses) < 3 ]
	# remove a few stopwords
	responses <- responses[!(responses %in% tm::stopwords(kind = "en"))]


	## pre-process

	# split into n-word documents
	n <- 100
	docs <- split(responses, ceiling(seq_along(responses)/n))
	# make corpus
	library(tm)
	corpus <- Corpus(VectorSource(docs))
	# make term document matrix
	tdm <- TermDocumentMatrix(corpus,
	control = list(removePunctuation = TRUE,
	stopwords = TRUE,
	removePunctuation = TRUE,
	removeNumbers = TRUE,
	removeNumbers = TRUE,
	stripWhitespace = TRUE,
	tolower = TRUE))


	# isolate terms to remove non-nouns
	terms <- tdm$dimnames$Terms

	# remove punctation (again)
	terms <- gsub('[[:punct:]]','', terms)
	terms <- gsub('\\\\','', terms)

	library(NLP); library(data.table); library(openNLP)
	tagPOS <- function(x) {

	s <- NLP::as.String(x)
	## Need sentence and word token annotations.

	a1 <- NLP::Annotation(1L, "sentence", 1L, nchar(s))
	a2 <- NLP::annotate(s, openNLP::Maxent_Word_Token_Annotator(), a1)
	a3 <- NLP::annotate(s, openNLP::Maxent_POS_Tag_Annotator(), a2)

	## Determine the distribution of POS tags for word tokens.
	a3w <- a3[a3$type == "word"]
	POStags <- unlist(lapply(a3w$features, `[[`, "POS"))

	## Extract token/POS pairs (all of them): easy - not needed
	# POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
	return(unlist(POStags))
	}


	# divide Terms into chunks of 1000 terms each because more than that can cause
	# memory problems
	terms_split <- split(terms, ceiling(seq_along(terms)/1000))

	# loop over each chunk of 1000 terms to do POStagging, I found that trying to
	# do 10,000 terms or more causes Java memory problems, so this is a very safe
	# method to try not to fill memory
	terms_split_chunks <- plyr::llply(terms_split, function(i){
	tmp <- paste(gsub("[^[:alnum:]]", " ", i), collapse = " ")
	tmp <- tagPOS(tmp)
	tmp <- tmp[!tmp %in% c(",", "``", "''", ".")]
	}, .progress = "text")


	# get all the tags in a vector
	terms_split_chunks_out <- unname(c(unlist(terms_split_chunks)))

	# subset document term matrix terms to keep only nouns
	tdm_nouns <- tdm[ c(tdm$dimnames$Terms[terms_split_chunks_out == "NN"]), ]

	# drop specific words relevant to the grand challenge
	specific_words <- c("grand", "challenge", "challenges", "archaeology", "archeology",
	"archaeological", "archaeologists", "discipline", "disciplines",
	"can", "like", "pay", "many", "well", "way", "non", "yet", "world",
	"broad", "future", "across", "questions", "don", "dacs", "understanding",
	"still", "make", "made", "researchers", "upon", "among", "ever", "good",
	"conduct", "year", "cultural", "projects", "provide", "means", "high",
	"get", "next", "coming", "available", "kind", "geo", "insights", "different",
	"differences", "ambitious", "continue", "another", "currently" , "nearly",
	"particular", "certain", "current", "critical", "era", "taking", "happened",
	"changing", "idea", "first", "whether", "key", "facing", "justify", "leveraging",
	"identified", "collected", "record", "sciences","framework", "chose", "whatever",
	"nothing", "terms", "result", "remains", "scientific", "scientists", "activities",
	"perspective", "burn", "american", "doesn", "concepts", "faced", "societies",
	"focus", "forms", "either", "adot", "enormous", "fully","largely", "despite",
	"similar", "probably", "act", "known", "changes", "partial", "five", "quickly",
	"following", "hoc", "relevant", "shift", "life", "linked", "useful", "set",
	"developed", "response", "gaining", "hardly", "refine", "patterns", "investigate",
	"ground", "sufficient", "argued", "entirely", "proposed", "scholars", "main",
	"around", "incomplete", "azsite", "assign", "agree", "oldest", "practices" ,
	"innovations", "several", "specifically", "standard","late", "early", "return",
	"seem", "return", "constructed", "investigation", "diagnostic", "altered",
	"program", "minimum", "minimal","articulate", "consequences", "somehow", "factors",
	"seems", "never", "effectively", "sophisticated", "unique", "cnh", "creating",
	"knowing", "discovering" , "longer", "problems", "paradigms", "modern", "efficiency",
	"foundations", "graphics", "leads", "scripps", "huge", "extensive", "potential",
	"contexts", "near", "sense", "scope", "rising", "significant", "occurred",
	"manner", "compare", "extend", "exist", "advantage", "interested", "space",
	"evident", "chimp", "heck", "pick", "illustrated","space", "tired", "hopes",
	"vanity", "stop", "plays", "fundamentally", "slightly", "reasons", "fresh",
	"suitably", "volume", "shut", "kept", "togeter", "commented", "latter",
	"suggests", "boyd", "say", "hence", "mos", "bruse", "waiting", "steps",
	"assuring", "sorely", "anxious", "undergone", "submit", "touted", "assorted",
	"equally", "legendary", "sadly", "guesses", "hardesty", "placing", "avoided",
	"stored", "boserup", "commonly", "need", "problem", "term", "will", "information",
	"help", "basis", "ability", "move", "address", "change", "access",
	"barbara", "via", "attempt", "post", "etc", "norm", "cross", "comment",
	"step", "compromising", "benefit","gain", "sake", "entity", "chance",
	"issue", "couple", "trying", "detail", "ways", "redundancy", "example",
	"using")
	tdm_nouns <- tdm_nouns[ (!tdm_nouns$dimnames$Terms %in% specific_words), ]
	# tdm_nouns$dimnames$Terms[tdm_nouns$dimnames$Terms %in% specific_words]
	# tdm_nouns$dimnames$Terms[!tdm_nouns$dimnames$Terms %in% specific_words]

	# most frequent qords
	(freq_words <- findFreqTerms(tdm_nouns, 5))

	# find words associated with those high freq words
	assocs <- lapply(freq_words, function(i) findAssocs(tdm_nouns, i, 0.3))
	names(assocs) <- freq_words
	assocs
	######## generate topic model ###############
	library(topicmodels)
	K <- 20 # set number of topics
	# remove empty rows
	tdm_nouns <- tdm_nouns[ ,tdm_nouns$dimnames$Docs != "" ]
	tdm_nouns <- tdm_nouns[ rowSums(as.matrix(tdm_nouns)) != 0, ]
	# remove empty columns
	tdm_nouns <- tdm_nouns[ , colSums(as.matrix(tdm_nouns)) != 0 ]

	# generate topic model, v. low alpha value
	response_topics <- LDA(t(tdm_nouns), K, control = list(alpha = 0.0001))
	# inspect results
	Topic <- topics(response_topics, 1)
	Terms <- terms(response_topics, 20)
	Terms[,1:K]


	library(lda)
	ldafmt <- dtm2ldaformat(t(tdm_nouns))
	wc <- word.counts(ldafmt$documents) #get word counts
	# generate topic model
	K <- 10
	result <- lda.collapsed.gibbs.sampler(ldafmt$documents,
	K, # number of topics
	ldafmt$vocab,
	100, # number of iterations
	0.001, # alpha, after Griffiths & Steyvers 2004
	0.1, # eta
	burnin = 100,
	compute.log.likelihood = FALSE
	) # uses a collapsed Gibbs sampler to ﬁt a latent Dirichlet allocation (LDA) model, consider 0.01 for alpha http://www.bytemining.com/2011/08/sigkdd-2011-conference-day-1-graph-mining-and-david-bleitopic-models/

	# find top five words per topic to label topics
	top.words <- top.topic.words(result$topics, 10, by.score=TRUE) # assign top.words, following the demo(lda)

	# make data frame of topics (columns), documents (rows) and probabilities (cells)
	topic.proportions <- t(result$document_sums) / colSums(result$document_sums) # assign topic.proportions
	topic.proportions[is.na(topic.proportions)] <- 1 / K # etc. from demo(lda)
	colnames(topic.proportions) <- apply(top.words, 2, paste, collapse=" ") # assign col names...
	topic.proportions <- data.frame(topic.proportions)