Sketch of a look at the 'grand challenges' of Kintigh et al. 2014 (http://www.pnas.org/content/111/3/879.full) in five archaeology journals
library(devtools) | |
install_github("benmarwick/JSTORr") | |
load("~/teamviewer/five_journals/five_journals.RData") | |
# looking at American Antiquity, Journal of World Prehistory, World Archaeology, | |
library(JSTORr) | |
# major groups of challenges | |
emergence <- c("emergence", "communities", "complexity") | |
JSTOR_1word(unpack_multi, "emergence", span = 0.5, se = FALSE) | |
resilience <- c("resilience", "persistence", "transformation", "collapse", "decline", "complexity") | |
JSTOR_1word(unpack_multi, resilience, span = 0.5, se = FALSE) | |
movement <- c("movement", "mobility", "migration", "migrant", "adaptation") | |
JSTOR_1word(unpack_multi, movement, span = 0.5, se = FALSE) | |
cognition <- c("cognition", "behavior", "identity", "identities", "societal") | |
JSTOR_1word(unpack_multi, cognition, span = 0.5, se = FALSE) | |
human_environment <- c("environment", "health", "domestication", "agriculture", "environmental", "climate") | |
JSTOR_1word(unpack_multi, human_environment, span = 0.5, se = FALSE) | |
other <- c("gender", "indigeneity", "postprocessual", "postcolonial", "phenomenology") | |
JSTOR_1word(unpack_multi, other, span = 0.5, se = FALSE) | |
states <- c("state", "empires", "imperial", "domination") | |
JSTOR_1word(unpack_multi, states, span = 0.5, se = FALSE) | |
# numbers | |
challenges <- list(emergence=emergence, resilience=resilience, movement=movement, cognition=cognition, human_environment=human_environment, other=other) | |
numbers <- data.frame(challenge = names(challenges), word_count = NA, article_count = NA) | |
for(i in 1:length(challenges)){ | |
tmp <- as.matrix(unpack_multi$wordcounts[, unpack_multi$wordcounts$dimnames$Terms %in% unlist(unname(challenges[i])) ]) | |
numbers$word_count[i] <- sum(colSums(tmp)) | |
# how many instances of the word | |
numbers$article_count[i] <- sum(tmp!=0) | |
# how many articles with the word | |
print(i) | |
} | |
# plot | |
devtools::install_github("karthik/wesanderson") | |
library(ggplot2); library(reshape2); library(wesanderson) | |
numbers_m <- melt(numbers) | |
ggplot(numbers_m, aes(x=reorder(challenge, -value), value, fill = variable)) + | |
geom_bar(position = "dodge", stat="identity") + | |
scale_fill_manual(values = wes.palette(2, "GrandBudapest")) + | |
theme_minimal() + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 14)) + | |
xlab("challenge") | |
nouns <- JSTOR_dtmofnouns(unpack_multi, sparse = 0.75, POStag = TRUE) | |
K <- 30 | |
# high alpha-value will lead to documents being more similar in terms of what topics they contain | |
topics <- JSTOR_lda(unpack_multi, nouns, K, alpha = 1000/K) | |
hotncold <- JSTOR_lda_hotncoldtopics(topics, pval = 0.05, ma = 10, size = 12) | |
## just do it for American Antiquity | |
load("~/teamviewer/kretzler_and_marwick.RData") | |
# major groups of challenges just in American Antiquity | |
emergence <- c("emergence", "communities", "complexity") | |
JSTOR_1word(unpack1grams, "emergence", span = 0.5, se = FALSE) | |
resilience <- c("resilience", "persistence", "transformation", "collapse", "decline", "complexity") | |
JSTOR_1word(unpack1grams, resilience, span = 0.5, se = FALSE) | |
movement <- c("movement", "mobility", "migration", "migrant", "adaptation") | |
JSTOR_1word(unpack1grams, movement, span = 0.5, se = FALSE) | |
cognition <- c("cognition", "behavior", "identity", "identities", "societal") | |
JSTOR_1word(unpack1grams, cognition, span = 0.5, se = FALSE) | |
human_environment <- c("environment", "health", "domestication", "agriculture", "environmental", "climate") | |
JSTOR_1word(unpack1grams, human_environment, span = 0.5, se = FALSE) | |
other <- c("gender", "indigeneity", "postprocessual", "postcolonial", "phenomenology") | |
JSTOR_1word(unpack1grams, other, span = 0.5, se = FALSE) | |
states <- c("state", "empires", "imperial", "domination") | |
JSTOR_1word(unpack1grams, states, span = 0.5, se = FALSE) | |
# numbers | |
challenges <- list(emergence=emergence, resilience=resilience, movement=movement, cognition=cognition, human_environment=human_environment, other=other) | |
numbers <- data.frame(challenge = names(challenges), word_count = NA, article_count = NA) | |
for(i in 1:length(challenges)){ | |
tmp <- as.matrix(unpack1grams$wordcounts[, unpack1grams$wordcounts$dimnames$Terms %in% unlist(unname(challenges[i])) ]) | |
numbers$word_count[i] <- sum(colSums(tmp)) | |
# how many instances of the word | |
numbers$article_count[i] <- sum(tmp!=0) | |
# how many articles with the word | |
print(i) | |
} | |
# plot | |
devtools::install_github("karthik/wesanderson") | |
library(ggplot2); library(reshape2); library(wesanderson) | |
numbers_m <- melt(numbers) | |
ggplot(numbers_m, aes(x=reorder(challenge, -value), value, fill = variable)) + | |
geom_bar(position = "dodge", stat="identity") + | |
scale_fill_manual(values = wes.palette(2, "GrandBudapest")) + | |
theme_minimal() + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 14)) + | |
xlab("challenge") | |
## Topic model the Grand challenges raw data in tDAR report | |
# https://core.tdar.org/document/391233 | |
# convert PDF to text | |
# windows: see https://gist.github.com/benmarwick/11333467 | |
# linux: sudo apt-get install poppler-utils | |
report <- "~/teamviewer/five_journals/GrandChallengeCrowdSourcingReport.pdf" | |
system(paste('"pdftotext"', report, '"out.txt"'), wait = FALSE ) | |
report <- scan('out.txt', what = character()) | |
# where do the reponses start? | |
grep("Responses", report) # 350 | |
# where do they stop | |
grep("SECTOR", report) # 3268 | |
# get responses only | |
responses <- report[350:3268] | |
# join into one big vector | |
responses <- paste(responses, collapse = " ") | |
# get only text between 'challenge' and 'justification | |
# ie. exclude justifications | |
responses <- regmatches(responses, gregexpr("(?<=Challenge).*?(?=Justification)", responses, perl=TRUE)) | |
# join into one big vector (again) | |
responses <- paste(responses, collapse = " ") | |
# remove anything that's nota letter (ie. punctuation, numbers) | |
responses <- gsub('[^[:alpha:]]',' ', responses) | |
# split into words | |
responses <- unlist(strsplit(responses, " ")) | |
# remove empty items | |
responses <- responses[responses != ""] | |
# remove words with one or two letters | |
responses <- responses[!nchar(responses) < 3 ] | |
# remove a few stopwords | |
responses <- responses[!(responses %in% tm::stopwords(kind = "en"))] | |
## pre-process | |
# split into n-word documents | |
n <- 100 | |
docs <- split(responses, ceiling(seq_along(responses)/n)) | |
# make corpus | |
library(tm) | |
corpus <- Corpus(VectorSource(docs)) | |
# make term document matrix | |
tdm <- TermDocumentMatrix(corpus, | |
control = list(removePunctuation = TRUE, | |
stopwords = TRUE, | |
removePunctuation = TRUE, | |
removeNumbers = TRUE, | |
removeNumbers = TRUE, | |
stripWhitespace = TRUE, | |
tolower = TRUE)) | |
# isolate terms to remove non-nouns | |
terms <- tdm$dimnames$Terms | |
# remove punctation (again) | |
terms <- gsub('[[:punct:]]','', terms) | |
terms <- gsub('\\\\','', terms) | |
library(NLP); library(data.table); library(openNLP) | |
tagPOS <- function(x) { | |
s <- NLP::as.String(x) | |
## Need sentence and word token annotations. | |
a1 <- NLP::Annotation(1L, "sentence", 1L, nchar(s)) | |
a2 <- NLP::annotate(s, openNLP::Maxent_Word_Token_Annotator(), a1) | |
a3 <- NLP::annotate(s, openNLP::Maxent_POS_Tag_Annotator(), a2) | |
## Determine the distribution of POS tags for word tokens. | |
a3w <- a3[a3$type == "word"] | |
POStags <- unlist(lapply(a3w$features, `[[`, "POS")) | |
## Extract token/POS pairs (all of them): easy - not needed | |
# POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ") | |
return(unlist(POStags)) | |
} | |
# divide Terms into chunks of 1000 terms each because more than that can cause | |
# memory problems | |
terms_split <- split(terms, ceiling(seq_along(terms)/1000)) | |
# loop over each chunk of 1000 terms to do POStagging, I found that trying to | |
# do 10,000 terms or more causes Java memory problems, so this is a very safe | |
# method to try not to fill memory | |
terms_split_chunks <- plyr::llply(terms_split, function(i){ | |
tmp <- paste(gsub("[^[:alnum:]]", " ", i), collapse = " ") | |
tmp <- tagPOS(tmp) | |
tmp <- tmp[!tmp %in% c(",", "``", "''", ".")] | |
}, .progress = "text") | |
# get all the tags in a vector | |
terms_split_chunks_out <- unname(c(unlist(terms_split_chunks))) | |
# subset document term matrix terms to keep only nouns | |
tdm_nouns <- tdm[ c(tdm$dimnames$Terms[terms_split_chunks_out == "NN"]), ] | |
# drop specific words relevant to the grand challenge | |
specific_words <- c("grand", "challenge", "challenges", "archaeology", "archeology", | |
"archaeological", "archaeologists", "discipline", "disciplines", | |
"can", "like", "pay", "many", "well", "way", "non", "yet", "world", | |
"broad", "future", "across", "questions", "don", "dacs", "understanding", | |
"still", "make", "made", "researchers", "upon", "among", "ever", "good", | |
"conduct", "year", "cultural", "projects", "provide", "means", "high", | |
"get", "next", "coming", "available", "kind", "geo", "insights", "different", | |
"differences", "ambitious", "continue", "another", "currently" , "nearly", | |
"particular", "certain", "current", "critical", "era", "taking", "happened", | |
"changing", "idea", "first", "whether", "key", "facing", "justify", "leveraging", | |
"identified", "collected", "record", "sciences","framework", "chose", "whatever", | |
"nothing", "terms", "result", "remains", "scientific", "scientists", "activities", | |
"perspective", "burn", "american", "doesn", "concepts", "faced", "societies", | |
"focus", "forms", "either", "adot", "enormous", "fully","largely", "despite", | |
"similar", "probably", "act", "known", "changes", "partial", "five", "quickly", | |
"following", "hoc", "relevant", "shift", "life", "linked", "useful", "set", | |
"developed", "response", "gaining", "hardly", "refine", "patterns", "investigate", | |
"ground", "sufficient", "argued", "entirely", "proposed", "scholars", "main", | |
"around", "incomplete", "azsite", "assign", "agree", "oldest", "practices" , | |
"innovations", "several", "specifically", "standard","late", "early", "return", | |
"seem", "return", "constructed", "investigation", "diagnostic", "altered", | |
"program", "minimum", "minimal","articulate", "consequences", "somehow", "factors", | |
"seems", "never", "effectively", "sophisticated", "unique", "cnh", "creating", | |
"knowing", "discovering" , "longer", "problems", "paradigms", "modern", "efficiency", | |
"foundations", "graphics", "leads", "scripps", "huge", "extensive", "potential", | |
"contexts", "near", "sense", "scope", "rising", "significant", "occurred", | |
"manner", "compare", "extend", "exist", "advantage", "interested", "space", | |
"evident", "chimp", "heck", "pick", "illustrated","space", "tired", "hopes", | |
"vanity", "stop", "plays", "fundamentally", "slightly", "reasons", "fresh", | |
"suitably", "volume", "shut", "kept", "togeter", "commented", "latter", | |
"suggests", "boyd", "say", "hence", "mos", "bruse", "waiting", "steps", | |
"assuring", "sorely", "anxious", "undergone", "submit", "touted", "assorted", | |
"equally", "legendary", "sadly", "guesses", "hardesty", "placing", "avoided", | |
"stored", "boserup", "commonly", "need", "problem", "term", "will", "information", | |
"help", "basis", "ability", "move", "address", "change", "access", | |
"barbara", "via", "attempt", "post", "etc", "norm", "cross", "comment", | |
"step", "compromising", "benefit","gain", "sake", "entity", "chance", | |
"issue", "couple", "trying", "detail", "ways", "redundancy", "example", | |
"using") | |
tdm_nouns <- tdm_nouns[ (!tdm_nouns$dimnames$Terms %in% specific_words), ] | |
# tdm_nouns$dimnames$Terms[tdm_nouns$dimnames$Terms %in% specific_words] | |
# tdm_nouns$dimnames$Terms[!tdm_nouns$dimnames$Terms %in% specific_words] | |
# most frequent qords | |
(freq_words <- findFreqTerms(tdm_nouns, 5)) | |
# find words associated with those high freq words | |
assocs <- lapply(freq_words, function(i) findAssocs(tdm_nouns, i, 0.3)) | |
names(assocs) <- freq_words | |
assocs | |
######## generate topic model ############### | |
library(topicmodels) | |
K <- 20 # set number of topics | |
# remove empty rows | |
tdm_nouns <- tdm_nouns[ ,tdm_nouns$dimnames$Docs != "" ] | |
tdm_nouns <- tdm_nouns[ rowSums(as.matrix(tdm_nouns)) != 0, ] | |
# remove empty columns | |
tdm_nouns <- tdm_nouns[ , colSums(as.matrix(tdm_nouns)) != 0 ] | |
# generate topic model, v. low alpha value | |
response_topics <- LDA(t(tdm_nouns), K, control = list(alpha = 0.0001)) | |
# inspect results | |
Topic <- topics(response_topics, 1) | |
Terms <- terms(response_topics, 20) | |
Terms[,1:K] | |
library(lda) | |
ldafmt <- dtm2ldaformat(t(tdm_nouns)) | |
wc <- word.counts(ldafmt$documents) #get word counts | |
# generate topic model | |
K <- 10 | |
result <- lda.collapsed.gibbs.sampler(ldafmt$documents, | |
K, # number of topics | |
ldafmt$vocab, | |
100, # number of iterations | |
0.001, # alpha, after Griffiths & Steyvers 2004 | |
0.1, # eta | |
burnin = 100, | |
compute.log.likelihood = FALSE | |
) # uses a collapsed Gibbs sampler to fit a latent Dirichlet allocation (LDA) model, consider 0.01 for alpha http://www.bytemining.com/2011/08/sigkdd-2011-conference-day-1-graph-mining-and-david-bleitopic-models/ | |
# find top five words per topic to label topics | |
top.words <- top.topic.words(result$topics, 10, by.score=TRUE) # assign top.words, following the demo(lda) | |
# make data frame of topics (columns), documents (rows) and probabilities (cells) | |
topic.proportions <- t(result$document_sums) / colSums(result$document_sums) # assign topic.proportions | |
topic.proportions[is.na(topic.proportions)] <- 1 / K # etc. from demo(lda) | |
colnames(topic.proportions) <- apply(top.words, 2, paste, collapse=" ") # assign col names... | |
topic.proportions <- data.frame(topic.proportions) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment