Last active
August 29, 2015 14:00
-
-
Save benmarwick/d3eb3167ccb2116197ca to your computer and use it in GitHub Desktop.
Sketch of a look at the 'grand challenges' of Kintigh et al. 2014 (http://www.pnas.org/content/111/3/879.full) in five archaeology journals
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(devtools) | |
install_github("benmarwick/JSTORr") | |
load("~/teamviewer/five_journals/five_journals.RData") | |
# looking at American Antiquity, Journal of World Prehistory, World Archaeology, | |
library(JSTORr) | |
# major groups of challenges | |
emergence <- c("emergence", "communities", "complexity") | |
JSTOR_1word(unpack_multi, "emergence", span = 0.5, se = FALSE) | |
resilience <- c("resilience", "persistence", "transformation", "collapse", "decline", "complexity") | |
JSTOR_1word(unpack_multi, resilience, span = 0.5, se = FALSE) | |
movement <- c("movement", "mobility", "migration", "migrant", "adaptation") | |
JSTOR_1word(unpack_multi, movement, span = 0.5, se = FALSE) | |
cognition <- c("cognition", "behavior", "identity", "identities", "societal") | |
JSTOR_1word(unpack_multi, cognition, span = 0.5, se = FALSE) | |
human_environment <- c("environment", "health", "domestication", "agriculture", "environmental", "climate") | |
JSTOR_1word(unpack_multi, human_environment, span = 0.5, se = FALSE) | |
other <- c("gender", "indigeneity", "postprocessual", "postcolonial", "phenomenology") | |
JSTOR_1word(unpack_multi, other, span = 0.5, se = FALSE) | |
states <- c("state", "empires", "imperial", "domination") | |
JSTOR_1word(unpack_multi, states, span = 0.5, se = FALSE) | |
# numbers | |
challenges <- list(emergence=emergence, resilience=resilience, movement=movement, cognition=cognition, human_environment=human_environment, other=other) | |
numbers <- data.frame(challenge = names(challenges), word_count = NA, article_count = NA) | |
for(i in 1:length(challenges)){ | |
tmp <- as.matrix(unpack_multi$wordcounts[, unpack_multi$wordcounts$dimnames$Terms %in% unlist(unname(challenges[i])) ]) | |
numbers$word_count[i] <- sum(colSums(tmp)) | |
# how many instances of the word | |
numbers$article_count[i] <- sum(tmp!=0) | |
# how many articles with the word | |
print(i) | |
} | |
# plot | |
devtools::install_github("karthik/wesanderson") | |
library(ggplot2); library(reshape2); library(wesanderson) | |
numbers_m <- melt(numbers) | |
ggplot(numbers_m, aes(x=reorder(challenge, -value), value, fill = variable)) + | |
geom_bar(position = "dodge", stat="identity") + | |
scale_fill_manual(values = wes.palette(2, "GrandBudapest")) + | |
theme_minimal() + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 14)) + | |
xlab("challenge") | |
nouns <- JSTOR_dtmofnouns(unpack_multi, sparse = 0.75, POStag = TRUE) | |
K <- 30 | |
# high alpha-value will lead to documents being more similar in terms of what topics they contain | |
topics <- JSTOR_lda(unpack_multi, nouns, K, alpha = 1000/K) | |
hotncold <- JSTOR_lda_hotncoldtopics(topics, pval = 0.05, ma = 10, size = 12) | |
## just do it for American Antiquity | |
load("~/teamviewer/kretzler_and_marwick.RData") | |
# major groups of challenges just in American Antiquity | |
emergence <- c("emergence", "communities", "complexity") | |
JSTOR_1word(unpack1grams, "emergence", span = 0.5, se = FALSE) | |
resilience <- c("resilience", "persistence", "transformation", "collapse", "decline", "complexity") | |
JSTOR_1word(unpack1grams, resilience, span = 0.5, se = FALSE) | |
movement <- c("movement", "mobility", "migration", "migrant", "adaptation") | |
JSTOR_1word(unpack1grams, movement, span = 0.5, se = FALSE) | |
cognition <- c("cognition", "behavior", "identity", "identities", "societal") | |
JSTOR_1word(unpack1grams, cognition, span = 0.5, se = FALSE) | |
human_environment <- c("environment", "health", "domestication", "agriculture", "environmental", "climate") | |
JSTOR_1word(unpack1grams, human_environment, span = 0.5, se = FALSE) | |
other <- c("gender", "indigeneity", "postprocessual", "postcolonial", "phenomenology") | |
JSTOR_1word(unpack1grams, other, span = 0.5, se = FALSE) | |
states <- c("state", "empires", "imperial", "domination") | |
JSTOR_1word(unpack1grams, states, span = 0.5, se = FALSE) | |
# numbers | |
challenges <- list(emergence=emergence, resilience=resilience, movement=movement, cognition=cognition, human_environment=human_environment, other=other) | |
numbers <- data.frame(challenge = names(challenges), word_count = NA, article_count = NA) | |
for(i in 1:length(challenges)){ | |
tmp <- as.matrix(unpack1grams$wordcounts[, unpack1grams$wordcounts$dimnames$Terms %in% unlist(unname(challenges[i])) ]) | |
numbers$word_count[i] <- sum(colSums(tmp)) | |
# how many instances of the word | |
numbers$article_count[i] <- sum(tmp!=0) | |
# how many articles with the word | |
print(i) | |
} | |
# plot | |
devtools::install_github("karthik/wesanderson") | |
library(ggplot2); library(reshape2); library(wesanderson) | |
numbers_m <- melt(numbers) | |
ggplot(numbers_m, aes(x=reorder(challenge, -value), value, fill = variable)) + | |
geom_bar(position = "dodge", stat="identity") + | |
scale_fill_manual(values = wes.palette(2, "GrandBudapest")) + | |
theme_minimal() + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 14)) + | |
xlab("challenge") | |
## Topic model the Grand challenges raw data in tDAR report | |
# https://core.tdar.org/document/391233 | |
# convert PDF to text | |
# windows: see https://gist.github.com/benmarwick/11333467 | |
# linux: sudo apt-get install poppler-utils | |
report <- "~/teamviewer/five_journals/GrandChallengeCrowdSourcingReport.pdf" | |
system(paste('"pdftotext"', report, '"out.txt"'), wait = FALSE ) | |
report <- scan('out.txt', what = character()) | |
# where do the reponses start? | |
grep("Responses", report) # 350 | |
# where do they stop | |
grep("SECTOR", report) # 3268 | |
# get responses only | |
responses <- report[350:3268] | |
# join into one big vector | |
responses <- paste(responses, collapse = " ") | |
# get only text between 'challenge' and 'justification | |
# ie. exclude justifications | |
responses <- regmatches(responses, gregexpr("(?<=Challenge).*?(?=Justification)", responses, perl=TRUE)) | |
# join into one big vector (again) | |
responses <- paste(responses, collapse = " ") | |
# remove anything that's nota letter (ie. punctuation, numbers) | |
responses <- gsub('[^[:alpha:]]',' ', responses) | |
# split into words | |
responses <- unlist(strsplit(responses, " ")) | |
# remove empty items | |
responses <- responses[responses != ""] | |
# remove words with one or two letters | |
responses <- responses[!nchar(responses) < 3 ] | |
# remove a few stopwords | |
responses <- responses[!(responses %in% tm::stopwords(kind = "en"))] | |
## pre-process | |
# split into n-word documents | |
n <- 100 | |
docs <- split(responses, ceiling(seq_along(responses)/n)) | |
# make corpus | |
library(tm) | |
corpus <- Corpus(VectorSource(docs)) | |
# make term document matrix | |
tdm <- TermDocumentMatrix(corpus, | |
control = list(removePunctuation = TRUE, | |
stopwords = TRUE, | |
removePunctuation = TRUE, | |
removeNumbers = TRUE, | |
removeNumbers = TRUE, | |
stripWhitespace = TRUE, | |
tolower = TRUE)) | |
# isolate terms to remove non-nouns | |
terms <- tdm$dimnames$Terms | |
# remove punctation (again) | |
terms <- gsub('[[:punct:]]','', terms) | |
terms <- gsub('\\\\','', terms) | |
library(NLP); library(data.table); library(openNLP) | |
tagPOS <- function(x) { | |
s <- NLP::as.String(x) | |
## Need sentence and word token annotations. | |
a1 <- NLP::Annotation(1L, "sentence", 1L, nchar(s)) | |
a2 <- NLP::annotate(s, openNLP::Maxent_Word_Token_Annotator(), a1) | |
a3 <- NLP::annotate(s, openNLP::Maxent_POS_Tag_Annotator(), a2) | |
## Determine the distribution of POS tags for word tokens. | |
a3w <- a3[a3$type == "word"] | |
POStags <- unlist(lapply(a3w$features, `[[`, "POS")) | |
## Extract token/POS pairs (all of them): easy - not needed | |
# POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ") | |
return(unlist(POStags)) | |
} | |
# divide Terms into chunks of 1000 terms each because more than that can cause | |
# memory problems | |
terms_split <- split(terms, ceiling(seq_along(terms)/1000)) | |
# loop over each chunk of 1000 terms to do POStagging, I found that trying to | |
# do 10,000 terms or more causes Java memory problems, so this is a very safe | |
# method to try not to fill memory | |
terms_split_chunks <- plyr::llply(terms_split, function(i){ | |
tmp <- paste(gsub("[^[:alnum:]]", " ", i), collapse = " ") | |
tmp <- tagPOS(tmp) | |
tmp <- tmp[!tmp %in% c(",", "``", "''", ".")] | |
}, .progress = "text") | |
# get all the tags in a vector | |
terms_split_chunks_out <- unname(c(unlist(terms_split_chunks))) | |
# subset document term matrix terms to keep only nouns | |
tdm_nouns <- tdm[ c(tdm$dimnames$Terms[terms_split_chunks_out == "NN"]), ] | |
# drop specific words relevant to the grand challenge | |
specific_words <- c("grand", "challenge", "challenges", "archaeology", "archeology", | |
"archaeological", "archaeologists", "discipline", "disciplines", | |
"can", "like", "pay", "many", "well", "way", "non", "yet", "world", | |
"broad", "future", "across", "questions", "don", "dacs", "understanding", | |
"still", "make", "made", "researchers", "upon", "among", "ever", "good", | |
"conduct", "year", "cultural", "projects", "provide", "means", "high", | |
"get", "next", "coming", "available", "kind", "geo", "insights", "different", | |
"differences", "ambitious", "continue", "another", "currently" , "nearly", | |
"particular", "certain", "current", "critical", "era", "taking", "happened", | |
"changing", "idea", "first", "whether", "key", "facing", "justify", "leveraging", | |
"identified", "collected", "record", "sciences","framework", "chose", "whatever", | |
"nothing", "terms", "result", "remains", "scientific", "scientists", "activities", | |
"perspective", "burn", "american", "doesn", "concepts", "faced", "societies", | |
"focus", "forms", "either", "adot", "enormous", "fully","largely", "despite", | |
"similar", "probably", "act", "known", "changes", "partial", "five", "quickly", | |
"following", "hoc", "relevant", "shift", "life", "linked", "useful", "set", | |
"developed", "response", "gaining", "hardly", "refine", "patterns", "investigate", | |
"ground", "sufficient", "argued", "entirely", "proposed", "scholars", "main", | |
"around", "incomplete", "azsite", "assign", "agree", "oldest", "practices" , | |
"innovations", "several", "specifically", "standard","late", "early", "return", | |
"seem", "return", "constructed", "investigation", "diagnostic", "altered", | |
"program", "minimum", "minimal","articulate", "consequences", "somehow", "factors", | |
"seems", "never", "effectively", "sophisticated", "unique", "cnh", "creating", | |
"knowing", "discovering" , "longer", "problems", "paradigms", "modern", "efficiency", | |
"foundations", "graphics", "leads", "scripps", "huge", "extensive", "potential", | |
"contexts", "near", "sense", "scope", "rising", "significant", "occurred", | |
"manner", "compare", "extend", "exist", "advantage", "interested", "space", | |
"evident", "chimp", "heck", "pick", "illustrated","space", "tired", "hopes", | |
"vanity", "stop", "plays", "fundamentally", "slightly", "reasons", "fresh", | |
"suitably", "volume", "shut", "kept", "togeter", "commented", "latter", | |
"suggests", "boyd", "say", "hence", "mos", "bruse", "waiting", "steps", | |
"assuring", "sorely", "anxious", "undergone", "submit", "touted", "assorted", | |
"equally", "legendary", "sadly", "guesses", "hardesty", "placing", "avoided", | |
"stored", "boserup", "commonly", "need", "problem", "term", "will", "information", | |
"help", "basis", "ability", "move", "address", "change", "access", | |
"barbara", "via", "attempt", "post", "etc", "norm", "cross", "comment", | |
"step", "compromising", "benefit","gain", "sake", "entity", "chance", | |
"issue", "couple", "trying", "detail", "ways", "redundancy", "example", | |
"using") | |
tdm_nouns <- tdm_nouns[ (!tdm_nouns$dimnames$Terms %in% specific_words), ] | |
# tdm_nouns$dimnames$Terms[tdm_nouns$dimnames$Terms %in% specific_words] | |
# tdm_nouns$dimnames$Terms[!tdm_nouns$dimnames$Terms %in% specific_words] | |
# most frequent qords | |
(freq_words <- findFreqTerms(tdm_nouns, 5)) | |
# find words associated with those high freq words | |
assocs <- lapply(freq_words, function(i) findAssocs(tdm_nouns, i, 0.3)) | |
names(assocs) <- freq_words | |
assocs | |
######## generate topic model ############### | |
library(topicmodels) | |
K <- 20 # set number of topics | |
# remove empty rows | |
tdm_nouns <- tdm_nouns[ ,tdm_nouns$dimnames$Docs != "" ] | |
tdm_nouns <- tdm_nouns[ rowSums(as.matrix(tdm_nouns)) != 0, ] | |
# remove empty columns | |
tdm_nouns <- tdm_nouns[ , colSums(as.matrix(tdm_nouns)) != 0 ] | |
# generate topic model, v. low alpha value | |
response_topics <- LDA(t(tdm_nouns), K, control = list(alpha = 0.0001)) | |
# inspect results | |
Topic <- topics(response_topics, 1) | |
Terms <- terms(response_topics, 20) | |
Terms[,1:K] | |
library(lda) | |
ldafmt <- dtm2ldaformat(t(tdm_nouns)) | |
wc <- word.counts(ldafmt$documents) #get word counts | |
# generate topic model | |
K <- 10 | |
result <- lda.collapsed.gibbs.sampler(ldafmt$documents, | |
K, # number of topics | |
ldafmt$vocab, | |
100, # number of iterations | |
0.001, # alpha, after Griffiths & Steyvers 2004 | |
0.1, # eta | |
burnin = 100, | |
compute.log.likelihood = FALSE | |
) # uses a collapsed Gibbs sampler to fit a latent Dirichlet allocation (LDA) model, consider 0.01 for alpha http://www.bytemining.com/2011/08/sigkdd-2011-conference-day-1-graph-mining-and-david-bleitopic-models/ | |
# find top five words per topic to label topics | |
top.words <- top.topic.words(result$topics, 10, by.score=TRUE) # assign top.words, following the demo(lda) | |
# make data frame of topics (columns), documents (rows) and probabilities (cells) | |
topic.proportions <- t(result$document_sums) / colSums(result$document_sums) # assign topic.proportions | |
topic.proportions[is.na(topic.proportions)] <- 1 / K # etc. from demo(lda) | |
colnames(topic.proportions) <- apply(top.words, 2, paste, collapse=" ") # assign col names... | |
topic.proportions <- data.frame(topic.proportions) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment