Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Last active August 29, 2015 14:00
Show Gist options
  • Save benmarwick/d3eb3167ccb2116197ca to your computer and use it in GitHub Desktop.
Save benmarwick/d3eb3167ccb2116197ca to your computer and use it in GitHub Desktop.
Sketch of a look at the 'grand challenges' of Kintigh et al. 2014 (http://www.pnas.org/content/111/3/879.full) in five archaeology journals
library(devtools)
install_github("benmarwick/JSTORr")
load("~/teamviewer/five_journals/five_journals.RData")
# looking at American Antiquity, Journal of World Prehistory, World Archaeology,
library(JSTORr)
# major groups of challenges
emergence <- c("emergence", "communities", "complexity")
JSTOR_1word(unpack_multi, "emergence", span = 0.5, se = FALSE)
resilience <- c("resilience", "persistence", "transformation", "collapse", "decline", "complexity")
JSTOR_1word(unpack_multi, resilience, span = 0.5, se = FALSE)
movement <- c("movement", "mobility", "migration", "migrant", "adaptation")
JSTOR_1word(unpack_multi, movement, span = 0.5, se = FALSE)
cognition <- c("cognition", "behavior", "identity", "identities", "societal")
JSTOR_1word(unpack_multi, cognition, span = 0.5, se = FALSE)
human_environment <- c("environment", "health", "domestication", "agriculture", "environmental", "climate")
JSTOR_1word(unpack_multi, human_environment, span = 0.5, se = FALSE)
other <- c("gender", "indigeneity", "postprocessual", "postcolonial", "phenomenology")
JSTOR_1word(unpack_multi, other, span = 0.5, se = FALSE)
states <- c("state", "empires", "imperial", "domination")
JSTOR_1word(unpack_multi, states, span = 0.5, se = FALSE)
# numbers
challenges <- list(emergence=emergence, resilience=resilience, movement=movement, cognition=cognition, human_environment=human_environment, other=other)
numbers <- data.frame(challenge = names(challenges), word_count = NA, article_count = NA)
for(i in 1:length(challenges)){
tmp <- as.matrix(unpack_multi$wordcounts[, unpack_multi$wordcounts$dimnames$Terms %in% unlist(unname(challenges[i])) ])
numbers$word_count[i] <- sum(colSums(tmp))
# how many instances of the word
numbers$article_count[i] <- sum(tmp!=0)
# how many articles with the word
print(i)
}
# plot
devtools::install_github("karthik/wesanderson")
library(ggplot2); library(reshape2); library(wesanderson)
numbers_m <- melt(numbers)
ggplot(numbers_m, aes(x=reorder(challenge, -value), value, fill = variable)) +
geom_bar(position = "dodge", stat="identity") +
scale_fill_manual(values = wes.palette(2, "GrandBudapest")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 14)) +
xlab("challenge")
nouns <- JSTOR_dtmofnouns(unpack_multi, sparse = 0.75, POStag = TRUE)
K <- 30
# high alpha-value will lead to documents being more similar in terms of what topics they contain
topics <- JSTOR_lda(unpack_multi, nouns, K, alpha = 1000/K)
hotncold <- JSTOR_lda_hotncoldtopics(topics, pval = 0.05, ma = 10, size = 12)
## just do it for American Antiquity
load("~/teamviewer/kretzler_and_marwick.RData")
# major groups of challenges just in American Antiquity
emergence <- c("emergence", "communities", "complexity")
JSTOR_1word(unpack1grams, "emergence", span = 0.5, se = FALSE)
resilience <- c("resilience", "persistence", "transformation", "collapse", "decline", "complexity")
JSTOR_1word(unpack1grams, resilience, span = 0.5, se = FALSE)
movement <- c("movement", "mobility", "migration", "migrant", "adaptation")
JSTOR_1word(unpack1grams, movement, span = 0.5, se = FALSE)
cognition <- c("cognition", "behavior", "identity", "identities", "societal")
JSTOR_1word(unpack1grams, cognition, span = 0.5, se = FALSE)
human_environment <- c("environment", "health", "domestication", "agriculture", "environmental", "climate")
JSTOR_1word(unpack1grams, human_environment, span = 0.5, se = FALSE)
other <- c("gender", "indigeneity", "postprocessual", "postcolonial", "phenomenology")
JSTOR_1word(unpack1grams, other, span = 0.5, se = FALSE)
states <- c("state", "empires", "imperial", "domination")
JSTOR_1word(unpack1grams, states, span = 0.5, se = FALSE)
# numbers
challenges <- list(emergence=emergence, resilience=resilience, movement=movement, cognition=cognition, human_environment=human_environment, other=other)
numbers <- data.frame(challenge = names(challenges), word_count = NA, article_count = NA)
for(i in 1:length(challenges)){
tmp <- as.matrix(unpack1grams$wordcounts[, unpack1grams$wordcounts$dimnames$Terms %in% unlist(unname(challenges[i])) ])
numbers$word_count[i] <- sum(colSums(tmp))
# how many instances of the word
numbers$article_count[i] <- sum(tmp!=0)
# how many articles with the word
print(i)
}
# plot
devtools::install_github("karthik/wesanderson")
library(ggplot2); library(reshape2); library(wesanderson)
numbers_m <- melt(numbers)
ggplot(numbers_m, aes(x=reorder(challenge, -value), value, fill = variable)) +
geom_bar(position = "dodge", stat="identity") +
scale_fill_manual(values = wes.palette(2, "GrandBudapest")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 14)) +
xlab("challenge")
## Topic model the Grand challenges raw data in tDAR report
# https://core.tdar.org/document/391233
# convert PDF to text
# windows: see https://gist.github.com/benmarwick/11333467
# linux: sudo apt-get install poppler-utils
report <- "~/teamviewer/five_journals/GrandChallengeCrowdSourcingReport.pdf"
system(paste('"pdftotext"', report, '"out.txt"'), wait = FALSE )
report <- scan('out.txt', what = character())
# where do the reponses start?
grep("Responses", report) # 350
# where do they stop
grep("SECTOR", report) # 3268
# get responses only
responses <- report[350:3268]
# join into one big vector
responses <- paste(responses, collapse = " ")
# get only text between 'challenge' and 'justification
# ie. exclude justifications
responses <- regmatches(responses, gregexpr("(?<=Challenge).*?(?=Justification)", responses, perl=TRUE))
# join into one big vector (again)
responses <- paste(responses, collapse = " ")
# remove anything that's nota letter (ie. punctuation, numbers)
responses <- gsub('[^[:alpha:]]',' ', responses)
# split into words
responses <- unlist(strsplit(responses, " "))
# remove empty items
responses <- responses[responses != ""]
# remove words with one or two letters
responses <- responses[!nchar(responses) < 3 ]
# remove a few stopwords
responses <- responses[!(responses %in% tm::stopwords(kind = "en"))]
## pre-process
# split into n-word documents
n <- 100
docs <- split(responses, ceiling(seq_along(responses)/n))
# make corpus
library(tm)
corpus <- Corpus(VectorSource(docs))
# make term document matrix
tdm <- TermDocumentMatrix(corpus,
control = list(removePunctuation = TRUE,
stopwords = TRUE,
removePunctuation = TRUE,
removeNumbers = TRUE,
removeNumbers = TRUE,
stripWhitespace = TRUE,
tolower = TRUE))
# isolate terms to remove non-nouns
terms <- tdm$dimnames$Terms
# remove punctation (again)
terms <- gsub('[[:punct:]]','', terms)
terms <- gsub('\\\\','', terms)
library(NLP); library(data.table); library(openNLP)
tagPOS <- function(x) {
s <- NLP::as.String(x)
## Need sentence and word token annotations.
a1 <- NLP::Annotation(1L, "sentence", 1L, nchar(s))
a2 <- NLP::annotate(s, openNLP::Maxent_Word_Token_Annotator(), a1)
a3 <- NLP::annotate(s, openNLP::Maxent_POS_Tag_Annotator(), a2)
## Determine the distribution of POS tags for word tokens.
a3w <- a3[a3$type == "word"]
POStags <- unlist(lapply(a3w$features, `[[`, "POS"))
## Extract token/POS pairs (all of them): easy - not needed
# POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
return(unlist(POStags))
}
# divide Terms into chunks of 1000 terms each because more than that can cause
# memory problems
terms_split <- split(terms, ceiling(seq_along(terms)/1000))
# loop over each chunk of 1000 terms to do POStagging, I found that trying to
# do 10,000 terms or more causes Java memory problems, so this is a very safe
# method to try not to fill memory
terms_split_chunks <- plyr::llply(terms_split, function(i){
tmp <- paste(gsub("[^[:alnum:]]", " ", i), collapse = " ")
tmp <- tagPOS(tmp)
tmp <- tmp[!tmp %in% c(",", "``", "''", ".")]
}, .progress = "text")
# get all the tags in a vector
terms_split_chunks_out <- unname(c(unlist(terms_split_chunks)))
# subset document term matrix terms to keep only nouns
tdm_nouns <- tdm[ c(tdm$dimnames$Terms[terms_split_chunks_out == "NN"]), ]
# drop specific words relevant to the grand challenge
specific_words <- c("grand", "challenge", "challenges", "archaeology", "archeology",
"archaeological", "archaeologists", "discipline", "disciplines",
"can", "like", "pay", "many", "well", "way", "non", "yet", "world",
"broad", "future", "across", "questions", "don", "dacs", "understanding",
"still", "make", "made", "researchers", "upon", "among", "ever", "good",
"conduct", "year", "cultural", "projects", "provide", "means", "high",
"get", "next", "coming", "available", "kind", "geo", "insights", "different",
"differences", "ambitious", "continue", "another", "currently" , "nearly",
"particular", "certain", "current", "critical", "era", "taking", "happened",
"changing", "idea", "first", "whether", "key", "facing", "justify", "leveraging",
"identified", "collected", "record", "sciences","framework", "chose", "whatever",
"nothing", "terms", "result", "remains", "scientific", "scientists", "activities",
"perspective", "burn", "american", "doesn", "concepts", "faced", "societies",
"focus", "forms", "either", "adot", "enormous", "fully","largely", "despite",
"similar", "probably", "act", "known", "changes", "partial", "five", "quickly",
"following", "hoc", "relevant", "shift", "life", "linked", "useful", "set",
"developed", "response", "gaining", "hardly", "refine", "patterns", "investigate",
"ground", "sufficient", "argued", "entirely", "proposed", "scholars", "main",
"around", "incomplete", "azsite", "assign", "agree", "oldest", "practices" ,
"innovations", "several", "specifically", "standard","late", "early", "return",
"seem", "return", "constructed", "investigation", "diagnostic", "altered",
"program", "minimum", "minimal","articulate", "consequences", "somehow", "factors",
"seems", "never", "effectively", "sophisticated", "unique", "cnh", "creating",
"knowing", "discovering" , "longer", "problems", "paradigms", "modern", "efficiency",
"foundations", "graphics", "leads", "scripps", "huge", "extensive", "potential",
"contexts", "near", "sense", "scope", "rising", "significant", "occurred",
"manner", "compare", "extend", "exist", "advantage", "interested", "space",
"evident", "chimp", "heck", "pick", "illustrated","space", "tired", "hopes",
"vanity", "stop", "plays", "fundamentally", "slightly", "reasons", "fresh",
"suitably", "volume", "shut", "kept", "togeter", "commented", "latter",
"suggests", "boyd", "say", "hence", "mos", "bruse", "waiting", "steps",
"assuring", "sorely", "anxious", "undergone", "submit", "touted", "assorted",
"equally", "legendary", "sadly", "guesses", "hardesty", "placing", "avoided",
"stored", "boserup", "commonly", "need", "problem", "term", "will", "information",
"help", "basis", "ability", "move", "address", "change", "access",
"barbara", "via", "attempt", "post", "etc", "norm", "cross", "comment",
"step", "compromising", "benefit","gain", "sake", "entity", "chance",
"issue", "couple", "trying", "detail", "ways", "redundancy", "example",
"using")
tdm_nouns <- tdm_nouns[ (!tdm_nouns$dimnames$Terms %in% specific_words), ]
# tdm_nouns$dimnames$Terms[tdm_nouns$dimnames$Terms %in% specific_words]
# tdm_nouns$dimnames$Terms[!tdm_nouns$dimnames$Terms %in% specific_words]
# most frequent qords
(freq_words <- findFreqTerms(tdm_nouns, 5))
# find words associated with those high freq words
assocs <- lapply(freq_words, function(i) findAssocs(tdm_nouns, i, 0.3))
names(assocs) <- freq_words
assocs
######## generate topic model ###############
library(topicmodels)
K <- 20 # set number of topics
# remove empty rows
tdm_nouns <- tdm_nouns[ ,tdm_nouns$dimnames$Docs != "" ]
tdm_nouns <- tdm_nouns[ rowSums(as.matrix(tdm_nouns)) != 0, ]
# remove empty columns
tdm_nouns <- tdm_nouns[ , colSums(as.matrix(tdm_nouns)) != 0 ]
# generate topic model, v. low alpha value
response_topics <- LDA(t(tdm_nouns), K, control = list(alpha = 0.0001))
# inspect results
Topic <- topics(response_topics, 1)
Terms <- terms(response_topics, 20)
Terms[,1:K]
library(lda)
ldafmt <- dtm2ldaformat(t(tdm_nouns))
wc <- word.counts(ldafmt$documents) #get word counts
# generate topic model
K <- 10
result <- lda.collapsed.gibbs.sampler(ldafmt$documents,
K, # number of topics
ldafmt$vocab,
100, # number of iterations
0.001, # alpha, after Griffiths & Steyvers 2004
0.1, # eta
burnin = 100,
compute.log.likelihood = FALSE
) # uses a collapsed Gibbs sampler to fit a latent Dirichlet allocation (LDA) model, consider 0.01 for alpha http://www.bytemining.com/2011/08/sigkdd-2011-conference-day-1-graph-mining-and-david-bleitopic-models/
# find top five words per topic to label topics
top.words <- top.topic.words(result$topics, 10, by.score=TRUE) # assign top.words, following the demo(lda)
# make data frame of topics (columns), documents (rows) and probabilities (cells)
topic.proportions <- t(result$document_sums) / colSums(result$document_sums) # assign topic.proportions
topic.proportions[is.na(topic.proportions)] <- 1 / K # etc. from demo(lda)
colnames(topic.proportions) <- apply(top.words, 2, paste, collapse=" ") # assign col names...
topic.proportions <- data.frame(topic.proportions)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment