Skip to content

Instantly share code, notes, and snippets.

@agoldst
Created July 11, 2014 12:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save agoldst/d9782993efa8c977b2a6 to your computer and use it in GitHub Desktop.
Save agoldst/d9782993efa8c977b2a6 to your computer and use it in GitHub Desktop.
R code used to produce the slides for this DH 2014 presentation: http://andrewgoldstone.com/blog/2014/07/02/dh2014/ . Generated by knitr::purl()
opts_chunk$set(echo=F,warning=F,prompt=F,comment="",
autodep=T,cache=T,dev="tikz",
fig.width=4.5,fig.height=3,size ='footnotesize',
dev.args=list(pointsize=12))
options(width=70)
options(tikzDefaultEngine="xetex")
options(tikzXelatexPackages=c(
"\\usepackage{tikz}\n",
"\\usepackage[active,tightpage,xetex]{preview}\n",
"\\usepackage{fontspec,xunicode}\n",
"\\setmainfont{Gill Sans}\n",
"\\PreviewEnvironment{pgfpicture}\n",
"\\setlength\\PreviewBorder{0pt}\n"))
library("xtable")
library("lubridate")
library("stringr")
library("dfrtopics")
dep_auto()
smoother <- function (...) {
geom_smooth(method="loess",span=0.5,fill="grey60",se=F,...)
}
mdir <- "/Users/agoldst/Documents/research/20c/hls/tmhls/dh2014/hls_v10K_k120"
meta <- read_metadata(file.path(mdir, "dfr-data",
c("elh_ci_all",
"mlr1905-1970",
"mlr1971-2013",
"modphil_all",
"nlh_all",
"pmla_all",
"res1925-1980",
"res1981-2012"),
"citations.CSV"))
m <- list()
m$keys <- read.csv(file.path(mdir,"keys.csv"),as.is=T)
m$doctops <- read.csv(file.path(mdir,"doc_topics.csv"),as.is=T)
m$n <- max(m$keys$topic)
m$vocab <- readLines(file.path(mdir,"vocab.txt"))
meta <- meta[meta$id %in% m$doctops$id,]
topic_classes <- read.csv(file.path(mdir,"keys_classed.csv"),as.is=T)
m$dtw <- doc_topics_wide(m$doctops,meta)
m$series <- topic_proportions_series_frame(topic_year_matrix(m$dtw))
m$series$decade <- cut.Date(as.Date(m$series$year),
breaks=seq.Date(from=as.Date("1880-01-01"),
to=as.Date("2020-01-01"),
by="10 years"))
m$dtw$decade <- cut.Date(as.Date(m$dtw$pubdate),
breaks=seq.Date(from=as.Date("1880-01-01"),
to=as.Date("2020-01-01"),
by="10 years"))
js <- ddply(meta,"journaltitle",summarize,
name=str_trim(unique(journaltitle)),start=min(pubdate),
end=max(pubdate))
js <- js[order(js$start),]
js$name <- str_c("*",js$name,"*")
cat(str_c(str_c(js$name," (",year(js$start),"--",year(js$end),")"),
collapse=" \n"))
top_words <- ddply(m$keys,"topic",transform,rank=order(weight,decreasing=T))
top_words <- top_words[top_words$rank <= 3,]
top_words$x <- (top_words$topic - 1) %% 12
top_words$y <- floor((top_words$topic - 1) / 12)
top_words$y <- top_words$y + rep(c(0,-0.25,0.25),times=nrow(top_words) / 3)
top_words$y <- -top_words$y
ggplot(top_words,aes(x=x,y=y,label=word,size=weight)) + geom_text() +
scale_size_continuous(range=c(2,3)) +
theme(legend.position="none",
line=element_blank(),
rect=element_blank(),
title=element_blank(),
axis.text=element_blank())
slope_discrete <- function (decade, frac, cutoff,
earliest="1920-01-01",
latest="2010-01-01") {
earlier <- frac[as.Date(decade) >= as.Date(earliest) &
as.Date(decade) < as.Date(cutoff)]
later <- frac[as.Date(decade) > as.Date(cutoff) &
as.Date(decade) <= as.Date(latest)]
score <- sum(rep(earlier,times=length(later)) <
rep(later,each=length(earlier)))
score / (length(earlier) * length(later))
}
recency_cutoff <- "1980-01-01"
topic_decades <- t(daply(m$dtw[,-1], "decade", function (d) colSums(d[,1:m$n])))
series_dec <- topic_proportions_series_frame(topic_decades)
recent_topics <- function (s,cutoff) {
topic_slopes <- ddply(s,"topic", function (d) {
data.frame(topic=d$topic[1],
slope=slope_discrete(d$year,d$weight,cutoff))
})
topic_slopes$topic[topic_slopes$slope == 1]
}
recents <- recent_topics(series_dec,recency_cutoff)
recent_series <- m$series[m$series$topic %in% recents,]
topic_names <- daply(m$keys[m$keys$topic %in% recents,],"topic",
function (d) {
paste(paste(d$word[1:5],collapse=" "),
sep="")
})
recent_series$topic <- factor(recent_series$topic,labels=topic_names)
m$series$recent <- m$series$topic %in% recents
ggplot(m$series,aes(year,weight,
color=recent)) +
smoother() +
facet_wrap(~ topic,nrow=12,scales="free_y") +
theme(legend.position="none",
rect=element_blank(),
title=element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
strip.text=element_blank(),
panel.grid=element_blank()) +
scale_color_manual(values=c("blue","orange"))
tnames <- str_c("• ",levels(recent_series$topic))
cat(str_c(tnames,collapse="\\\\ "))
topic_classes$recent <- ifelse(topic_classes$topic %in% recents,
"recent","not recent")
tally <- as.matrix(with(topic_classes,table(code,recent)))
print(xtable(tally,digits=0),comment=F)
decs <- seq.Date(from=as.Date("1950-01-01"),
to=as.Date("2000-01-01"),
by="10 years")
slps <- list()
rs <- list()
socs <- matrix(nrow=length(decs),ncol=3,
dimnames=list(as.character(decs),
c("total recent",
"S not recent",
"S recent")))
for (d in seq_along(decs)) {
rs[[d]] <- recent_topics(series_dec,decs[d]) # series_dec calc'd above
slps[[d]] <- ddply(series_dec,.(topic),summarize,
cutoff=decs[d],
slope=slope_discrete(year,weight,decs[d]))
tab <- table(topic_classes$code,topic_classes$topic %in% rs[[d]])
socs[d,1] <- length(rs[[d]])
socs[d,2:3] <- tab["S",]
}
socs <- cbind(year(decs),socs)
colnames(socs)[1] <- "cutoff year"
print(xtable(socs,digits=0),include.rownames=F,comment=F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment