agoldst/dh2014-slides.R

## dh2014-slides.R


opts_chunk$set(echo=F,warning=F,prompt=F,comment="",
               autodep=T,cache=T,dev="tikz",
               fig.width=4.5,fig.height=3,size ='footnotesize',
               dev.args=list(pointsize=12))
options(width=70)
options(tikzDefaultEngine="xetex")
options(tikzXelatexPackages=c(
    "\\usepackage{tikz}\n",
    "\\usepackage[active,tightpage,xetex]{preview}\n",
    "\\usepackage{fontspec,xunicode}\n",
    "\\setmainfont{Gill Sans}\n",
    "\\PreviewEnvironment{pgfpicture}\n",
    "\\setlength\\PreviewBorder{0pt}\n"))
library("xtable")
library("lubridate")
library("stringr")
library("dfrtopics")
dep_auto()


smoother <- function (...) {
  geom_smooth(method="loess",span=0.5,fill="grey60",se=F,...)
}


mdir <- "/Users/agoldst/Documents/research/20c/hls/tmhls/dh2014/hls_v10K_k120"
meta <- read_metadata(file.path(mdir, "dfr-data",
                                c("elh_ci_all",
                                  "mlr1905-1970",
                                   "mlr1971-2013",
                                   "modphil_all",
                                   "nlh_all",
                                   "pmla_all",
                                   "res1925-1980",
                                   "res1981-2012"),
                                "citations.CSV"))

m <- list()
m$keys <- read.csv(file.path(mdir,"keys.csv"),as.is=T)
m$doctops <- read.csv(file.path(mdir,"doc_topics.csv"),as.is=T)
m$n <- max(m$keys$topic)
m$vocab <- readLines(file.path(mdir,"vocab.txt"))
meta <- meta[meta$id %in% m$doctops$id,]

topic_classes <- read.csv(file.path(mdir,"keys_classed.csv"),as.is=T)
m$dtw <- doc_topics_wide(m$doctops,meta)
m$series <- topic_proportions_series_frame(topic_year_matrix(m$dtw))

m$series$decade <- cut.Date(as.Date(m$series$year),
                            breaks=seq.Date(from=as.Date("1880-01-01"),
                                            to=as.Date("2020-01-01"),
                                            by="10 years"))

m$dtw$decade <- cut.Date(as.Date(m$dtw$pubdate),
                         breaks=seq.Date(from=as.Date("1880-01-01"),
                                         to=as.Date("2020-01-01"),
                                         by="10 years"))


js <- ddply(meta,"journaltitle",summarize,
            name=str_trim(unique(journaltitle)),start=min(pubdate),
            end=max(pubdate))
js <- js[order(js$start),]
js$name <- str_c("*",js$name,"*")
cat(str_c(str_c(js$name," (",year(js$start),"--",year(js$end),")"),
          collapse="  \n"))


top_words <- ddply(m$keys,"topic",transform,rank=order(weight,decreasing=T))
top_words <- top_words[top_words$rank <= 3,]
top_words$x <- (top_words$topic - 1) %% 12
top_words$y <- floor((top_words$topic - 1) / 12)
top_words$y <- top_words$y + rep(c(0,-0.25,0.25),times=nrow(top_words) / 3)
top_words$y <- -top_words$y

ggplot(top_words,aes(x=x,y=y,label=word,size=weight)) + geom_text() +
    scale_size_continuous(range=c(2,3)) +
    theme(legend.position="none",
          line=element_blank(),
          rect=element_blank(),
          title=element_blank(),
          axis.text=element_blank())


slope_discrete <- function (decade, frac, cutoff,
                            earliest="1920-01-01",
                            latest="2010-01-01") {
    earlier <- frac[as.Date(decade) >= as.Date(earliest) &
                    as.Date(decade) < as.Date(cutoff)]
    later <- frac[as.Date(decade) > as.Date(cutoff) &
                  as.Date(decade) <= as.Date(latest)]
    score <- sum(rep(earlier,times=length(later)) <
        rep(later,each=length(earlier)))

    score / (length(earlier) * length(later))
}


recency_cutoff <- "1980-01-01"


topic_decades <- t(daply(m$dtw[,-1], "decade", function (d) colSums(d[,1:m$n])))
series_dec <- topic_proportions_series_frame(topic_decades)

recent_topics <- function (s,cutoff) {
    topic_slopes <- ddply(s,"topic", function (d) {
        data.frame(topic=d$topic[1],
                   slope=slope_discrete(d$year,d$weight,cutoff))
    })
    topic_slopes$topic[topic_slopes$slope == 1]
}
recents <- recent_topics(series_dec,recency_cutoff)


recent_series <- m$series[m$series$topic %in% recents,]
topic_names <- daply(m$keys[m$keys$topic %in% recents,],"topic",
                     function (d) {
                         paste(paste(d$word[1:5],collapse=" "),
                               sep="")
                     })
recent_series$topic <- factor(recent_series$topic,labels=topic_names)


m$series$recent <- m$series$topic %in% recents
ggplot(m$series,aes(year,weight,
                    color=recent)) +
    smoother() +
    facet_wrap(~ topic,nrow=12,scales="free_y") +
    theme(legend.position="none",
          rect=element_blank(),
          title=element_blank(),
          axis.text=element_blank(),
          axis.ticks=element_blank(),
          strip.text=element_blank(),
          panel.grid=element_blank()) +
      scale_color_manual(values=c("blue","orange"))


tnames <- str_c("• ",levels(recent_series$topic))
cat(str_c(tnames,collapse="\\\\ "))


topic_classes$recent <- ifelse(topic_classes$topic %in% recents,
                               "recent","not recent")
tally <- as.matrix(with(topic_classes,table(code,recent)))
print(xtable(tally,digits=0),comment=F)


decs <- seq.Date(from=as.Date("1950-01-01"),
                 to=as.Date("2000-01-01"),
                 by="10 years")

slps <- list()
rs <- list()
socs <- matrix(nrow=length(decs),ncol=3,
               dimnames=list(as.character(decs),
                             c("total recent",
                               "S not recent",
                               "S recent")))
for (d in seq_along(decs)) {
    rs[[d]] <- recent_topics(series_dec,decs[d]) # series_dec calc'd above
    slps[[d]] <- ddply(series_dec,.(topic),summarize,
                       cutoff=decs[d],
                       slope=slope_discrete(year,weight,decs[d]))
    tab <- table(topic_classes$code,topic_classes$topic %in% rs[[d]])
    socs[d,1] <- length(rs[[d]])
    socs[d,2:3] <- tab["S",]
}
socs <- cbind(year(decs),socs)
colnames(socs)[1] <- "cutoff year"
print(xtable(socs,digits=0),include.rownames=F,comment=F)


	opts_chunk$set(echo=F,warning=F,prompt=F,comment="",
	autodep=T,cache=T,dev="tikz",
	fig.width=4.5,fig.height=3,size ='footnotesize',
	dev.args=list(pointsize=12))
	options(width=70)
	options(tikzDefaultEngine="xetex")
	options(tikzXelatexPackages=c(
	"\\usepackage{tikz}\n",
	"\\usepackage[active,tightpage,xetex]{preview}\n",
	"\\usepackage{fontspec,xunicode}\n",
	"\\setmainfont{Gill Sans}\n",
	"\\PreviewEnvironment{pgfpicture}\n",
	"\\setlength\\PreviewBorder{0pt}\n"))
	library("xtable")
	library("lubridate")
	library("stringr")
	library("dfrtopics")
	dep_auto()



	smoother <- function (...) {
	geom_smooth(method="loess",span=0.5,fill="grey60",se=F,...)
	}



	mdir <- "/Users/agoldst/Documents/research/20c/hls/tmhls/dh2014/hls_v10K_k120"
	meta <- read_metadata(file.path(mdir, "dfr-data",
	c("elh_ci_all",
	"mlr1905-1970",
	"mlr1971-2013",
	"modphil_all",
	"nlh_all",
	"pmla_all",
	"res1925-1980",
	"res1981-2012"),
	"citations.CSV"))

	m <- list()
	m$keys <- read.csv(file.path(mdir,"keys.csv"),as.is=T)
	m$doctops <- read.csv(file.path(mdir,"doc_topics.csv"),as.is=T)
	m$n <- max(m$keys$topic)
	m$vocab <- readLines(file.path(mdir,"vocab.txt"))
	meta <- meta[meta$id %in% m$doctops$id,]

	topic_classes <- read.csv(file.path(mdir,"keys_classed.csv"),as.is=T)
	m$dtw <- doc_topics_wide(m$doctops,meta)
	m$series <- topic_proportions_series_frame(topic_year_matrix(m$dtw))

	m$series$decade <- cut.Date(as.Date(m$series$year),
	breaks=seq.Date(from=as.Date("1880-01-01"),
	to=as.Date("2020-01-01"),
	by="10 years"))

	m$dtw$decade <- cut.Date(as.Date(m$dtw$pubdate),
	breaks=seq.Date(from=as.Date("1880-01-01"),
	to=as.Date("2020-01-01"),
	by="10 years"))



	js <- ddply(meta,"journaltitle",summarize,
	name=str_trim(unique(journaltitle)),start=min(pubdate),
	end=max(pubdate))
	js <- js[order(js$start),]
	js$name <- str_c("",js$name,"")
	cat(str_c(str_c(js$name," (",year(js$start),"--",year(js$end),")"),
	collapse=" \n"))



	top_words <- ddply(m$keys,"topic",transform,rank=order(weight,decreasing=T))
	top_words <- top_words[top_words$rank <= 3,]
	top_words$x <- (top_words$topic - 1) %% 12
	top_words$y <- floor((top_words$topic - 1) / 12)
	top_words$y <- top_words$y + rep(c(0,-0.25,0.25),times=nrow(top_words) / 3)
	top_words$y <- -top_words$y

	ggplot(top_words,aes(x=x,y=y,label=word,size=weight)) + geom_text() +
	scale_size_continuous(range=c(2,3)) +
	theme(legend.position="none",
	line=element_blank(),
	rect=element_blank(),
	title=element_blank(),
	axis.text=element_blank())



	slope_discrete <- function (decade, frac, cutoff,
	earliest="1920-01-01",
	latest="2010-01-01") {
	earlier <- frac[as.Date(decade) >= as.Date(earliest) &
	as.Date(decade) < as.Date(cutoff)]
	later <- frac[as.Date(decade) > as.Date(cutoff) &
	as.Date(decade) <= as.Date(latest)]
	score <- sum(rep(earlier,times=length(later)) <
	rep(later,each=length(earlier)))

	score / (length(earlier) * length(later))
	}



	recency_cutoff <- "1980-01-01"



	topic_decades <- t(daply(m$dtw[,-1], "decade", function (d) colSums(d[,1:m$n])))
	series_dec <- topic_proportions_series_frame(topic_decades)

	recent_topics <- function (s,cutoff) {
	topic_slopes <- ddply(s,"topic", function (d) {
	data.frame(topic=d$topic[1],
	slope=slope_discrete(d$year,d$weight,cutoff))
	})
	topic_slopes$topic[topic_slopes$slope == 1]
	}
	recents <- recent_topics(series_dec,recency_cutoff)



	recent_series <- m$series[m$series$topic %in% recents,]
	topic_names <- daply(m$keys[m$keys$topic %in% recents,],"topic",
	function (d) {
	paste(paste(d$word[1:5],collapse=" "),
	sep="")
	})
	recent_series$topic <- factor(recent_series$topic,labels=topic_names)



	m$series$recent <- m$series$topic %in% recents
	ggplot(m$series,aes(year,weight,
	color=recent)) +
	smoother() +
	facet_wrap(~ topic,nrow=12,scales="free_y") +
	theme(legend.position="none",
	rect=element_blank(),
	title=element_blank(),
	axis.text=element_blank(),
	axis.ticks=element_blank(),
	strip.text=element_blank(),
	panel.grid=element_blank()) +
	scale_color_manual(values=c("blue","orange"))



	tnames <- str_c("• ",levels(recent_series$topic))
	cat(str_c(tnames,collapse="\\\\ "))



	topic_classes$recent <- ifelse(topic_classes$topic %in% recents,
	"recent","not recent")
	tally <- as.matrix(with(topic_classes,table(code,recent)))
	print(xtable(tally,digits=0),comment=F)



	decs <- seq.Date(from=as.Date("1950-01-01"),
	to=as.Date("2000-01-01"),
	by="10 years")

	slps <- list()
	rs <- list()
	socs <- matrix(nrow=length(decs),ncol=3,
	dimnames=list(as.character(decs),
	c("total recent",
	"S not recent",
	"S recent")))
	for (d in seq_along(decs)) {
	rs[[d]] <- recent_topics(series_dec,decs[d]) # series_dec calc'd above
	slps[[d]] <- ddply(series_dec,.(topic),summarize,
	cutoff=decs[d],
	slope=slope_discrete(year,weight,decs[d]))
	tab <- table(topic_classes$code,topic_classes$topic %in% rs[[d]])
	socs[d,1] <- length(rs[[d]])
	socs[d,2:3] <- tab["S",]
	}
	socs <- cbind(year(decs),socs)
	colnames(socs)[1] <- "cutoff year"
	print(xtable(socs,digits=0),include.rownames=F,comment=F)