Skip to content

Instantly share code, notes, and snippets.

@agoldst
Created February 8, 2013 17:25
Show Gist options
  • Save agoldst/4740526 to your computer and use it in GitHub Desktop.
Save agoldst/4740526 to your computer and use it in GitHub Desktop.
quickie plots for the Woolf-Bennett ULTIMATE SHOWDOWN
# the metadata.R script (for read.citations()) is part of
# this git repository:
# http://github.com/agoldst/dfr-analysis
# So change this path as needed
source("~/Developer/dfr-analysis/metadata.R")
bennett.df <- read.citations("bennett.csv")
woolf.df <- read.citations("woolf.csv")
# Now bind the two together, using columns to flag AB and VW hits
# we'll have to de-duplicate overlaps
bennett.df <- cbind(bennett.df,AB=T,VW=bennett.df$id %in% woolf.df$id)
woolf.df <- cbind(woolf.df,AB=woolf.df$id %in% bennett.df$id,VW=T)
# number of overlaps
num.overlaps <- sum(abvw$AB & abvw$VW)
num.overlaps.fla <- sum(abvw$type[abvw$AB & abvw$VW]=="fla\t")
abvw <- unique(rbind(bennett.df,woolf.df))
# keep just pubdate years
abvw$pubdate <- pubdate.to.years(abvw$pubdate)
# now let's explore the data
# cross-tabulate how many VW and AB hits in each year
# the right way to do cross-tabulation is table(pubdate,AB,VW,data=abvw)
# but, arRgh, I can't figure out how to index into that for plotting
ab.hits <- cbind(as.data.frame(table(abvw$pubdate[abvw$AB])),"AB")
names(ab.hits) <- c("pubdate","hitcount","author")
# back-convert from factor to numeric year
ab.hits$pubdate <- as.numeric(as.character(ab.hits$pubdate))
# and again
vw.hits <- cbind(as.data.frame(table(abvw$pubdate[abvw$VW])),"VW")
names(vw.hits) <- c("pubdate","hitcount","author")
vw.hits$pubdate <- as.numeric(as.character(vw.hits$pubdate))
# stack 'em up
hits <- rbind(ab.hits,vw.hits)
# Now for some plots
library(ggplot2)
# there are some problems in the data, because
# there are 46 articles from the teens and twenties by "Arnold Bennett Hall"
# for now, just going to cut out pre-1930 hits
# also stop at 2000, because those counts are artificially low
# (things enter JSTOR slowly)
# plot the two time series of hitcounts on one chart
qplot(pubdate,hitcount,color=author,data=hits,xlim=c(1930,2000),
main="Number of JSTOR items containing full name",
xlab="Publication year",
ylab="Number of items",
geom="point")
ggsave("ab-vw-hits.png")
# plot the ratio
names(ab.hits) <- c("pubdate","AB.hitcount","author")
names(vw.hits) <- c("pubdate","VW.hitcount","author")
hits.recast <- merge(subset(ab.hits,select=-author),
subset(vw.hits,select=-author))
qplot(pubdate,VW.hitcount/AB.hitcount,data=hits.recast,xlim=c(1930,2000),
main="Ratio of Woolf-mentioning to Bennett-mentioning items",
xlab="Publication year",
ylab="VW/AB",
geom=c("line","smooth"))
ggsave("ab-vw-ratio.png")
ab.vw.hits <- cbind(as.data.frame(table(abvw$pubdate[abvw$AB & abvw$VW])),VW="yes")
ab.novw.hits <- cbind(as.data.frame(table(abvw$pubdate[abvw$AB & !abvw$VW])), VW="no")
qplot(as.numeric(as.character(Var1)),weight=Freq,fill=VW,data=rbind(ab.vw.hits,ab.novw.hits),
geom="bar",binwidth=2,
xlim=c(1930,2000),
main="Which JSTOR Bennett mentions are also Woolf Mentions?",
xlab="Publication year",
ylab="Number of articles mentioning Bennett")
ggsave("ab-vw-yesno.png")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment