Created
February 8, 2013 17:25
-
-
Save agoldst/4740526 to your computer and use it in GitHub Desktop.
quickie plots for the Woolf-Bennett ULTIMATE SHOWDOWN
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# the metadata.R script (for read.citations()) is part of | |
# this git repository: | |
# http://github.com/agoldst/dfr-analysis | |
# So change this path as needed | |
source("~/Developer/dfr-analysis/metadata.R") | |
bennett.df <- read.citations("bennett.csv") | |
woolf.df <- read.citations("woolf.csv") | |
# Now bind the two together, using columns to flag AB and VW hits | |
# we'll have to de-duplicate overlaps | |
bennett.df <- cbind(bennett.df,AB=T,VW=bennett.df$id %in% woolf.df$id) | |
woolf.df <- cbind(woolf.df,AB=woolf.df$id %in% bennett.df$id,VW=T) | |
# number of overlaps | |
num.overlaps <- sum(abvw$AB & abvw$VW) | |
num.overlaps.fla <- sum(abvw$type[abvw$AB & abvw$VW]=="fla\t") | |
abvw <- unique(rbind(bennett.df,woolf.df)) | |
# keep just pubdate years | |
abvw$pubdate <- pubdate.to.years(abvw$pubdate) | |
# now let's explore the data | |
# cross-tabulate how many VW and AB hits in each year | |
# the right way to do cross-tabulation is table(pubdate,AB,VW,data=abvw) | |
# but, arRgh, I can't figure out how to index into that for plotting | |
ab.hits <- cbind(as.data.frame(table(abvw$pubdate[abvw$AB])),"AB") | |
names(ab.hits) <- c("pubdate","hitcount","author") | |
# back-convert from factor to numeric year | |
ab.hits$pubdate <- as.numeric(as.character(ab.hits$pubdate)) | |
# and again | |
vw.hits <- cbind(as.data.frame(table(abvw$pubdate[abvw$VW])),"VW") | |
names(vw.hits) <- c("pubdate","hitcount","author") | |
vw.hits$pubdate <- as.numeric(as.character(vw.hits$pubdate)) | |
# stack 'em up | |
hits <- rbind(ab.hits,vw.hits) | |
# Now for some plots | |
library(ggplot2) | |
# there are some problems in the data, because | |
# there are 46 articles from the teens and twenties by "Arnold Bennett Hall" | |
# for now, just going to cut out pre-1930 hits | |
# also stop at 2000, because those counts are artificially low | |
# (things enter JSTOR slowly) | |
# plot the two time series of hitcounts on one chart | |
qplot(pubdate,hitcount,color=author,data=hits,xlim=c(1930,2000), | |
main="Number of JSTOR items containing full name", | |
xlab="Publication year", | |
ylab="Number of items", | |
geom="point") | |
ggsave("ab-vw-hits.png") | |
# plot the ratio | |
names(ab.hits) <- c("pubdate","AB.hitcount","author") | |
names(vw.hits) <- c("pubdate","VW.hitcount","author") | |
hits.recast <- merge(subset(ab.hits,select=-author), | |
subset(vw.hits,select=-author)) | |
qplot(pubdate,VW.hitcount/AB.hitcount,data=hits.recast,xlim=c(1930,2000), | |
main="Ratio of Woolf-mentioning to Bennett-mentioning items", | |
xlab="Publication year", | |
ylab="VW/AB", | |
geom=c("line","smooth")) | |
ggsave("ab-vw-ratio.png") | |
ab.vw.hits <- cbind(as.data.frame(table(abvw$pubdate[abvw$AB & abvw$VW])),VW="yes") | |
ab.novw.hits <- cbind(as.data.frame(table(abvw$pubdate[abvw$AB & !abvw$VW])), VW="no") | |
qplot(as.numeric(as.character(Var1)),weight=Freq,fill=VW,data=rbind(ab.vw.hits,ab.novw.hits), | |
geom="bar",binwidth=2, | |
xlim=c(1930,2000), | |
main="Which JSTOR Bennett mentions are also Woolf Mentions?", | |
xlab="Publication year", | |
ylab="Number of articles mentioning Bennett") | |
ggsave("ab-vw-yesno.png") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment