agoldst/woolf-bennett.R

## woolf-bennett.R
# the metadata.R script (for read.citations()) is part of
# this git repository:
# http://github.com/agoldst/dfr-analysis
# So change this path as needed
source("~/Developer/dfr-analysis/metadata.R")

bennett.df <- read.citations("bennett.csv")
woolf.df <- read.citations("woolf.csv")

# Now bind the two together, using columns to flag AB and VW hits
# we'll have to de-duplicate overlaps
bennett.df <- cbind(bennett.df,AB=T,VW=bennett.df$id %in% woolf.df$id)
woolf.df <- cbind(woolf.df,AB=woolf.df$id %in% bennett.df$id,VW=T)

# number of overlaps
num.overlaps <- sum(abvw$AB & abvw$VW)
num.overlaps.fla <- sum(abvw$type[abvw$AB & abvw$VW]=="fla\t")

abvw <- unique(rbind(bennett.df,woolf.df))

# keep just pubdate years
abvw$pubdate <- pubdate.to.years(abvw$pubdate)

# now let's explore the data

# cross-tabulate how many VW and AB hits in each year
# the right way to do cross-tabulation is table(pubdate,AB,VW,data=abvw)
# but, arRgh, I can't figure out how to index into that for plotting
ab.hits <- cbind(as.data.frame(table(abvw$pubdate[abvw$AB])),"AB")
names(ab.hits) <- c("pubdate","hitcount","author")

# back-convert from factor to numeric year
ab.hits$pubdate <- as.numeric(as.character(ab.hits$pubdate))

# and again
vw.hits <- cbind(as.data.frame(table(abvw$pubdate[abvw$VW])),"VW")
names(vw.hits) <- c("pubdate","hitcount","author")
vw.hits$pubdate <- as.numeric(as.character(vw.hits$pubdate))

# stack 'em up
hits <- rbind(ab.hits,vw.hits)

# Now for some plots
library(ggplot2)

# there are some problems in the data, because
# there are 46 articles from the teens and twenties by "Arnold Bennett Hall"
# for now, just going to cut out pre-1930 hits
# also stop at 2000, because those counts are artificially low
# (things enter JSTOR slowly)

# plot the two time series of hitcounts on one chart
qplot(pubdate,hitcount,color=author,data=hits,xlim=c(1930,2000),
      main="Number of JSTOR items containing full name",
      xlab="Publication year",
      ylab="Number of items",
      geom="point")

ggsave("ab-vw-hits.png")

# plot the ratio
names(ab.hits) <- c("pubdate","AB.hitcount","author")
names(vw.hits) <- c("pubdate","VW.hitcount","author")
hits.recast <- merge(subset(ab.hits,select=-author),
                     subset(vw.hits,select=-author))
qplot(pubdate,VW.hitcount/AB.hitcount,data=hits.recast,xlim=c(1930,2000),
      main="Ratio of Woolf-mentioning to Bennett-mentioning items",
      xlab="Publication year",
      ylab="VW/AB",
      geom=c("line","smooth"))

ggsave("ab-vw-ratio.png")


ab.vw.hits <- cbind(as.data.frame(table(abvw$pubdate[abvw$AB & abvw$VW])),VW="yes")
ab.novw.hits <- cbind(as.data.frame(table(abvw$pubdate[abvw$AB & !abvw$VW])), VW="no")
qplot(as.numeric(as.character(Var1)),weight=Freq,fill=VW,data=rbind(ab.vw.hits,ab.novw.hits),
      geom="bar",binwidth=2,
      xlim=c(1930,2000),
      main="Which JSTOR Bennett mentions are also Woolf Mentions?",
      xlab="Publication year",
      ylab="Number of articles mentioning Bennett")

ggsave("ab-vw-yesno.png")
	# the metadata.R script (for read.citations()) is part of
	# this git repository:
	# http://github.com/agoldst/dfr-analysis
	# So change this path as needed
	source("~/Developer/dfr-analysis/metadata.R")

	bennett.df <- read.citations("bennett.csv")
	woolf.df <- read.citations("woolf.csv")

	# Now bind the two together, using columns to flag AB and VW hits
	# we'll have to de-duplicate overlaps
	bennett.df <- cbind(bennett.df,AB=T,VW=bennett.df$id %in% woolf.df$id)
	woolf.df <- cbind(woolf.df,AB=woolf.df$id %in% bennett.df$id,VW=T)

	# number of overlaps
	num.overlaps <- sum(abvw$AB & abvw$VW)
	num.overlaps.fla <- sum(abvw$type[abvw$AB & abvw$VW]=="fla\t")

	abvw <- unique(rbind(bennett.df,woolf.df))

	# keep just pubdate years
	abvw$pubdate <- pubdate.to.years(abvw$pubdate)

	# now let's explore the data

	# cross-tabulate how many VW and AB hits in each year
	# the right way to do cross-tabulation is table(pubdate,AB,VW,data=abvw)
	# but, arRgh, I can't figure out how to index into that for plotting
	ab.hits <- cbind(as.data.frame(table(abvw$pubdate[abvw$AB])),"AB")
	names(ab.hits) <- c("pubdate","hitcount","author")

	# back-convert from factor to numeric year
	ab.hits$pubdate <- as.numeric(as.character(ab.hits$pubdate))

	# and again
	vw.hits <- cbind(as.data.frame(table(abvw$pubdate[abvw$VW])),"VW")
	names(vw.hits) <- c("pubdate","hitcount","author")
	vw.hits$pubdate <- as.numeric(as.character(vw.hits$pubdate))

	# stack 'em up
	hits <- rbind(ab.hits,vw.hits)

	# Now for some plots
	library(ggplot2)

	# there are some problems in the data, because
	# there are 46 articles from the teens and twenties by "Arnold Bennett Hall"
	# for now, just going to cut out pre-1930 hits
	# also stop at 2000, because those counts are artificially low
	# (things enter JSTOR slowly)

	# plot the two time series of hitcounts on one chart
	qplot(pubdate,hitcount,color=author,data=hits,xlim=c(1930,2000),
	main="Number of JSTOR items containing full name",
	xlab="Publication year",
	ylab="Number of items",
	geom="point")

	ggsave("ab-vw-hits.png")

	# plot the ratio
	names(ab.hits) <- c("pubdate","AB.hitcount","author")
	names(vw.hits) <- c("pubdate","VW.hitcount","author")
	hits.recast <- merge(subset(ab.hits,select=-author),
	subset(vw.hits,select=-author))
	qplot(pubdate,VW.hitcount/AB.hitcount,data=hits.recast,xlim=c(1930,2000),
	main="Ratio of Woolf-mentioning to Bennett-mentioning items",
	xlab="Publication year",
	ylab="VW/AB",
	geom=c("line","smooth"))

	ggsave("ab-vw-ratio.png")


	ab.vw.hits <- cbind(as.data.frame(table(abvw$pubdate[abvw$AB & abvw$VW])),VW="yes")
	ab.novw.hits <- cbind(as.data.frame(table(abvw$pubdate[abvw$AB & !abvw$VW])), VW="no")
	qplot(as.numeric(as.character(Var1)),weight=Freq,fill=VW,data=rbind(ab.vw.hits,ab.novw.hits),
	geom="bar",binwidth=2,
	xlim=c(1930,2000),
	main="Which JSTOR Bennett mentions are also Woolf Mentions?",
	xlab="Publication year",
	ylab="Number of articles mentioning Bennett")

	ggsave("ab-vw-yesno.png")