Created
October 7, 2019 20:50
-
-
Save nucholab/e6313beb98fbb1d4ee26cdc2b850c31c to your computer and use it in GitHub Desktop.
Plotting and Stats Calculations for 'Librarian Impact on Systematic Reviews' poster for MCMLA2019
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# srr-analyze | |
# | |
# plots and stats after collecting SR-AMC data | |
# | |
# script is part of: | |
# A Bibliographic Analysis of librarian assistance on SRs at CUAnschutz | |
# (continuation of Craven/Palmer/Piper project) | |
# Initialize | |
library(tidyverse) | |
library(magrittr) | |
library(RefManageR) | |
# Read dataset | |
srr_dataset <- read.csv("combo_papers_table.csv") | |
srr_dataset <- mutate(srr_dataset,url=as.character(url),author=as.character(author),title=as.character(title),journal=as.character(journal),doi=as.character(doi),id=as.character(id)) | |
# Look at histograms of the whole dataset | |
ggplot(data=srr_dataset,aes(JIF))+ | |
geom_histogram(col="blue",fill="blue",alpha=0.5) | |
ggplot(data=srr_dataset,aes(maxRank))+ | |
geom_histogram(col="blue",fill="blue",alpha=0.5) | |
ggplot(data=srr_dataset,aes(times.cited))+ | |
geom_histogram(col="blue",fill="blue",alpha=0.5) | |
# Compare these histograms among librarian versus non-librarian records | |
ggplot(data=(srr_dataset %>% filter(librarian==FALSE)),aes(JIF))+ | |
geom_histogram(col="blue",fill="blue",alpha=0.5)+ | |
geom_histogram(data=(srr_dataset %>% filter(librarian==TRUE)),aes(JIF),col="red",fill="red",alpha=0.5) | |
ggplot(data=(srr_dataset %>% filter(librarian==FALSE)),aes(maxRank))+ | |
geom_histogram(col="blue",fill="blue",alpha=0.5)+ | |
geom_histogram(data=(srr_dataset %>% filter(librarian==TRUE)),aes(maxRank),col="red",fill="red",alpha=0.5) | |
ggplot(data=(srr_dataset %>% filter(librarian==FALSE)),aes(times.cited))+ | |
geom_histogram(col="blue",fill="blue",alpha=0.5)+ | |
geom_histogram(data=(srr_dataset %>% filter(librarian==TRUE)),aes(times.cited),col="red",fill="red",alpha=0.5) | |
# Let's see if densities make any difference more visible | |
ggplot(data=(srr_dataset %>% filter(librarian != "NA")), aes(JIF, stat(density), colour = librarian)) + | |
geom_freqpoly(na.rm=TRUE) | |
ggplot(data=(srr_dataset %>% filter(librarian != "NA")), aes(maxRank, stat(density), colour = librarian)) + | |
geom_freqpoly(na.rm=TRUE) | |
ggplot(data=(srr_dataset %>% filter(librarian != "NA")), aes(times.cited, stat(density), colour = librarian)) + | |
geom_freqpoly(na.rm=TRUE) | |
# Boxplots comparing nonlibs to coauthor libs | |
ggplot(srr_dataset %>% filter(librarian != "NA"),aes(factor(coauthor),JIF)) + geom_boxplot(outlier.shape=NA) + geom_jitter(width=0.15,alpha=0.5) | |
ggplot(srr_dataset %>% filter(librarian != "NA"),aes(factor(coauthor),maxRank)) + geom_boxplot(outlier.shape=NA) + geom_jitter(width=0.15,alpha=0.5) | |
ggplot(srr_dataset %>% filter(librarian != "NA"),aes(factor(coauthor),times.cited)) + geom_boxplot(outlier.shape=NA) + geom_jitter(width=0.15,alpha=0.5) | |
# Let me try a weird "normalization" | |
ggplot(srr_dataset %>% filter(librarian != "NA"),aes(factor(librarian),times.cited/JIF)) + geom_boxplot(outlier.shape=NA) + geom_jitter(width=0.15,alpha=0.5) | |
ggplot(srr_dataset %>% filter(librarian != "NA"),aes(factor(coauthor),times.cited/JIF)) + geom_boxplot(outlier.shape=NA) + geom_jitter(width=0.15,alpha=0.5) | |
## Pie Chart of dataset | |
srr_chart <- data.frame( | |
group <- c("No Librarian Involvement","Librarian is Co-author","Librarian Acknowledged"), | |
value <- c(243,37,46)) | |
srr_chart <- srr_chart %>% | |
arrange(desc(group)) %>% | |
mutate(prop = value / sum(srr_chart$value) * 100) %>% | |
mutate(ypos = cumsum(prop) - 0.5*prop) | |
srr_pie <- ggplot(srr_chart,aes(x="", y=prop, fill=group)) + | |
geom_bar(stat="identity",width=1,color="white") + | |
coord_polar("y",start=pi/2) + | |
theme_void() + | |
theme(legend.position="bottom",legend.direction="vertical",legend.text=element_text(face="bold",size=14)) + | |
scale_fill_manual("",values=c("#007C7C","#773D84","#215297")) + | |
geom_text(aes(y=ypos,label=value),color="white",size=8) | |
## Stacked Histograms of dataset | |
# JIF | |
jifhist <- ggplot(data=(srr_dataset %>% filter(librarian==FALSE)),aes(JIF))+ | |
geom_histogram(col="#215297",fill="#215297",alpha=0.65,binwidth=1)+ | |
geom_histogram(data=(srr_dataset %>% filter(librarian==TRUE)), | |
aes(JIF),col="#BE5039",fill="#BE5039",alpha=0.65,binwidth=1) + | |
theme_minimal() + | |
ggtitle("Histogram of Journal Impact Factors in dataset") + | |
theme(plot.title = element_text(size=20,face="bold")) + | |
annotate("text",x=30,y=50,label="No Librarian",colour="#215297",size=7,fontface="bold") + | |
annotate("text",x=30,y=45,label="Librarian",colour="#BE5039",size=7,fontface="bold") | |
# maxRank | |
rankhist <- ggplot(data=(srr_dataset %>% filter(librarian==FALSE)),aes(maxRank))+ | |
geom_histogram(col="#215297",fill="#215297",alpha=0.65,binwidth=2.5)+ | |
geom_histogram(data=(srr_dataset %>% filter(librarian==TRUE)), | |
aes(maxRank),col="#BE5039",fill="#BE5039",alpha=0.65,binwidth=2.5) + | |
theme_minimal() + | |
ggtitle("Histogram of Journal Category Rankings in dataset") + | |
theme(plot.title = element_text(size=20,face="bold")) + | |
annotate("text",x=40,y=15,label="No Librarian",colour="#215297",size=7,fontface="bold") + | |
annotate("text",x=40,y=14,label="Librarian",colour="#BE5039",size=7,fontface="bold") | |
# citations | |
cithist <- ggplot(data=(srr_dataset %>% filter(librarian==FALSE)),aes(times.cited))+ | |
geom_histogram(col="#215297",fill="#215297",alpha=0.65,binwidth=2)+ | |
geom_histogram(data=(srr_dataset %>% filter(librarian==TRUE)), | |
aes(times.cited),col="#BE5039",fill="#BE5039",alpha=0.65,binwidth=2) + | |
theme_minimal() + | |
ggtitle("Histogram of Article Citations in dataset") + | |
theme(plot.title = element_text(size=20,face="bold")) + | |
annotate("text",x=150,y=15,label="No Librarian",colour="#215297",size=7,fontface="bold") + | |
annotate("text",x=150,y=12,label="Librarian",colour="#BE5039",size=7,fontface="bold") | |
## Box-plots Lib versus NoLib | |
libscale <- c("#215297","#BE5039") | |
jifbox <- ggplot(srr_dataset %>% filter(librarian != "NA"),aes(factor(librarian),JIF)) + | |
geom_jitter(width=0.15,alpha=0.5) + | |
geom_boxplot(outlier.shape=NA,alpha=0.8,colour="black",fill=libscale) + | |
theme_minimal() + | |
ggtitle("Journal Impact Factor versus Librarian Involvement") + | |
xlab("Librarian Involved?") | |
rankbox <- ggplot(srr_dataset %>% filter(librarian != "NA"),aes(factor(librarian),maxRank)) + | |
geom_jitter(width=0.15,alpha=0.5) + | |
geom_boxplot(outlier.shape=NA,alpha=0.8,colour="black",fill=libscale) + | |
theme_minimal() + | |
ggtitle("Journal Category Rank versus Librarian Involvement") + | |
xlab("Librarian Involved?") | |
citbox <- ggplot(srr_dataset %>% filter(librarian != "NA"),aes(factor(librarian),times.cited)) + | |
geom_jitter(width=0.15,alpha=0.5) + | |
geom_boxplot(outlier.shape=NA,alpha=0.8,colour="black",fill=libscale) + | |
theme_minimal() + | |
ggtitle("Number of Citations versus Librarian Involvement") + | |
xlab("Librarian Involved?") | |
## Prep for auth vs intext-only comparisons | |
authscale <- c("#357A7A","#713F7E") | |
srr_dataset <- mutate(srr_dataset,libtype = case_when( librarian==FALSE ~ "None", | |
coauthor==TRUE ~ "Co-Author", | |
intext==TRUE ~ "Acknowledgment" )) | |
srr_dataset %<>% mutate(libtype=as.factor(libtype)) | |
## Box-plots Co-Author versus Acknowledgement | |
jifauthbox <- ggplot(srr_dataset %>% filter(librarian == TRUE),aes(libtype,JIF)) + | |
geom_jitter(width=0.15,alpha=0.5) + | |
geom_boxplot(outlier.shape=NA,alpha=0.8,colour="black",fill=authscale) + | |
theme_minimal() + | |
ggtitle("Journal Impact Factor versus Librarian Involvement") + | |
xlab("Librarian Involvement") | |
rankauthbox <- ggplot(srr_dataset %>% filter(librarian == TRUE),aes(libtype,maxRank)) + | |
geom_jitter(width=0.15,alpha=0.5) + | |
geom_boxplot(outlier.shape=NA,alpha=0.8,colour="black",fill=authscale) + | |
theme_minimal() + | |
ggtitle("Journal Category Ranking versus Librarian Involvement") + | |
xlab("Librarian Involvement") | |
citauthbox <- ggplot(srr_dataset %>% filter(librarian == TRUE),aes(libtype,times.cited)) + | |
geom_jitter(width=0.15,alpha=0.5) + | |
geom_boxplot(outlier.shape=NA,alpha=0.8,colour="black",fill=authscale) + | |
theme_minimal() + | |
ggtitle("Number of Citations versus Librarian Involvement") + | |
xlab("Librarian Involvement") | |
# Let's do some t-tests (lib versus nonlib) | |
JIF_lib <- srr_dataset %>% filter(librarian=="TRUE") %>% select(JIF) | |
JIF_nonlib <- srr_dataset %>% filter(librarian=="FALSE") %>% select(JIF) | |
t.test(JIF_lib,JIF_nonlib,alt="two.sided",var.equal=FALSE) # p-value = 0.06408 | |
RANK_lib <- srr_dataset %>% filter(librarian=="TRUE") %>% select(maxRank) | |
RANK_nonlib <- srr_dataset %>% filter(librarian=="FALSE") %>% select(maxRank) | |
t.test(RANK_lib,RANK_nonlib,alt="two.sided",var.equal=FALSE) # p-value = 0.00201 | |
CIT_lib <- srr_dataset %>% filter(librarian=="TRUE") %>% select(times.cited) | |
CIT_nonlib <- srr_dataset %>% filter(librarian=="FALSE") %>% select(times.cited) | |
t.test(CIT_lib,CIT_nonlib,alt="two.sided",var.equal=FALSE) # p-value = 0.4686 | |
# Wilcoxon rank-sum | |
wilcox.test(as.numeric(unlist(JIF_lib)),as.numeric(unlist(JIF_nonlib)),alt="two.sided") # p-value = 0.038 | |
wilcox.test(as.numeric(unlist(RANK_lib)),as.numeric(unlist(RANK_nonlib)),alt="two.sided") # p-value = 0.00458 | |
wilcox.test(as.numeric(unlist(CIT_lib)),as.numeric(unlist(CIT_nonlib)),alt="two.sided") # p-value = 0.8387 | |
# Let's do some t-tests (auth versus nonauth) | |
JIF_auth <- srr_dataset %>% filter(coauthor=="TRUE") %>% select(JIF) | |
JIF_nonauth <- srr_dataset %>% filter(coauthor=="FALSE") %>% select(JIF) | |
JIF_intext <- srr_dataset %>% filter(intext=="TRUE") %>% select(JIF) | |
t.test(JIF_auth,JIF_intext) # p-value = 0.7126 | |
RANK_auth <- srr_dataset %>% filter(coauthor=="TRUE") %>% select(maxRank) | |
RANK_nonauth <- srr_dataset %>% filter(coauthor=="FALSE") %>% select(maxRank) | |
RANK_intext <- srr_dataset %>% filter(intext=="TRUE") %>% select(maxRank) | |
t.test(RANK_auth,RANK_intext) # p-value = 0.6542 | |
CIT_auth <- srr_dataset %>% filter(coauthor=="TRUE") %>% select(times.cited) | |
CIT_nonauth <- srr_dataset %>% filter(coauthor=="FALSE") %>% select(times.cited) | |
CIT_intext <- srr_dataset %>% filter(intext=="TRUE") %>% select(times.cited) | |
t.test(CIT_auth,CIT_intext) # p-value = 0.6874 | |
# Wilcoxon rank-sum between librarian involvement types | |
wilcox.test(as.numeric(unlist(JIF_auth)),as.numeric(unlist(JIF_intext)),alt="two.sided") | |
wilcox.test(as.numeric(unlist(RANK_auth)),as.numeric(unlist(RANK_intext)),alt="two.sided") | |
wilcox.test(as.numeric(unlist(CIT_auth)),as.numeric(unlist(CIT_intext)),alt="two.sided") | |
## For funsies.... | |
strauss_table <- as.data.frame(ReadBib("StraussLibrarians.txt")) | |
strauss_table <- left_join(strauss_table,srr_dataset,by="title") | |
JIFstrauss <- strauss_table %>% select(JIF) | |
t.test(JIFstrauss,JIF_nonlib) # p-value = 0.6473 | |
wilcox.test(as.numeric(unlist(JIFstrauss)),as.numeric(unlist(JIF_nonlib)),alt="two.sided") # p-value = 0.6196 | |
RANKstrauss <- strauss_table %>% select(maxRank) | |
t.test(RANKstrauss,RANK_nonlib) # p-value = 0.6519 | |
wilcox.test(as.numeric(unlist(RANKstrauss)),as.numeric(unlist(RANK_nonlib)),alt="two.sided") # p-value = 0.9661 | |
CITstrauss <- strauss_table %>% select(times.cited) | |
t.test(CITstrauss,CIT_nonlib) # p-value = 0.1012 | |
wilcox.test(as.numeric(unlist(CITstrauss)),as.numeric(unlist(CIT_nonlib)),alt="two.sided") # p-value = 0.3177 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment