First, let's get some data
library (alm)
library (rImpactStory)
library (rAltmetric)
library (ggplot2)
library (rplos)
library (reshape2)
library (httr)
library (reshape)
library (lubridate)
Define a function to parse ImpactStory results
parse_is <- function(input) {
metrics <- input$metrics
delicious_bookmarks <- try(metrics$`delicious:bookmarks`$values[[" raw" ]])
mendeley_readers <- try(metrics$`mendeley:readers`$values[[" raw" ]])
plosalm_html_views <- try(metrics$`plosalm:html_views`$values[[" raw" ]])
plosalm_pdf_views <- try(metrics$`plosalm:pdf_views`$values[[" raw" ]])
plosalm_scopus <- try(metrics$`plosalm:scopus`$values[[" raw" ]])
# pmc_citations <- try(metrics$`pubmed:pmc_citations`$values[[' raw' ]])
pmc_citations_one <- try(metrics$`plosalm:pmc_full-text`$values[[" raw" ]])
pmc_citations_two <- try(metrics$`plosalm:pmc_pdf`$values[[" raw" ]])
if (is.null(pmc_citations_one) & is.null(pmc_citations_one)) {
pmc_citations <- NULL
} else {
pmc_citations <- pmc_citations_one + pmc_citations_two
}
topsy_tweets <- try(metrics$`topsy:tweets`$values[[" raw" ]])
results <- list(delicious_bookmarks = delicious_bookmarks, mendeley_readers = mendeley_readers,
plosalm_html_views = plosalm_html_views, plosalm_pdf_views = plosalm_pdf_views,
plosalm_scopus = plosalm_scopus, pmc_citations = pmc_citations, topsy_tweets = topsy_tweets)
results[sapply(results, is.null)] <- NA
data.frame(results, date_modified = input$last_modified)
}
Define a function to collect altmetrics from three providers
compare_altmet_prov <- function(doi, isaddifnot = FALSE, fileout = NULL, sleep = 0) {
Sys .sleep (sleep)
# plos
plos_dat <- alm(doi = doi, total_details = TRUE)
plos_dat2 <- plos_dat[, c(" citeulike_total" , " connotea_total" , " scopus_total" ,
" counter_total" , " pmc_total" , " facebook_shares" , " mendeley_shares" ,
" twitter_total" , " date_modified" )]
plos_dat2$date_modified <- as.Date(plos_dat2$date_modified)
names(plos_dat2) <- c(" citeulike" , " connotea" , " scopus" , " ploscounter" ,
" pmc" , " facebook" , " mendeley" , " twitter" , " date_modified" )
# altmetric
alt_dat <- altmetric_data(altmetrics(doi = doi))
if (is.null(alt_dat)) {
alt_dat2 <- data.frame(999, 999, 999, 999, 999, 999, 999, 999, " 1970-01-01" )
} else {
alt_dat2 <- alt_dat[, c(" cited_by_fbwalls_count" , " cited_by_posts_count" ,
" cited_by_tweeters_count" , " cited_by_delicious_count" , " mendeley" ,
" connotea" , " citeulike" , " readers_count" , " last_updated" )]
alt_dat2$last_updated <- as.Date(as.POSIXct(alt_dat2$last_updated, origin = " 1970-01-01" ,
tz = " GMT" ))
}
names(alt_dat2) <- c(" facebook" , " cite_by_posts_count" , " twitter" , " delicious" ,
" mendeley" , " connotea" , " citeulike" , " readers_count" , " date_modified" )
# impactstory is_dat <- metrics(doi)
is_dat <- metrics(doi, addifnot = isaddifnot, sleep = 1)
is_dat <- parse_is(is_dat)
is_dat$date_modified <- as.Date(is_dat$date_modified)
names(is_dat) <- c(" delicious" , " mendeley" , " ploshtml" , " plosviews" , " scopus" ,
" pmc" , " twitter" , " date_modified" )
is_dat$ploscounter <- is_dat$ploshtml + is_dat$plosviews
is_dat <- is_dat[, -c(3, 4)]
# put it together
df <- data.frame(provider = c(" PLoS ALM" , " Altmetric" , " ImpactStory" ), doi = doi,
rbind.fill(plos_dat2, alt_dat2, is_dat))[, -c(12:14)]
if (is.null(fileout)) {
df
} else {
write .table (df, file = fileout, append = TRUE, row .names = FALSE, col .names = FALSE,
sep = " ," )
}
}
Safe version in case of errors
compare_altmet_prov_safe <- plyr::failwith(NULL, compare_altmet_prov)
Search for and get DOIs for 50 papers (we used more in the paper)
res <- searchplos(terms = " *:*" , fields = " id" , toquery = list(" publication_date:[2011-06-30T00:00:00Z TO 2012-06-01T23:59:59Z] " ,
" doc_type:full" ), start = 0, limit = 50)
Get data from each provider
dat <- llply(as.character(res[, 1]), compare_altmet_prov_safe, isaddifnot = TRUE,
sleep = 1, .progress = " text" )
dat <- ldply(dat)
Great, we have data! Next, let's make a plot of the difference between max and min value across the three providers for 7 article-level metrics.
alldat <- sort_df(dat, " doi" )
dat_split <- split(alldat, f = alldat$doi)
dat_split <- dat_split[!sapply(dat_split, nrow) == 0]
dat_split <- compact(lapply(dat_split, function(x) {
if (x$citeulike[2 ] == 999 ) {
NULL
} else (x)
}))
dat_split_df <- ldply(dat_split)[, -1]
calcdiff <- function(x) {
max (na .omit (x)) - min (na .omit (x))
}
dat_split_df_1 <- ddply(dat_split_df, .(doi), numcolwise(calcdiff))
dat_split_df_melt <- melt(dat_split_df_1)
dat_split_df_melt <- dat_split_df_melt[!dat_split_df_melt$variable %in% " connotea" ,
]
ggplot(dat_split_df_melt, aes(x = log10(value), fill = variable)) + theme_bw(base_size = 14) +
geom_bar() + scale_fill_discrete(name = " Almetric" ) + facet_grid(variable ~
., scales = " free" ) + labs(x = expression(log[10](Value)), y = " Count" ) +
theme(strip.text.y = element_text(angle = 0), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), legend.position = " none" , panel.border = element_rect(size = 1))
Okay, so there are some inconsistencies in data from different providers. Perhaps the same metric (e.g., tweets) were collected on different days for each provider? Let's take a look. We first define a function to get the difference in days between a vector of values, then apply that function over the data for each metric. We then reshape the data a bit using the reshape2
package, and make the plot.
datediff <- function(x) {
datesorted <- sort(x)
round(as.numeric(difftime(datesorted[3], datesorted[1])), 0)
}
dat_split_df_1 <- ddply(dat_split_df, .(doi), numcolwise(calcdiff))
dat_split_df_2 <- ddply(dat_split_df, .(doi), summarise, datediff = datediff(date_modified))
dat_split_df_melt <- melt(dat_split_df_1)
dat_split_df_ <- merge(dat_split_df_melt, dat_split_df_2, by = " doi" )
dat_split_df_melt <- dat_split_df_[!dat_split_df_$variable %in% " connotea" ,
] # connotea isn' t shared among the providers
ggplot(dat_split_df_melt, aes(x = datediff, y = log10(value + 1), colour = variable)) +
theme_bw(base_size = 14) + geom_point(size = 3, alpha = 0.6) + scale_colour_brewer("Source",
type = "qual", palette = 3) + theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), legend.position = c(0.65, 0.9), panel.border = element_rect(size = 1),
legend.key = element_blank(), panel.background = element_rect(colour = "white")) +
guides(col = guide_legend(nrow = 2, override.aes = list(size = 4))) + labs(x = "\n Date difference (no. of days)",
y = "Value of difference between max and min\n ")
Let's dig in to indivdual articles. Here, we reshape some data, selecting just 20 DOIs (i.e., papers), and plot the values of each metric for each DOI, coloring points by provider. Note that points are slightly offset horizontally to make them easier to see.
alldat_m <- melt(dat_split_df[1:60,], id.vars = c(1,2,11))
alldat_m <- alldat_m[!alldat_m$variable %in% " connotea" ,]
ggplot(na.omit(alldat_m[,-3]), aes(x=doi, y = value, colour = provider)) +
theme_bw(base_size = 14) +
geom_point(size=4, alpha = 0.4, position = position_jitter(width=0.15)) +
scale_colour_manual(values = c(' #FC1D00' ,' #FD8A00' ,' #0D71A5' ,' #2CCC00' )) +
facet_grid(variable~., scales = ' free' ) +
theme(axis.text.x=element_blank(),
strip.text.y = element_text(angle = 0),
legend.position = " top" ,
legend.key = element_blank(),
panel.border = element_rect(size=1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
guides(col = guide_legend(title=" " , override.aes = list(size=5),
nrow = 1, byrow = TRUE))