sckott/almetrics_functions.md

## almetrics_functions.md

      
    Raw
  

              almetrics_functions.md
            
          
    First, let's get some data

library(alm)
library(rImpactStory)
library(rAltmetric)
library(ggplot2)
library(rplos)
library(reshape2)
library(httr)
library(reshape)
library(lubridate)
Define a function to parse ImpactStory results

parse_is <- function(input) {
    metrics <- input$metrics
    delicious_bookmarks <- try(metrics$`delicious:bookmarks`$values[["raw"]])
    mendeley_readers <- try(metrics$`mendeley:readers`$values[["raw"]])
    plosalm_html_views <- try(metrics$`plosalm:html_views`$values[["raw"]])
    plosalm_pdf_views <- try(metrics$`plosalm:pdf_views`$values[["raw"]])
    plosalm_scopus <- try(metrics$`plosalm:scopus`$values[["raw"]])
    # pmc_citations <- try(metrics$`pubmed:pmc_citations`$values[['raw']])
    pmc_citations_one <- try(metrics$`plosalm:pmc_full-text`$values[["raw"]])
    pmc_citations_two <- try(metrics$`plosalm:pmc_pdf`$values[["raw"]])
    if (is.null(pmc_citations_one) & is.null(pmc_citations_one)) {
        pmc_citations <- NULL
    } else {
        pmc_citations <- pmc_citations_one + pmc_citations_two
    }
    topsy_tweets <- try(metrics$`topsy:tweets`$values[["raw"]])
    results <- list(delicious_bookmarks = delicious_bookmarks, mendeley_readers = mendeley_readers, 
        plosalm_html_views = plosalm_html_views, plosalm_pdf_views = plosalm_pdf_views, 
        plosalm_scopus = plosalm_scopus, pmc_citations = pmc_citations, topsy_tweets = topsy_tweets)
    results[sapply(results, is.null)] <- NA
    data.frame(results, date_modified = input$last_modified)
}
Define a function to collect altmetrics from three providers

compare_altmet_prov <- function(doi, isaddifnot = FALSE, fileout = NULL, sleep = 0) {
    Sys.sleep(sleep)
    
    # plos
    plos_dat <- alm(doi = doi, total_details = TRUE)
    plos_dat2 <- plos_dat[, c("citeulike_total", "connotea_total", "scopus_total", 
        "counter_total", "pmc_total", "facebook_shares", "mendeley_shares", 
        "twitter_total", "date_modified")]
    plos_dat2$date_modified <- as.Date(plos_dat2$date_modified)
    names(plos_dat2) <- c("citeulike", "connotea", "scopus", "ploscounter", 
        "pmc", "facebook", "mendeley", "twitter", "date_modified")
    # altmetric
    alt_dat <- altmetric_data(altmetrics(doi = doi))
    if (is.null(alt_dat)) {
        alt_dat2 <- data.frame(999, 999, 999, 999, 999, 999, 999, 999, "1970-01-01")
    } else {
        alt_dat2 <- alt_dat[, c("cited_by_fbwalls_count", "cited_by_posts_count", 
            "cited_by_tweeters_count", "cited_by_delicious_count", "mendeley", 
            "connotea", "citeulike", "readers_count", "last_updated")]
        alt_dat2$last_updated <- as.Date(as.POSIXct(alt_dat2$last_updated, origin = "1970-01-01", 
            tz = "GMT"))
    }
    names(alt_dat2) <- c("facebook", "cite_by_posts_count", "twitter", "delicious", 
        "mendeley", "connotea", "citeulike", "readers_count", "date_modified")
    # impactstory is_dat <- metrics(doi)
    is_dat <- metrics(doi, addifnot = isaddifnot, sleep = 1)
    is_dat <- parse_is(is_dat)
    is_dat$date_modified <- as.Date(is_dat$date_modified)
    names(is_dat) <- c("delicious", "mendeley", "ploshtml", "plosviews", "scopus", 
        "pmc", "twitter", "date_modified")
    is_dat$ploscounter <- is_dat$ploshtml + is_dat$plosviews
    is_dat <- is_dat[, -c(3, 4)]
    
    # put it together
    df <- data.frame(provider = c("PLoS ALM", "Altmetric", "ImpactStory"), doi = doi, 
        rbind.fill(plos_dat2, alt_dat2, is_dat))[, -c(12:14)]
    if (is.null(fileout)) {
        df
    } else {
        write.table(df, file = fileout, append = TRUE, row.names = FALSE, col.names = FALSE, 
            sep = ",")
    }
}
Safe version in case of errors

compare_altmet_prov_safe <- plyr::failwith(NULL, compare_altmet_prov)
Search for and get DOIs for 50 papers (we used more in the paper)

res <- searchplos(terms = "*:*", fields = "id", toquery = list("publication_date:[2011-06-30T00:00:00Z TO 2012-06-01T23:59:59Z] ", 
    "doc_type:full"), start = 0, limit = 50)
Get data from each provider

dat <- llply(as.character(res[, 1]), compare_altmet_prov_safe, isaddifnot = TRUE, 
    sleep = 1, .progress = "text")
dat <- ldply(dat)
Great, we have data! Next, let's make a plot of the difference between max and min value across the three providers for 7 article-level metrics.

alldat <- sort_df(dat, "doi")

dat_split <- split(alldat, f = alldat$doi)
dat_split <- dat_split[!sapply(dat_split, nrow) == 0]
dat_split <- compact(lapply(dat_split, function(x) {
    if (x$citeulike[2] == 999) {
        NULL
    } else (x)
}))
dat_split_df <- ldply(dat_split)[, -1]

calcdiff <- function(x) {
    max(na.omit(x)) - min(na.omit(x))
}

dat_split_df_1 <- ddply(dat_split_df, .(doi), numcolwise(calcdiff))
dat_split_df_melt <- melt(dat_split_df_1)

dat_split_df_melt <- dat_split_df_melt[!dat_split_df_melt$variable %in% "connotea", 
    ]

ggplot(dat_split_df_melt, aes(x = log10(value), fill = variable)) + theme_bw(base_size = 14) + 
    geom_bar() + scale_fill_discrete(name = "Almetric") + facet_grid(variable ~ 
    ., scales = "free") + labs(x = expression(log[10](Value)), y = "Count") + 
    theme(strip.text.y = element_text(angle = 0), panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(), legend.position = "none", panel.border = element_rect(size = 1))

Okay, so there are some inconsistencies in data from different providers. Perhaps the same metric (e.g., tweets) were collected on different days for each provider? Let's take a look. We first define a function to get the difference in days between a vector of values, then apply that function over the data for each metric. We then reshape the data a bit using the reshape2 package, and make the plot.

datediff <- function(x) {
    datesorted <- sort(x)
    round(as.numeric(difftime(datesorted[3], datesorted[1])), 0)
}

dat_split_df_1 <- ddply(dat_split_df, .(doi), numcolwise(calcdiff))
dat_split_df_2 <- ddply(dat_split_df, .(doi), summarise, datediff = datediff(date_modified))
dat_split_df_melt <- melt(dat_split_df_1)
dat_split_df_ <- merge(dat_split_df_melt, dat_split_df_2, by = "doi")
dat_split_df_melt <- dat_split_df_[!dat_split_df_$variable %in% "connotea", 
    ]  # connotea isn't shared among the providers

ggplot(dat_split_df_melt, aes(x = datediff, y = log10(value + 1), colour = variable)) + 
    theme_bw(base_size = 14) + geom_point(size = 3, alpha = 0.6) + scale_colour_brewer("Source", 
    type = "qual", palette = 3) + theme(panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(), legend.position = c(0.65, 0.9), panel.border = element_rect(size = 1), 
    legend.key = element_blank(), panel.background = element_rect(colour = "white")) + 
    guides(col = guide_legend(nrow = 2, override.aes = list(size = 4))) + labs(x = "\nDate difference (no. of days)", 
    y = "Value of difference between max and min\n")

Let's dig in to indivdual articles. Here, we reshape some data, selecting just 20 DOIs (i.e., papers), and plot the values of each metric for each DOI, coloring points by provider. Note that points are slightly offset horizontally to make them easier to see.

alldat_m <- melt(dat_split_df[1:60,], id.vars=c(1,2,11))
alldat_m <- alldat_m[!alldat_m$variable %in% "connotea",]

ggplot(na.omit(alldat_m[,-3]), aes(x=doi, y=value, colour=provider)) +
  theme_bw(base_size = 14) + 
  geom_point(size=4, alpha=0.4, position=position_jitter(width=0.15)) +
  scale_colour_manual(values = c('#FC1D00','#FD8A00','#0D71A5','#2CCC00')) +
  facet_grid(variable~., scales='free') +
  theme(axis.text.x=element_blank(),
        strip.text.y=element_text(angle = 0),
        legend.position="top", 
        legend.key = element_blank(),
        panel.border = element_rect(size=1),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) +
  guides(col = guide_legend(title="", override.aes=list(size=5), 
                            nrow = 1, byrow = TRUE))