Skip to content

Instantly share code, notes, and snippets.

@sckott
Last active December 20, 2015 13:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sckott/6136591 to your computer and use it in GitHub Desktop.
Save sckott/6136591 to your computer and use it in GitHub Desktop.
A few functions to use in consuming altmetrics data

First, let's get some data

library(alm)
library(rImpactStory)
library(rAltmetric)
library(ggplot2)
library(rplos)
library(reshape2)
library(httr)
library(reshape)
library(lubridate)

Define a function to parse ImpactStory results

parse_is <- function(input) {
    metrics <- input$metrics
    delicious_bookmarks <- try(metrics$`delicious:bookmarks`$values[["raw"]])
    mendeley_readers <- try(metrics$`mendeley:readers`$values[["raw"]])
    plosalm_html_views <- try(metrics$`plosalm:html_views`$values[["raw"]])
    plosalm_pdf_views <- try(metrics$`plosalm:pdf_views`$values[["raw"]])
    plosalm_scopus <- try(metrics$`plosalm:scopus`$values[["raw"]])
    # pmc_citations <- try(metrics$`pubmed:pmc_citations`$values[['raw']])
    pmc_citations_one <- try(metrics$`plosalm:pmc_full-text`$values[["raw"]])
    pmc_citations_two <- try(metrics$`plosalm:pmc_pdf`$values[["raw"]])
    if (is.null(pmc_citations_one) & is.null(pmc_citations_one)) {
        pmc_citations <- NULL
    } else {
        pmc_citations <- pmc_citations_one + pmc_citations_two
    }
    topsy_tweets <- try(metrics$`topsy:tweets`$values[["raw"]])
    results <- list(delicious_bookmarks = delicious_bookmarks, mendeley_readers = mendeley_readers, 
        plosalm_html_views = plosalm_html_views, plosalm_pdf_views = plosalm_pdf_views, 
        plosalm_scopus = plosalm_scopus, pmc_citations = pmc_citations, topsy_tweets = topsy_tweets)
    results[sapply(results, is.null)] <- NA
    data.frame(results, date_modified = input$last_modified)
}

Define a function to collect altmetrics from three providers

compare_altmet_prov <- function(doi, isaddifnot = FALSE, fileout = NULL, sleep = 0) {
    Sys.sleep(sleep)
    
    # plos
    plos_dat <- alm(doi = doi, total_details = TRUE)
    plos_dat2 <- plos_dat[, c("citeulike_total", "connotea_total", "scopus_total", 
        "counter_total", "pmc_total", "facebook_shares", "mendeley_shares", 
        "twitter_total", "date_modified")]
    plos_dat2$date_modified <- as.Date(plos_dat2$date_modified)
    names(plos_dat2) <- c("citeulike", "connotea", "scopus", "ploscounter", 
        "pmc", "facebook", "mendeley", "twitter", "date_modified")
    # altmetric
    alt_dat <- altmetric_data(altmetrics(doi = doi))
    if (is.null(alt_dat)) {
        alt_dat2 <- data.frame(999, 999, 999, 999, 999, 999, 999, 999, "1970-01-01")
    } else {
        alt_dat2 <- alt_dat[, c("cited_by_fbwalls_count", "cited_by_posts_count", 
            "cited_by_tweeters_count", "cited_by_delicious_count", "mendeley", 
            "connotea", "citeulike", "readers_count", "last_updated")]
        alt_dat2$last_updated <- as.Date(as.POSIXct(alt_dat2$last_updated, origin = "1970-01-01", 
            tz = "GMT"))
    }
    names(alt_dat2) <- c("facebook", "cite_by_posts_count", "twitter", "delicious", 
        "mendeley", "connotea", "citeulike", "readers_count", "date_modified")
    # impactstory is_dat <- metrics(doi)
    is_dat <- metrics(doi, addifnot = isaddifnot, sleep = 1)
    is_dat <- parse_is(is_dat)
    is_dat$date_modified <- as.Date(is_dat$date_modified)
    names(is_dat) <- c("delicious", "mendeley", "ploshtml", "plosviews", "scopus", 
        "pmc", "twitter", "date_modified")
    is_dat$ploscounter <- is_dat$ploshtml + is_dat$plosviews
    is_dat <- is_dat[, -c(3, 4)]
    
    # put it together
    df <- data.frame(provider = c("PLoS ALM", "Altmetric", "ImpactStory"), doi = doi, 
        rbind.fill(plos_dat2, alt_dat2, is_dat))[, -c(12:14)]
    if (is.null(fileout)) {
        df
    } else {
        write.table(df, file = fileout, append = TRUE, row.names = FALSE, col.names = FALSE, 
            sep = ",")
    }
}

Safe version in case of errors

compare_altmet_prov_safe <- plyr::failwith(NULL, compare_altmet_prov)

Search for and get DOIs for 50 papers (we used more in the paper)

res <- searchplos(terms = "*:*", fields = "id", toquery = list("publication_date:[2011-06-30T00:00:00Z TO 2012-06-01T23:59:59Z] ", 
    "doc_type:full"), start = 0, limit = 50)

Get data from each provider

dat <- llply(as.character(res[, 1]), compare_altmet_prov_safe, isaddifnot = TRUE, 
    sleep = 1, .progress = "text")
dat <- ldply(dat)

Great, we have data! Next, let's make a plot of the difference between max and min value across the three providers for 7 article-level metrics.

alldat <- sort_df(dat, "doi")

dat_split <- split(alldat, f = alldat$doi)
dat_split <- dat_split[!sapply(dat_split, nrow) == 0]
dat_split <- compact(lapply(dat_split, function(x) {
    if (x$citeulike[2] == 999) {
        NULL
    } else (x)
}))
dat_split_df <- ldply(dat_split)[, -1]

calcdiff <- function(x) {
    max(na.omit(x)) - min(na.omit(x))
}

dat_split_df_1 <- ddply(dat_split_df, .(doi), numcolwise(calcdiff))
dat_split_df_melt <- melt(dat_split_df_1)

dat_split_df_melt <- dat_split_df_melt[!dat_split_df_melt$variable %in% "connotea", 
    ]

ggplot(dat_split_df_melt, aes(x = log10(value), fill = variable)) + theme_bw(base_size = 14) + 
    geom_bar() + scale_fill_discrete(name = "Almetric") + facet_grid(variable ~ 
    ., scales = "free") + labs(x = expression(log[10](Value)), y = "Count") + 
    theme(strip.text.y = element_text(angle = 0), panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(), legend.position = "none", panel.border = element_rect(size = 1))

center

Okay, so there are some inconsistencies in data from different providers. Perhaps the same metric (e.g., tweets) were collected on different days for each provider? Let's take a look. We first define a function to get the difference in days between a vector of values, then apply that function over the data for each metric. We then reshape the data a bit using the reshape2 package, and make the plot.

datediff <- function(x) {
    datesorted <- sort(x)
    round(as.numeric(difftime(datesorted[3], datesorted[1])), 0)
}

dat_split_df_1 <- ddply(dat_split_df, .(doi), numcolwise(calcdiff))
dat_split_df_2 <- ddply(dat_split_df, .(doi), summarise, datediff = datediff(date_modified))
dat_split_df_melt <- melt(dat_split_df_1)
dat_split_df_ <- merge(dat_split_df_melt, dat_split_df_2, by = "doi")
dat_split_df_melt <- dat_split_df_[!dat_split_df_$variable %in% "connotea", 
    ]  # connotea isn't shared among the providers

ggplot(dat_split_df_melt, aes(x = datediff, y = log10(value + 1), colour = variable)) + 
    theme_bw(base_size = 14) + geom_point(size = 3, alpha = 0.6) + scale_colour_brewer("Source", 
    type = "qual", palette = 3) + theme(panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(), legend.position = c(0.65, 0.9), panel.border = element_rect(size = 1), 
    legend.key = element_blank(), panel.background = element_rect(colour = "white")) + 
    guides(col = guide_legend(nrow = 2, override.aes = list(size = 4))) + labs(x = "\nDate difference (no. of days)", 
    y = "Value of difference between max and min\n")

center

Let's dig in to indivdual articles. Here, we reshape some data, selecting just 20 DOIs (i.e., papers), and plot the values of each metric for each DOI, coloring points by provider. Note that points are slightly offset horizontally to make them easier to see.

alldat_m <- melt(dat_split_df[1:60,], id.vars=c(1,2,11))
alldat_m <- alldat_m[!alldat_m$variable %in% "connotea",]

ggplot(na.omit(alldat_m[,-3]), aes(x=doi, y=value, colour=provider)) +
  theme_bw(base_size = 14) + 
  geom_point(size=4, alpha=0.4, position=position_jitter(width=0.15)) +
  scale_colour_manual(values = c('#FC1D00','#FD8A00','#0D71A5','#2CCC00')) +
  facet_grid(variable~., scales='free') +
  theme(axis.text.x=element_blank(),
        strip.text.y=element_text(angle = 0),
        legend.position="top", 
        legend.key = element_blank(),
        panel.border = element_rect(size=1),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) +
  guides(col = guide_legend(title="", override.aes=list(size=5), 
                            nrow = 1, byrow = TRUE))

center

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment