Skip to content

Instantly share code, notes, and snippets.

@thoughtfulbloke
Last active August 5, 2017 02:15
Show Gist options
  • Save thoughtfulbloke/5aaa4536c3950d1f3307b12554d53a10 to your computer and use it in GitHub Desktop.
Save thoughtfulbloke/5aaa4536c3950d1f3307b12554d53a10 to your computer and use it in GitHub Desktop.
Showing analysing a bunch of abstracts using tidypvals, fulltext, and tidy text in R
library(tidypvals)
library(dplyr)
library(fulltext)
library(tidytext)
library(tidyr)
library(ggplot)
library(parallel)
hasDOI <- allp %>% filter(!is.na(doi), operator == "equals")
plosDOI <- hasDOI[grep("pone", hasDOI$doi),]
# in theory error handling isn't needed if only targetting PLOS articles
fltx_seek_abstract <- function(x){
tryCatch(ft_abstract(x)[["plos"]][[1]][["abstract"]],
error = function(c) NA,
warning = function(c) NA,
message = function(c) NA
)
}
# rather than using my download individuals code, I strongly urge
# people to have a look at Scott Chamberlain's (@sckottie) GIST
# https://gist.github.com/sckott/ee32e58b0b2fe2f722a5b5112234c893
plosDOI$Absfltx <- sapply(plosDOI$doi, fltx_seek_abstract)
#I have just spent 36 hours with the computer downloading 348542 abstracts,
# Let's save what I have done
save(plosDOI, file="PLOSDOI.RData")
rm(hasDOI)
cats_dogs_living_together <- plosDOI %>% select(doi, pvalue, Absfltx) %>%
unnest_tokens(word, Absfltx) %>%
mutate(cats = word == "cats" |
word == "cat" |
word == "feline" |
word == "felines",
dogs = word == "dog" |
word == "dogs" |
word == "canine" |
word == "canines",
teeth = word == "tooth" | #canine can refer to teeth
word == "teeth" |
word == "dentistry" |
word == "dental" |
word == "oral") %>%
group_by(doi, pvalue) %>%
summarise(is_cat = sum(cats) > 0,
is_dog = sum(dogs) > 0,
is_teeth = sum(teeth) > 0) %>% ungroup() %>%
filter(!is_teeth & (is_cat | is_dog)) %>% select(-is_teeth) %>%
gather(animal,relevence, is_cat:is_dog) %>% filter(relevence) %>%
select(-relevence)
ggplot(cats_dogs_living_together, aes(x=pvalue, colour=animal)) + geom_density() +
ggtitle("Cats vs Dogs pvalues", subtitle = "source: Abstracts of 2667 PLOS articles")
cats_dogs_living_together %>% group_by(animal) %>%
summarise(mpv = mean(pvalue), number = n())
# 757 cat studies, 1910 dog studies suggests dogs are just easier to do experiments on
cats_dogs_living_together %>% group_by(animal) %>%
summarise(mpv = mean(pvalue), number = n()) %>% summarise(diff_in_mean = mpv[1]-mpv[2])
# difference in mean pvals 0.02344332
# significance test- irregualr distribution so will use simulation.
# if we assume that cats and dogs are being drawn from a common distrubtion
# of "research on household pets" that is represented by the combined distribution
# how likely is the observed difference in means or more likely to occur by chance
num_sim <- 10000000
a_diff_of_means <- function(x,pvals){
catdog <- sample(pvals, 2667, replace=TRUE)
abs(mean(catdog[1:757]) - mean(catdog[758:2667]))
}
pvalsVec <- cats_dogs_living_together$pvalue
# this is going parallel for doing a lot of simulations YMMV,
# but at 10000000 my computers fan revs up and the room warms
# Calculate the number of cores
no_cores <- detectCores() - 1
# Initiate cluster
cl <- makeCluster(no_cores)
clusterExport(cl, varlist=c("num_sim", "a_diff_of_means", "pvalsVec"))
sim <- parSapply(cl, 1:num_sim,a_diff_of_means, pvals=pvalsVec)
stopCluster(cl)
sum(sim >= 0.02344332) / num_sim
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment