thoughtfulbloke/catsAndDogs.R

## catsAndDogs.R
library(tidypvals)
library(dplyr)
library(fulltext)
library(tidytext)
library(tidyr)
library(ggplot)
library(parallel)

hasDOI <- allp %>% filter(!is.na(doi), operator == "equals")
plosDOI <- hasDOI[grep("pone", hasDOI$doi),]

# in theory error handling isn't needed if only targetting PLOS articles
fltx_seek_abstract <- function(x){
  tryCatch(ft_abstract(x)[["plos"]][[1]][["abstract"]],
           error = function(c) NA,
           warning = function(c) NA,
           message = function(c) NA
  )
}

# rather than using my download individuals code, I strongly urge
# people to have a look at Scott Chamberlain's (@sckottie) GIST
# https://gist.github.com/sckott/ee32e58b0b2fe2f722a5b5112234c893
plosDOI$Absfltx <- sapply(plosDOI$doi, fltx_seek_abstract)


#I have just spent 36 hours with the computer downloading 348542 abstracts,
# Let's save what I have done
save(plosDOI, file="PLOSDOI.RData")
rm(hasDOI)

cats_dogs_living_together <- plosDOI %>% select(doi, pvalue, Absfltx) %>%
  unnest_tokens(word, Absfltx) %>%
  mutate(cats = word == "cats" |
           word == "cat" |
           word == "feline" |
           word == "felines",
         dogs = word == "dog" |
           word == "dogs" |
           word == "canine" |
           word == "canines",
         teeth = word == "tooth" | #canine can refer to teeth
           word == "teeth" |
           word == "dentistry" |
           word == "dental" |
           word == "oral") %>%
  group_by(doi, pvalue) %>%
  summarise(is_cat = sum(cats) > 0,
            is_dog = sum(dogs) > 0,
            is_teeth = sum(teeth) > 0) %>% ungroup() %>%
  filter(!is_teeth & (is_cat | is_dog)) %>% select(-is_teeth) %>%
  gather(animal,relevence, is_cat:is_dog) %>% filter(relevence) %>%
  select(-relevence)

ggplot(cats_dogs_living_together, aes(x=pvalue, colour=animal)) + geom_density() +
  ggtitle("Cats vs Dogs pvalues", subtitle = "source: Abstracts of 2667 PLOS articles")

cats_dogs_living_together %>% group_by(animal) %>%
  summarise(mpv = mean(pvalue), number = n())
# 757 cat studies, 1910 dog studies suggests dogs are just easier to do experiments on
cats_dogs_living_together %>% group_by(animal) %>%
  summarise(mpv = mean(pvalue), number = n()) %>% summarise(diff_in_mean = mpv[1]-mpv[2])
# difference in mean pvals 0.02344332

# significance test- irregualr distribution so will use simulation.
# if we assume that cats and dogs are being drawn from a common distrubtion
# of "research on household pets" that is represented by the combined distribution
# how likely is the observed difference in means or more likely to occur by chance

num_sim <- 10000000
a_diff_of_means <- function(x,pvals){
  catdog <- sample(pvals, 2667, replace=TRUE)
  abs(mean(catdog[1:757]) - mean(catdog[758:2667]))
}
pvalsVec <- cats_dogs_living_together$pvalue

# this is going parallel for doing a lot of simulations YMMV,
# but at 10000000 my computers fan revs up and the room warms

# Calculate the number of cores
no_cores <- detectCores() - 1
# Initiate cluster
cl <- makeCluster(no_cores)
clusterExport(cl, varlist=c("num_sim", "a_diff_of_means", "pvalsVec"))
sim <- parSapply(cl, 1:num_sim,a_diff_of_means, pvals=pvalsVec)
stopCluster(cl)

sum(sim >= 0.02344332) / num_sim
	library(tidypvals)
	library(dplyr)
	library(fulltext)
	library(tidytext)
	library(tidyr)
	library(ggplot)
	library(parallel)

	hasDOI <- allp %>% filter(!is.na(doi), operator == "equals")
	plosDOI <- hasDOI[grep("pone", hasDOI$doi),]

	# in theory error handling isn't needed if only targetting PLOS articles
	fltx_seek_abstract <- function(x){
	tryCatch(ft_abstract(x)[["plos"]][[1]][["abstract"]],
	error = function(c) NA,
	warning = function(c) NA,
	message = function(c) NA
	)
	}

	# rather than using my download individuals code, I strongly urge
	# people to have a look at Scott Chamberlain's (@sckottie) GIST
	# https://gist.github.com/sckott/ee32e58b0b2fe2f722a5b5112234c893
	plosDOI$Absfltx <- sapply(plosDOI$doi, fltx_seek_abstract)


	#I have just spent 36 hours with the computer downloading 348542 abstracts,
	# Let's save what I have done
	save(plosDOI, file="PLOSDOI.RData")
	rm(hasDOI)

	cats_dogs_living_together <- plosDOI %>% select(doi, pvalue, Absfltx) %>%
	unnest_tokens(word, Absfltx) %>%
	mutate(cats = word == "cats" \|
	word == "cat" \|
	word == "feline" \|
	word == "felines",
	dogs = word == "dog" \|
	word == "dogs" \|
	word == "canine" \|
	word == "canines",
	teeth = word == "tooth" \| #canine can refer to teeth
	word == "teeth" \|
	word == "dentistry" \|
	word == "dental" \|
	word == "oral") %>%
	group_by(doi, pvalue) %>%
	summarise(is_cat = sum(cats) > 0,
	is_dog = sum(dogs) > 0,
	is_teeth = sum(teeth) > 0) %>% ungroup() %>%
	filter(!is_teeth & (is_cat \| is_dog)) %>% select(-is_teeth) %>%
	gather(animal,relevence, is_cat:is_dog) %>% filter(relevence) %>%
	select(-relevence)

	ggplot(cats_dogs_living_together, aes(x=pvalue, colour=animal)) + geom_density() +
	ggtitle("Cats vs Dogs pvalues", subtitle = "source: Abstracts of 2667 PLOS articles")

	cats_dogs_living_together %>% group_by(animal) %>%
	summarise(mpv = mean(pvalue), number = n())
	# 757 cat studies, 1910 dog studies suggests dogs are just easier to do experiments on
	cats_dogs_living_together %>% group_by(animal) %>%
	summarise(mpv = mean(pvalue), number = n()) %>% summarise(diff_in_mean = mpv[1]-mpv[2])
	# difference in mean pvals 0.02344332

	# significance test- irregualr distribution so will use simulation.
	# if we assume that cats and dogs are being drawn from a common distrubtion
	# of "research on household pets" that is represented by the combined distribution
	# how likely is the observed difference in means or more likely to occur by chance

	num_sim <- 10000000
	a_diff_of_means <- function(x,pvals){
	catdog <- sample(pvals, 2667, replace=TRUE)
	abs(mean(catdog[1:757]) - mean(catdog[758:2667]))
	}
	pvalsVec <- cats_dogs_living_together$pvalue

	# this is going parallel for doing a lot of simulations YMMV,
	# but at 10000000 my computers fan revs up and the room warms

	# Calculate the number of cores
	no_cores <- detectCores() - 1
	# Initiate cluster
	cl <- makeCluster(no_cores)
	clusterExport(cl, varlist=c("num_sim", "a_diff_of_means", "pvalsVec"))
	sim <- parSapply(cl, 1:num_sim,a_diff_of_means, pvals=pvalsVec)
	stopCluster(cl)

	sum(sim >= 0.02344332) / num_sim