ikashnitsky/sci-hub-latest-reads.r

## sci-hub-latest-reads.r
#===============================================================================
# 2022-02-12 -- twitter
# sci-hub latest reads -- look years
# Ilya Kashnitsky, ilya.kashnitsky@gmail.com, @ikashnitsky
#===============================================================================

library(tidyverse)
library(magrittr)
library(xml2)
library(rvest)
library(glue)
library(lubridate)

# When you visit sci-hub.ru, it displays the last 32 papers read by anyone
# Here I continuously scrape the website and plot the years when papers were written

# get 32 papers from one iteration download if the website
get_32 <- function(iter, url = "https://sci-hub.ru") {
    sci <- read_html(url)
    year <- sci %>%
        xml2::xml_find_all("//span[contains(@class, 'year')]") %>%
        rvest::html_text()
    paper <- sci %>%
        xml2::xml_find_all("//span[contains(@class, 'title')]") %>%
        rvest::html_text()
    paper <- paper[2:33]
    dt <- sci %>%
        xml2::xml_find_all("//span[contains(@class, 'dt')]") %>%
        rvest::html_text()

    return(
        tibble(paper, dt, year) %>%
            mutate(
                iter_n = iter,
                year = year %>% as.numeric
            )
    )
}


# here we scrape the website any number of times (100 takes ~ a minute)
raw <- 1:100 %>%
    map_df(get_32) %>%
    distinct(paper, dt, year)

times <- raw %>% distinct(dt) %>% pull(dt)
bounds <- c(times %>% first(), times%>% last)
time_data <- bounds %>% unlist %>% toString %>%  str_remove_all('[^0-9]+')

saveRDS(raw, file = glue("raw-{time_data}.rds"))

nr <- nrow(raw)

count <- raw %>%
    group_by(year) %>%
    summarise(n = n())

max_n <- count$n %>% max()


old <- count %>%
    filter(year < 1980) %>%
    arrange(year %>% desc)

count %>%
    ggplot(aes(year, n))+
    geom_col(fill = "#339999", color = "#ffffff")+
    geom_hline(yintercept = 0, color = "#333333", size = 3/4)+
    scale_y_continuous(position = "right")+
    theme_minimal()+
    theme(
        panel.grid.minor = element_blank(),
        plot.title = element_text(face = 2, hjust = .5),
        plot.subtitle = element_text(hjust = .5)
    )+
    labs(
        title = glue("Last {nr} papers read at sci-hub: distribution of years"),
        subtitle = glue("Website scraped between: {bounds[1]} and {bounds[2]} GMT+3"),
        caption = "@ikashnitsky",
        y = "# of papers"
    )+
    coord_cartesian(xlim = c(1980, 2022))+
    geom_text(
        data = old,
        aes(label = year, y = (max_n/10)+seq_along(year)*(max_n/20)),
        x = 1980, size = 3
    )+
    geom_text(
        data = old,
        aes(label = n, y = (max_n/10)+seq_along(year)*(max_n/20)),
        x = 1982, size = 2.5, fontface = 2, color = 2
    )

ggsave(glue("years-{time_data}.png"),
       width = 6.4, height = 3.6, type = "cairo-png", bg = "#ffffff")
	#===============================================================================
	# 2022-02-12 -- twitter
	# sci-hub latest reads -- look years
	# Ilya Kashnitsky, ilya.kashnitsky@gmail.com, @ikashnitsky
	#===============================================================================

	library(tidyverse)
	library(magrittr)
	library(xml2)
	library(rvest)
	library(glue)
	library(lubridate)

	# When you visit sci-hub.ru, it displays the last 32 papers read by anyone
	# Here I continuously scrape the website and plot the years when papers were written

	# get 32 papers from one iteration download if the website
	get_32 <- function(iter, url = "https://sci-hub.ru") {
	sci <- read_html(url)
	year <- sci %>%
	xml2::xml_find_all("//span[contains(@class, 'year')]") %>%
	rvest::html_text()
	paper <- sci %>%
	xml2::xml_find_all("//span[contains(@class, 'title')]") %>%
	rvest::html_text()
	paper <- paper[2:33]
	dt <- sci %>%
	xml2::xml_find_all("//span[contains(@class, 'dt')]") %>%
	rvest::html_text()

	return(
	tibble(paper, dt, year) %>%
	mutate(
	iter_n = iter,
	year = year %>% as.numeric
	)
	)
	}


	# here we scrape the website any number of times (100 takes ~ a minute)
	raw <- 1:100 %>%
	map_df(get_32) %>%
	distinct(paper, dt, year)

	times <- raw %>% distinct(dt) %>% pull(dt)
	bounds <- c(times %>% first(), times%>% last)
	time_data <- bounds %>% unlist %>% toString %>% str_remove_all('[^0-9]+')

	saveRDS(raw, file = glue("raw-{time_data}.rds"))

	nr <- nrow(raw)

	count <- raw %>%
	group_by(year) %>%
	summarise(n = n())

	max_n <- count$n %>% max()


	old <- count %>%
	filter(year < 1980) %>%
	arrange(year %>% desc)

	count %>%
	ggplot(aes(year, n))+
	geom_col(fill = "#339999", color = "#ffffff")+
	geom_hline(yintercept = 0, color = "#333333", size = 3/4)+
	scale_y_continuous(position = "right")+
	theme_minimal()+
	theme(
	panel.grid.minor = element_blank(),
	plot.title = element_text(face = 2, hjust = .5),
	plot.subtitle = element_text(hjust = .5)
	)+
	labs(
	title = glue("Last {nr} papers read at sci-hub: distribution of years"),
	subtitle = glue("Website scraped between: {bounds[1]} and {bounds[2]} GMT+3"),
	caption = "@ikashnitsky",
	y = "# of papers"
	)+
	coord_cartesian(xlim = c(1980, 2022))+
	geom_text(
	data = old,
	aes(label = year, y = (max_n/10)+seq_along(year)*(max_n/20)),
	x = 1980, size = 3
	)+
	geom_text(
	data = old,
	aes(label = n, y = (max_n/10)+seq_along(year)*(max_n/20)),
	x = 1982, size = 2.5, fontface = 2, color = 2
	)

	ggsave(glue("years-{time_data}.png"),
	width = 6.4, height = 3.6, type = "cairo-png", bg = "#ffffff")