Skip to content

Instantly share code, notes, and snippets.

@ikashnitsky
Created February 12, 2022 22:22
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ikashnitsky/8855e339caae46a46a3c345cf96e6204 to your computer and use it in GitHub Desktop.
Save ikashnitsky/8855e339caae46a46a3c345cf96e6204 to your computer and use it in GitHub Desktop.
Scrape 'latest read' papers from sci-hub home page and plot the years when these papers were written -- https://twitter.com/ikashnitsky/status/1492624193141288965
#===============================================================================
# 2022-02-12 -- twitter
# sci-hub latest reads -- look years
# Ilya Kashnitsky, ilya.kashnitsky@gmail.com, @ikashnitsky
#===============================================================================
library(tidyverse)
library(magrittr)
library(xml2)
library(rvest)
library(glue)
library(lubridate)
# When you visit sci-hub.ru, it displays the last 32 papers read by anyone
# Here I continuously scrape the website and plot the years when papers were written
# get 32 papers from one iteration download if the website
get_32 <- function(iter, url = "https://sci-hub.ru") {
sci <- read_html(url)
year <- sci %>%
xml2::xml_find_all("//span[contains(@class, 'year')]") %>%
rvest::html_text()
paper <- sci %>%
xml2::xml_find_all("//span[contains(@class, 'title')]") %>%
rvest::html_text()
paper <- paper[2:33]
dt <- sci %>%
xml2::xml_find_all("//span[contains(@class, 'dt')]") %>%
rvest::html_text()
return(
tibble(paper, dt, year) %>%
mutate(
iter_n = iter,
year = year %>% as.numeric
)
)
}
# here we scrape the website any number of times (100 takes ~ a minute)
raw <- 1:100 %>%
map_df(get_32) %>%
distinct(paper, dt, year)
times <- raw %>% distinct(dt) %>% pull(dt)
bounds <- c(times %>% first(), times%>% last)
time_data <- bounds %>% unlist %>% toString %>% str_remove_all('[^0-9]+')
saveRDS(raw, file = glue("raw-{time_data}.rds"))
nr <- nrow(raw)
count <- raw %>%
group_by(year) %>%
summarise(n = n())
max_n <- count$n %>% max()
old <- count %>%
filter(year < 1980) %>%
arrange(year %>% desc)
count %>%
ggplot(aes(year, n))+
geom_col(fill = "#339999", color = "#ffffff")+
geom_hline(yintercept = 0, color = "#333333", size = 3/4)+
scale_y_continuous(position = "right")+
theme_minimal()+
theme(
panel.grid.minor = element_blank(),
plot.title = element_text(face = 2, hjust = .5),
plot.subtitle = element_text(hjust = .5)
)+
labs(
title = glue("Last {nr} papers read at sci-hub: distribution of years"),
subtitle = glue("Website scraped between: {bounds[1]} and {bounds[2]} GMT+3"),
caption = "@ikashnitsky",
y = "# of papers"
)+
coord_cartesian(xlim = c(1980, 2022))+
geom_text(
data = old,
aes(label = year, y = (max_n/10)+seq_along(year)*(max_n/20)),
x = 1980, size = 3
)+
geom_text(
data = old,
aes(label = n, y = (max_n/10)+seq_along(year)*(max_n/20)),
x = 1982, size = 2.5, fontface = 2, color = 2
)
ggsave(glue("years-{time_data}.png"),
width = 6.4, height = 3.6, type = "cairo-png", bg = "#ffffff")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment