Skip to content

Instantly share code, notes, and snippets.

@giocomai
Created June 20, 2017 07:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save giocomai/5f37c84e8855684671b10a3cee002aed to your computer and use it in GitHub Desktop.
Save giocomai/5f37c84e8855684671b10a3cee002aed to your computer and use it in GitHub Desktop.
Create graph with pageviews to Turkish language Wikipedia projects (April-June 2017)
library("rvest")
library("tidyverse")
library("lubridate")
library("scales")
Sys.setlocale(category = "LC_TIME", locale = "en_IE")
dumpListApril <- read_html("https://dumps.wikimedia.org/other/pageviews/2017/2017-04/")
linksApril <- data_frame(filename = html_attr(html_nodes(dumpListApril, "a"), "href")) %>% # extracting links
filter(grepl(x = filename, "projectviews")) %>% # keeping only aggregated data by project
mutate(link = paste0("https://dumps.wikimedia.org/other/pageviews/2017/2017-04/", filename))
dumpListMay <- read_html("https://dumps.wikimedia.org/other/pageviews/2017/2017-05/")
linksMay <- data_frame(filename = html_attr(html_nodes(dumpListMay, "a"), "href")) %>% # extracting links
filter(grepl(x = filename, "projectviews")) %>% # keeping only aggregated data by project
mutate(link = paste0("https://dumps.wikimedia.org/other/pageviews/2017/2017-05/", filename))
dumpListJune <- read_html("https://dumps.wikimedia.org/other/pageviews/2017/2017-06/")
linksJune <- data_frame(filename = html_attr(html_nodes(dumpListJune, "a"), "href")) %>% # extracting links
filter(grepl(x = filename, "projectviews")) %>% # keeping only aggregated data by project
mutate(link = paste0("https://dumps.wikimedia.org/other/pageviews/2017/2017-06/", filename))
links <- bind_rows(linksApril, linksMay, linksJune)
# create folder to store files
dir.create("wikipediaStats", showWarnings = FALSE)
# download stats by project
for (i in seq_along(links$link)) {
if (file.exists(file.path("wikipediaStats", paste0(links$filename[i], ".txt")))==FALSE) {
download.file(url = links$link[i], destfile = file.path("wikipediaStats", paste0(links$filename[i], ".txt")))
}
}
pageviews <- list.files(path = file.path("wikipediaStats"), full.names = TRUE) %>%
map_df(.f = read_delim, delim = " ", col_names = FALSE, trim_ws = TRUE, .id = "id") %>% transmute(id = as.integer(id), source = X1, visits = X3) %>% filter(grepl(x = source, pattern = "^tr"))
pageviews <- pageviews %>% group_by(id) %>% tally(visits) %>% bind_cols(links) %>% mutate(date = gsub(x = filename, replacement = "", pattern = "projectviews-")) %>% select(-filename, -link) %>% mutate(date = lubridate::ymd_hms(date, tz = "GMT")) %>%
mutate(date = date+(3600*4)) # adjusting by four hours to Istanbul time
ggplot(data = pageviews, mapping = aes(x= date, y = n)) +
geom_line() +
scale_y_continuous(name = "", labels = comma) +
scale_x_datetime(name = "") +
theme_minimal() +
labs(title = "Number of pageviews per hour on Turkish-language Wikipedia projects")
ggsave(filename = "1.png", width = 10, height = 5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment