Last active
June 5, 2017 13:41
-
-
Save giocomai/43712740a468dc5435a93cdb01457a69 to your computer and use it in GitHub Desktop.
Extracts dumps of pageviews for Turkish language version of Wikipedia for the month of April 2017 and creates basic graphs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("rvest") | |
library("tidyverse") | |
library("lubridate") | |
library("scales") | |
Sys.setlocale(category = "LC_TIME", locale = "en_IE") | |
dumpList <- read_html("https://dumps.wikimedia.org/other/pageviews/2017/2017-04/") | |
links <- data_frame(filename = html_attr(html_nodes(dumpList, "a"), "href")) %>% # extracting links | |
filter(grepl(x = filename, "projectviews")) %>% # keeping only aggregated data by project | |
mutate(link = paste0("https://dumps.wikimedia.org/other/pageviews/2017/2017-04/", filename)) | |
# create folder to store files | |
dir.create("wikipediaStats", showWarnings = FALSE) | |
# download stats by project | |
for (i in seq_along(links$link)) { | |
if (file.exists(file.path("wikipediaStats", paste0(links$filename[i], ".txt")))==FALSE) { | |
download.file(url = links$link[i], destfile = file.path("wikipediaStats", paste0(links$filename[i], ".txt"))) | |
} | |
} | |
pageviews <- list.files(path = file.path("wikipediaStats"), full.names = TRUE) %>% | |
map_df(.f = read_delim, delim = " ", col_names = FALSE, trim_ws = TRUE, .id = "id") %>% transmute(id = as.integer(id), source = X1, visits = X3) %>% filter(grepl(x = source, pattern = "^tr")) | |
pageviews <- pageviews %>% group_by(id) %>% tally(visits) %>% bind_cols(links) %>% mutate(date = gsub(x = filename, replacement = "", pattern = "projectviews-")) %>% select(-filename, -link) %>% mutate(date = lubridate::ymd_hms(date, tz = "GMT")) %>% | |
mutate(date = date+(3600*4)) # adjusting by four hours to Istanbul time | |
ggplot(data = pageviews, mapping = aes(x= date, y = n)) + | |
geom_line() + | |
scale_y_continuous(name = "", labels = comma) + | |
scale_x_datetime(name = "") + | |
theme_minimal() + | |
labs(title = "Number of pageviews per hour on Turkish-language Wikipedia projects") | |
ggsave(filename = "1.png") | |
ggplot(data = pageviews %>% filter(date>as.POSIXct("2017-04-23")), mapping = aes(x= date, y = n)) + | |
geom_line() + | |
scale_y_continuous(name = "", labels = comma) + | |
scale_x_datetime(name = "") + | |
theme_minimal() + | |
labs(title = "Number of pageviews per hour on Turkish-language Wikipedia projects") | |
ggsave(filename = "2.png") | |
ggplot(data = pageviews %>% filter(date>as.POSIXct("2017-04-28")), mapping = aes(x= date, y = n)) + | |
geom_line() + | |
scale_y_continuous(name = "", labels = comma) + | |
scale_x_datetime(name = "") + | |
theme_minimal() + | |
labs(title = "Number of pageviews per hour on Turkish-language Wikipedia projects") | |
ggsave(filename = "3.png") | |
pageviews %>% | |
filter(date>lubridate::ymd("2017-04-23")&date<lubridate::ymd("2017-05-01")) %>% | |
mutate(Hour = hour(date), Day = factor(day(date))) %>% | |
ggplot(mapping = aes(x = Hour, y = n, color = Day)) + | |
scale_y_continuous(name = "", labels = comma) + | |
scale_x_continuous(name = "Hour of the day") + | |
geom_line(size = 1.5) + | |
scale_color_brewer(type = "qual") + | |
theme_minimal() + | |
labs(title = "Number of pageviews per hour on Turkish-language Wikipedia projects", subtitle = "23 April-31 April 2017") | |
ggsave(filename = "hour.png", width = 8, height = 6) | |
###### | |
# wikipedia projects as listed in the dump files: | |
# wikibooks: ".b" | |
# wiktionary: ".d" | |
# wikimedia: ".m" | |
# wikipedia mobile: ".mw" | |
# wikinews: ".n" | |
# wikiquote: ".q" | |
# wikisource: ".s" | |
# wikiversity: ".v" | |
# mediawiki: ".w" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment