Skip to content

Instantly share code, notes, and snippets.

@giocomai
Last active June 5, 2017 13:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save giocomai/43712740a468dc5435a93cdb01457a69 to your computer and use it in GitHub Desktop.
Save giocomai/43712740a468dc5435a93cdb01457a69 to your computer and use it in GitHub Desktop.
Extracts dumps of pageviews for Turkish language version of Wikipedia for the month of April 2017 and creates basic graphs
library("rvest")
library("tidyverse")
library("lubridate")
library("scales")
Sys.setlocale(category = "LC_TIME", locale = "en_IE")
dumpList <- read_html("https://dumps.wikimedia.org/other/pageviews/2017/2017-04/")
links <- data_frame(filename = html_attr(html_nodes(dumpList, "a"), "href")) %>% # extracting links
filter(grepl(x = filename, "projectviews")) %>% # keeping only aggregated data by project
mutate(link = paste0("https://dumps.wikimedia.org/other/pageviews/2017/2017-04/", filename))
# create folder to store files
dir.create("wikipediaStats", showWarnings = FALSE)
# download stats by project
for (i in seq_along(links$link)) {
if (file.exists(file.path("wikipediaStats", paste0(links$filename[i], ".txt")))==FALSE) {
download.file(url = links$link[i], destfile = file.path("wikipediaStats", paste0(links$filename[i], ".txt")))
}
}
pageviews <- list.files(path = file.path("wikipediaStats"), full.names = TRUE) %>%
map_df(.f = read_delim, delim = " ", col_names = FALSE, trim_ws = TRUE, .id = "id") %>% transmute(id = as.integer(id), source = X1, visits = X3) %>% filter(grepl(x = source, pattern = "^tr"))
pageviews <- pageviews %>% group_by(id) %>% tally(visits) %>% bind_cols(links) %>% mutate(date = gsub(x = filename, replacement = "", pattern = "projectviews-")) %>% select(-filename, -link) %>% mutate(date = lubridate::ymd_hms(date, tz = "GMT")) %>%
mutate(date = date+(3600*4)) # adjusting by four hours to Istanbul time
ggplot(data = pageviews, mapping = aes(x= date, y = n)) +
geom_line() +
scale_y_continuous(name = "", labels = comma) +
scale_x_datetime(name = "") +
theme_minimal() +
labs(title = "Number of pageviews per hour on Turkish-language Wikipedia projects")
ggsave(filename = "1.png")
ggplot(data = pageviews %>% filter(date>as.POSIXct("2017-04-23")), mapping = aes(x= date, y = n)) +
geom_line() +
scale_y_continuous(name = "", labels = comma) +
scale_x_datetime(name = "") +
theme_minimal() +
labs(title = "Number of pageviews per hour on Turkish-language Wikipedia projects")
ggsave(filename = "2.png")
ggplot(data = pageviews %>% filter(date>as.POSIXct("2017-04-28")), mapping = aes(x= date, y = n)) +
geom_line() +
scale_y_continuous(name = "", labels = comma) +
scale_x_datetime(name = "") +
theme_minimal() +
labs(title = "Number of pageviews per hour on Turkish-language Wikipedia projects")
ggsave(filename = "3.png")
pageviews %>%
filter(date>lubridate::ymd("2017-04-23")&date<lubridate::ymd("2017-05-01")) %>%
mutate(Hour = hour(date), Day = factor(day(date))) %>%
ggplot(mapping = aes(x = Hour, y = n, color = Day)) +
scale_y_continuous(name = "", labels = comma) +
scale_x_continuous(name = "Hour of the day") +
geom_line(size = 1.5) +
scale_color_brewer(type = "qual") +
theme_minimal() +
labs(title = "Number of pageviews per hour on Turkish-language Wikipedia projects", subtitle = "23 April-31 April 2017")
ggsave(filename = "hour.png", width = 8, height = 6)
######
# wikipedia projects as listed in the dump files:
# wikibooks: ".b"
# wiktionary: ".d"
# wikimedia: ".m"
# wikipedia mobile: ".mw"
# wikinews: ".n"
# wikiquote: ".q"
# wikisource: ".s"
# wikiversity: ".v"
# mediawiki: ".w"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment