Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Last active September 22, 2015 16:54
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save benmarwick/0d5e6526604e755b556f to your computer and use it in GitHub Desktop.
How does wikipedia see archaeology?
# install.packages("devtools")
# devtools::install_github("hadley/httr")
# devtools::install_github("petermeissner/wikipediatrend")
# http://www.r-datacollection.com/blog/Using-wikipediatrend/
# devtools::install_github("Ironholds/WikipediR")
# devtools::install_github("chainsawriot/pediarr")
#- sites per country
#- sites per time period
#- distribution of article sizes (biggest)
#- distribution of numbers of authors (most)
#- distribution of edit frequency (most)
#- distribution of view frequency (most)
library(httr)
library(WikipediR)
library(pediarr)
library(wikipediatrend)
# geographical coverage: sites per country
country <- pediacategory("Category:Archaeology by country")
sites_by_country <- pediacategory("Category:Archaeological sites by country")
country_names <- gsub("Category:Archaeological sites in ", "", sites_by_country)
country_names <- gsub("the ", "", country_names)
pages_in_sites_by_country <- lapply(sites_by_country, function(i) pediacategory(i))
names(pages_in_sites_by_country) <- country_names
# some countries have categories we need to drill down into... which countries?
# just get the list items that contain categories
cats_in_sites_by_country <- lapply(pages_in_sites_by_country, function(i) i[grepl("Category", i)])
# remove the empty list items
cats_in_sites_by_country <- cats_in_sites_by_country[unname(sapply(cats_in_sites_by_country, function(i) length (i) != 0))] # 88 here
## first pass... drill down and flatten out so we just add sites to the country level of the list, rather than creating increasingly nested lists in lists
tmp2 <- vector("list", length = length(cats_in_sites_by_country))
for(i in seq_along(cats_in_sites_by_country)){
tmp <- cats_in_sites_by_country[[i]]
# look in each list item and get sites in each cat
tmp1 <- vector("list", length = length(tmp))
for(j in seq_along(tmp)){
tmp1[[j]] <- pediacategory(tmp[[j]])
}
tmp2[[i]] <- unlist(tmp1)
}
# attach country names
names(tmp2) <- names(cats_in_sites_by_country)
## second pass... drill down again to catch sites in states
cats_in_tmp2 <- lapply(tmp2, function(i) i[grepl("Category", i)])
cats_in_tmp2 <- cats_in_tmp2[(sapply(cats_in_tmp2, function(i) length (i) != 0))]
# drill down again a second level
tmp3 <- list()
for(i in seq_along(cats_in_tmp2)){
tmp_name <- names(cats_in_tmp2[i])
tmp <- unlist(unname((cats_in_tmp2[i])))
# look in each list item and get sites in each cat
tmp1 <- vector("list", length = length(tmp))
for(j in seq_along(tmp)){
tmp1[[j]] <- pediacategory(tmp[[j]])
}
tmp3[[i]] <- unlist(tmp1)
names(tmp3[i]) <- tmp_name
}
# attach country names
names(tmp3) <- names(cats_in_tmp2)
# are there any meaningful categories left at this level?
cats_in_tmp3 <- lapply(tmp3, function(i) i[grepl("Category", i)])
cats_in_tmp3 <- cats_in_tmp3[(sapply(cats_in_tmp3, function(i) length (i) != 0))]
# yes, seems there are more, but we need to be discerning
## third pass...
archy_cats <- c( "sites", "ruins", "castles", "fortresses", "forts", "monuments", "roads", "villas", "petroglyphs", "mounds", "shipwrecks", "bridges", "amphitheatres", "Dolmens", "stupas", "theatres", "city", "baths", "settlements", "necropolis", "circles" )
tmp4 <- list()
for(i in seq_along(cats_in_tmp3)){
idx <- lapply(cats_in_tmp3[i][1], function(j) unlist(tolower(j)))[[1]]
tmp <- cats_in_tmp3[i][[1]][grepl(paste0(archy_cats, collapse = "|"), idx)]
tmp4[[i]] <- tmp
}
# get rid of the empties
names(tmp4) <- names(cats_in_tmp3)
tmp4 <- tmp4[(sapply(tmp4, function(i) length(i) != 0))]
## fourth pass... now get the pages for these archy categories...drill down again another level
tmp5 <- list()
for(i in seq_along(tmp4)){
tmp_name <- names(tmp4[i])
tmp <- unlist(unname((tmp4[i])))
# look in each list item and get sites in each cat
tmp1 <- vector("list", length = length(tmp))
for(j in seq_along(tmp)){
tmp1[[j]] <- pediacategory(tmp[[j]])
}
tmp5[[i]] <- unlist(tmp1)
names(tmp5[i]) <- tmp_name
}
# attach country names
names(tmp5) <- names(tmp4)
## fifth pass... probably just drill down one more level of categories then that should do...
# just get the items that are categories...
tmp5 <- lapply(tmp5, function(i) i[grepl("Category", i)])
tmp5 <- tmp5[(sapply(tmp5, function(i) length (i) != 0))]
# subset categories that are relevant to archy...
tmp6 <- list()
for(i in seq_along(tmp5)){
idx <- lapply(tmp5[i][1], function(j) unlist(tolower(j)))[[1]]
tmp <- tmp5[i][[1]][grepl(paste0(archy_cats, collapse = "|"), idx)]
tmp6[[i]] <- tmp
}
# get rid of the empties
names(tmp6) <- names(tmp5)
tmp6 <- tmp6[(sapply(tmp6, function(i) length(i) != 0))]
# now get the pages for these archy categories...do the drill
tmp7 <- list()
for(i in seq_along(tmp6)){
tmp_name <- names(tmp6[i])
tmp <- unlist(unname((tmp6[i])))
# look in each list item and get sites in each cat
tmp1 <- vector("list", length = length(tmp))
for(j in seq_along(tmp)){
tmp1[[j]] <- pediacategory(tmp[[j]])
}
tmp7[[i]] <- unlist(tmp1)
names(tmp7[i]) <- tmp_name
}
# attach country names
names(tmp7) <- names(tmp6)
# that's deep enough down the rabbit hole... let's put the five passes together into one big list...
tmp0a <- lapply(pages_in_sites_by_country, function(i) i[!grepl("Category", i)])
tmp1a <- lapply(tmp1, function(i) i[!grepl("Category", i)])
tmp2a <- lapply(tmp2, function(i) i[!grepl("Category", i)])
tmp3a <- tmp3
# tmp4 all categories
# tmp5 all categories
# tmp6 all categories
tmp7a <- lapply(tmp7, function(i) i[!grepl("Category", i)])
l <- list(tmp0a, tmp1a, tmp2a, tmp3a, tmp7a)
keys <- unique(unlist(lapply(l, names)))
one_list_country <- setNames(do.call(mapply, c(FUN=c, lapply(l, `[`, keys))), keys)
rm(tmp, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, l, keys,
tmp1a, tmp2a, tmp3a, tmp7a, tmp0a)
rm(list=(ls()[ls()!="one_list_country"]))
save(one_list_country, file = "one_list_country.RData")
# how many site per country?
how_many <- sapply(one_list_country, function(i) length(i))
how_many <- data.frame(country = names(how_many),
site_count = unname(how_many))
how_many <- how_many[with(how_many, order(site_count)), ]
how_many[how_many$country == "Thailand", ]
# map it
library(rworldmap)
names(how_many) <- c("country", "value")
how_many$country <- gsub("the ", "", how_many$country)
#create a map-shaped window
mapDevice('x11')
#join to a coarse resolution map
spdf <- joinCountryData2Map(how_many, joinCode="NAME", nameJoinColumn="country")
mapCountryData(spdf, nameColumnToPlot="value", catMethod="fixedWidth")
# length of each article...
# using WikipediaR
page_metadata_length_vec <- vector("list", length = length(one_list_country))
for (i in seq_along(one_list_country)){
pages <- one_list_country[[i]]
page_metadata <- lapply(pages, function(j) page_info("en","wikipedia", page = j))
page_metadata_length <- lapply(page_metadata, function(k) k$query$pages[[1]][8]$length)
page_metadata_length_vec[[i]] <- unlist(page_metadata_length)
}
names(page_metadata_length_vec) <- names(one_list_country)
stack_pages <- stack(page_metadata_length_vec)
library(dplyr)
pages_summary <- stack_pages %>%
group_by(ind) %>%
filter(!grepl("Category", ind)) %>%
summarise(mean = mean(values),
n = n()) %>%
arrange(-n)
## plot
library(ggplot2)
# numbers of articles per country
ggplot(pages_summary[1:25,], aes(reorder(ind, n), n)) +
geom_point(stat = "identity", size = 4) +
coord_flip() +
theme_minimal(base_size = 14) +
xlab("Country") +
ylab("approximate number of archaeological \nsites with wikipedia pages")
# distribution of article lengths
ggplot(stack_pages, aes(reorder(ind, values), values)) +
geom_boxplot() +
geom_jitter(alpha = 0.05) +
coord_flip() +
theme_minimal() +
xlab("Country") +
ylab("distribution of page sizes for \narchaeological sites \nwith wikipedia pages")
# relationship between number of articles and article length
ggplot(pages_summary, aes(mean, n)) +
geom_text(aes(label = ind), size = 3) +
geom_smooth() +
scale_y_log10() +
theme_minimal(base_size = 14) +
xlab("Average size of page") +
ylab("approximate number of archaeological \nsites with wikipedia pages")
## What pages are the most accessed?
sum_year_page_views <- function(page_name) {
page_views <-
wp_trend(
page = page_name,
lang = "en",
from = Sys.Date()-365
)
sum(page_views$count)
}
page_accesses_vec <- vector("list", length = length(one_list_country))
for (i in seq_along(one_list_country)){
pages <- one_list_country[[i]]
page_accesses <- lapply(pages, function(j) sum_year_page_views(j))
page_accesses_vec[[i]] <- unlist(page_accesses)
}
# still some 'category', lists' that could be drilled into, and 'template', and 'file' pages that could be ignores
## how many editors per country?
# how many unique editors on a page?
page_changes <- recent_changes(page)
length(unique(sapply(page_changes$query$recentchanges, function(i) i$user)))
library(WikipediR)
archy_cat <- pages_in_category("en", "wikipedia", categories = "Minerals")
length(archy_cat$query$categorymembers)
sapply(archy_cat$query$categorymembers, function(i) i$title)
library(wikipediatrend)
page_views <- wp_trend("main_page")
library(ggplot2)
ggplot(page_views, aes(x=date, y=count)) +
geom_line(size=1.5, colour="steelblue") +
geom_smooth(method="loess", colour="#00000000", fill="#001090", alpha=0.1) +
scale_y_continuous( breaks=seq(5e6, 50e6, 5e6) ,
label= paste(seq(5,50,5),"M")
) +
theme_bw()
page_views <-
wp_trend(
page = c("Objetivos_de_Desarrollo_del_Milenio", "Millennium_Development_Goals") ,
lang = c("es", "en"),
from = Sys.Date()-100
)
library(ggplot2)
ggplot(page_views, aes(x=date, y=count, group=lang, color=lang, fill=lang)) +
geom_smooth(size=1.5) +
geom_point() +
theme_bw()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment