benmarwick/archaeology_on_wikipedia.R

## archaeology_on_wikipedia.R
# install.packages("devtools")
# devtools::install_github("hadley/httr")
# devtools::install_github("petermeissner/wikipediatrend")
# http://www.r-datacollection.com/blog/Using-wikipediatrend/
# devtools::install_github("Ironholds/WikipediR")
# devtools::install_github("chainsawriot/pediarr")


#- sites per country
#- sites per time period
#- distribution of article sizes (biggest)
#- distribution of numbers of authors (most)
#- distribution of edit frequency (most)
#- distribution of view frequency (most)

library(httr)
library(WikipediR)
library(pediarr)
library(wikipediatrend)


# geographical coverage: sites per country
country <- pediacategory("Category:Archaeology by country")
sites_by_country <- pediacategory("Category:Archaeological sites by country")
country_names <- gsub("Category:Archaeological sites in ", "", sites_by_country)
country_names <- gsub("the ", "", country_names)
pages_in_sites_by_country <- lapply(sites_by_country, function(i) pediacategory(i))
names(pages_in_sites_by_country) <- country_names

# some countries have categories we need to drill down into... which countries?
# just get the list items that contain categories
cats_in_sites_by_country <- lapply(pages_in_sites_by_country, function(i) i[grepl("Category", i)])
# remove the empty list items
cats_in_sites_by_country <- cats_in_sites_by_country[unname(sapply(cats_in_sites_by_country, function(i) length (i) != 0))] # 88 here

## first pass... drill down and flatten out so we just add sites to the country level of the list, rather than creating increasingly nested lists in lists
tmp2 <- vector("list", length = length(cats_in_sites_by_country))
for(i in seq_along(cats_in_sites_by_country)){
  tmp <- cats_in_sites_by_country[[i]]
  # look in each list item and get sites in each cat
  tmp1 <- vector("list", length = length(tmp))
  for(j in seq_along(tmp)){
    tmp1[[j]] <- pediacategory(tmp[[j]])
  }
  tmp2[[i]] <- unlist(tmp1)
}
# attach country names
names(tmp2) <-  names(cats_in_sites_by_country)

## second pass... drill down again to catch sites in states
cats_in_tmp2 <- lapply(tmp2, function(i) i[grepl("Category", i)])
cats_in_tmp2 <- cats_in_tmp2[(sapply(cats_in_tmp2, function(i) length (i) != 0))]

# drill down again a second level
tmp3 <- list()
for(i in seq_along(cats_in_tmp2)){
  tmp_name <- names(cats_in_tmp2[i])
  tmp <- unlist(unname((cats_in_tmp2[i])))
  # look in each list item and get sites in each cat
  tmp1 <- vector("list", length = length(tmp))
  for(j in seq_along(tmp)){
    tmp1[[j]] <- pediacategory(tmp[[j]])
  }
  tmp3[[i]] <- unlist(tmp1)
  names(tmp3[i]) <- tmp_name
}
# attach country names
names(tmp3) <-  names(cats_in_tmp2)

# are there any meaningful categories left at this level?
cats_in_tmp3 <- lapply(tmp3, function(i) i[grepl("Category", i)])
cats_in_tmp3 <- cats_in_tmp3[(sapply(cats_in_tmp3, function(i) length (i) != 0))]
# yes, seems there are more, but we need to be discerning

## third pass...
archy_cats <-  c( "sites", "ruins", "castles", "fortresses", "forts", "monuments", "roads", "villas", "petroglyphs", "mounds", "shipwrecks", "bridges", "amphitheatres", "Dolmens", "stupas", "theatres", "city", "baths", "settlements", "necropolis", "circles" )
tmp4 <- list()
for(i in seq_along(cats_in_tmp3)){
  idx <- lapply(cats_in_tmp3[i][1], function(j) unlist(tolower(j)))[[1]]
  tmp <- cats_in_tmp3[i][[1]][grepl(paste0(archy_cats, collapse = "|"), idx)]
  tmp4[[i]] <- tmp
}
# get rid of the empties
names(tmp4) <- names(cats_in_tmp3)
tmp4 <- tmp4[(sapply(tmp4, function(i) length(i) != 0))]

## fourth pass... now get the pages for these archy categories...drill down again another level
tmp5 <- list()
for(i in seq_along(tmp4)){
  tmp_name <- names(tmp4[i])
  tmp <- unlist(unname((tmp4[i])))
  # look in each list item and get sites in each cat
  tmp1 <- vector("list", length = length(tmp))
  for(j in seq_along(tmp)){
    tmp1[[j]] <- pediacategory(tmp[[j]])
  }
  tmp5[[i]] <- unlist(tmp1)
  names(tmp5[i]) <- tmp_name
}
# attach country names
names(tmp5) <-  names(tmp4)

## fifth pass... probably just drill down one more level of categories then that should do...

# just get the items that are categories...
tmp5 <- lapply(tmp5, function(i) i[grepl("Category", i)])
tmp5 <- tmp5[(sapply(tmp5, function(i) length (i) != 0))]

# subset categories that are relevant to archy...
tmp6 <- list()
for(i in seq_along(tmp5)){
  idx <- lapply(tmp5[i][1], function(j) unlist(tolower(j)))[[1]]
  tmp <- tmp5[i][[1]][grepl(paste0(archy_cats, collapse = "|"), idx)]
  tmp6[[i]] <- tmp
}
# get rid of the empties
names(tmp6) <- names(tmp5)
tmp6 <- tmp6[(sapply(tmp6, function(i) length(i) != 0))]

#  now get the pages for these archy categories...do the drill
tmp7 <- list()
for(i in seq_along(tmp6)){
  tmp_name <- names(tmp6[i])
  tmp <- unlist(unname((tmp6[i])))
  # look in each list item and get sites in each cat
  tmp1 <- vector("list", length = length(tmp))
  for(j in seq_along(tmp)){
    tmp1[[j]] <- pediacategory(tmp[[j]])
  }
  tmp7[[i]] <- unlist(tmp1)
  names(tmp7[i]) <- tmp_name
}
# attach country names
names(tmp7) <-  names(tmp6)

# that's deep enough down the rabbit hole... let's put the five passes together into one big list...

tmp0a <- lapply(pages_in_sites_by_country, function(i) i[!grepl("Category", i)])
tmp1a <- lapply(tmp1, function(i) i[!grepl("Category", i)])
tmp2a <- lapply(tmp2, function(i) i[!grepl("Category", i)])
tmp3a <- tmp3
# tmp4 all categories
# tmp5 all categories
# tmp6 all categories
tmp7a <- lapply(tmp7, function(i) i[!grepl("Category", i)])

l <- list(tmp0a, tmp1a, tmp2a, tmp3a, tmp7a)
keys <- unique(unlist(lapply(l, names)))
one_list_country <- setNames(do.call(mapply, c(FUN=c, lapply(l, `[`, keys))), keys)
rm(tmp, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, l, keys,
   tmp1a, tmp2a, tmp3a, tmp7a, tmp0a)
rm(list=(ls()[ls()!="one_list_country"]))
save(one_list_country, file = "one_list_country.RData")

# how many site per country?
how_many <- sapply(one_list_country, function(i) length(i))
how_many <- data.frame(country = names(how_many),
                       site_count = unname(how_many))
how_many <- how_many[with(how_many, order(site_count)), ]
how_many[how_many$country == "Thailand", ]

# map it
library(rworldmap)
names(how_many) <- c("country", "value")
how_many$country <- gsub("the ", "", how_many$country)
#create a map-shaped window
mapDevice('x11')
#join to a coarse resolution map
spdf <- joinCountryData2Map(how_many, joinCode="NAME", nameJoinColumn="country")
mapCountryData(spdf, nameColumnToPlot="value", catMethod="fixedWidth")

# length of each article...

# using WikipediaR
page_metadata_length_vec <- vector("list", length = length(one_list_country))
for (i in seq_along(one_list_country)){
  pages <- one_list_country[[i]]
  page_metadata <- lapply(pages, function(j) page_info("en","wikipedia", page = j))
  page_metadata_length <- lapply(page_metadata, function(k) k$query$pages[[1]][8]$length)
  page_metadata_length_vec[[i]] <- unlist(page_metadata_length)
}

names(page_metadata_length_vec)  <- names(one_list_country)
stack_pages <- stack(page_metadata_length_vec)


library(dplyr)
pages_summary <- stack_pages %>%
  group_by(ind) %>%
  filter(!grepl("Category", ind)) %>%
  summarise(mean = mean(values),
            n = n()) %>%
  arrange(-n)

## plot
library(ggplot2)

# numbers of articles per country
ggplot(pages_summary[1:25,], aes(reorder(ind, n), n)) +
  geom_point(stat = "identity", size = 4) +
  coord_flip() +
  theme_minimal(base_size = 14) +
  xlab("Country") +
  ylab("approximate number of archaeological \nsites with wikipedia pages")

# distribution of article lengths
ggplot(stack_pages, aes(reorder(ind, values), values)) +
  geom_boxplot() +
  geom_jitter(alpha = 0.05) +
  coord_flip() +
  theme_minimal() +
  xlab("Country") +
  ylab("distribution of page sizes for \narchaeological sites \nwith wikipedia pages")

# relationship between number of articles and article length
ggplot(pages_summary, aes(mean, n)) +
  geom_text(aes(label = ind), size = 3) +
  geom_smooth() +
  scale_y_log10() +
  theme_minimal(base_size = 14) +
  xlab("Average size of page") +
  ylab("approximate number of archaeological \nsites with wikipedia pages")


## What pages are the most accessed?
sum_year_page_views <- function(page_name) {
  page_views <-
    wp_trend(
      page = page_name,
      lang = "en",
      from = Sys.Date()-365
    )
  sum(page_views$count)
}

page_accesses_vec <- vector("list", length = length(one_list_country))
for (i in seq_along(one_list_country)){
  pages <- one_list_country[[i]]
  page_accesses <- lapply(pages, function(j) sum_year_page_views(j))
  page_accesses_vec[[i]] <- unlist(page_accesses)
}

# still some 'category', lists' that could be drilled into, and 'template', and 'file' pages that could be ignores


## how many editors per country?

# how many unique editors on a page?
page_changes <-  recent_changes(page)
length(unique(sapply(page_changes$query$recentchanges, function(i) i$user)))


library(WikipediR)

archy_cat <- pages_in_category("en", "wikipedia", categories = "Minerals")
length(archy_cat$query$categorymembers)

sapply(archy_cat$query$categorymembers, function(i) i$title)


library(wikipediatrend)

page_views <- wp_trend("main_page")

library(ggplot2)

ggplot(page_views, aes(x=date, y=count)) +
  geom_line(size=1.5, colour="steelblue") +
  geom_smooth(method="loess", colour="#00000000", fill="#001090", alpha=0.1) +
  scale_y_continuous( breaks=seq(5e6, 50e6, 5e6) ,
                      label= paste(seq(5,50,5),"M")

  ) +
  theme_bw()


page_views <-
  wp_trend(
    page = c("Objetivos_de_Desarrollo_del_Milenio", "Millennium_Development_Goals") ,
    lang = c("es", "en"),
    from = Sys.Date()-100
  )

library(ggplot2)

ggplot(page_views, aes(x=date, y=count, group=lang, color=lang, fill=lang)) +
  geom_smooth(size=1.5) +
  geom_point() +
  theme_bw()
	# install.packages("devtools")
	# devtools::install_github("hadley/httr")
	# devtools::install_github("petermeissner/wikipediatrend")
	# http://www.r-datacollection.com/blog/Using-wikipediatrend/
	# devtools::install_github("Ironholds/WikipediR")
	# devtools::install_github("chainsawriot/pediarr")


	#- sites per country
	#- sites per time period
	#- distribution of article sizes (biggest)
	#- distribution of numbers of authors (most)
	#- distribution of edit frequency (most)
	#- distribution of view frequency (most)

	library(httr)
	library(WikipediR)
	library(pediarr)
	library(wikipediatrend)


	# geographical coverage: sites per country
	country <- pediacategory("Category:Archaeology by country")
	sites_by_country <- pediacategory("Category:Archaeological sites by country")
	country_names <- gsub("Category:Archaeological sites in ", "", sites_by_country)
	country_names <- gsub("the ", "", country_names)
	pages_in_sites_by_country <- lapply(sites_by_country, function(i) pediacategory(i))
	names(pages_in_sites_by_country) <- country_names

	# some countries have categories we need to drill down into... which countries?
	# just get the list items that contain categories
	cats_in_sites_by_country <- lapply(pages_in_sites_by_country, function(i) i[grepl("Category", i)])
	# remove the empty list items
	cats_in_sites_by_country <- cats_in_sites_by_country[unname(sapply(cats_in_sites_by_country, function(i) length (i) != 0))] # 88 here

	## first pass... drill down and flatten out so we just add sites to the country level of the list, rather than creating increasingly nested lists in lists
	tmp2 <- vector("list", length = length(cats_in_sites_by_country))
	for(i in seq_along(cats_in_sites_by_country)){
	tmp <- cats_in_sites_by_country[[i]]
	# look in each list item and get sites in each cat
	tmp1 <- vector("list", length = length(tmp))
	for(j in seq_along(tmp)){
	tmp1[[j]] <- pediacategory(tmp[[j]])
	}
	tmp2[[i]] <- unlist(tmp1)
	}
	# attach country names
	names(tmp2) <- names(cats_in_sites_by_country)

	## second pass... drill down again to catch sites in states
	cats_in_tmp2 <- lapply(tmp2, function(i) i[grepl("Category", i)])
	cats_in_tmp2 <- cats_in_tmp2[(sapply(cats_in_tmp2, function(i) length (i) != 0))]

	# drill down again a second level
	tmp3 <- list()
	for(i in seq_along(cats_in_tmp2)){
	tmp_name <- names(cats_in_tmp2[i])
	tmp <- unlist(unname((cats_in_tmp2[i])))
	# look in each list item and get sites in each cat
	tmp1 <- vector("list", length = length(tmp))
	for(j in seq_along(tmp)){
	tmp1[[j]] <- pediacategory(tmp[[j]])
	}
	tmp3[[i]] <- unlist(tmp1)
	names(tmp3[i]) <- tmp_name
	}
	# attach country names
	names(tmp3) <- names(cats_in_tmp2)

	# are there any meaningful categories left at this level?
	cats_in_tmp3 <- lapply(tmp3, function(i) i[grepl("Category", i)])
	cats_in_tmp3 <- cats_in_tmp3[(sapply(cats_in_tmp3, function(i) length (i) != 0))]
	# yes, seems there are more, but we need to be discerning

	## third pass...
	archy_cats <- c( "sites", "ruins", "castles", "fortresses", "forts", "monuments", "roads", "villas", "petroglyphs", "mounds", "shipwrecks", "bridges", "amphitheatres", "Dolmens", "stupas", "theatres", "city", "baths", "settlements", "necropolis", "circles" )
	tmp4 <- list()
	for(i in seq_along(cats_in_tmp3)){
	idx <- lapply(cats_in_tmp3[i][1], function(j) unlist(tolower(j)))[[1]]
	tmp <- cats_in_tmp3[i][[1]][grepl(paste0(archy_cats, collapse = "\|"), idx)]
	tmp4[[i]] <- tmp
	}
	# get rid of the empties
	names(tmp4) <- names(cats_in_tmp3)
	tmp4 <- tmp4[(sapply(tmp4, function(i) length(i) != 0))]

	## fourth pass... now get the pages for these archy categories...drill down again another level
	tmp5 <- list()
	for(i in seq_along(tmp4)){
	tmp_name <- names(tmp4[i])
	tmp <- unlist(unname((tmp4[i])))
	# look in each list item and get sites in each cat
	tmp1 <- vector("list", length = length(tmp))
	for(j in seq_along(tmp)){
	tmp1[[j]] <- pediacategory(tmp[[j]])
	}
	tmp5[[i]] <- unlist(tmp1)
	names(tmp5[i]) <- tmp_name
	}
	# attach country names
	names(tmp5) <- names(tmp4)

	## fifth pass... probably just drill down one more level of categories then that should do...

	# just get the items that are categories...
	tmp5 <- lapply(tmp5, function(i) i[grepl("Category", i)])
	tmp5 <- tmp5[(sapply(tmp5, function(i) length (i) != 0))]

	# subset categories that are relevant to archy...
	tmp6 <- list()
	for(i in seq_along(tmp5)){
	idx <- lapply(tmp5[i][1], function(j) unlist(tolower(j)))[[1]]
	tmp <- tmp5[i][[1]][grepl(paste0(archy_cats, collapse = "\|"), idx)]
	tmp6[[i]] <- tmp
	}
	# get rid of the empties
	names(tmp6) <- names(tmp5)
	tmp6 <- tmp6[(sapply(tmp6, function(i) length(i) != 0))]

	# now get the pages for these archy categories...do the drill
	tmp7 <- list()
	for(i in seq_along(tmp6)){
	tmp_name <- names(tmp6[i])
	tmp <- unlist(unname((tmp6[i])))
	# look in each list item and get sites in each cat
	tmp1 <- vector("list", length = length(tmp))
	for(j in seq_along(tmp)){
	tmp1[[j]] <- pediacategory(tmp[[j]])
	}
	tmp7[[i]] <- unlist(tmp1)
	names(tmp7[i]) <- tmp_name
	}
	# attach country names
	names(tmp7) <- names(tmp6)

	# that's deep enough down the rabbit hole... let's put the five passes together into one big list...

	tmp0a <- lapply(pages_in_sites_by_country, function(i) i[!grepl("Category", i)])
	tmp1a <- lapply(tmp1, function(i) i[!grepl("Category", i)])
	tmp2a <- lapply(tmp2, function(i) i[!grepl("Category", i)])
	tmp3a <- tmp3
	# tmp4 all categories
	# tmp5 all categories
	# tmp6 all categories
	tmp7a <- lapply(tmp7, function(i) i[!grepl("Category", i)])

	l <- list(tmp0a, tmp1a, tmp2a, tmp3a, tmp7a)
	keys <- unique(unlist(lapply(l, names)))
	one_list_country <- setNames(do.call(mapply, c(FUN=c, lapply(l, `[`, keys))), keys)
	rm(tmp, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, l, keys,
	tmp1a, tmp2a, tmp3a, tmp7a, tmp0a)
	rm(list=(ls()[ls()!="one_list_country"]))
	save(one_list_country, file = "one_list_country.RData")

	# how many site per country?
	how_many <- sapply(one_list_country, function(i) length(i))
	how_many <- data.frame(country = names(how_many),
	site_count = unname(how_many))
	how_many <- how_many[with(how_many, order(site_count)), ]
	how_many[how_many$country == "Thailand", ]

	# map it
	library(rworldmap)
	names(how_many) <- c("country", "value")
	how_many$country <- gsub("the ", "", how_many$country)
	#create a map-shaped window
	mapDevice('x11')
	#join to a coarse resolution map
	spdf <- joinCountryData2Map(how_many, joinCode="NAME", nameJoinColumn="country")
	mapCountryData(spdf, nameColumnToPlot="value", catMethod="fixedWidth")

	# length of each article...

	# using WikipediaR
	page_metadata_length_vec <- vector("list", length = length(one_list_country))
	for (i in seq_along(one_list_country)){
	pages <- one_list_country[[i]]
	page_metadata <- lapply(pages, function(j) page_info("en","wikipedia", page = j))
	page_metadata_length <- lapply(page_metadata, function(k) k$query$pages[[1]][8]$length)
	page_metadata_length_vec[[i]] <- unlist(page_metadata_length)
	}

	names(page_metadata_length_vec) <- names(one_list_country)
	stack_pages <- stack(page_metadata_length_vec)


	library(dplyr)
	pages_summary <- stack_pages %>%
	group_by(ind) %>%
	filter(!grepl("Category", ind)) %>%
	summarise(mean = mean(values),
	n = n()) %>%
	arrange(-n)

	## plot
	library(ggplot2)

	# numbers of articles per country
	ggplot(pages_summary[1:25,], aes(reorder(ind, n), n)) +
	geom_point(stat = "identity", size = 4) +
	coord_flip() +
	theme_minimal(base_size = 14) +
	xlab("Country") +
	ylab("approximate number of archaeological \nsites with wikipedia pages")

	# distribution of article lengths
	ggplot(stack_pages, aes(reorder(ind, values), values)) +
	geom_boxplot() +
	geom_jitter(alpha = 0.05) +
	coord_flip() +
	theme_minimal() +
	xlab("Country") +
	ylab("distribution of page sizes for \narchaeological sites \nwith wikipedia pages")

	# relationship between number of articles and article length
	ggplot(pages_summary, aes(mean, n)) +
	geom_text(aes(label = ind), size = 3) +
	geom_smooth() +
	scale_y_log10() +
	theme_minimal(base_size = 14) +
	xlab("Average size of page") +
	ylab("approximate number of archaeological \nsites with wikipedia pages")


	## What pages are the most accessed?
	sum_year_page_views <- function(page_name) {
	page_views <-
	wp_trend(
	page = page_name,
	lang = "en",
	from = Sys.Date()-365
	)
	sum(page_views$count)
	}

	page_accesses_vec <- vector("list", length = length(one_list_country))
	for (i in seq_along(one_list_country)){
	pages <- one_list_country[[i]]
	page_accesses <- lapply(pages, function(j) sum_year_page_views(j))
	page_accesses_vec[[i]] <- unlist(page_accesses)
	}

	# still some 'category', lists' that could be drilled into, and 'template', and 'file' pages that could be ignores


	## how many editors per country?

	# how many unique editors on a page?
	page_changes <- recent_changes(page)
	length(unique(sapply(page_changes$query$recentchanges, function(i) i$user)))



	library(WikipediR)

	archy_cat <- pages_in_category("en", "wikipedia", categories = "Minerals")
	length(archy_cat$query$categorymembers)

	sapply(archy_cat$query$categorymembers, function(i) i$title)



	library(wikipediatrend)

	page_views <- wp_trend("main_page")

	library(ggplot2)

	ggplot(page_views, aes(x=date, y=count)) +
	geom_line(size=1.5, colour="steelblue") +
	geom_smooth(method="loess", colour="#00000000", fill="#001090", alpha=0.1) +
	scale_y_continuous( breaks=seq(5e6, 50e6, 5e6) ,
	label= paste(seq(5,50,5),"M")

	) +
	theme_bw()




	page_views <-
	wp_trend(
	page = c("Objetivos_de_Desarrollo_del_Milenio", "Millennium_Development_Goals") ,
	lang = c("es", "en"),
	from = Sys.Date()-100
	)

	library(ggplot2)

	ggplot(page_views, aes(x=date, y=count, group=lang, color=lang, fill=lang)) +
	geom_smooth(size=1.5) +
	geom_point() +
	theme_bw()