ronnyli/wikipedia_article_sampler.R

## wikipedia_article_sampler.R
#######################
# Randomly select N_RAND Wikipedia articles that are up to 3 links away from
# a root article, in this case "Educational Technology"
#
# The purpose is to discover topics you may be interested in
# (because they are all linked to a root article that you provide)
# that might never cross your path under normal circumstances
#######################
library(magrittr)
library(data.table)

WEIGHT <- 10  # lower number increases probability of sampling closer articles
N_RAND <- 5  # number of articles to randomly select

# Load CSVs that were downloaded from Quarry into R
# Quarry output has the following structure:
#   - page_id: the linked article's id
#   - page_namespace: see wikipedia database schema for details
#   - page_title: the linked article's title
#   - link_distance: how many links do you have to click from the root article to reach this article
# d1 is built from this query: https://quarry.wmflabs.org/query/13582
d1 <- read.csv("~/Documents/open_data/Wikipedia-QuarryQuery/Educational_technology_distance1.csv") %>%
  data.table
# d2 is built from this query: https://quarry.wmflabs.org/query/13581
d2 <- read.csv("~/Documents/open_data/Wikipedia-QuarryQuery/Educational_technology_distance2.csv") %>%
  data.table
# d3 is built from this query: https://quarry.wmflabs.org/query/13584
d3 <- read.csv("~/Documents/open_data/Wikipedia-QuarryQuery/Educational_technology_distance3.csv") %>%
  data.table

# Combine CSVs into one data.table of ~25k articles
articles <- rbind(d1, d2, d3)
# Remove duplicates, leaving only the first instance
articles <- articles[, list(link_distance = min(link_distance)),
                     by = c("page_id", "page_namespace", "page_title")]
# Sample articles, weight the probability of selection down as link_distance increases
# (this is to prevent the articles at greater link_dstances from overwhelming
# the articles at smaller link_distances with their sheer numbers)
out <- sample(x = articles[, page_title],
              size = N_RAND,
              prob = 1 / WEIGHT ^ articles[, link_distance])
# Print the selected articles to screen
articles[page_title %in% out]
	#######################
	# Randomly select N_RAND Wikipedia articles that are up to 3 links away from
	# a root article, in this case "Educational Technology"
	#
	# The purpose is to discover topics you may be interested in
	# (because they are all linked to a root article that you provide)
	# that might never cross your path under normal circumstances
	#######################
	library(magrittr)
	library(data.table)

	WEIGHT <- 10 # lower number increases probability of sampling closer articles
	N_RAND <- 5 # number of articles to randomly select

	# Load CSVs that were downloaded from Quarry into R
	# Quarry output has the following structure:
	# - page_id: the linked article's id
	# - page_namespace: see wikipedia database schema for details
	# - page_title: the linked article's title
	# - link_distance: how many links do you have to click from the root article to reach this article
	# d1 is built from this query: https://quarry.wmflabs.org/query/13582
	d1 <- read.csv("~/Documents/open_data/Wikipedia-QuarryQuery/Educational_technology_distance1.csv") %>%
	data.table
	# d2 is built from this query: https://quarry.wmflabs.org/query/13581
	d2 <- read.csv("~/Documents/open_data/Wikipedia-QuarryQuery/Educational_technology_distance2.csv") %>%
	data.table
	# d3 is built from this query: https://quarry.wmflabs.org/query/13584
	d3 <- read.csv("~/Documents/open_data/Wikipedia-QuarryQuery/Educational_technology_distance3.csv") %>%
	data.table

	# Combine CSVs into one data.table of ~25k articles
	articles <- rbind(d1, d2, d3)
	# Remove duplicates, leaving only the first instance
	articles <- articles[, list(link_distance = min(link_distance)),
	by = c("page_id", "page_namespace", "page_title")]
	# Sample articles, weight the probability of selection down as link_distance increases
	# (this is to prevent the articles at greater link_dstances from overwhelming
	# the articles at smaller link_distances with their sheer numbers)
	out <- sample(x = articles[, page_title],
	size = N_RAND,
	prob = 1 / WEIGHT ^ articles[, link_distance])
	# Print the selected articles to screen
	articles[page_title %in% out]