Last active
December 15, 2016 04:51
-
-
Save ronnyli/e57c92de00818978a029fb2840f26c0b to your computer and use it in GitHub Desktop.
Wikipedia Article Sampler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
####################### | |
# Randomly select N_RAND Wikipedia articles that are up to 3 links away from | |
# a root article, in this case "Educational Technology" | |
# | |
# The purpose is to discover topics you may be interested in | |
# (because they are all linked to a root article that you provide) | |
# that might never cross your path under normal circumstances | |
####################### | |
library(magrittr) | |
library(data.table) | |
WEIGHT <- 10 # lower number increases probability of sampling closer articles | |
N_RAND <- 5 # number of articles to randomly select | |
# Load CSVs that were downloaded from Quarry into R | |
# Quarry output has the following structure: | |
# - page_id: the linked article's id | |
# - page_namespace: see wikipedia database schema for details | |
# - page_title: the linked article's title | |
# - link_distance: how many links do you have to click from the root article to reach this article | |
# d1 is built from this query: https://quarry.wmflabs.org/query/13582 | |
d1 <- read.csv("~/Documents/open_data/Wikipedia-QuarryQuery/Educational_technology_distance1.csv") %>% | |
data.table | |
# d2 is built from this query: https://quarry.wmflabs.org/query/13581 | |
d2 <- read.csv("~/Documents/open_data/Wikipedia-QuarryQuery/Educational_technology_distance2.csv") %>% | |
data.table | |
# d3 is built from this query: https://quarry.wmflabs.org/query/13584 | |
d3 <- read.csv("~/Documents/open_data/Wikipedia-QuarryQuery/Educational_technology_distance3.csv") %>% | |
data.table | |
# Combine CSVs into one data.table of ~25k articles | |
articles <- rbind(d1, d2, d3) | |
# Remove duplicates, leaving only the first instance | |
articles <- articles[, list(link_distance = min(link_distance)), | |
by = c("page_id", "page_namespace", "page_title")] | |
# Sample articles, weight the probability of selection down as link_distance increases | |
# (this is to prevent the articles at greater link_dstances from overwhelming | |
# the articles at smaller link_distances with their sheer numbers) | |
out <- sample(x = articles[, page_title], | |
size = N_RAND, | |
prob = 1 / WEIGHT ^ articles[, link_distance]) | |
# Print the selected articles to screen | |
articles[page_title %in% out] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment