Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ajdamico/c24dd25c8b0cab3203c39b8a47d7b1a0 to your computer and use it in GitHub Desktop.
Save ajdamico/c24dd25c8b0cab3203c39b8a47d7b1a0 to your computer and use it in GitHub Desktop.
library(rvest)
tf <- tempfile()
tf2 <- tempfile()
# download and unzip
download.file( "https://dumps.wikimedia.org/other/pagecounts-ez/merged/2018/2018-05/pagecounts-2018-05-27.bz2" , tf , mode = 'wb' )
R.utils::bunzip2( tf , tf2 , remove = FALSE )
# import and restrict to english
x <- data.table::fread( tf2 , skip = 25 , header = FALSE )
y <- subset( x , V1 == 'en.z' )
# how many average pageviews overall?
average_pageviews <- mean( y$V3 )
# sample fifty thousand articles
sampled_pageviews <- NULL
this_n <- 50000
for( i in seq( this_n ) ){
this_page <- as.character( html_nodes( read_html("https://en.wikipedia.org/wiki/Special:Random") , "title" ) )
this_article_name <- gsub( " " , "_" , gsub( "(.*)>(.*) - Wikipedia<(.*)" , "\\2" , this_page ) )
# look inside `y` for the sampled article's pageviews
this_pageviews <- subset( y , V2 == this_article_name )$V3
if( length( this_pageviews ) == 1 ) sampled_pageviews <- c( sampled_pageviews , this_pageviews )
if( length( this_pageviews ) == 0 ) sampled_pageviews <- c( sampled_pageviews , 0 )
if( length( this_pageviews ) > 1 ) stop( "problem" )
}
# these two numbers get close as this_n gets bigger
mean( sampled_pageviews )
average_pageviews
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment