ajdamico/wikipedia random articles are indeed random.R

## wikipedia random articles are indeed random.R
library(rvest)

tf <- tempfile()
tf2 <- tempfile()

# download and unzip
download.file( "https://dumps.wikimedia.org/other/pagecounts-ez/merged/2018/2018-05/pagecounts-2018-05-27.bz2" , tf , mode = 'wb' )
R.utils::bunzip2( tf , tf2 , remove = FALSE )

# import and restrict to english
x <- data.table::fread( tf2 , skip = 25 , header = FALSE )
y <- subset( x , V1 == 'en.z' )

# how many average pageviews overall?
average_pageviews <- mean( y$V3 )


# sample fifty thousand articles
sampled_pageviews <- NULL

this_n <- 50000

for( i in seq( this_n ) ){

    this_page <- as.character( html_nodes( read_html("https://en.wikipedia.org/wiki/Special:Random") , "title" ) )
    this_article_name <- gsub( " " , "_" , gsub( "(.*)>(.*) - Wikipedia<(.*)" , "\\2" , this_page ) )

	# look inside `y` for the sampled article's pageviews
	this_pageviews <- subset( y , V2 == this_article_name )$V3

    if( length( this_pageviews ) == 1 ) sampled_pageviews <- c( sampled_pageviews , this_pageviews )
    if( length( this_pageviews ) == 0 ) sampled_pageviews <- c( sampled_pageviews , 0 )
    if( length( this_pageviews ) > 1 ) stop( "problem" )

}

# these two numbers get close as this_n gets bigger
mean( sampled_pageviews )
average_pageviews
	library(rvest)

	tf <- tempfile()
	tf2 <- tempfile()

	# download and unzip
	download.file( "https://dumps.wikimedia.org/other/pagecounts-ez/merged/2018/2018-05/pagecounts-2018-05-27.bz2" , tf , mode = 'wb' )
	R.utils::bunzip2( tf , tf2 , remove = FALSE )

	# import and restrict to english
	x <- data.table::fread( tf2 , skip = 25 , header = FALSE )
	y <- subset( x , V1 == 'en.z' )

	# how many average pageviews overall?
	average_pageviews <- mean( y$V3 )


	# sample fifty thousand articles
	sampled_pageviews <- NULL

	this_n <- 50000

	for( i in seq( this_n ) ){

	this_page <- as.character( html_nodes( read_html("https://en.wikipedia.org/wiki/Special:Random") , "title" ) )
	this_article_name <- gsub( " " , "_" , gsub( "(.)>(.) - Wikipedia<(.*)" , "\\2" , this_page ) )

	# look inside `y` for the sampled article's pageviews
	this_pageviews <- subset( y , V2 == this_article_name )$V3

	if( length( this_pageviews ) == 1 ) sampled_pageviews <- c( sampled_pageviews , this_pageviews )
	if( length( this_pageviews ) == 0 ) sampled_pageviews <- c( sampled_pageviews , 0 )
	if( length( this_pageviews ) > 1 ) stop( "problem" )

	}

	# these two numbers get close as this_n gets bigger
	mean( sampled_pageviews )
	average_pageviews