ajdamico/top 25 words weighted by download.R

## top 25 words weighted by download.R

# devtools::install_github( "ajdamico/lodown" )

library(tm)
library(tidyverse)
library(rvest)

cranlogs_html <- read_html( "http://cran-logs.rstudio.com/" )

gz_files <- html_attr( html_nodes( cranlogs_html , "a" ) , "href" )

tf <- tempfile()

year_package_counts <- NULL

for( this_year in 2013:2017 ){

	day_package_counts <- NULL

	this_year_files <- grep( paste0( "^" , this_year ) , gz_files , value = TRUE )
	this_year_files <- this_year_files[ !grepl( "-r\\.csv\\.gz" , this_year_files ) ]

	for( this_day_file in this_year_files ){

		lodown::cachaca( paste0( "http://cran-logs.rstudio.com/" , this_day_file ) , tf , mode = 'wb' )

		this_gz_file <- read_csv( gzfile( tf ) )

		day_package_counts <-
			rbind(
				day_package_counts ,
				this_gz_file %>% group_by( date , package ) %>% summarize( count = n() )
			)

	}

	year_package_counts <-
		rbind(
			year_package_counts ,
			day_package_counts %>% group_by( year = substr( date , 1 , 4 ) , package ) %>% summarize( count = sum( count ) )
		)

}


# compute year-package weight
year_package_weights <-
	data.frame( year_package_counts %>% spread( year , count ) )

# overwrite missings with zeroes
year_package_weights[ , -1 ][ is.na( year_package_weights[ , -1 ] ) ] <- 0


traceback()

cachaca( "https://cloud.r-project.org/web/packages/packages.rds" , tf , mode = 'wb' )
package_title_description <- data.frame( readRDS( tf ) )[ c( 'Package' , 'Title' , 'Description' ) ]

package_title_description$package <- as.character( package_title_description$Package )
package_title_description$text <- paste0( package_title_description$Title , package_title_description$Description )

package_title_description$text <- tolower( gsub( "\\n|\\t" , " " , package_title_description$text ) )
package_title_description$text <- removeWords( package_title_description$text , stopwords( "english" ) )
package_title_description$text <- str_replace_all( package_title_description$text , "[[:punct:]]", " " )
package_title_description$text <- gsub( "  " , " " , package_title_description$text )

word_list <- strsplit( package_title_description$text , " " )
word_list <- lapply( word_list , function( z ) data.frame( word = z[ z != '' ] , stringsAsFactors = FALSE ) )

merged_list <-
	mapply(
		merge ,
		word_list ,
		lapply( package_title_description$package , function( z ) data.frame( package = z , stringsAsFactors = FALSE ) ) ,
		SIMPLIFY = FALSE
	)

merged_df <- do.call( rbind , merged_list )


weighted_df <- merge( merged_df , year_package_weights )

word_weighted_df <-
	data.frame( weighted_df[ , -1 ] %>% group_by( word ) %>% summarize_all( sum ) )

# remove some other words
word_weighted_df <-
	subset(
		word_weighted_df ,
		!( word %in% c( 'based' , 'also' , 'can' , 'including' , 'provides' , 'provided' , '<doi' , '1' , '10' , 'well' , 'using' , 'use' , 'used' , 'uses' ) )
	)

top_twenty_five <-
	data.frame(
		y2013 = head( word_weighted_df[ order( -word_weighted_df$X2013 ) , 'word' ] , 25 ) ,
		y2014 = head( word_weighted_df[ order( -word_weighted_df$X2014 ) , 'word' ] , 25 ) ,
		y2015 = head( word_weighted_df[ order( -word_weighted_df$X2015 ) , 'word' ] , 25 ) ,
		y2016 = head( word_weighted_df[ order( -word_weighted_df$X2016 ) , 'word' ] , 25 ) ,
		y2017 = head( word_weighted_df[ order( -word_weighted_df$X2017 ) , 'word' ] , 25 )

	)

top_twenty_five

	# devtools::install_github( "ajdamico/lodown" )

	library(tm)
	library(tidyverse)
	library(rvest)

	cranlogs_html <- read_html( "http://cran-logs.rstudio.com/" )

	gz_files <- html_attr( html_nodes( cranlogs_html , "a" ) , "href" )

	tf <- tempfile()

	year_package_counts <- NULL

	for( this_year in 2013:2017 ){

	day_package_counts <- NULL

	this_year_files <- grep( paste0( "^" , this_year ) , gz_files , value = TRUE )
	this_year_files <- this_year_files[ !grepl( "-r\\.csv\\.gz" , this_year_files ) ]

	for( this_day_file in this_year_files ){

	lodown::cachaca( paste0( "http://cran-logs.rstudio.com/" , this_day_file ) , tf , mode = 'wb' )

	this_gz_file <- read_csv( gzfile( tf ) )

	day_package_counts <-
	rbind(
	day_package_counts ,
	this_gz_file %>% group_by( date , package ) %>% summarize( count = n() )
	)

	}

	year_package_counts <-
	rbind(
	year_package_counts ,
	day_package_counts %>% group_by( year = substr( date , 1 , 4 ) , package ) %>% summarize( count = sum( count ) )
	)

	}


	# compute year-package weight
	year_package_weights <-
	data.frame( year_package_counts %>% spread( year , count ) )

	# overwrite missings with zeroes
	year_package_weights[ , -1 ][ is.na( year_package_weights[ , -1 ] ) ] <- 0



	traceback()

	cachaca( "https://cloud.r-project.org/web/packages/packages.rds" , tf , mode = 'wb' )
	package_title_description <- data.frame( readRDS( tf ) )[ c( 'Package' , 'Title' , 'Description' ) ]

	package_title_description$package <- as.character( package_title_description$Package )
	package_title_description$text <- paste0( package_title_description$Title , package_title_description$Description )

	package_title_description$text <- tolower( gsub( "\\n\|\\t" , " " , package_title_description$text ) )
	package_title_description$text <- removeWords( package_title_description$text , stopwords( "english" ) )
	package_title_description$text <- str_replace_all( package_title_description$text , "[[:punct:]]", " " )
	package_title_description$text <- gsub( " " , " " , package_title_description$text )

	word_list <- strsplit( package_title_description$text , " " )
	word_list <- lapply( word_list , function( z ) data.frame( word = z[ z != '' ] , stringsAsFactors = FALSE ) )

	merged_list <-
	mapply(
	merge ,
	word_list ,
	lapply( package_title_description$package , function( z ) data.frame( package = z , stringsAsFactors = FALSE ) ) ,
	SIMPLIFY = FALSE
	)

	merged_df <- do.call( rbind , merged_list )


	weighted_df <- merge( merged_df , year_package_weights )

	word_weighted_df <-
	data.frame( weighted_df[ , -1 ] %>% group_by( word ) %>% summarize_all( sum ) )

	# remove some other words
	word_weighted_df <-
	subset(
	word_weighted_df ,
	!( word %in% c( 'based' , 'also' , 'can' , 'including' , 'provides' , 'provided' , '<doi' , '1' , '10' , 'well' , 'using' , 'use' , 'used' , 'uses' ) )
	)

	top_twenty_five <-
	data.frame(
	y2013 = head( word_weighted_df[ order( -word_weighted_df$X2013 ) , 'word' ] , 25 ) ,
	y2014 = head( word_weighted_df[ order( -word_weighted_df$X2014 ) , 'word' ] , 25 ) ,
	y2015 = head( word_weighted_df[ order( -word_weighted_df$X2015 ) , 'word' ] , 25 ) ,
	y2016 = head( word_weighted_df[ order( -word_weighted_df$X2016 ) , 'word' ] , 25 ) ,
	y2017 = head( word_weighted_df[ order( -word_weighted_df$X2017 ) , 'word' ] , 25 )

	)

	top_twenty_five