twenty-five most common words in CRAN title + description fields, weighted by downloads
# devtools::install_github( "ajdamico/lodown" ) | |
library(tm) | |
library(tidyverse) | |
library(rvest) | |
cranlogs_html <- read_html( "http://cran-logs.rstudio.com/" ) | |
gz_files <- html_attr( html_nodes( cranlogs_html , "a" ) , "href" ) | |
tf <- tempfile() | |
year_package_counts <- NULL | |
for( this_year in 2013:2017 ){ | |
day_package_counts <- NULL | |
this_year_files <- grep( paste0( "^" , this_year ) , gz_files , value = TRUE ) | |
this_year_files <- this_year_files[ !grepl( "-r\\.csv\\.gz" , this_year_files ) ] | |
for( this_day_file in this_year_files ){ | |
lodown::cachaca( paste0( "http://cran-logs.rstudio.com/" , this_day_file ) , tf , mode = 'wb' ) | |
this_gz_file <- read_csv( gzfile( tf ) ) | |
day_package_counts <- | |
rbind( | |
day_package_counts , | |
this_gz_file %>% group_by( date , package ) %>% summarize( count = n() ) | |
) | |
} | |
year_package_counts <- | |
rbind( | |
year_package_counts , | |
day_package_counts %>% group_by( year = substr( date , 1 , 4 ) , package ) %>% summarize( count = sum( count ) ) | |
) | |
} | |
# compute year-package weight | |
year_package_weights <- | |
data.frame( year_package_counts %>% spread( year , count ) ) | |
# overwrite missings with zeroes | |
year_package_weights[ , -1 ][ is.na( year_package_weights[ , -1 ] ) ] <- 0 | |
traceback() | |
cachaca( "https://cloud.r-project.org/web/packages/packages.rds" , tf , mode = 'wb' ) | |
package_title_description <- data.frame( readRDS( tf ) )[ c( 'Package' , 'Title' , 'Description' ) ] | |
package_title_description$package <- as.character( package_title_description$Package ) | |
package_title_description$text <- paste0( package_title_description$Title , package_title_description$Description ) | |
package_title_description$text <- tolower( gsub( "\\n|\\t" , " " , package_title_description$text ) ) | |
package_title_description$text <- removeWords( package_title_description$text , stopwords( "english" ) ) | |
package_title_description$text <- str_replace_all( package_title_description$text , "[[:punct:]]", " " ) | |
package_title_description$text <- gsub( " " , " " , package_title_description$text ) | |
word_list <- strsplit( package_title_description$text , " " ) | |
word_list <- lapply( word_list , function( z ) data.frame( word = z[ z != '' ] , stringsAsFactors = FALSE ) ) | |
merged_list <- | |
mapply( | |
merge , | |
word_list , | |
lapply( package_title_description$package , function( z ) data.frame( package = z , stringsAsFactors = FALSE ) ) , | |
SIMPLIFY = FALSE | |
) | |
merged_df <- do.call( rbind , merged_list ) | |
weighted_df <- merge( merged_df , year_package_weights ) | |
word_weighted_df <- | |
data.frame( weighted_df[ , -1 ] %>% group_by( word ) %>% summarize_all( sum ) ) | |
# remove some other words | |
word_weighted_df <- | |
subset( | |
word_weighted_df , | |
!( word %in% c( 'based' , 'also' , 'can' , 'including' , 'provides' , 'provided' , '<doi' , '1' , '10' , 'well' , 'using' , 'use' , 'used' , 'uses' ) ) | |
) | |
top_twenty_five <- | |
data.frame( | |
y2013 = head( word_weighted_df[ order( -word_weighted_df$X2013 ) , 'word' ] , 25 ) , | |
y2014 = head( word_weighted_df[ order( -word_weighted_df$X2014 ) , 'word' ] , 25 ) , | |
y2015 = head( word_weighted_df[ order( -word_weighted_df$X2015 ) , 'word' ] , 25 ) , | |
y2016 = head( word_weighted_df[ order( -word_weighted_df$X2016 ) , 'word' ] , 25 ) , | |
y2017 = head( word_weighted_df[ order( -word_weighted_df$X2017 ) , 'word' ] , 25 ) | |
) | |
top_twenty_five |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment