Skip to content

Instantly share code, notes, and snippets.

@ajdamico
Created March 11, 2018 18:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ajdamico/d2b0bf5e6e66ce6e605a1ce14dc995b5 to your computer and use it in GitHub Desktop.
Save ajdamico/d2b0bf5e6e66ce6e605a1ce14dc995b5 to your computer and use it in GitHub Desktop.
twenty-five most common words in CRAN title + description fields, weighted by downloads
# devtools::install_github( "ajdamico/lodown" )
library(tm)
library(tidyverse)
library(rvest)
cranlogs_html <- read_html( "http://cran-logs.rstudio.com/" )
gz_files <- html_attr( html_nodes( cranlogs_html , "a" ) , "href" )
tf <- tempfile()
year_package_counts <- NULL
for( this_year in 2013:2017 ){
day_package_counts <- NULL
this_year_files <- grep( paste0( "^" , this_year ) , gz_files , value = TRUE )
this_year_files <- this_year_files[ !grepl( "-r\\.csv\\.gz" , this_year_files ) ]
for( this_day_file in this_year_files ){
lodown::cachaca( paste0( "http://cran-logs.rstudio.com/" , this_day_file ) , tf , mode = 'wb' )
this_gz_file <- read_csv( gzfile( tf ) )
day_package_counts <-
rbind(
day_package_counts ,
this_gz_file %>% group_by( date , package ) %>% summarize( count = n() )
)
}
year_package_counts <-
rbind(
year_package_counts ,
day_package_counts %>% group_by( year = substr( date , 1 , 4 ) , package ) %>% summarize( count = sum( count ) )
)
}
# compute year-package weight
year_package_weights <-
data.frame( year_package_counts %>% spread( year , count ) )
# overwrite missings with zeroes
year_package_weights[ , -1 ][ is.na( year_package_weights[ , -1 ] ) ] <- 0
traceback()
cachaca( "https://cloud.r-project.org/web/packages/packages.rds" , tf , mode = 'wb' )
package_title_description <- data.frame( readRDS( tf ) )[ c( 'Package' , 'Title' , 'Description' ) ]
package_title_description$package <- as.character( package_title_description$Package )
package_title_description$text <- paste0( package_title_description$Title , package_title_description$Description )
package_title_description$text <- tolower( gsub( "\\n|\\t" , " " , package_title_description$text ) )
package_title_description$text <- removeWords( package_title_description$text , stopwords( "english" ) )
package_title_description$text <- str_replace_all( package_title_description$text , "[[:punct:]]", " " )
package_title_description$text <- gsub( " " , " " , package_title_description$text )
word_list <- strsplit( package_title_description$text , " " )
word_list <- lapply( word_list , function( z ) data.frame( word = z[ z != '' ] , stringsAsFactors = FALSE ) )
merged_list <-
mapply(
merge ,
word_list ,
lapply( package_title_description$package , function( z ) data.frame( package = z , stringsAsFactors = FALSE ) ) ,
SIMPLIFY = FALSE
)
merged_df <- do.call( rbind , merged_list )
weighted_df <- merge( merged_df , year_package_weights )
word_weighted_df <-
data.frame( weighted_df[ , -1 ] %>% group_by( word ) %>% summarize_all( sum ) )
# remove some other words
word_weighted_df <-
subset(
word_weighted_df ,
!( word %in% c( 'based' , 'also' , 'can' , 'including' , 'provides' , 'provided' , '<doi' , '1' , '10' , 'well' , 'using' , 'use' , 'used' , 'uses' ) )
)
top_twenty_five <-
data.frame(
y2013 = head( word_weighted_df[ order( -word_weighted_df$X2013 ) , 'word' ] , 25 ) ,
y2014 = head( word_weighted_df[ order( -word_weighted_df$X2014 ) , 'word' ] , 25 ) ,
y2015 = head( word_weighted_df[ order( -word_weighted_df$X2015 ) , 'word' ] , 25 ) ,
y2016 = head( word_weighted_df[ order( -word_weighted_df$X2016 ) , 'word' ] , 25 ) ,
y2017 = head( word_weighted_df[ order( -word_weighted_df$X2017 ) , 'word' ] , 25 )
)
top_twenty_five
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment