Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save SPLOpenData/c98052d82e94164e32faac53711232cc to your computer and use it in GitHub Desktop.
Save SPLOpenData/c98052d82e94164e32faac53711232cc to your computer and use it in GitHub Desktop.
#This R script creates wordcloud of CRAN package titles since 2020.
#This R script creates wordcloud of CRAN package titles since 2020.
library(rvest)
library(ggplot2)
library(dplyr)
library(tokenizers)
library(SnowballC)
library(stopwords)
library(ggwordcloud)
URL<- "https://cran.r-project.org/web/packages/available_packages_by_date.html"
#Scrape table data from CRAN website
getthis<- xml2::read_html(URL)
cran_df<-rvest::html_table(getthis) %>% data.frame(.)
#filter to packages published since 2020
cran_df_since2020<- cran_df[as.Date(cran_df$Date)>"2020-01-01",]
#concatenate title text, and removed some of the symbols in the text
CRAN_text<- cran_df_since2020 %>%
summarize(test = toString(Title)) %>%
ungroup() %>%
gsub("\n", " ",.) %>%
gsub("\\(|\\)|, ", " ",.)
#Tokenize text by wordstems, filter stopwords.
CRAN_text_token<-tokenize_word_stems(CRAN_text, stopwords = stopwords::stopwords("en"))
#Create a frequency table
CRAN_text_table<- table(CRAN_text_token) %>% data.frame(.) %>%
arrange(desc(Freq))
#Limit frequency table to top 200 rows
CRAN_text_top200<-CRAN_text_table[1:200,]
#today's date (for labeling chart)
today_is<-Sys.Date()
#plot wordcloud.
p<-ggplot(CRAN_text_top200, aes(label = CRAN_text_token, size = Freq)) +
geom_text_wordcloud() +
scale_size_area(max_size = 36) +
theme_minimal()+
ggtitle("CRAN package vocabulary ")+
labs(subtitle = paste0("Top 200 words in package titles, sized by frequency as of ", today_is))
p<-p+ theme(plot.title = element_text(size=32, color="#6699cc"),
plot.subtitle = element_text(size=16, color="#6699cc"))
#Save to file
png(file="cran_wordcloud.png",
width=1200, height=700)
p
dev.off()
@SPLOpenData
Copy link
Author

cran_wordcloud

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment