Created
April 9, 2021 20:29
-
-
Save SPLOpenData/c98052d82e94164e32faac53711232cc to your computer and use it in GitHub Desktop.
#This R script creates wordcloud of CRAN package titles since 2020.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#This R script creates wordcloud of CRAN package titles since 2020. | |
library(rvest) | |
library(ggplot2) | |
library(dplyr) | |
library(tokenizers) | |
library(SnowballC) | |
library(stopwords) | |
library(ggwordcloud) | |
URL<- "https://cran.r-project.org/web/packages/available_packages_by_date.html" | |
#Scrape table data from CRAN website | |
getthis<- xml2::read_html(URL) | |
cran_df<-rvest::html_table(getthis) %>% data.frame(.) | |
#filter to packages published since 2020 | |
cran_df_since2020<- cran_df[as.Date(cran_df$Date)>"2020-01-01",] | |
#concatenate title text, and removed some of the symbols in the text | |
CRAN_text<- cran_df_since2020 %>% | |
summarize(test = toString(Title)) %>% | |
ungroup() %>% | |
gsub("\n", " ",.) %>% | |
gsub("\\(|\\)|, ", " ",.) | |
#Tokenize text by wordstems, filter stopwords. | |
CRAN_text_token<-tokenize_word_stems(CRAN_text, stopwords = stopwords::stopwords("en")) | |
#Create a frequency table | |
CRAN_text_table<- table(CRAN_text_token) %>% data.frame(.) %>% | |
arrange(desc(Freq)) | |
#Limit frequency table to top 200 rows | |
CRAN_text_top200<-CRAN_text_table[1:200,] | |
#today's date (for labeling chart) | |
today_is<-Sys.Date() | |
#plot wordcloud. | |
p<-ggplot(CRAN_text_top200, aes(label = CRAN_text_token, size = Freq)) + | |
geom_text_wordcloud() + | |
scale_size_area(max_size = 36) + | |
theme_minimal()+ | |
ggtitle("CRAN package vocabulary ")+ | |
labs(subtitle = paste0("Top 200 words in package titles, sized by frequency as of ", today_is)) | |
p<-p+ theme(plot.title = element_text(size=32, color="#6699cc"), | |
plot.subtitle = element_text(size=16, color="#6699cc")) | |
#Save to file | |
png(file="cran_wordcloud.png", | |
width=1200, height=700) | |
p | |
dev.off() | |
Author
SPLOpenData
commented
Apr 9, 2021
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment