slarge/fedregs_demo.R

## fedregs_demo.R
install.packages("fedregs")
library(fedregs)
library(dplyr)
library(ggplot2)
library(quanteda)
library(magick)

## For more info, check out the fedregs github repository:
## https://github.com/slarge/fedregs


## What does the text look like for northeastern US Fisheries regulations?
regs <- cfr_text(year = 2017,
                 title_number = 50,
                 chapter = 6,
                 part = 648,
                 #token = "ngrams", # uncomment for ngrams of length 2
                 #n = 2, # uncomment for ngrams of length 2
                 return_tidytext = TRUE,
                 verbose = FALSE)

## "stop words" are common that don't hold a lot of significance
stop_words <- data_frame(word = quanteda::stopwords("english"))

## Clean up words to try and find most meaningful words
clean_words <- regs %>%
  tidyr::unnest() %>%
  mutate(word = gsub("[[:punct:]]", "", word), # remove any remaining punctuation
         word = gsub("^[[:digit:]]*", "", word)) %>%  # remove digits (e.g., 1st, 1881a, 15th, etc)
  anti_join(stop_words, by = "word") %>%  # remove "stop words"
  filter(is.na(as.numeric(word)),
         !grepl("^m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$",
                word), # adios Roman Numerals
         !grepl("\\b[a-z]{1}\\b", word), # get rid of one letter words
         !grepl("\\bwww*.", word)) %>% # get rid of web addresses
  mutate(word = quanteda::tokens(word),
         word = as.character(quanteda::tokens_wordstem(word)))

## Count the 50 most frequent words
count_words <- clean_words %>%
  group_by(word) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  arrange(-n) %>%
  top_n(n = 50, wt = n) %>%
  mutate(word = reorder(word, n))

## Plot
gg <- ggplot(count_words, aes(n, word)) +
  geom_segment(aes(xend=0, yend=word), size=3.5) +
  labs(y = NULL,
       x = "count",
       title = "Code of Federal Regulations",
       subtitle = "Title 50, Chapter VI, Part 648",
       caption = sprintf("Data accessed on %s from:\n https://www.govinfo.gov/bulkdata/CFR/",
                         format(Sys.Date(), "%d %B %Y"))) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.direction = "horizontal",
        legend.position = "bottom",
        text = element_text(size = 6)) +
  theme_minimal()

## Save
fig <- image_graph(width=1016, height=573, res=72)
print(gg)
dev.off()

image_write(fig, "final.png")
	install.packages("fedregs")
	library(fedregs)
	library(dplyr)
	library(ggplot2)
	library(quanteda)
	library(magick)

	## For more info, check out the fedregs github repository:
	## https://github.com/slarge/fedregs


	## What does the text look like for northeastern US Fisheries regulations?
	regs <- cfr_text(year = 2017,
	title_number = 50,
	chapter = 6,
	part = 648,
	#token = "ngrams", # uncomment for ngrams of length 2
	#n = 2, # uncomment for ngrams of length 2
	return_tidytext = TRUE,
	verbose = FALSE)

	## "stop words" are common that don't hold a lot of significance
	stop_words <- data_frame(word = quanteda::stopwords("english"))

	## Clean up words to try and find most meaningful words
	clean_words <- regs %>%
	tidyr::unnest() %>%
	mutate(word = gsub("[[:punct:]]", "", word), # remove any remaining punctuation
	word = gsub("^[[:digit:]]*", "", word)) %>% # remove digits (e.g., 1st, 1881a, 15th, etc)
	anti_join(stop_words, by = "word") %>% # remove "stop words"
	filter(is.na(as.numeric(word)),
	!grepl("^m{0,4}(cm\|cd\|d?c{0,3})(xc\|xl\|l?x{0,3})(ix\|iv\|v?i{0,3})$",
	word), # adios Roman Numerals
	!grepl("\\b[a-z]{1}\\b", word), # get rid of one letter words
	!grepl("\\bwww*.", word)) %>% # get rid of web addresses
	mutate(word = quanteda::tokens(word),
	word = as.character(quanteda::tokens_wordstem(word)))

	## Count the 50 most frequent words
	count_words <- clean_words %>%
	group_by(word) %>%
	summarise(n = n()) %>%
	ungroup() %>%
	arrange(-n) %>%
	top_n(n = 50, wt = n) %>%
	mutate(word = reorder(word, n))

	## Plot
	gg <- ggplot(count_words, aes(n, word)) +
	geom_segment(aes(xend=0, yend=word), size=3.5) +
	labs(y = NULL,
	x = "count",
	title = "Code of Federal Regulations",
	subtitle = "Title 50, Chapter VI, Part 648",
	caption = sprintf("Data accessed on %s from:\n https://www.govinfo.gov/bulkdata/CFR/",
	format(Sys.Date(), "%d %B %Y"))) +
	theme(axis.text.x = element_text(angle = 45, hjust = 1),
	legend.direction = "horizontal",
	legend.position = "bottom",
	text = element_text(size = 6)) +
	theme_minimal()

	## Save
	fig <- image_graph(width=1016, height=573, res=72)
	print(gg)
	dev.off()

	image_write(fig, "final.png")