Skip to content

Instantly share code, notes, and snippets.

@ofchurches
Last active April 28, 2020 00:40
Show Gist options
  • Save ofchurches/da902a393ce8e529b33ca0a137cd5ff3 to your computer and use it in GitHub Desktop.
Save ofchurches/da902a393ce8e529b33ca0a137cd5ff3 to your computer and use it in GitHub Desktop.
tidy_tuesday_20200421
library(tidyverse)
library(tidytext)
library(wordcloud)
library(reshape2)
# Plot 1: Sentiment wordcloud.
gdpr_text <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_text.tsv')
#base on https://www.tidytextmining.com/sentiment.html
gdpr_text %>%
unnest_tokens(word, gdpr_text) %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("#F8766D", "#7CAE00"),
max.words = 100, match.colors = TRUE)
# Plot 2: Network
library(tidygraph)
library(ggraph)
gdpr_violations <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv')
edge_list <- gdpr_violations %>%
# this part is from https://juliasilge.com/blog/gdpr-violations/
transmute(id,
articles = str_extract_all(article_violated, "Art.[:digit:]+|Art. [:digit:]+")
) %>%
unnest(articles) %>%
# here the steps to getting an edge list are from https://stackoverflow.com/questions/34670145/generating-an-edge-list-from-id-and-grouping-vectors
group_by(id) %>%
filter(n() >= 2) %>%
do(data.frame(t(combn(.$articles, 2)), stringsAsFactors = FALSE)) %>%
ungroup() %>%
select(- id) %>%
rename(from = X1, to = X2) %>%
# here the steps to getting the edge weight are from: https://www.jessesadler.com/post/network-analysis-with-r/
group_by(from, to) %>%
summarise(weight = n()) %>%
ungroup()
# Create graph using tidygraph
graph <- as_tbl_graph(edge_list) %>%
to_undirected() %>%
activate(nodes) %>%
mutate(centrality = centrality_authority()) %>%
mutate(group = as.factor(group_edge_betweenness())) %>%
group_by(group) %>%
mutate(name_first = last(name, order_by = centrality)) %>%
ungroup()
# plot network using ggraph
graph %>%
ggraph(layout = 'linear', circular = TRUE) +
geom_edge_arc(aes(alpha = weight, width = weight), show.legend = FALSE) +
geom_node_label(aes(label = name,
colour = group)) +
theme_graph() +
guides(colour = FALSE, size = FALSE) +
labs(title = str_wrap("Network of GDPR articles that co-occured in the same violations",
width = 40))
ggsave("gdpr_network.png",
scale = 2,
width = 90,
height = 90,
units = "mm",
dpi = 300)
# Plot 3: Upset plot
library(UpSetR)
cooccurrence_df <- gdpr_violations %>%
# this part is from https://juliasilge.com/blog/gdpr-violations/
transmute(id,
articles = str_extract_all(article_violated, "Art.[:digit:]+|Art. [:digit:]+")
) %>%
unnest(articles) %>%
mutate(value = 1) %>%
distinct() %>%
pivot_wider(names_from = articles,
values_from = value,
values_fill = list(value = 0)) %>%
as.data.frame()
upset(cooccurrence_df, order.by = c( "freq"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment