Skip to content

Instantly share code, notes, and snippets.

@nruigrok
Created October 5, 2023 13:18
Show Gist options
  • Save nruigrok/52d6dffe6ad33f2924895ae909b49f74 to your computer and use it in GitHub Desktop.
Save nruigrok/52d6dffe6ad33f2924895ae909b49f74 to your computer and use it in GitHub Desktop.
conceptmap_2.R
library(tidytext)
library(tidyverse)
library(tidygraph)
library(ggraph)
library(ggiraph)
library(udpipe)
source("conceptmap.R")
stopwords= stopwords::stopwords("nl", source = "snowball")
get_tokens <- function(fn){
tibble(text=read_file(fn)) |>
unnest_tokens(word, text) |>
filter(!word %in% stopwords) |>
mutate(offset=seq_along(word))
}
remotes::install_github("ccs-amsterdam/amcat4r")
amcat4r::amcat_login("https://amcat4.labs.vu.nl/amcat")
piek_1 = amcat4r::query_documents("pefd_media", queries = 'fraude', fields=c("title", "text", "date"), max_pages = 0, filters = list(date=list(gte="1992-11-01", lt="1993-11-01")))|>
arrange(date)|>
mutate(text = paste0(title,text))|>
rename(doc_id=.id)
piek_2 = amcat4r::query_documents("pefd_media", queries = 'fraude', fields=c("title", "text", "date"), max_pages = 0, filters = list(date=list(gte="2001-02-01", lt="2002-05-01")))|>
arrange(date)|>
mutate(text = paste0(title,text))|>
rename(doc_id=.id)
piek_3 = amcat4r::query_documents("pefd_media", queries = 'fraude', fields=c("title", "text", "date"), max_pages = 0, filters = list(date=list(gte="2013-04-01", lt="2014-02-01")))|>
arrange(date)|>
mutate(text = paste0(title,text))|>
rename(doc_id=.id)
piek_4 = amcat4r::query_documents("pefd_media", queries = 'fraude', fields=c("title", "text", "date"), max_pages = 0, filters = list(date=list(gte="2020-01-01")))|>
arrange(date)|>
mutate(text = paste0(title,text))|>
rename(doc_id=.id)
###UDPIPE Make tokens
tokens_1=udpipe(piek_1, "dutch", parser='none')
saveRDS(tokens_1, "data/tokens_1.rds")
tokens_2=udpipe(piek_2, "dutch", parser='none')
saveRDS(tokens_2, "data/tokens_2.rds")
tokens_3=udpipe(piek_3, "dutch", parser='none')
saveRDS(tokens_3, "data/tokens_3.rds")
tokens_4=udpipe(piek_4, "dutch", parser='none')
saveRDS(tokens_4, "data/tokens_4.rds")
#####
tokens_1 = readRDS("data/tokens_1.rds")
tokens_1 = tokens_1|>
select(token, lemma, upos)|>
mutate(offset=seq_along(token))|>
filter(upos %in% c('NOUN','PROPN'))|>
rename(word=token)|>
mutate(word = tolower(word))|>
filter(! word %in% stopwords)
nwords_1 = tokens_1|>
group_by(word)|>
summarise(n=n())|>
arrange(-n)|>
top_n(5)
nwords_1
tokens_1 = tokens_1|>
filter(word %in% nwords$word)
kde_1 = kde_matrix(tokens_1$word, bandwidth = 1000)
kde_1 |>
filter(word %in% c("fraude", "mensen", "jaar", "uitkering", "procent")) |>
ggplot(aes(x=offset, y=p, color=word)) + geom_line() +
theme_classic()
dist_1 <- distances(kde_1)|>
filter(from %in% nwords_1$word)
nodes = unique(select(dist_1, word=from))|>
mutate(id=as.character(seq_along(word)))
g_1=tbl_graph(nodes=nodes, edges=dist_1, directed=F)
g <- g_1 |> ggraph(layout = "fr", weights=sim) +
geom_edge_link(color="gray") +
geom_node_text(aes(label=word), repel=F) +
theme_graph() +
ggtitle("Piek 1992-1993")
girafe(ggobj=g, width_svg = 25, height_svg = 25,
options = list(opts_sizing(rescale = FALSE)))
####
tokens_2 = readRDS("data/tokens_2.rds")
tokens_2 = tokens_2|>
select(token, lemma, upos)|>
mutate(offset=seq_along(token))|>
filter(upos %in% c('NOUN','PROPN'))|>
rename(word=token)|>
mutate(word = tolower(word))|>
filter(! word %in% stopwords)
nwords_2 = tokens_2|>
group_by(word)|>
summarise(n=n())|>
arrange(-n)|>
top_n(50)
nwords_2
tokens_2b = tokens_2|>
filter(word %in% nwords_2$word)
kde_2 = kde_matrix(tokens_2$word, bandwidth = 500)
kde_2 |>
filter(word %in% c("fraude", "zaken", "economie", "nederland", "marokko")) |>
ggplot(aes(x=offset, y=p, color=word)) + geom_line() +
theme_classic()
dist_2 <- distances(kde_2)
nodes = unique(select(dist_2, word=from))
g_2=tbl_graph(nodes=nodes, edges=dist_2, directed=F)
g <- g_2 |> ggraph(layout = "fr", weights=sim) +
geom_edge_link(color="gray") +
geom_node_text(aes(label=word), repel=F) +
theme_graph() +
ggtitle("Piek 2002")
girafe(ggobj=g, width_svg = 25, height_svg = 25,
options = list(opts_sizing(rescale = FALSE)))
####
tokens_3 = readRDS("data/tokens_3.rds")
tokens_3 = tokens_3|>
select(token, lemma, upos)|>
mutate(offset=seq_along(token))|>
filter(upos %in% c('NOUN','PROPN'))|>
rename(word=token)|>
mutate(word = tolower(word))|>
filter(! word %in% stopwords)
nwords_3 = tokens_3|>
group_by(word)|>
summarise(n=n())|>
arrange(-n)|>
top_n(100)
nwords_3
tokens_3b = tokens_3|>
filter(word %in% nwords$word)
kde_3 = kde_matrix(tokens_3$word)
kde_3 |>
filter(word %in% c("fraude", "mensen", "jaar", "nederland", "zaken")) |>
ggplot(aes(x=offset, y=p, color=word)) + geom_line() +
theme_classic()
dist_3 <- distances(kde_3)|>
filter(from=='fraude')
nodes = unique(select(dist_3, word=to))
g_3=tbl_graph(nodes=nodes, edges=dist_3, directed=F)
g <- g_3 |> ggraph(layout = "fr", weights=sim) +
geom_edge_link(color="gray") +
geom_node_text(aes(label=word), repel=F) +
theme_graph() +
ggtitle("Piek Bulgarenfraude")
girafe(ggobj=g, width_svg = 25, height_svg = 25,
options = list(opts_sizing(rescale = FALSE)))
#####
tokens_4 = readRDS("data/tokens_4.rds")
tokens_4 = tokens_4|>
select(token, lemma, upos)|>
mutate(offset=seq_along(token))|>
filter(upos %in% c('NOUN','PROPN'))|>
rename(word=token)|>
mutate(word = tolower(word))|>
filter(! word %in% stopwords)
nwords_4 = tokens_4|>
group_by(word)|>
summarise(n=n())|>
arrange(-n)|>
top_n(100)
nwords_4
tokens_4b = tokens_4|>
filter(word %in% nwords$word)
kde_4 = kde_matrix(tokens_4$word)
kde_4 |>
filter(word %in% c("fraude", "mensen", "jaar", "nederland", "zaken")) |>
ggplot(aes(x=offset, y=p, color=word)) + geom_line() +
theme_classic()
dist_4 <- distances(kde_4)|>
filter(from=='fraude')
nodes = unique(select(dist_4, word=to))
g_4=tbl_graph(nodes=nodes, edges=dist_4, directed=F)
g <- g_4 |> ggraph(layout = "fr", weights=sim) +
geom_edge_link(color="gray") +
geom_node_text(aes(label=word), repel=F) +
theme_graph() +
ggtitle("Toeslagenaffaire")
girafe(ggobj=g, width_svg = 25, height_svg = 25,
options = list(opts_sizing(rescale = FALSE)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment