Created
October 5, 2023 13:18
-
-
Save nruigrok/52d6dffe6ad33f2924895ae909b49f74 to your computer and use it in GitHub Desktop.
conceptmap_2.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidytext) | |
library(tidyverse) | |
library(tidygraph) | |
library(ggraph) | |
library(ggiraph) | |
library(udpipe) | |
source("conceptmap.R") | |
stopwords= stopwords::stopwords("nl", source = "snowball") | |
get_tokens <- function(fn){ | |
tibble(text=read_file(fn)) |> | |
unnest_tokens(word, text) |> | |
filter(!word %in% stopwords) |> | |
mutate(offset=seq_along(word)) | |
} | |
remotes::install_github("ccs-amsterdam/amcat4r") | |
amcat4r::amcat_login("https://amcat4.labs.vu.nl/amcat") | |
piek_1 = amcat4r::query_documents("pefd_media", queries = 'fraude', fields=c("title", "text", "date"), max_pages = 0, filters = list(date=list(gte="1992-11-01", lt="1993-11-01")))|> | |
arrange(date)|> | |
mutate(text = paste0(title,text))|> | |
rename(doc_id=.id) | |
piek_2 = amcat4r::query_documents("pefd_media", queries = 'fraude', fields=c("title", "text", "date"), max_pages = 0, filters = list(date=list(gte="2001-02-01", lt="2002-05-01")))|> | |
arrange(date)|> | |
mutate(text = paste0(title,text))|> | |
rename(doc_id=.id) | |
piek_3 = amcat4r::query_documents("pefd_media", queries = 'fraude', fields=c("title", "text", "date"), max_pages = 0, filters = list(date=list(gte="2013-04-01", lt="2014-02-01")))|> | |
arrange(date)|> | |
mutate(text = paste0(title,text))|> | |
rename(doc_id=.id) | |
piek_4 = amcat4r::query_documents("pefd_media", queries = 'fraude', fields=c("title", "text", "date"), max_pages = 0, filters = list(date=list(gte="2020-01-01")))|> | |
arrange(date)|> | |
mutate(text = paste0(title,text))|> | |
rename(doc_id=.id) | |
###UDPIPE Make tokens | |
tokens_1=udpipe(piek_1, "dutch", parser='none') | |
saveRDS(tokens_1, "data/tokens_1.rds") | |
tokens_2=udpipe(piek_2, "dutch", parser='none') | |
saveRDS(tokens_2, "data/tokens_2.rds") | |
tokens_3=udpipe(piek_3, "dutch", parser='none') | |
saveRDS(tokens_3, "data/tokens_3.rds") | |
tokens_4=udpipe(piek_4, "dutch", parser='none') | |
saveRDS(tokens_4, "data/tokens_4.rds") | |
##### | |
tokens_1 = readRDS("data/tokens_1.rds") | |
tokens_1 = tokens_1|> | |
select(token, lemma, upos)|> | |
mutate(offset=seq_along(token))|> | |
filter(upos %in% c('NOUN','PROPN'))|> | |
rename(word=token)|> | |
mutate(word = tolower(word))|> | |
filter(! word %in% stopwords) | |
nwords_1 = tokens_1|> | |
group_by(word)|> | |
summarise(n=n())|> | |
arrange(-n)|> | |
top_n(5) | |
nwords_1 | |
tokens_1 = tokens_1|> | |
filter(word %in% nwords$word) | |
kde_1 = kde_matrix(tokens_1$word, bandwidth = 1000) | |
kde_1 |> | |
filter(word %in% c("fraude", "mensen", "jaar", "uitkering", "procent")) |> | |
ggplot(aes(x=offset, y=p, color=word)) + geom_line() + | |
theme_classic() | |
dist_1 <- distances(kde_1)|> | |
filter(from %in% nwords_1$word) | |
nodes = unique(select(dist_1, word=from))|> | |
mutate(id=as.character(seq_along(word))) | |
g_1=tbl_graph(nodes=nodes, edges=dist_1, directed=F) | |
g <- g_1 |> ggraph(layout = "fr", weights=sim) + | |
geom_edge_link(color="gray") + | |
geom_node_text(aes(label=word), repel=F) + | |
theme_graph() + | |
ggtitle("Piek 1992-1993") | |
girafe(ggobj=g, width_svg = 25, height_svg = 25, | |
options = list(opts_sizing(rescale = FALSE))) | |
#### | |
tokens_2 = readRDS("data/tokens_2.rds") | |
tokens_2 = tokens_2|> | |
select(token, lemma, upos)|> | |
mutate(offset=seq_along(token))|> | |
filter(upos %in% c('NOUN','PROPN'))|> | |
rename(word=token)|> | |
mutate(word = tolower(word))|> | |
filter(! word %in% stopwords) | |
nwords_2 = tokens_2|> | |
group_by(word)|> | |
summarise(n=n())|> | |
arrange(-n)|> | |
top_n(50) | |
nwords_2 | |
tokens_2b = tokens_2|> | |
filter(word %in% nwords_2$word) | |
kde_2 = kde_matrix(tokens_2$word, bandwidth = 500) | |
kde_2 |> | |
filter(word %in% c("fraude", "zaken", "economie", "nederland", "marokko")) |> | |
ggplot(aes(x=offset, y=p, color=word)) + geom_line() + | |
theme_classic() | |
dist_2 <- distances(kde_2) | |
nodes = unique(select(dist_2, word=from)) | |
g_2=tbl_graph(nodes=nodes, edges=dist_2, directed=F) | |
g <- g_2 |> ggraph(layout = "fr", weights=sim) + | |
geom_edge_link(color="gray") + | |
geom_node_text(aes(label=word), repel=F) + | |
theme_graph() + | |
ggtitle("Piek 2002") | |
girafe(ggobj=g, width_svg = 25, height_svg = 25, | |
options = list(opts_sizing(rescale = FALSE))) | |
#### | |
tokens_3 = readRDS("data/tokens_3.rds") | |
tokens_3 = tokens_3|> | |
select(token, lemma, upos)|> | |
mutate(offset=seq_along(token))|> | |
filter(upos %in% c('NOUN','PROPN'))|> | |
rename(word=token)|> | |
mutate(word = tolower(word))|> | |
filter(! word %in% stopwords) | |
nwords_3 = tokens_3|> | |
group_by(word)|> | |
summarise(n=n())|> | |
arrange(-n)|> | |
top_n(100) | |
nwords_3 | |
tokens_3b = tokens_3|> | |
filter(word %in% nwords$word) | |
kde_3 = kde_matrix(tokens_3$word) | |
kde_3 |> | |
filter(word %in% c("fraude", "mensen", "jaar", "nederland", "zaken")) |> | |
ggplot(aes(x=offset, y=p, color=word)) + geom_line() + | |
theme_classic() | |
dist_3 <- distances(kde_3)|> | |
filter(from=='fraude') | |
nodes = unique(select(dist_3, word=to)) | |
g_3=tbl_graph(nodes=nodes, edges=dist_3, directed=F) | |
g <- g_3 |> ggraph(layout = "fr", weights=sim) + | |
geom_edge_link(color="gray") + | |
geom_node_text(aes(label=word), repel=F) + | |
theme_graph() + | |
ggtitle("Piek Bulgarenfraude") | |
girafe(ggobj=g, width_svg = 25, height_svg = 25, | |
options = list(opts_sizing(rescale = FALSE))) | |
##### | |
tokens_4 = readRDS("data/tokens_4.rds") | |
tokens_4 = tokens_4|> | |
select(token, lemma, upos)|> | |
mutate(offset=seq_along(token))|> | |
filter(upos %in% c('NOUN','PROPN'))|> | |
rename(word=token)|> | |
mutate(word = tolower(word))|> | |
filter(! word %in% stopwords) | |
nwords_4 = tokens_4|> | |
group_by(word)|> | |
summarise(n=n())|> | |
arrange(-n)|> | |
top_n(100) | |
nwords_4 | |
tokens_4b = tokens_4|> | |
filter(word %in% nwords$word) | |
kde_4 = kde_matrix(tokens_4$word) | |
kde_4 |> | |
filter(word %in% c("fraude", "mensen", "jaar", "nederland", "zaken")) |> | |
ggplot(aes(x=offset, y=p, color=word)) + geom_line() + | |
theme_classic() | |
dist_4 <- distances(kde_4)|> | |
filter(from=='fraude') | |
nodes = unique(select(dist_4, word=to)) | |
g_4=tbl_graph(nodes=nodes, edges=dist_4, directed=F) | |
g <- g_4 |> ggraph(layout = "fr", weights=sim) + | |
geom_edge_link(color="gray") + | |
geom_node_text(aes(label=word), repel=F) + | |
theme_graph() + | |
ggtitle("Toeslagenaffaire") | |
girafe(ggobj=g, width_svg = 25, height_svg = 25, | |
options = list(opts_sizing(rescale = FALSE))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment