Created
April 25, 2020 12:39
-
-
Save nruigrok/f339b0f6bc61491c3c26b5fe3df45410 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(igraph) | |
library(tidyverse) | |
dtm = readRDS("/tmp/nieuws.rds") | |
g = RNewsflow::newsflow_compare(dtm, dtm, date='date', | |
min_similarity = 0.75, ## similarity threshold | |
hour_window = c(-1*24, 1*24), ## tijd window: tussen 0 en 7 dagen na publicatie persbericht | |
measure = 'cosine', ## cosine similarity. Je kunt ook overlap_pct gebruiken voor assymetrische vergelijking | |
tf_idf=T) ## weeg woorden die minder vaak voorkomen zwaarder mee | |
## g is een netwerk van alle artikelen. Je kunt daar een data.frame van make | |
## je moet dan alleen wel zelf nog de meta data eraan mergen | |
clusters = g %>% | |
decompose() %>% | |
purrr::map(igraph::as_data_frame) %>% | |
bind_rows(.id="cluster") %>% | |
as_tibble() | |
# If g was symmetric, each cluster should have the same from as to nodes, but this is not the case (?), so bind froms and tos | |
clusters %>% group_by(cluster) %>% summarize(n=n(), check=setequal(from, to)) %>% with(table(check)) | |
clusters %>% group_by(cluster) %>% summarize(n=n(), check=setequal(from, to)) %>% filter(!check) | |
g %>% igraph::as_data_frame() %>% filter(from %in% c(110922, 366719) | to %in% c(110922, 366719)) | |
g %>% igraph::as_data_frame() %>% filter(from %in% c(9651) | to %in% c(9651)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment