Created
May 30, 2022 09:55
-
-
Save vanatteveldt/6fb94b263b35e0b1719746759a84a23a to your computer and use it in GitHub Desktop.
ICA Demo of CAVA dictionary tools by Wouter van Atteveldt, Dafne van Kuppevelt, and Kasper Welbers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#################################################### | |
# # | |
# Bonjour a tous! # | |
# # | |
# Ca va‽️ # | |
# # | |
#################################################### | |
# Embedding-based tools for (semi-)automatic dictionary | |
# comprehension, augmentation validation and analysis | |
# Wouter van Atteveldt, Dafne van Kuppevelt, Kasper Welbers | |
# github.com/vanatteveldt/CAVA | |
# wouter.van.atteveldt@vu.nl | |
# Remember: making dictionaries is hard, | |
# and conceptualization and validation are essential | |
#################################################### | |
# Let's get started! # | |
#################################################### | |
library(tidyverse) | |
# remotes::install_github("vanatteveldt/CAVA") | |
seed = c("immigra*", "migra*", "refugee*", "emigrat*") | |
pimpo = readRDS("~/icademo/pimpo_us.rds") |> as_tibble() | |
pimpo_corpus = quanteda::corpus(pimpo, text_field = "content") | |
vectors = CAVA::load_fasttext("~/icademo/cc.en.300.bin", corpus=pimpo_corpus) | |
vectors$vocabulary | |
vectors$vectors[1:5, 1:5] | |
#################################################### | |
# (1) Dictionary expansion # | |
#################################################### | |
dictionary = CAVA::expand_wildcards(seed, vectors) | |
candidates = CAVA::similar_words(dictionary, vectors) | |
View(candidates) | |
#################################################### | |
# (2) Dictionary cohesion # | |
#################################################### | |
expanded = candidates |> filter(similarity>.3) |> pull(word) | |
CAVA::similarity_to_centroid(expanded, vectors) | |
CAVA::similarity_to_centroid(expanded, vectors) |> tail() | |
similarities = CAVA::pairwise_similarities(expanded, vectors) | |
g = CAVA::similarity_graph(similarities, max_edges = 100) | |
plot(g) | |
# static plots are boring, right? | |
library(networkD3) | |
g = CAVA::similarity_graph(similarities, threshold=.6) | |
nodes = igraph::as_data_frame(g, what='vertices') %>% | |
mutate(size=n^.75) | |
edges = igraph::as_data_frame(g, what='edges') %>% | |
mutate(from=match(from, nodes$name)-1, | |
to=match(to, nodes$name)-1) | |
networkD3::forceNetwork( | |
Links = edges, Nodes = nodes, | |
NodeID = 'name', Group = 'cluster', Nodesize='size', | |
fontSize = 10, zoom = TRUE, linkDistance = 30, | |
opacityNoHover=.75, opacity=.75) | |
#################################################### | |
# Manual word-level evaluation with ccs-annotator # | |
#################################################### | |
# Hold-one out evaluation | |
eval = CAVA::evaluate_expansion(seed, vectors, split=1) | |
# plot similarity vs | |
eval |>pivot_longer(-word:-rank_mean, names_to = "Metric") |> | |
filter(rank_mean<1000, Metric %in% c("recall_mean", "precision_mean", "f4_mean")) |> | |
ggplot() + geom_line(aes(x=similarity_mean, y=value, color=Metric)) + | |
xlab("Similarity to seed set") + | |
ylab("Evaluation value") + | |
ggtitle("Hold-one out evaluation results") | |
#################################################### | |
# Semi-automatic expansion / human-in-the-loop # | |
#################################################### | |
# Select candidates for manual evaluation | |
manual = bind_rows( | |
candidates |> filter(similarity > .4), | |
candidates |> filter(similarity > .3, similarity < .4) |> sample_n(70)) |> | |
mutate(id = glue::glue("w{row_number()}")) | |
# write_csv("manual.csv") | |
manual = read_csv("manual.csv") | |
### Write and distribute the coding job | |
# See https://github.com/ccs-amsterdam/ccsAnnotator | |
# https://t.co/uceHEIwCgP | |
# (Or go back in time and see Kasper Welber's presentation yesterday) | |
### Retrieve Annotations | |
ccsAnnotator::backend_connect('https://kasperwelbers.com/ica-annotator', 'wouter@vanatteveldt.com') | |
annotations = ccsAnnotator::download_annotations(job_id = 17) | |
annotations = annotations |> | |
filter(str_detect(unit_id, "^w\\d+"), | |
value != "undefined") |> | |
mutate(relevant=as.numeric(value == "Yes")) |> | |
select(id=unit_id, coder, relevant) |> | |
left_join(manual) | |
coded = annotations |> | |
group_by(word, similarity, frequency) |> | |
summarize(n=n(), m=mean(relevant), sd=sd(relevant)) | |
### Similarity vs relevance | |
ggplot(coded) + | |
ggwordcloud::geom_text_wordcloud_area( | |
aes(x=m, y=similarity, label=word, size=log(frequency), color=sd)) + | |
scale_size_area(max_size = 8) + | |
xlab("Mean relevance according to coders") + | |
ylab("Similarity to seed set") + | |
theme_minimal() + theme(panel.grid = element_blank(), axis.text = element_blank()) | |
#################################################### | |
# Manual evaluation of the resulting dictionary # | |
#################################################### | |
library(ccsAnnotator) | |
testset = pimpo |> | |
add_column(id=1:nrow(pimpo)) |> | |
group_by(selection) |> | |
sample_n(50) |> | |
select(id, selection, content) | |
immigration = ccsAnnotator::question( | |
name='immigration', | |
question='Is this about immigration‽', | |
codes = c(No = 'red', Yes = 'green', Skip = 'grey'), | |
type="annotinder") | |
codingjob = ccsAnnotator::create_job( | |
title='immigration_evaluation', | |
units=ccsAnnotator::create_units(testset, id='id', text='content'), | |
codebook=ccsAnnotator::create_codebook(immigration)) | |
job_db = ccsAnnotator::create_job_db(codingjob, overwrite = T) | |
job_db = ccsAnnotator::start_annotator(job_db, background=T) | |
ccsAnnotator::gimme_annotations(job_db) | |
#################################################### | |
# Fortunately we had a gold standard already... # | |
#################################################### | |
dictionaries = list( | |
seed = seed, | |
auto03 = candidates |> filter(similarity > .3) |> pull(word) |> c(seed), | |
auto04 = candidates |> filter(similarity > .4) |> pull(word) |> c(seed), | |
manual05 = coded |> filter(m>.5) |> pull(word) |> c(seed), | |
manual08 = coded |> filter(m>.8) |> pull(word) |> c(seed) | |
) |> quanteda::dictionary() | |
dict_results = pimpo_corpus |> | |
quanteda::tokens() |> | |
quanteda::dfm() |> | |
quanteda::dfm_lookup(dictionaries) |> | |
quanteda::convert(to="data.frame") |> | |
as_tibble() | |
dict_results |> | |
add_column(gold=docvars(pimpo_corpus, "selection")) |> | |
pivot_longer(seed:manual08) |> | |
mutate( | |
tp=as.numeric(value>1 & gold==1), | |
fp=as.numeric(value>1 & gold==0), | |
fn=as.numeric(value==0 & gold==1), | |
tn=as.numeric(value==0 & gold==0)) |> | |
group_by(name) |> | |
summarize(pr=sum(tp)/sum(tp+fp), | |
re=sum(tp)/sum(tp+fn), | |
f1=2*pr*re/(pr+re)) | |
# Thanks!!!! | |
# Wouter van Atteveldt, Dafne van Kuppevelt, Kasper Welbers | |
# github.com/vanatteveldt/CAVA | |
# wouter.van.atteveldt@vu.nl |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment