Skip to content

Instantly share code, notes, and snippets.

@vanatteveldt
Created May 30, 2022 09:55
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vanatteveldt/6fb94b263b35e0b1719746759a84a23a to your computer and use it in GitHub Desktop.
Save vanatteveldt/6fb94b263b35e0b1719746759a84a23a to your computer and use it in GitHub Desktop.
ICA Demo of CAVA dictionary tools by Wouter van Atteveldt, Dafne van Kuppevelt, and Kasper Welbers
####################################################
# #
# Bonjour a tous! #
# #
# Ca va‽️ #
# #
####################################################
# Embedding-based tools for (semi-)automatic dictionary
# comprehension, augmentation validation and analysis
# Wouter van Atteveldt, Dafne van Kuppevelt, Kasper Welbers
# github.com/vanatteveldt/CAVA
# wouter.van.atteveldt@vu.nl
# Remember: making dictionaries is hard,
# and conceptualization and validation are essential
####################################################
# Let's get started! #
####################################################
library(tidyverse)
# remotes::install_github("vanatteveldt/CAVA")
seed = c("immigra*", "migra*", "refugee*", "emigrat*")
pimpo = readRDS("~/icademo/pimpo_us.rds") |> as_tibble()
pimpo_corpus = quanteda::corpus(pimpo, text_field = "content")
vectors = CAVA::load_fasttext("~/icademo/cc.en.300.bin", corpus=pimpo_corpus)
vectors$vocabulary
vectors$vectors[1:5, 1:5]
####################################################
# (1) Dictionary expansion #
####################################################
dictionary = CAVA::expand_wildcards(seed, vectors)
candidates = CAVA::similar_words(dictionary, vectors)
View(candidates)
####################################################
# (2) Dictionary cohesion #
####################################################
expanded = candidates |> filter(similarity>.3) |> pull(word)
CAVA::similarity_to_centroid(expanded, vectors)
CAVA::similarity_to_centroid(expanded, vectors) |> tail()
similarities = CAVA::pairwise_similarities(expanded, vectors)
g = CAVA::similarity_graph(similarities, max_edges = 100)
plot(g)
# static plots are boring, right?
library(networkD3)
g = CAVA::similarity_graph(similarities, threshold=.6)
nodes = igraph::as_data_frame(g, what='vertices') %>%
mutate(size=n^.75)
edges = igraph::as_data_frame(g, what='edges') %>%
mutate(from=match(from, nodes$name)-1,
to=match(to, nodes$name)-1)
networkD3::forceNetwork(
Links = edges, Nodes = nodes,
NodeID = 'name', Group = 'cluster', Nodesize='size',
fontSize = 10, zoom = TRUE, linkDistance = 30,
opacityNoHover=.75, opacity=.75)
####################################################
# Manual word-level evaluation with ccs-annotator #
####################################################
# Hold-one out evaluation
eval = CAVA::evaluate_expansion(seed, vectors, split=1)
# plot similarity vs
eval |>pivot_longer(-word:-rank_mean, names_to = "Metric") |>
filter(rank_mean<1000, Metric %in% c("recall_mean", "precision_mean", "f4_mean")) |>
ggplot() + geom_line(aes(x=similarity_mean, y=value, color=Metric)) +
xlab("Similarity to seed set") +
ylab("Evaluation value") +
ggtitle("Hold-one out evaluation results")
####################################################
# Semi-automatic expansion / human-in-the-loop #
####################################################
# Select candidates for manual evaluation
manual = bind_rows(
candidates |> filter(similarity > .4),
candidates |> filter(similarity > .3, similarity < .4) |> sample_n(70)) |>
mutate(id = glue::glue("w{row_number()}"))
# write_csv("manual.csv")
manual = read_csv("manual.csv")
### Write and distribute the coding job
# See https://github.com/ccs-amsterdam/ccsAnnotator
# https://t.co/uceHEIwCgP
# (Or go back in time and see Kasper Welber's presentation yesterday)
### Retrieve Annotations
ccsAnnotator::backend_connect('https://kasperwelbers.com/ica-annotator', 'wouter@vanatteveldt.com')
annotations = ccsAnnotator::download_annotations(job_id = 17)
annotations = annotations |>
filter(str_detect(unit_id, "^w\\d+"),
value != "undefined") |>
mutate(relevant=as.numeric(value == "Yes")) |>
select(id=unit_id, coder, relevant) |>
left_join(manual)
coded = annotations |>
group_by(word, similarity, frequency) |>
summarize(n=n(), m=mean(relevant), sd=sd(relevant))
### Similarity vs relevance
ggplot(coded) +
ggwordcloud::geom_text_wordcloud_area(
aes(x=m, y=similarity, label=word, size=log(frequency), color=sd)) +
scale_size_area(max_size = 8) +
xlab("Mean relevance according to coders") +
ylab("Similarity to seed set") +
theme_minimal() + theme(panel.grid = element_blank(), axis.text = element_blank())
####################################################
# Manual evaluation of the resulting dictionary #
####################################################
library(ccsAnnotator)
testset = pimpo |>
add_column(id=1:nrow(pimpo)) |>
group_by(selection) |>
sample_n(50) |>
select(id, selection, content)
immigration = ccsAnnotator::question(
name='immigration',
question='Is this about immigration‽',
codes = c(No = 'red', Yes = 'green', Skip = 'grey'),
type="annotinder")
codingjob = ccsAnnotator::create_job(
title='immigration_evaluation',
units=ccsAnnotator::create_units(testset, id='id', text='content'),
codebook=ccsAnnotator::create_codebook(immigration))
job_db = ccsAnnotator::create_job_db(codingjob, overwrite = T)
job_db = ccsAnnotator::start_annotator(job_db, background=T)
ccsAnnotator::gimme_annotations(job_db)
####################################################
# Fortunately we had a gold standard already... #
####################################################
dictionaries = list(
seed = seed,
auto03 = candidates |> filter(similarity > .3) |> pull(word) |> c(seed),
auto04 = candidates |> filter(similarity > .4) |> pull(word) |> c(seed),
manual05 = coded |> filter(m>.5) |> pull(word) |> c(seed),
manual08 = coded |> filter(m>.8) |> pull(word) |> c(seed)
) |> quanteda::dictionary()
dict_results = pimpo_corpus |>
quanteda::tokens() |>
quanteda::dfm() |>
quanteda::dfm_lookup(dictionaries) |>
quanteda::convert(to="data.frame") |>
as_tibble()
dict_results |>
add_column(gold=docvars(pimpo_corpus, "selection")) |>
pivot_longer(seed:manual08) |>
mutate(
tp=as.numeric(value>1 & gold==1),
fp=as.numeric(value>1 & gold==0),
fn=as.numeric(value==0 & gold==1),
tn=as.numeric(value==0 & gold==0)) |>
group_by(name) |>
summarize(pr=sum(tp)/sum(tp+fp),
re=sum(tp)/sum(tp+fn),
f1=2*pr*re/(pr+re))
# Thanks!!!!
# Wouter van Atteveldt, Dafne van Kuppevelt, Kasper Welbers
# github.com/vanatteveldt/CAVA
# wouter.van.atteveldt@vu.nl
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment