vanatteveldt/atteveldt_icademo.r

## atteveldt_icademo.r

####################################################
#                                                  #
#               Bonjour a tous!                    #
#                                                  #
#                     Ca va‽️                       #
#                                                  #
####################################################

# Embedding-based tools for (semi-)automatic dictionary
# comprehension, augmentation validation and analysis

# Wouter van Atteveldt, Dafne van Kuppevelt, Kasper Welbers
# github.com/vanatteveldt/CAVA
# wouter.van.atteveldt@vu.nl

# Remember: making dictionaries is hard,
# and conceptualization and validation are essential

####################################################
# Let's get started!                               #
####################################################

library(tidyverse)
# remotes::install_github("vanatteveldt/CAVA")
seed = c("immigra*", "migra*", "refugee*", "emigrat*")

pimpo = readRDS("~/icademo/pimpo_us.rds") |> as_tibble()
pimpo_corpus = quanteda::corpus(pimpo, text_field = "content")

vectors = CAVA::load_fasttext("~/icademo/cc.en.300.bin", corpus=pimpo_corpus)
vectors$vocabulary
vectors$vectors[1:5, 1:5]

####################################################
# (1) Dictionary expansion                         #
####################################################

dictionary = CAVA::expand_wildcards(seed, vectors)
candidates = CAVA::similar_words(dictionary, vectors)
View(candidates)

####################################################
# (2) Dictionary cohesion                          #
####################################################

expanded = candidates |> filter(similarity>.3) |> pull(word)
CAVA::similarity_to_centroid(expanded, vectors)
CAVA::similarity_to_centroid(expanded, vectors) |> tail()

similarities = CAVA::pairwise_similarities(expanded, vectors)
g = CAVA::similarity_graph(similarities, max_edges = 100)
plot(g)

# static plots are boring, right?

library(networkD3)
g = CAVA::similarity_graph(similarities, threshold=.6)
nodes = igraph::as_data_frame(g, what='vertices') %>%
  mutate(size=n^.75)
edges = igraph::as_data_frame(g, what='edges') %>%
  mutate(from=match(from, nodes$name)-1,
         to=match(to, nodes$name)-1)
networkD3::forceNetwork(
   Links = edges, Nodes = nodes,
   NodeID = 'name', Group = 'cluster', Nodesize='size',
   fontSize = 10, zoom = TRUE, linkDistance = 30,
   opacityNoHover=.75, opacity=.75)


####################################################
# Manual word-level evaluation with ccs-annotator  #
####################################################
# Hold-one out evaluation
eval = CAVA::evaluate_expansion(seed, vectors, split=1)

# plot similarity vs
eval |>pivot_longer(-word:-rank_mean, names_to = "Metric") |>
  filter(rank_mean<1000, Metric %in% c("recall_mean", "precision_mean", "f4_mean")) |>
  ggplot() + geom_line(aes(x=similarity_mean, y=value, color=Metric)) +
  xlab("Similarity to seed set") +
  ylab("Evaluation value") +
  ggtitle("Hold-one out evaluation results")

####################################################
# Semi-automatic expansion / human-in-the-loop     #
####################################################

# Select candidates for manual evaluation
manual = bind_rows(
  candidates |> filter(similarity > .4),
  candidates |> filter(similarity > .3, similarity < .4) |> sample_n(70)) |>
  mutate(id = glue::glue("w{row_number()}"))
# write_csv("manual.csv")

manual = read_csv("manual.csv")

### Write and distribute the coding job
# See https://github.com/ccs-amsterdam/ccsAnnotator
# https://t.co/uceHEIwCgP
# (Or go back in time and see Kasper Welber's presentation yesterday)


### Retrieve Annotations
ccsAnnotator::backend_connect('https://kasperwelbers.com/ica-annotator', 'wouter@vanatteveldt.com')
annotations = ccsAnnotator::download_annotations(job_id = 17)
annotations = annotations |>
  filter(str_detect(unit_id, "^w\\d+"),
         value != "undefined") |>
  mutate(relevant=as.numeric(value == "Yes")) |>
  select(id=unit_id, coder, relevant) |>
  left_join(manual)

coded = annotations |>
  group_by(word, similarity, frequency) |>
  summarize(n=n(), m=mean(relevant), sd=sd(relevant))

### Similarity vs relevance

ggplot(coded) +
  ggwordcloud::geom_text_wordcloud_area(
    aes(x=m, y=similarity, label=word, size=log(frequency), color=sd)) +
  scale_size_area(max_size = 8) +
  xlab("Mean relevance according to coders") +
  ylab("Similarity to seed set") +
  theme_minimal() + theme(panel.grid = element_blank(), axis.text = element_blank())

####################################################
# Manual evaluation of the resulting dictionary    #
####################################################


library(ccsAnnotator)
testset = pimpo |>
  add_column(id=1:nrow(pimpo)) |>
  group_by(selection) |>
  sample_n(50) |>
  select(id, selection, content)

immigration = ccsAnnotator::question(
  name='immigration',
  question='Is this about immigration‽',
  codes = c(No = 'red', Yes = 'green', Skip = 'grey'),
  type="annotinder")

codingjob = ccsAnnotator::create_job(
  title='immigration_evaluation',
  units=ccsAnnotator::create_units(testset, id='id', text='content'),
  codebook=ccsAnnotator::create_codebook(immigration))

job_db = ccsAnnotator::create_job_db(codingjob, overwrite = T)
job_db = ccsAnnotator::start_annotator(job_db, background=T)

ccsAnnotator::gimme_annotations(job_db)

####################################################
# Fortunately we had a gold standard already...    #
####################################################

dictionaries = list(
  seed = seed,
  auto03 = candidates |> filter(similarity > .3) |> pull(word) |> c(seed),
  auto04 = candidates |> filter(similarity > .4) |> pull(word) |> c(seed),
  manual05 = coded |> filter(m>.5) |> pull(word) |> c(seed),
  manual08 = coded |> filter(m>.8) |> pull(word) |> c(seed)
) |> quanteda::dictionary()

dict_results = pimpo_corpus |>
  quanteda::tokens() |>
  quanteda::dfm() |>
  quanteda::dfm_lookup(dictionaries) |>
  quanteda::convert(to="data.frame") |>
  as_tibble()

dict_results |>
  add_column(gold=docvars(pimpo_corpus, "selection")) |>
  pivot_longer(seed:manual08) |>
  mutate(
    tp=as.numeric(value>1 & gold==1),
    fp=as.numeric(value>1 & gold==0),
    fn=as.numeric(value==0 & gold==1),
    tn=as.numeric(value==0 & gold==0)) |>
  group_by(name) |>
  summarize(pr=sum(tp)/sum(tp+fp),
            re=sum(tp)/sum(tp+fn),
            f1=2*pr*re/(pr+re))


# Thanks!!!!
# Wouter van Atteveldt, Dafne van Kuppevelt, Kasper Welbers
# github.com/vanatteveldt/CAVA
# wouter.van.atteveldt@vu.nl

	####################################################
	# #
	# Bonjour a tous! #
	# #
	# Ca va‽️ #
	# #
	####################################################

	# Embedding-based tools for (semi-)automatic dictionary
	# comprehension, augmentation validation and analysis

	# Wouter van Atteveldt, Dafne van Kuppevelt, Kasper Welbers
	# github.com/vanatteveldt/CAVA
	# wouter.van.atteveldt@vu.nl

	# Remember: making dictionaries is hard,
	# and conceptualization and validation are essential

	####################################################
	# Let's get started! #
	####################################################

	library(tidyverse)
	# remotes::install_github("vanatteveldt/CAVA")
	seed = c("immigra", "migra", "refugee", "emigrat")

	pimpo = readRDS("~/icademo/pimpo_us.rds") \|> as_tibble()
	pimpo_corpus = quanteda::corpus(pimpo, text_field = "content")

	vectors = CAVA::load_fasttext("~/icademo/cc.en.300.bin", corpus=pimpo_corpus)
	vectors$vocabulary
	vectors$vectors[1:5, 1:5]

	####################################################
	# (1) Dictionary expansion #
	####################################################

	dictionary = CAVA::expand_wildcards(seed, vectors)
	candidates = CAVA::similar_words(dictionary, vectors)
	View(candidates)

	####################################################
	# (2) Dictionary cohesion #
	####################################################

	expanded = candidates \|> filter(similarity>.3) \|> pull(word)
	CAVA::similarity_to_centroid(expanded, vectors)
	CAVA::similarity_to_centroid(expanded, vectors) \|> tail()

	similarities = CAVA::pairwise_similarities(expanded, vectors)
	g = CAVA::similarity_graph(similarities, max_edges = 100)
	plot(g)

	# static plots are boring, right?

	library(networkD3)
	g = CAVA::similarity_graph(similarities, threshold=.6)
	nodes = igraph::as_data_frame(g, what='vertices') %>%
	mutate(size=n^.75)
	edges = igraph::as_data_frame(g, what='edges') %>%
	mutate(from=match(from, nodes$name)-1,
	to=match(to, nodes$name)-1)
	networkD3::forceNetwork(
	Links = edges, Nodes = nodes,
	NodeID = 'name', Group = 'cluster', Nodesize='size',
	fontSize = 10, zoom = TRUE, linkDistance = 30,
	opacityNoHover=.75, opacity=.75)


	####################################################
	# Manual word-level evaluation with ccs-annotator #
	####################################################
	# Hold-one out evaluation
	eval = CAVA::evaluate_expansion(seed, vectors, split=1)

	# plot similarity vs
	eval \|>pivot_longer(-word:-rank_mean, names_to = "Metric") \|>
	filter(rank_mean<1000, Metric %in% c("recall_mean", "precision_mean", "f4_mean")) \|>
	ggplot() + geom_line(aes(x=similarity_mean, y=value, color=Metric)) +
	xlab("Similarity to seed set") +
	ylab("Evaluation value") +
	ggtitle("Hold-one out evaluation results")

	####################################################
	# Semi-automatic expansion / human-in-the-loop #
	####################################################

	# Select candidates for manual evaluation
	manual = bind_rows(
	candidates \|> filter(similarity > .4),
	candidates \|> filter(similarity > .3, similarity < .4) \|> sample_n(70)) \|>
	mutate(id = glue::glue("w{row_number()}"))
	# write_csv("manual.csv")

	manual = read_csv("manual.csv")

	### Write and distribute the coding job
	# See https://github.com/ccs-amsterdam/ccsAnnotator
	# https://t.co/uceHEIwCgP
	# (Or go back in time and see Kasper Welber's presentation yesterday)


	### Retrieve Annotations
	ccsAnnotator::backend_connect('https://kasperwelbers.com/ica-annotator', 'wouter@vanatteveldt.com')
	annotations = ccsAnnotator::download_annotations(job_id = 17)
	annotations = annotations \|>
	filter(str_detect(unit_id, "^w\\d+"),
	value != "undefined") \|>
	mutate(relevant=as.numeric(value == "Yes")) \|>
	select(id=unit_id, coder, relevant) \|>
	left_join(manual)

	coded = annotations \|>
	group_by(word, similarity, frequency) \|>
	summarize(n=n(), m=mean(relevant), sd=sd(relevant))

	### Similarity vs relevance

	ggplot(coded) +
	ggwordcloud::geom_text_wordcloud_area(
	aes(x=m, y=similarity, label=word, size=log(frequency), color=sd)) +
	scale_size_area(max_size = 8) +
	xlab("Mean relevance according to coders") +
	ylab("Similarity to seed set") +
	theme_minimal() + theme(panel.grid = element_blank(), axis.text = element_blank())

	####################################################
	# Manual evaluation of the resulting dictionary #
	####################################################


	library(ccsAnnotator)
	testset = pimpo \|>
	add_column(id=1:nrow(pimpo)) \|>
	group_by(selection) \|>
	sample_n(50) \|>
	select(id, selection, content)

	immigration = ccsAnnotator::question(
	name='immigration',
	question='Is this about immigration‽',
	codes = c(No = 'red', Yes = 'green', Skip = 'grey'),
	type="annotinder")

	codingjob = ccsAnnotator::create_job(
	title='immigration_evaluation',
	units=ccsAnnotator::create_units(testset, id='id', text='content'),
	codebook=ccsAnnotator::create_codebook(immigration))

	job_db = ccsAnnotator::create_job_db(codingjob, overwrite = T)
	job_db = ccsAnnotator::start_annotator(job_db, background=T)

	ccsAnnotator::gimme_annotations(job_db)

	####################################################
	# Fortunately we had a gold standard already... #
	####################################################

	dictionaries = list(
	seed = seed,
	auto03 = candidates \|> filter(similarity > .3) \|> pull(word) \|> c(seed),
	auto04 = candidates \|> filter(similarity > .4) \|> pull(word) \|> c(seed),
	manual05 = coded \|> filter(m>.5) \|> pull(word) \|> c(seed),
	manual08 = coded \|> filter(m>.8) \|> pull(word) \|> c(seed)
	) \|> quanteda::dictionary()

	dict_results = pimpo_corpus \|>
	quanteda::tokens() \|>
	quanteda::dfm() \|>
	quanteda::dfm_lookup(dictionaries) \|>
	quanteda::convert(to="data.frame") \|>
	as_tibble()

	dict_results \|>
	add_column(gold=docvars(pimpo_corpus, "selection")) \|>
	pivot_longer(seed:manual08) \|>
	mutate(
	tp=as.numeric(value>1 & gold==1),
	fp=as.numeric(value>1 & gold==0),
	fn=as.numeric(value==0 & gold==1),
	tn=as.numeric(value==0 & gold==0)) \|>
	group_by(name) \|>
	summarize(pr=sum(tp)/sum(tp+fp),
	re=sum(tp)/sum(tp+fn),
	f1=2prre/(pr+re))


	# Thanks!!!!
	# Wouter van Atteveldt, Dafne van Kuppevelt, Kasper Welbers
	# github.com/vanatteveldt/CAVA
	# wouter.van.atteveldt@vu.nl