Skip to content

Instantly share code, notes, and snippets.

View vanatteveldt's full-sized avatar

Wouter van Atteveldt vanatteveldt

  • VU University
  • Amsterdam
View GitHub Profile
library(tidyverse)
elections = tribble(
~year,~state_po,~party_Detailed,~candidatevotes,~totalvotes,
1976,"AL","DEMOCRAT",500,1000,
1976,"AL","REPUBLICAN",450,1000,
1976,"AL","x",30,1000,
1976,"AL","x",10,1000,
1976,"AL","x",7,1000,
1976,"AL","x",2,1000,
install.packages("udpipe")
install.packages("rsyntax")
library(udpipe)
tokens = udpipe('John Doe, who is a great guy, said yesterday that all was well', 'english') |>
as_tokenindex()
plot_tree(tokens, token, lemma, upos)
verbs = c("tell", "show", "acknowledge", "admit", "affirm", "allege",
"announce", "assert", "attest", "avow", "call", "claim", "comment",
# Plot histogram and normal curve for (simulated) data
library(tidyverse)
library(moments)
library(glue)
plot_distribution = function (x) {
m = mean(x)
sd = sd(x)
skewness=skewness(x)
#' Run an SVD for collaborative filtering and process the results to be more tidyverse-friendly
#' @param ratingsmatrix A item-user review matrix
#' @param ndimensions the number of dimensions to use, defaults to 10
#' @return a list with the original u, d, and v matrices from the svd function and
#' item_values - a long-format tibble with the values per item per dimension
#' user_values - a long-format tibble with the values per user per dimension
#' predictions - a long-format tibble with the predictions per user per item
#' @note (c) 2022 Wouter van Atteveldt, license: CC-0
run_svd = function(ratingsmatrix, ndimensions=10) {
cat("Arrr")
# demo: scraping
library(httr)
r = GET('https://opendata.cbs.nl/ODataApi/odata/85275NED/UntypedDataSet')
r$status_code
content(r, as="text") |> str_sub(end=500) |> cat()
d = content(r, as="parsed") |> as_tibble()
# demo: cbs data
library(tidyverse)
library(readtext)
library(quanteda)
library(topicmodels)
d = readtext("/home/wva/ccsbook/chapter*/*.tex")
dfm = corpus(d) |> corpus_reshape(to="paragraphs") |> tokens(split_tags=F, remove_symbols = T, remove_punct = T) |> tokens_remove("\\W", valuetype="regex") |> dfm()
set.seed(123)
m = dfm |> dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
max_docfreq = 0.1, docfreq_type = "prop") |>
library(tidyverse)
elections <- dataverse::get_dataframe_by_name(
filename = "1976-2020-president.tab",
dataset = "doi:10.7910/DVN/42MVDX",
server = "dataverse.harvard.edu")
totals = elections |> filter(year == 2020) |> select(state_po, totalvotes) |> unique()
d = elections |>
filter(year %in% c(2016,2020), party_simplified == "DEMOCRAT", candidatevotes > 1000) |>
mutate(percentage =candidatevotes / totalvotes * 100) |>
# Demo 1: Rtweet en word clouds
# install.pacakges("rtweet")
library(tidyverse)
library(rtweet)
library(quanteda)
library(quanteda.textplots)
library(RColorBrewer)
auth_setup_default()
@vanatteveldt
vanatteveldt / atteveldt_icademo.r
Created May 30, 2022 09:55
ICA Demo of CAVA dictionary tools by Wouter van Atteveldt, Dafne van Kuppevelt, and Kasper Welbers
####################################################
# #
# Bonjour a tous! #
# #
# Ca va‽️ #
# #
####################################################
# Embedding-based tools for (semi-)automatic dictionary