Created
April 17, 2024 12:19
-
-
Save giocomai/7205127b129b02e9dd37c265a1641194 to your computer and use it in GitHub Desktop.
Explore zavtra.ru dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://discuss-data.net/dataset/dae0b5b8-157f-471a-bb3c-ea24e0b2a7f7/ | |
# remotes::install_github("giocomai/castarter") | |
library("castarter") | |
corpus_name <- "zavtra.ru_ru_2024" | |
corpus_file <- paste0(corpus_name, ".csv.gz", collapse = "") | |
## Step 1: Download the corpus #### | |
# download `zavtra.ru_ru_2024.csv.gz` from the main repository, | |
# or rely on the following to download the latest version from GitHub | |
library("piggyback") | |
pb_download(repo = "giocomai/tadadit", | |
tag = corpus_name, | |
file = corpus_file) | |
## Step 2: Explore the corpus in an interactive web interface #### | |
corpus_df <- readr::read_csv(file = corpus_file) | |
castarter::cas_explorer(corpus = corpus_df, | |
default_pattern = "коллективн[а-я]+ Запад", | |
title = corpus_name) | |
## Want a speed boost, and use less RAM on your local device? | |
## Convert corpus to the parquet format, and read via arrow: | |
cas_write_corpus(corpus = corpus_df |> | |
dplyr::arrange(date) |> | |
dplyr::collect(), | |
partition = "year", | |
path = corpus_name) | |
castarter::cas_explorer(corpus = arrow::open_dataset("zavtra.ru_ru_2024"), | |
default_pattern = "коллективн[а-я]+ Запад", | |
title = corpus_name) | |
## Count matches in R | |
cas_count(corpus = corpus_df, | |
pattern = "коллективн[а-я]+ Запад") | |
cas_count(corpus = corpus_df, | |
pattern = "коллективн[а-я]+ Запад") |> | |
cas_summarise(period = "year") | |
## Or process this corpus with your favourite tools! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment