Skip to content

Instantly share code, notes, and snippets.

View agricolamz's full-sized avatar

George Moroz agricolamz

View GitHub Profile
library(tidyverse)
library(phonfieldwork)
library(writexl)
read_from_folder(getwd()) |>
mutate(content = str_squish(content)) |>
filter(content != "") |>
mutate(content = str_squish(content),
word = ifelse(tier == 1, str_c(content, "_", time_start, "_", time_end), NA)) |>
fill(word) |>
# based on Stefan Evert's lecture
library(tidyverse)
library(tidytext)
library(stopwords)
# create stopwords list ---------------------------------------------------
map(stopwords_getsources()[-c(3:4, 6, 8)], function(i){
stopwords(language = "ru", source = i)
library(tidyverse)
library(scholar)
my_id <- "ka_iMFQAAAAJ"
my_pubs <- get_publications(my_id)
map_chr(my_pubs$pubid, function(i){
res <- get_publication_abstract(id = my_id, pub_id = i)
ifelse(length(res) > 0, res, "")
library(tidyverse)
library(bib2df)
bib2df("verbal_negation.bib") |>
rowwise() |>
mutate(AUTHOR = str_c(AUTHOR, collapse =" and "),
EDITOR = str_c(EDITOR, collapse =" and ")) |>
write_tsv("verbal_negation.tsv")
bib2df("verbal_negation_spec.bib") |>
rowwise() |>
library(tidyverse)
read_lines("revizor.txt") |>
str_squish() |>
tibble(text = _) |>
filter(!str_detect(text, "^ДЕЙСТВИЕ"),
!str_detect(text, "^Явление"),
text != "") |>
mutate(id = 1:n()) ->
revizor
library(tidyverse)
library(phonfieldwork)
files <- list.files(pattern = "TextGrid")
walk(files, function(file){
textgrid <- textgrid_to_df(file)
textgrid |>
mutate(content = str_extract(content, "^.*?-"),
content = str_remove(content, "-"),
content = str_replace_all(content, "SS", "ss"),
library(tidyverse)
t <- pdftools::pdf_ocr_text("Khan 2008 Jewish Neo-Aramaic Dialect of Urmi-465-497.pdf")
tibble(text = str_split(t, "\n\n") |> unlist()) |>
filter(!str_detect(text, "GLOSSARY OF VERBS"),
nchar(text) > 4) |>
slice(-c(1:2)) |>
mutate(verb = str_extract(text, "\\S{1,}\\s"),
verb = str_squish(verb),
text = str_remove_all(text, "\n")) |>
library(tidyverse)
read_csv("russian_libraries_subdataset.csv") |>
count(dedication, sort = TRUE)
speech_to_text <- function(audio,
output_name = "output",
model_path = "ggml-large-v3.bin"){
library(tidyverse)
library(audio.whisper)
# convert to the format specs ---------------------------------------------
tmp <- tempdir()
str_glue("ffmpeg -i {audio} -ar 16000 -ac 1 -c:a pcm_s16le {tmp}/{output_name}.wav") |>
system()
library(stringi)
coresp <- "
ῶι > ῷ;
ωι > ῳ;
ὧι > ᾧ;
ὦι > ᾦ;
ηι > ῃ;
ῆι > ῇ;
ἦι > ᾖ;