George Moroz agricolamz

## textgrid_analysis_for_Natasha.R
library(tidyverse)
library(phonfieldwork)
library(writexl)

read_from_folder(getwd()) |>
  mutate(content = str_squish(content)) |>
  filter(content != "") |>
  mutate(content = str_squish(content),
         word = ifelse(tier == 1, str_c(content, "_", time_start, "_", time_end), NA)) |>
  fill(word) |>

## collacation_analysis.R
# based on Stefan Evert's lecture

library(tidyverse)
library(tidytext)
library(stopwords)

# create stopwords list ---------------------------------------------------

map(stopwords_getsources()[-c(3:4, 6, 8)], function(i){
  stopwords(language = "ru", source = i)

## get_list_of_articles_and_abstracts.R
library(tidyverse)
library(scholar)

my_id <- "ka_iMFQAAAAJ"

my_pubs <- get_publications(my_id)

map_chr(my_pubs$pubid, function(i){
  res <- get_publication_abstract(id = my_id, pub_id = i)
  ifelse(length(res) > 0, res, "")

## bib2tsv.R
library(tidyverse)
library(bib2df)
bib2df("verbal_negation.bib")  |>
  rowwise() |>
  mutate(AUTHOR = str_c(AUTHOR, collapse =" and "),
         EDITOR = str_c(EDITOR, collapse =" and ")) |>
  write_tsv("verbal_negation.tsv")

bib2df("verbal_negation_spec.bib")  |>
  rowwise() |>

## moving_embedings.R
library(tidyverse)

read_lines("revizor.txt") |>
  str_squish() |>
  tibble(text = _) |>
  filter(!str_detect(text, "^ДЕЙСТВИЕ"),
         !str_detect(text, "^Явление"),
         text != "") |>
  mutate(id = 1:n()) ->
  revizor

## convert_by_segment.R
library(tidyverse)
library(phonfieldwork)

files <- list.files(pattern = "TextGrid")
walk(files, function(file){
  textgrid <- textgrid_to_df(file)
  textgrid |>
    mutate(content = str_extract(content, "^.*?-"),
           content = str_remove(content, "-"),
           content = str_replace_all(content, "SS", "ss"),

## parse_khan.R
library(tidyverse)
t <- pdftools::pdf_ocr_text("Khan 2008 Jewish Neo-Aramaic Dialect of Urmi-465-497.pdf")

tibble(text = str_split(t, "\n\n") |> unlist()) |>
  filter(!str_detect(text, "GLOSSARY OF VERBS"),
         nchar(text) > 4) |>
  slice(-c(1:2)) |>
  mutate(verb = str_extract(text, "\\S{1,}\\s"),
         verb = str_squish(verb),
         text = str_remove_all(text, "\n")) |>

## code.R
library(tidyverse)

read_csv("russian_libraries_subdataset.csv") |>
  count(dedication, sort = TRUE)

## speech_to_text_with_audio.whisper.R
speech_to_text <- function(audio,
                           output_name = "output",
                           model_path = "ggml-large-v3.bin"){
  library(tidyverse)
  library(audio.whisper)

# convert to the format specs ---------------------------------------------
  tmp <- tempdir()
  str_glue("ffmpeg -i {audio} -ar 16000 -ac 1 -c:a pcm_s16le {tmp}/{output_name}.wav") |>
    system()

## stri_trans_general.R
library(stringi)

coresp <- "
    ῶι > ῷ;
    ωι > ῳ;
    ὧι > ᾧ;
    ὦι > ᾦ;
    ηι > ῃ;
    ῆι > ῇ;
    ἦι > ᾖ;
	library(tidyverse)
	library(phonfieldwork)
	library(writexl)

	read_from_folder(getwd()) \|>
	mutate(content = str_squish(content)) \|>
	filter(content != "") \|>
	mutate(content = str_squish(content),
	word = ifelse(tier == 1, str_c(content, "_", time_start, "_", time_end), NA)) \|>
	fill(word) \|>
	# based on Stefan Evert's lecture

	library(tidyverse)
	library(tidytext)
	library(stopwords)

	# create stopwords list ---------------------------------------------------

	map(stopwords_getsources()[-c(3:4, 6, 8)], function(i){
	stopwords(language = "ru", source = i)
	library(tidyverse)
	library(scholar)

	my_id <- "ka_iMFQAAAAJ"

	my_pubs <- get_publications(my_id)

	map_chr(my_pubs$pubid, function(i){
	res <- get_publication_abstract(id = my_id, pub_id = i)
	ifelse(length(res) > 0, res, "")
	library(tidyverse)
	library(bib2df)
	bib2df("verbal_negation.bib") \|>
	rowwise() \|>
	mutate(AUTHOR = str_c(AUTHOR, collapse =" and "),
	EDITOR = str_c(EDITOR, collapse =" and ")) \|>
	write_tsv("verbal_negation.tsv")

	bib2df("verbal_negation_spec.bib") \|>
	rowwise() \|>
	library(tidyverse)

	read_lines("revizor.txt") \|>
	str_squish() \|>
	tibble(text = _) \|>
	filter(!str_detect(text, "^ДЕЙСТВИЕ"),
	!str_detect(text, "^Явление"),
	text != "") \|>
	mutate(id = 1:n()) ->
	revizor
	library(tidyverse)
	library(phonfieldwork)

	files <- list.files(pattern = "TextGrid")
	walk(files, function(file){
	textgrid <- textgrid_to_df(file)
	textgrid \|>
	mutate(content = str_extract(content, "^.*?-"),
	content = str_remove(content, "-"),
	content = str_replace_all(content, "SS", "ss"),
	library(tidyverse)
	t <- pdftools::pdf_ocr_text("Khan 2008 Jewish Neo-Aramaic Dialect of Urmi-465-497.pdf")

	tibble(text = str_split(t, "\n\n") \|> unlist()) \|>
	filter(!str_detect(text, "GLOSSARY OF VERBS"),
	nchar(text) > 4) \|>
	slice(-c(1:2)) \|>
	mutate(verb = str_extract(text, "\\S{1,}\\s"),
	verb = str_squish(verb),
	text = str_remove_all(text, "\n")) \|>
	library(tidyverse)

	read_csv("russian_libraries_subdataset.csv") \|>
	count(dedication, sort = TRUE)
	speech_to_text <- function(audio,
	output_name = "output",
	model_path = "ggml-large-v3.bin"){
	library(tidyverse)
	library(audio.whisper)

	# convert to the format specs ---------------------------------------------
	tmp <- tempdir()
	str_glue("ffmpeg -i {audio} -ar 16000 -ac 1 -c:a pcm_s16le {tmp}/{output_name}.wav") \|>
	system()
	library(stringi)

	coresp <- "
	ῶι > ῷ;
	ωι > ῳ;
	ὧι > ᾧ;
	ὦι > ᾦ;
	ηι > ῃ;
	ῆι > ῇ;
	ἦι > ᾖ;