Skip to content

Instantly share code, notes, and snippets.

@jmclawson
Last active May 4, 2023 13:44
Show Gist options
  • Save jmclawson/0773f1200ee4ec47cf25f0a2acecaa26 to your computer and use it in GitHub Desktop.
Save jmclawson/0773f1200ee4ec47cf25f0a2acecaa26 to your computer and use it in GitHub Desktop.
Reads all files in a directory that match a certain naming pattern, returning a one-word-per-row table, with stanza and line numbers for poetry and with paragraph numbers for prose. Set word=FALSE to retain one line per row.
# library(tidyverse)
# library(tidytext)
##### Use the following function for reading a folder of prose text files. #####
## Put all the text files you want to read in the same folder. If that folder's
## called, for example, "project2", here's the function in practice:
##
## my_table <- tidy_prose_texts(folder = "project2")
##
tidy_prose_texts <- function(folder = "data", name = ".txt", word = TRUE) {
tidy_one_text <- function(file, directory) {
tibble(document = file |> str_remove_all("[.].*"),
text = readLines(paste0(directory, "/", file))) |>
mutate(par_num = cumsum(text == "") + 1,
.after = document) |>
filter(text != "")
}
the_files <- list.files(path = paste0(folder, "/"), pattern = name)
full_prose <- do.call(rbind, lapply(the_files, tidy_one_text, directory=folder))
if (word) {
full_prose <- full_prose |>
unnest_tokens(word, text)
}
full_prose
}
##### Use the following function for reading a folder of poetry text files. #####
## Put all the text files you want to read in the same folder. If that folder's
## called, for example, "project2", here's the function in practice:
##
## my_table <- tidy_prose_texts(folder = "project2")
##
tidy_poetry_texts <- function(folder = "data", name = ".txt", word = TRUE) {
tidy_one_text <- function(file, directory) {
tibble(document = file |> str_remove_all("[.].*"),
text = readLines(paste0(directory, "/", file))) |>
mutate(stanza_num = cumsum(text == "") + 1) |>
filter(text != "") |>
mutate(line_num = row_number()) |>
relocate(stanza_num, line_num,
.after = document)
}
the_files <- list.files(path = paste0(folder, "/"), pattern = name)
full_poetry <- do.call(rbind, lapply(the_files, tidy_one_text, directory=folder))
if (word) {
full_poetry <- full_poetry |>
unnest_tokens(word, text)
}
full_poetry
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment