Created
October 17, 2020 09:45
-
-
Save rexarski/6a6c4ef5a41b1f1f7de4ea33e1dd9d64 to your computer and use it in GitHub Desktop.
Read all documents within sub-folders under a folder into a dataframe. https://www.tidytextmining.com/usenet.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(tidyr) | |
library(purrr) | |
library(readr) | |
training_folder <- "data-folder" | |
# a function that reads all files from a folder into a data frame | |
read_folder <- function(infolder) { | |
tibble(file = dir(infolder, full.names = TRUE)) %>% | |
mutate(text = map(file, read_lines)) %>% | |
transmute(id = basename(file), text) %>% | |
unnest(text) | |
} | |
# use unnest() and map() to apply read_folder to each subfolder | |
raw_text <- tibble(folder = dir(training_folder, full.names = TRUE)) %>% | |
mutate(folder_out = map(folder, read_folder)) %>% | |
unnest(cols = c(folder_out)) %>% | |
transmute(newsgroup = basename(folder), id, text) | |
raw_text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment