jmclawson/corpus_micusp.R

## corpus_micusp.R
# helper function get_if_needed for downloading online documents exactly once: https://gist.github.com/jmclawson/65899e2de6bfee692b08141a98422240
source("https://gist.githubusercontent.com/jmclawson/65899e2de6bfee692b08141a98422240/raw/7c5590377332e427691f2331b69abd58be2141ec/get_if_needed.R")

get_micusp_metadata <- function(micusp_dir = "micusp"){
  get_if_needed("https://elicorpora.info/browse?mode=download&start=1&sort=dept&direction=desc",
                filename = "micusp_metadata.csv",
                destdir = micusp_dir)

  readr::read_csv("micusp/micusp_metadata.csv", show_col_types = FALSE) |>
    janitor::clean_names()
}

parse_micusp_paper <- function(paperid,
                               htmldir = "micusp/corpus_html",
                               textdir = "micusp/corpus"){
  filename_text <- paperid |>
    stringr::str_replace_all("[.]","_") |>
    paste0(".txt") |>
    {\(x) paste0(textdir,"/",x)}()

  filename_html <- paperid |>
    stringr::str_replace_all("[.]","_") |>
    paste0(".html") |>
    {\(x) paste0(htmldir,"/",x)}()

  if(!dir.exists(textdir)){dir.create(textdir)}
  if(!file.exists(filename_text)){
    filename_html |>
      rvest::read_html() |>
      rvest::html_element(css = "div#paperBody") |>
      rvest::html_text() |>
      readr::write_lines(filename_text)
  }

  readr::read_lines(filename_text) |>
    paste0(collapse = "\n")
}

get_micusp_corpus <- function(...){
  the_df <-
    get_micusp_metadata() |>
    dplyr::filter(...)

  the_urls <-
    the_df |>
    dplyr::pull(paper_id) |>
    {\(x) paste0("https://elicorpora.info/view?pid=", x)}()

  the_filenames <-
    the_df |>
    dplyr::pull(paper_id) |>
    stringr::str_replace_all("[.]", "_") |>
    paste0(".html")

  the_urls |>
    purrr::walk2(.x = the_urls,
          .y = the_filenames,
          .f = ~ get_if_needed(.x, .y, destdir = "micusp/corpus_html"))

  the_df |>
    dplyr::rowwise() |>
    dplyr::mutate(text = parse_micusp_paper(paper_id))
}
	# helper function get_if_needed for downloading online documents exactly once: https://gist.github.com/jmclawson/65899e2de6bfee692b08141a98422240
	source("https://gist.githubusercontent.com/jmclawson/65899e2de6bfee692b08141a98422240/raw/7c5590377332e427691f2331b69abd58be2141ec/get_if_needed.R")

	get_micusp_metadata <- function(micusp_dir = "micusp"){
	get_if_needed("https://elicorpora.info/browse?mode=download&start=1&sort=dept&direction=desc",
	filename = "micusp_metadata.csv",
	destdir = micusp_dir)

	readr::read_csv("micusp/micusp_metadata.csv", show_col_types = FALSE) \|>
	janitor::clean_names()
	}

	parse_micusp_paper <- function(paperid,
	htmldir = "micusp/corpus_html",
	textdir = "micusp/corpus"){
	filename_text <- paperid \|>
	stringr::str_replace_all("[.]","_") \|>
	paste0(".txt") \|>
	{\(x) paste0(textdir,"/",x)}()

	filename_html <- paperid \|>
	stringr::str_replace_all("[.]","_") \|>
	paste0(".html") \|>
	{\(x) paste0(htmldir,"/",x)}()

	if(!dir.exists(textdir)){dir.create(textdir)}
	if(!file.exists(filename_text)){
	filename_html \|>
	rvest::read_html() \|>
	rvest::html_element(css = "div#paperBody") \|>
	rvest::html_text() \|>
	readr::write_lines(filename_text)
	}

	readr::read_lines(filename_text) \|>
	paste0(collapse = "\n")
	}

	get_micusp_corpus <- function(...){
	the_df <-
	get_micusp_metadata() \|>
	dplyr::filter(...)

	the_urls <-
	the_df \|>
	dplyr::pull(paper_id) \|>
	{\(x) paste0("https://elicorpora.info/view?pid=", x)}()

	the_filenames <-
	the_df \|>
	dplyr::pull(paper_id) \|>
	stringr::str_replace_all("[.]", "_") \|>
	paste0(".html")

	the_urls \|>
	purrr::walk2(.x = the_urls,
	.y = the_filenames,
	.f = ~ get_if_needed(.x, .y, destdir = "micusp/corpus_html"))

	the_df \|>
	dplyr::rowwise() \|>
	dplyr::mutate(text = parse_micusp_paper(paper_id))
	}