Skip to content

Instantly share code, notes, and snippets.

@TimTaylor
Last active May 23, 2022 11:22
Show Gist options
  • Save TimTaylor/3d8b8d4ac2056cbff25afe83fdd2d2bd to your computer and use it in GitHub Desktop.
Save TimTaylor/3d8b8d4ac2056cbff25afe83fdd2d2bd to your computer and use it in GitHub Desktop.
nerd-sniped on slack
# Note the following is not 100% foolproof as it assumes the table entries are all on one line.
# This will need refining to deal with when this is not the case but may be sufficient for the
# question.
# h/t to https://github.com/hrbrmstr/fish-stocking-pdf-data-wrangling/blob/main/main.R
# who's code I'd seen just the other day doing similarish type stuff
library(rvest)
library(pdftools)
# Create folder to store reports
dir.create("./reports", showWarnings = FALSE)
# read the page with reports
pg <- read_html("https://www.ryedale.gov.uk/information/planning/planning-policy/ryedale/review-of-the-ryedale-plan/call-for-sites/submitted-sites/")
# extract the PDF urls
pg |>
html_nodes(xpath = ".//a[contains(@href, 'Site-')]") |>
html_attr("href") ->
report_urls
# download files
for (i in seq_along(report_urls)) {
r <- report_urls[i]
download.file(
url = URLencode(r),
destfile = file.path("./reports", basename(r)),
method = "libcurl"
)
Sys.sleep(0.5) # to be kind
}
# function to get meta data from a loaded report
get_meta <- function(dat, pattern) {
x <- grep(pattern, dat, perl = TRUE, value = TRUE)
x <- sub(pattern, "", x)
trimws(x)
}
# extract the meta information
meta <- lapply(
list.files("./reports", full.names = TRUE),
function(x) {
txt <- pdftools::pdf_text(x)
txt <- strsplit(txt, "\n", "")[[1]]
data.frame(
parish = get_meta(txt, "Parish:[[:space:]]+"),
site_number = get_meta(txt, "Site No:[[:space:]]+"),
address = get_meta(txt, "Address:[[:space:]]+"),
proposed_use = get_meta(txt, "Proposed Use:[[:space:]]+"),
area = get_meta(txt, "Area:[[:space:]]+"),
owner = get_meta(txt, "Owner/Agent:[[:space:]]+")
)
}
)
tbl <- do.call(rbind, meta)
write.csv(tbl, "meta.csv", row.names = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment