Skip to content

Instantly share code, notes, and snippets.

@maurolepore
Created March 9, 2022 12:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save maurolepore/5d9244aeb652aa6e648deb61826568f5 to your computer and use it in GitHub Desktop.
Save maurolepore/5d9244aeb652aa6e648deb61826568f5 to your computer and use it in GitHub Desktop.
library(dplyr)
library(rvest)
library(chromote) # remotes::install_github("rstudio/chromote")
library(countrycode)
library(readr)
# ------------------------------------------------------------------------------
em_url <- "https://www.europages.co.uk/DAZUN-GMBH/00000005340662-639906001.html"
# start the headless browser and capture the DOM as HTML after JavaScript runs -
b <- chromote::ChromoteSession$new()
b$Page$navigate(em_url)
b$Page$loadEventFired() # wait until the page is loaded to continue\
html <- b$Runtime$evaluate('document.documentElement.outerHTML')$result$value
b$close()
readr::write_file(html, "temp.html")
# -------------------------------------------------------------------------
html <- readr::read_file("temp.html")
keywords <-
html %>%
read_html() %>%
html_elements(".pl-0 li") %>%
html_text2()
products <-
html %>%
read_html() %>%
html_element(".pa-0") %>%
html_elements("figcaption") %>%
html_text2()
headcount <-
html %>%
read_html() %>%
html_element(".ep-epages-business-details-headcount .text-body-1") %>%
html_text2()
# -------------------------------------------------------------------------
Encoding(headcount)
charToRaw(headcount)
charToRaw(enc2utf8(headcount))
stringi::stri_enc_toascii(headcount)
stringi::stri_trans_general(headcount, "latin-ascii")
fixed <- stringi::stri_trans_general(headcount, "latin-ascii")
charToRaw(fixed)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment