Skip to content

Instantly share code, notes, and snippets.

@mjhendrickson
Last active October 17, 2018 15:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mjhendrickson/f8c165497be6cbec0dc0bcd1667676f9 to your computer and use it in GitHub Desktop.
Save mjhendrickson/f8c165497be6cbec0dc0bcd1667676f9 to your computer and use it in GitHub Desktop.
# -----Set up environment -----
library(tidyverse)
library(rvest)
# ===== Scrape Transcript =====
## Scrape episode 1 HTML as list
ep01_html <- 'https://www.datacamp.com/community/blog/data-science-past-present-future'
## Creates a list of lists of HTML load
ep01_webpage <- read_html(ep01_html)
## Specifies nodes for use where head = header
ep01_head <- html_nodes(ep01_webpage, "head")
## Specifies nodes for use where title = title
ep01_title <- html_nodes(ep01_webpage, "title")
## Specifies nodes for use where h1 = header type 1
ep01_h1 <- html_nodes(ep01_webpage, "h1")
## Specifies nodes for use where h2 = header type 2
ep01_h2 <- html_nodes(ep01_webpage, "h2")
## Specifies nodes for use where h3 = header type 3
ep01_h3 <- html_nodes(ep01_webpage, "h3")
## Specifies nodes for use where h4 = header type 4 -- no records
ep01_h4 <- html_nodes(ep01_webpage, "h4")
## Specifies nodes for use where h5 = header type 5
ep01_h5 <- html_nodes(ep01_webpage, "h5")
## Specifies nodes for use where div = divider
ep01_div <- html_nodes(ep01_webpage, "div")
## Specifies nodes for use where p = paragraph
ep01_p <- html_nodes(ep01_webpage, "p")
# ===== Clean Data =====
## Removes first 2 intro paragraphs that aren't part of discussion
ep01_p <- ep01_p[3:116]
## Shows start of list values for review
list(ep01_p)
## Returns all between '>' and ':', which is speaker name when someone was speaking at start of paragraph
ep01_speaker <- str_extract(ep01_p,"(?<=\\>).+(?=:)")
## Shows start of list values for review
list(ep01_speaker)
# ===== Next - Does Not Work =====
## html_nodes containing paragraphs of interest
## Fails using "p"
## Works using other HTML elements, including "div", "h1", and "title"
ep01_nodes <-
html_nodes(ep01_webpage, "p") %>%
map(xml_attrs) %>%
map_df(~as.list(.))
## Shows start of list values for review
list(ep01_nodes)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment