Skip to content

Instantly share code, notes, and snippets.

@shilohfling
Created June 25, 2019 16:29
Show Gist options
  • Save shilohfling/053e7873fecc0a40326fd1723976d1fe to your computer and use it in GitHub Desktop.
Save shilohfling/053e7873fecc0a40326fd1723976d1fe to your computer and use it in GitHub Desktop.
An RScript for webscraping the names and associated programming language of each Python, R, and SQL course available on DataCamp.
##################################################
# An RScript for webscraping the names and #
# associated programming language of each #
# Python, R, and SQL course #
# available on DataCamp. #
##################################################
# Last updated: March 21, 2019 #
##################################################
## Load packages -----
library(dplyr)
library(tidyr)
## Functions -----
scrape_datacamp_courses <- function(my_url) {
my_con <- file(my_url, "r")
my_html <- readLines(my_con, -1)
close(my_con)
h4_loc <- grep("[<]h4 class[=]\"course[-]block[_][_]title\"[>]", my_html)
courses <- unlist(lapply(as.list(my_html[h4_loc]), function(x) {trimws(gsub("<.*?>", "", gsub("&nbsp;", "", x)), which = "both")}))
descr <- unlist(lapply(as.list(my_html[h4_loc+2]), function(x) {trimws(x, which = "both")}))
df <- data.frame(Course = courses, Description = descr)
return(df)
}
## Call the functions -----
py <- scrape_datacamp_courses(my_url = "https://www.datacamp.com/courses/tech:python")
py$Type <- "Python"
r <- scrape_datacamp_courses(my_url = "https://www.datacamp.com/courses/tech:r")
r$Type <- "R"
sql <- scrape_datacamp_courses(my_url = "https://www.datacamp.com/courses/tech:sql")
sql$Type <- "SQL"
## Join data -----
df <- union(py, r) %>%
union(sql)
## Export data -----
write.csv(df, paste0("data_camp_courses_",Sys.Date(),".csv"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment