Skip to content

Instantly share code, notes, and snippets.

@erikgregorywebb
Created October 30, 2020 04:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save erikgregorywebb/fd7726b269079aa5872cd7f1e5b39de9 to your computer and use it in GitHub Desktop.
Save erikgregorywebb/fd7726b269079aa5872cd7f1e5b39de9 to your computer and use it in GitHub Desktop.
library(tidyverse)
library(rvest)
# compile list of job listings urls
listing_urls = c()
for (i in 1:20) {
base_url = paste('https://jobs.apple.com/en-us/search?location=united-states-USA&page=', i , sep = '')
print(base_url)
page = read_html(base_url)
listings = page %>% html_nodes('.table-col-1')
for (j in 1:length(listings)) {
listing_url = listings[j] %>% html_node('a') %>% html_attr('href')
print(listing_url)
listing_urls = c(listing_urls, listing_url)
}
}
listing_urls = paste('https://jobs.apple.com', listing_urls, sep = '')
length(listing_urls) = length(unique(listing_urls))
# scrape job details (title, location, team) from listing urls
datalist = list()
for (i in 1:length(listing_urls)) {
Sys.sleep(1)
page = read_html(listing_urls[i])
print(listing_urls[i])
job_title = page %>% html_node('#jdPostingTitle') %>% html_text()
job_location = page %>% html_node('#job-location-name') %>% html_text()
job_team = page %>% html_node('#job-team-name') %>% html_text()
cat(paste(job_title, job_location, job_team, sep = '\n'))
datalist[[i]] = tibble(job_title = job_title, job_location = job_location,
job_team = job_team, job_url = listing_urls[i])
}
raw = do.call(rbind, datalist)
# clean, summarize
aapl_jobs = raw %>%
mutate(scrape_date = Sys.Date())
aapl_jobs %>% group_by(job_location) %>% count(sort = T) # top locations
aapl_jobs %>% group_by(job_team) %>% count(sort = T) # top teams
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment