Skip to content

Instantly share code, notes, and snippets.

@khakieconomics
Created February 17, 2017 20:40
Show Gist options
  • Save khakieconomics/f54e430f3231ca9af338eced6a38fc3c to your computer and use it in GitHub Desktop.
Save khakieconomics/f54e430f3231ca9af338eced6a38fc3c to your computer and use it in GitHub Desktop.
Scrapes, saves and plots the salary rates for LCA applications for a few quantitative fields in NY state.
library(tidyr); library(ggplot2)
library(rvest); library(dplyr); library(ggthemes)
session <- html_session("http://visadoor.com/")
form <- html_form(session)[[1]]
# Add job titles to the character list below
jobs <- c("data scientist", "economist", "actuary", "consultant", "management consultant", "statistician")
for(j in jobs) {
form <- set_values(form, job = j, state = "NY", year = "2016")
query <- submit_form(form = form, session = session)
session2 <- html_session(query$url)
output <- session2 %>%
html_node(css = "table") %>%
html_table(header = T) %>%
mutate(wage = strsplit(`Wage Offer`, " to ")) %>%
unnest(wage) %>%
mutate(wage = readr::parse_number(wage)) %>%
mutate(senior = grepl(`Job Title`, pattern = "SENIOR") | grepl(`Job Title`, pattern = "LEAD"),
senior = ifelse(senior, "Senior", "Regular"),
hourly = grepl(`Wage Offer`, pattern = "Hour")) %>%
filter(!hourly & wage > 0)
write.csv(output, file = paste0(j, ".csv"))
print(output %>%
ggplot(aes(x = wage/1000)) +
stat_ecdf() +
facet_grid(senior~.) +
ggtitle(paste0("Cumulative distribution of LCA salaries for\n", j, "s in NY state")) +
theme_fivethirtyeight() +
xlab("Annual salary (k USD)") +
ylab("Cumulative proportion"))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment