Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Created January 18, 2024 07:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save benmarwick/44959d24ef6e53c86d3bd8aed83dffd5 to your computer and use it in GitHub Desktop.
Save benmarwick/44959d24ef6e53c86d3bd8aed83dffd5 to your computer and use it in GitHub Desktop.
Archaeology job ads from Archaeology Jobs Wiki
library(tidyverse)
base_url <- "http://academicjobs.wikia.com/wiki/Archaeology_Jobs_"
# starts at 2010-2011
years <- map_chr(2010:2019, ~str_glue('{.x}-{.x +1}'))
# though it seems to start at 2007-8: https://academicjobs.fandom.com/wiki/Archaeology_07-08
urls_for_each_year <- str_glue('{base_url}{years}')
library(rvest)
# tenure but not non-tenure
#------------------------------------------
# 2010-2011 has no table
urls_for_each_year[1] %>%
read_html() %>%
# html_nodes('.mw-content-text') %>%
html_nodes('.mw-headline') %>%
html_text()
#------------------------------------------
# table first appears in 2011-2012
urls_for_each_year[2] %>%
read_html() %>%
html_node('table , td') %>%
html_table()
# but headings are not systematic
urls_for_each_year[2] %>%
read_html() %>%
# html_nodes('.mw-content-text') %>%
html_nodes('.mw-headline') %>%
html_text()
#------------------------------------------
# table first appears in 2012-2013
urls_for_each_year[3] %>%
read_html() %>%
html_node('table , td') %>%
html_table()
urls_for_each_year[3] %>%
read_html() %>%
# html_nodes('.mw-content-text') %>%
html_nodes('.mw-headline') %>%
html_text()
#-------------------------------------
# all years
urls_for_each_year_headers <-
map(urls_for_each_year,
~.x %>%
read_html() %>%
html_nodes('.mw-headline') %>%
html_text())
# what are the different sections?
tt_sections <- c("TENURE-TRACK POSITIONS",
"TENURE-TRACK OR TENURED / FULL-TIME POSITIONS",
"Tenure-Track or Tenured / Full-time Position",
"ASSISTANT PROFESSOR OR OPEN RANK",
"TENURE TRACK ASSISTANT PROFESSOR OR OPEN RANK",
"TENURED ASSOCIATE OR FULL PROFESSOR",
"ASSOCIATE OR FULL PROFESSOR")
non_tt_sections <- c("NON-TENURE-TRACK POSITIONS",
"VISITING POSITIONS / Limited-Term Appointments / Postdocs",
"Visiting Positions / Limited-Term Appointments / Postdocs",
"VISITING POSITIONS / LIMITED TERM APPOINTMENTS / POSTDOCS",
"VISITING POSITIONS / LIMITED-TERM APPOINTMENTS / POSTDOCS / PART-TIME POSITIONS",
"VISITING POSITIONS")
end_non_tt_sections <- c("DISCUSSION, RUMORS AND SPECULATION",
"DISCUSSION, RUMORS, SPECULATION",
"General Discussion, Rumors, and Speculation" )
# what to do about this?
# "COMPLETED SEARCHES"
library(stringr)
get_counts_tt_non_tt <- function(x){
# x <- urls_for_each_year_headers[[9]]
tt_start <- which(str_detect(x,
paste(tt_sections,
collapse = "|")))[1]
non_tt_start <- which(str_detect(x,
paste(non_tt_sections,
collapse = "|")))[1]
non_tt_end <- which(str_detect(x,
paste(end_non_tt_sections,
collapse = "|")))[1]
n_tt_jobs <- length( x[(tt_start + 1) : (non_tt_start - 1) ] )
n_non_tt_jobs <- length( x[(non_tt_start + 1) : (non_tt_end - 1) ] )
return(list(n_tt_jobs = n_tt_jobs,
n_non_tt_jobs = n_non_tt_jobs))
}
map_df(urls_for_each_year_headers,
get_counts_tt_non_tt) %>%
mutate(ratio = n_tt_jobs / n_non_tt_jobs) %>%
mutate(year = str_replace(str_sub( urls_for_each_year, -9),
"-", "-\n")) %>%
ggplot() +
aes(year,
group = 1,
ratio) +
geom_line() +
geom_hline(yintercept = 1,
colour = "red") +
labs(y = "Ratio of tenure-track to non-tenure track\njobs in Archaeology",
x = "") +
theme_bw(base_size = 16)
##---------------------------------------------------
library(googlesheets4)
library(tidyverse)
jobdata<-read_sheet("https://docs.google.com/spreadsheets/d/1Jwe3UqJyedrV-QWlwR_44__t4xBVrCfxGyhXdi3E0sg/edit#gid=1686084773")
integer_breaks <- function(n = 5, ...) {
fxn <- function(x) {
breaks <- floor(pretty(x, n, ...))
names(breaks) <- attr(breaks, "labels")
breaks
}
return(fxn)
}
jobdata %>%
select(starts_with("Documents requested")) %>%
pivot_longer(everything()) %>%
group_by(name, value) %>%
tally() %>%
mutate(value = case_when(
value == "Not requested in the job ad" ~ 0,
value == "One" ~ 1,
value == "Two (e.g. two syllabi)" ~ 2,
value == "Three" ~ 3,
.default = 0
)) %>%
ggplot() +
aes(value,
n) +
geom_col() +
facet_wrap( ~ name) +
scale_x_continuous(label = integer_breaks(),
limits = c(-1, 4))
# do the requirements differ for associate positions
jobdata %>%
mutate(position_title = case_when(
str_detect(`Title of position`, "Associate") ~ "Associate",
str_detect(`Title of position`, "Assistant") ~ "Assistant")) %>%
select(position_title,
starts_with("Documents requested")) %>%
pivot_longer(-position_title) %>%
mutate(value = case_when(
value == "Not requested in the job ad" ~ 0,
value == "One" ~ 1,
value == "Two (e.g. two syllabi)" ~ 2,
value == "Three" ~ 3,
.default = 0
)) %>%
filter(!is.na(position_title)) %>%
ggplot() +
aes(position_title,
value
) +
geom_boxplot() +
facet_wrap( ~ name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment