Created
January 18, 2024 07:57
-
-
Save benmarwick/44959d24ef6e53c86d3bd8aed83dffd5 to your computer and use it in GitHub Desktop.
Archaeology job ads from Archaeology Jobs Wiki
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
base_url <- "http://academicjobs.wikia.com/wiki/Archaeology_Jobs_" | |
# starts at 2010-2011 | |
years <- map_chr(2010:2019, ~str_glue('{.x}-{.x +1}')) | |
# though it seems to start at 2007-8: https://academicjobs.fandom.com/wiki/Archaeology_07-08 | |
urls_for_each_year <- str_glue('{base_url}{years}') | |
library(rvest) | |
# tenure but not non-tenure | |
#------------------------------------------ | |
# 2010-2011 has no table | |
urls_for_each_year[1] %>% | |
read_html() %>% | |
# html_nodes('.mw-content-text') %>% | |
html_nodes('.mw-headline') %>% | |
html_text() | |
#------------------------------------------ | |
# table first appears in 2011-2012 | |
urls_for_each_year[2] %>% | |
read_html() %>% | |
html_node('table , td') %>% | |
html_table() | |
# but headings are not systematic | |
urls_for_each_year[2] %>% | |
read_html() %>% | |
# html_nodes('.mw-content-text') %>% | |
html_nodes('.mw-headline') %>% | |
html_text() | |
#------------------------------------------ | |
# table first appears in 2012-2013 | |
urls_for_each_year[3] %>% | |
read_html() %>% | |
html_node('table , td') %>% | |
html_table() | |
urls_for_each_year[3] %>% | |
read_html() %>% | |
# html_nodes('.mw-content-text') %>% | |
html_nodes('.mw-headline') %>% | |
html_text() | |
#------------------------------------- | |
# all years | |
urls_for_each_year_headers <- | |
map(urls_for_each_year, | |
~.x %>% | |
read_html() %>% | |
html_nodes('.mw-headline') %>% | |
html_text()) | |
# what are the different sections? | |
tt_sections <- c("TENURE-TRACK POSITIONS", | |
"TENURE-TRACK OR TENURED / FULL-TIME POSITIONS", | |
"Tenure-Track or Tenured / Full-time Position", | |
"ASSISTANT PROFESSOR OR OPEN RANK", | |
"TENURE TRACK ASSISTANT PROFESSOR OR OPEN RANK", | |
"TENURED ASSOCIATE OR FULL PROFESSOR", | |
"ASSOCIATE OR FULL PROFESSOR") | |
non_tt_sections <- c("NON-TENURE-TRACK POSITIONS", | |
"VISITING POSITIONS / Limited-Term Appointments / Postdocs", | |
"Visiting Positions / Limited-Term Appointments / Postdocs", | |
"VISITING POSITIONS / LIMITED TERM APPOINTMENTS / POSTDOCS", | |
"VISITING POSITIONS / LIMITED-TERM APPOINTMENTS / POSTDOCS / PART-TIME POSITIONS", | |
"VISITING POSITIONS") | |
end_non_tt_sections <- c("DISCUSSION, RUMORS AND SPECULATION", | |
"DISCUSSION, RUMORS, SPECULATION", | |
"General Discussion, Rumors, and Speculation" ) | |
# what to do about this? | |
# "COMPLETED SEARCHES" | |
library(stringr) | |
get_counts_tt_non_tt <- function(x){ | |
# x <- urls_for_each_year_headers[[9]] | |
tt_start <- which(str_detect(x, | |
paste(tt_sections, | |
collapse = "|")))[1] | |
non_tt_start <- which(str_detect(x, | |
paste(non_tt_sections, | |
collapse = "|")))[1] | |
non_tt_end <- which(str_detect(x, | |
paste(end_non_tt_sections, | |
collapse = "|")))[1] | |
n_tt_jobs <- length( x[(tt_start + 1) : (non_tt_start - 1) ] ) | |
n_non_tt_jobs <- length( x[(non_tt_start + 1) : (non_tt_end - 1) ] ) | |
return(list(n_tt_jobs = n_tt_jobs, | |
n_non_tt_jobs = n_non_tt_jobs)) | |
} | |
map_df(urls_for_each_year_headers, | |
get_counts_tt_non_tt) %>% | |
mutate(ratio = n_tt_jobs / n_non_tt_jobs) %>% | |
mutate(year = str_replace(str_sub( urls_for_each_year, -9), | |
"-", "-\n")) %>% | |
ggplot() + | |
aes(year, | |
group = 1, | |
ratio) + | |
geom_line() + | |
geom_hline(yintercept = 1, | |
colour = "red") + | |
labs(y = "Ratio of tenure-track to non-tenure track\njobs in Archaeology", | |
x = "") + | |
theme_bw(base_size = 16) | |
##--------------------------------------------------- | |
library(googlesheets4) | |
library(tidyverse) | |
jobdata<-read_sheet("https://docs.google.com/spreadsheets/d/1Jwe3UqJyedrV-QWlwR_44__t4xBVrCfxGyhXdi3E0sg/edit#gid=1686084773") | |
integer_breaks <- function(n = 5, ...) { | |
fxn <- function(x) { | |
breaks <- floor(pretty(x, n, ...)) | |
names(breaks) <- attr(breaks, "labels") | |
breaks | |
} | |
return(fxn) | |
} | |
jobdata %>% | |
select(starts_with("Documents requested")) %>% | |
pivot_longer(everything()) %>% | |
group_by(name, value) %>% | |
tally() %>% | |
mutate(value = case_when( | |
value == "Not requested in the job ad" ~ 0, | |
value == "One" ~ 1, | |
value == "Two (e.g. two syllabi)" ~ 2, | |
value == "Three" ~ 3, | |
.default = 0 | |
)) %>% | |
ggplot() + | |
aes(value, | |
n) + | |
geom_col() + | |
facet_wrap( ~ name) + | |
scale_x_continuous(label = integer_breaks(), | |
limits = c(-1, 4)) | |
# do the requirements differ for associate positions | |
jobdata %>% | |
mutate(position_title = case_when( | |
str_detect(`Title of position`, "Associate") ~ "Associate", | |
str_detect(`Title of position`, "Assistant") ~ "Assistant")) %>% | |
select(position_title, | |
starts_with("Documents requested")) %>% | |
pivot_longer(-position_title) %>% | |
mutate(value = case_when( | |
value == "Not requested in the job ad" ~ 0, | |
value == "One" ~ 1, | |
value == "Two (e.g. two syllabi)" ~ 2, | |
value == "Three" ~ 3, | |
.default = 0 | |
)) %>% | |
filter(!is.na(position_title)) %>% | |
ggplot() + | |
aes(position_title, | |
value | |
) + | |
geom_boxplot() + | |
facet_wrap( ~ name) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment