benmarwick/scraping-pnas-titles.R

## gistfile1.R

# JB says she searched for "archaeolog*" and "archeolog*", these return 257 results for me,
# much less than the 1002 we get from searching the archaeology 'keyword', e.g.
# https://www.pnas.org/action/doSearch?Concept=500376&Concept=500375&startPage=0&sortBy=Earliest

# in any case, let's start with "archaeolog*" and "archeolog*", I've copied the URL of the search
# results page and edited the URL to return 500 items on the first page, so we can get all results without
# having to scrape multiple pages of results, just to simplify the process

library(tidyverse)
library(rvest)

search_url <- "https://www.pnas.org/action/doSearch?field1=AllField&text1=archaeolog*+&field2=AllField&text2=archeolog*&field3=AllField&text3=&publication=&Ppub=&access=on&pageSize=500&startPage=0"

# let's get the URLs to each paper listed on that search results page so we can get abstracts, etc.
# this takes a few seconds

search_results_urls_df <-
search_url %>%
  read_html() %>%
  # extract content from elements on the page that contain the article title link
  # I used the Selector Gadget Chrome Extension to identify these elements
  html_nodes(".items-results .animation-underline") %>%
  # convert to human-readable text
  html_attr("href") %>%
  # construct full URL from these results
  str_c("http://pnas.org", .)

# and now for each link to a paper, we get the content type, title, abstract,
# subject and date of that paper.
# This takes several minutes, maybe more than 10?

search_results_abstract_title_date_subject <-
map(search_results_urls_df,
    # for each, url, do this:
    ~.x %>%
      read_html() %>%
      html_nodes(".core-date-published span , h1 , .core-container > div:nth-child(1)") %>%
      # convert to human-readable text
      html_text()
      )

# clean and tidy the titles, dates and abstracts into a nice tibble
search_results_abstract_title_date_subject_tidy_df <-
map_df(search_results_abstract_title_date_subject,
    ~tibble(article_type = .x[1],
            title = .x[2],
            date = .x[3],
            abstract = .x[4])) %>%
  mutate(article_type = str_squish(str_remove(article_type, " Share on"))) %>%
  # extract year from data and convert from character to numeric
  mutate(year = parse_number(str_sub(date,  -4))) %>%
  filter(!article_type %in% c("QnAs", "This Week in PNAS")) %>%
  filter(str_detect(article_type, "Research Article")) %>%
  separate(article_type,
           into = c("article_type", "pnas_subject"),
           sep = 16)


library(quanteda)

# explore species studied for domestication, e.g. inspired by this plot
# https://twitter.com/JBeckArch/status/1510346774283698177

# convert abstract text into document feature matrix and clean the text

# get only papers about domestication
papers_about_domestication <-
search_results_abstract_title_date_subject_tidy_df %>%
  filter(str_detect(abstract, "domestica"))

# do text analysis of the abstracts to count how many times certain words appear
# we can also track word frequency over time with this output

my_dfm <-
# convert free text of abstract to corpus
corpus(papers_about_domestication$abstract,
       docvars = papers_about_domestication$year) %>%
  # convert to tokens and clean
  tokens(remove_numbers = TRUE,
         remove_symbols = TRUE,
        remove_punct = TRUE) %>%
  # convert to document frame matrix
  dfm(tolower = TRUE,
      remove_padding = TRUE) %>%
  # remove stopwords
  dfm_select(pattern = stopwords("en"),
             selection = "remove") %>%
  # keep only words of interest here
  dfm_select(pattern = c("dog", "dogs",
                         "maize", "wheat",
                         "horse", "horses",
                         "rice", "squash",
                         "cattle", "cows", "cow"),
             selection = "keep")

# tidy up and plot
my_dfm %>%
  convert(to = "data.frame") %>%
  cbind(docvars(my_dfm)) %>%
  mutate(horse = horse + horses) %>%
  select(-horses,
         - doc_id,
         year = docvars) %>%
  pivot_longer(-year,
               names_to = "thing",
               values_to = "n") %>%
  group_by(thing) %>%
  summarise(n = sum(n)) %>%
  ggplot() +
  aes(reorder(thing, -n), n) +
  geom_col() +
  theme_minimal() +
  xlab("Domesticate")

## scraping-pnas-titles.R
library(tidyverse)
library(rvest)

# base URL for a single issue
url_pnas_toc <- "https://www.pnas.org/toc/pnas/"

# base URL for the list of issues
url_pnas_loi <- "https://www.pnas.org/loi/pnas/group"

this_year <- format(Sys.time(), "%Y")

# make a vector of year ranges to construct a URL to get all the issues pages
url_issues_pages <-
  map_chr(seq(1910, this_year, 10),
          ~paste0(url_pnas_loi, "/d", .x, ".", "y", ifelse(.x+9 < 2022, .x+9, 2022)))

url_issues_pages <-
  map_chr(1910:this_year,
          # first round the number down to the nearest decade and then
          # put each year at the end
          ~paste0(url_pnas_loi, "/d", .x%/%10*10, ".", "y", .x))

# get the month, vol and issue numbers, subset to get only items with Months in them,
# these are the issue details. This takes several minutes to complete
urls_for_specific_issues <-
  map(url_issues_pages,
      ~.x %>%
        read_html() %>%
        html_nodes(".flex-column") %>%
        html_text() %>%
        # subset to keep items that have a month name in them
        str_subset(paste0(month.name, collapse = "|")) %>%
        # extract only vol and issue numbers
        str_extract("Vol.*") %>%
        str_replace_all("\\|", "/") %>%
        str_remove_all("Vol. | No. | ") %>%
        str_c(url_pnas_toc, .))

# organise so we have the year with the vol and issue
# so we can analyse change over time

names(urls_for_specific_issues) <- str_sub(url_issues_pages,-4,-1)

# reshape into data frame
urls_for_specific_issues_df <-
  tibble(urls_for_specific_issues = urls_for_specific_issues,
         year = names(urls_for_specific_issues)) %>%
  unnest_longer(urls_for_specific_issues)

# ---------------------------------------------------------------
# This next step takes several hours to complete, if we've already
# done it recently, skip and read in the CSV we previously saved

# for each specific issue, get the text of the titles and
# sections in that issue.
titles_in_each_issue <-
  urls_for_specific_issues_df %>%
  mutate(titles_in_each_issue =
           map(urls_for_specific_issues,
               ~.x  %>%
                 read_html() %>%
                 html_nodes(".animation-underline , .mb-4") %>%
                 html_text()
           ))

# ---------------------------------------------------------------
#  Save the output so we don't have to do that again. Because this is
# data frame with nested tibbles, we cannot use CSV but need a binary
# format

saveRDS(titles_in_each_issue, "pnas_titles_in_each_issue.rds")

# read in the CSV we previously saved, if we have one
f <- "pnas_titles_in_each_issue.rds"
if (file.exists(f))
  titles_in_each_issue <- readRDS(f)

# unnest so we can see all the titles and remove some irrelevant materials
titles_in_each_issue_df <-
  titles_in_each_issue %>%
  unnest(titles_in_each_issue) %>%
  mutate(titles_in_each_issue = str_to_lower(titles_in_each_issue)) %>%
  filter(!titles_in_each_issue %in% str_to_lower( c("Enter text / DOI / keywords / authors / etc",
                                      "Enter words / phrases / DOI / ISBN / keywords / authors / etc",
                                      "Current Issue",
                                      "Journal_Article",
                                      "PNAS Nexus",
                                      "Special Features",
                                      "Colloquia" ,
                                      "List of Issues",
                                      "Collected Papers",
                                      "Front Matter",
                                      "Journal Club",
                                      "Podcasts",
                                      "About",
                                      "Diversity and Inclusion",
                                      "Authors",
                                      "Reviewers",
                                      "Subscribers",
                                      "Librarians",
                                      "Press",
                                      "Cozzarelli Prize",
                                      "PNAS Updates",
                                      "This Week in PNAS",
                                      "In This Issue",
                                      "Corrections",
                                      "QnAs",
                                      "Inner Workings",
                                      "Retrospective",
                                      "Opinion",
                                      "Profile",
                                      "Addendum",
                                      "Letters",
                                      "Letters (Online Only)",
                                      "Letter (Online Only)",
                                      "News Feature",
                                      "Errata",
                                      "Articles",
                                      "Article",
                                      "Awards of Medals",
                                      "Award of Medals",
                                      "PNAS Plus Significance Statements",
                                      "Research Article",
                                      "Research Articles",
                                      "Feature Article",
                                      "Introduction",
                                      "NATIONAL ACADEMY OF SCIENCES",
                                      "National Academy of Sciences",
                                      "National Research Council",
                                      "Brief Report",
                                      "Brief Reports",
                                      "Author Index",
                                      "Perspective",
                                      "Perspectives",
                                      "Retraction",
                                      "Retraction (Online Only)",
                                      "Commentaries",
                                      "Profiles",
                                      "Editorial",
                                      "Editorials",
                                      "Commentary",
                                      "Review",
                                      "Reviews",
                                      "nobel lecture",
                                      "n.a.s. symposium",
                                      "inaugural article"
  ))) %>%
  filter(!str_detect(titles_in_each_issue,
                     str_to_lower("Editorial|Editor|Report of the|Extracts from the Minutes|Minutes of|Corrections|Correction|Correction for|Colloquium|PNAS"))) %>%
  # drop blank titles
  filter(titles_in_each_issue != "") %>%
  mutate(titles_in_each_issue = str_to_lower( str_squish(titles_in_each_issue)) ) %>%
  distinct()

# get the subject section headings, which are still in the data frame
# from browsing these, I think ones that occur more than ten times are real
# section headers and not articles.
titles_in_each_issue_df_section_headers <-
  titles_in_each_issue_df %>%
  mutate(word_count = str_count(titles_in_each_issue, pattern = '\\w+')) %>%
  filter(word_count %in% 1:4) %>%
  count(titles_in_each_issue) %>%
  arrange(desc(n)) %>%
  filter(!str_detect(titles_in_each_issue, "a |profile|on ")) %>%
  filter(n >= 2)

# put section headings in their own column and delete from the article title column
# this has to be done by year, since some years they didn't use subject headings
titles_in_each_issue_with_section_headings_df <-
  titles_in_each_issue_df %>%
  mutate(year = parse_number(year)) %>%
  mutate(section_heading = ifelse(titles_in_each_issue %in% titles_in_each_issue_df_section_headers$titles_in_each_issue,
                                  titles_in_each_issue, NA)) %>%
  # do the fill down within each year only, so that we don't run over into a year with no
  # subject headers
  nest(data = -year) %>%
  mutate(section_heading_filled = map(data, ~.x %>% fill(section_heading))) %>%
  unnest(section_heading_filled) %>%
  filter(!titles_in_each_issue %in% titles_in_each_issue_df_section_headers$titles_in_each_issue) %>%
  select(-data)  %>%
  # for now, drop some mystery years that have unusually high amounts of certain topics
   mutate(section_heading = ifelse(year %in% c(1990, 1991, 1993, 1996),
                                          NA, section_heading))

# how many papers per year?
plot_papers_per_year <-
titles_in_each_issue_with_section_headings_df %>%
  count(year) %>%
  ggplot() +
  aes(year, n) +
  geom_col() +
  theme_minimal()

# I think the section headings are not straightforward to use for tallying papers per subject

# make it interactive
plotly::ggplotly(plot_papers_per_year)

# check to see how many of each, what else do we need to drop?
# we need to group a bunch of these together
section_headings_per_year_tally_df <-
  titles_in_each_issue_with_section_headings_df %>%
  mutate(section_heading = tolower(section_heading)) %>%
  mutate(section_heading =
           case_when(
             str_detect(section_heading, "anthropology") ~ "anthropology",
             str_detect(section_heading, "biochemistry") ~ "biochemistry",
             str_detect(section_heading, "biophysics") ~ "biophysics",
             str_detect(section_heading, "biolog") ~ "biology",
             str_detect(section_heading, "botany") ~ "botany",
             str_detect(section_heading, "math") ~ "math",
             str_detect(section_heading, "physical sciences") ~ "physical sciences",
             str_detect(section_heading, "cell biology") ~ "cell biology",
             str_detect(section_heading, "developmental biology") ~ "developmental biology",
             str_detect(section_heading, "geology") ~ "geology",
             str_detect(section_heading, "psycho") ~ "psychology",
             str_detect(section_heading, "immuno") ~ "immunology",
             str_detect(section_heading, "pathology") ~ "pathology",
             str_detect(section_heading, "physics") ~ "physics",
             str_detect(section_heading, "physio") ~ "physiology",
             str_detect(section_heading, "zoölogy") ~ "zoology",
             str_detect(section_heading, "agricult") ~ "agriculture",
             TRUE ~ section_heading
           )) %>%
  mutate(section_heading = str_remove(section_heading, "biological sciences:|physical sciences:")) %>%
  mutate(section_heading = str_squish(section_heading)) %>%
  group_by(year, section_heading) %>%
  summarise(n = n())   %>%
  drop_na()

# how many papers in each subject per year? Lets just focus on the major subject areas
p1 <-
  section_headings_per_year_tally_df %>%
  ggplot() +
  aes(year, n,
      colour = section_heading,
      fill = section_heading) +
  geom_col()

# make it interactive
plotly::ggplotly(p1)

# another way to get all anthro articles is to scrape the PNAS search page here
# https://www-pnas-org.offcampus.lib.washington.edu/action/doSearch?Concept=500376&Concept=500375&startPage=0&sortBy=Earliest
# it shows 1002 articles

# another PNAS keyword search shows 1474 articles
pnas_archaeology_search <- "https://www.pnas.org/action/doSearch?AllField=archaeology&stemming=yes&publication=pnas&pageSize=1500&startPage=0"
# save as webpage to my disk, then read in
search_results <-
"Search Result _ Proceedings of the National Academy of Sciences.html" %>%
  read_html() %>%
  html_nodes(".items-results .card__meta__date , .items-results .animation-underline") %>%
  html_text() %>%
  matrix(., ncol = 2, byrow = TRUE) %>%
  tibble() %>%
  mutate(year = parse_number(str_sub(.[,1],  -4)))

search_results %>%
  count(year) %>%
ggplot() +
  aes(year, n) +
  geom_line()


# or for 'archaeology' it shows 74 articles
# https://www-pnas-org.offcampus.lib.washington.edu/action/doSearch?Concept=500376&Concept=500375&sortBy=Earliest&pageSize=500&startPage=&rel=nofollow&KeywordRaw=archaeology
# another option is pubmed and the MeSH keywords (n = 123 articles)
# https://pubmed-ncbi-nlm-nih-gov.offcampus.lib.washington.edu/?term=%28%22Archaeology%22%5BMAJR%5D%29+AND+%28%22Proceedings+of+the+National+Academy+of+Sciences+of+the+United+States+of+America%22%5BJournal%5D%29&sort=
# same for Science (n = 219)
# https://pubmed-ncbi-nlm-nih-gov.offcampus.lib.washington.edu/?term=%28%22Archaeology%22%5BMAJR%5D%29+AND+%28%22Science+%28New+York%2C+N.Y.%29%22%5BJournal%5D%29&size=200
# and Nature (n = 126, but only one of my papers is in here)
# https://pubmed-ncbi-nlm-nih-gov.offcampus.lib.washington.edu/?term=%28%22Archaeology%22%5BMAJR%5D+%29+AND+%28%22Nature%22%5BJournal%5D%29&sort=


# JB says she searched for "archaeolog*" and "archeolog*", these return 257 results for me,
# much less than the 1002 we get from searching the archaeology 'keyword, e.g.
# https://www.pnas.org/action/doSearch?Concept=500376&Concept=500375&startPage=0&sortBy=Earliest

# in any case, let's start with "archaeolog*" and "archeolog*", I've copied the URL of the search
# results page and edited it to return 500 items on the first page, so we can get all results without
# having to scrape multiple pages of results, just to simplify the process
library(tidyverse)
library(rvest)

search_url <- "https://www.pnas.org/action/doSearch?field1=AllField&text1=archaeolog*+&field2=AllField&text2=archeolog*&field3=AllField&text3=&publication=&Ppub=&access=on&pageSize=500&startPage=0"

# this takes a few seconds:
search_results_titles_and_dates_df <-
search_url %>%
read_html() %>%
  # extract content from elements on the page that contain the article title and date
  html_nodes(".items-results .card__meta__date , .items-results .animation-underline") %>%
  # convert to human-readable text
  html_text() %>%
  # reshape into a data frame with title in one col and date in another col
  matrix(., ncol = 2, byrow = TRUE) %>%
  tibble() %>%
  # extract year from first column and convert from character to numeric
  mutate(year = parse_number(str_sub(.[,1],  -4)))

# how many papers per year?
search_results_titles_and_dates_df %>%
  count(year) %>%
  ggplot() +
  aes(year, n) +
  geom_col() +
  theme_minimal()

# let's get the URLs to each paper so we can get abstracts, etc.
# this takes a few seconds
search_results_urls_df <-
search_url %>%
  read_html() %>%
  # extract content from elements on the page that contain the article title link
  html_nodes(".items-results .animation-underline") %>%
  # convert to human-readable text
  html_attr("href") %>%
  # construct full URL from these results
  str_c("http://pnas.org", .)

# for each link to a paper, get the content type,title, abstract,
# subject and date of that paper.
# This takes quite a few minutes
search_results_abstract_title_date_subject <-
map(search_results_urls_df,
    ~.x %>%
      read_html() %>%
      html_nodes(".core-date-published span , h1 , .core-container > div:nth-child(1)") %>%
      # convert to human-readable text
      html_text()
      )

# clean and tidy the titles, dates and abstracts into a nice tibble
search_results_abstract_title_date_subject_tidy_df <-
map_df(search_results_abstract_title_date_subject,
    ~tibble(article_type = .x[1],
            title = .x[2],
            date = .x[3],
            abstract = .x[4])) %>%
  mutate(article_type = str_squish(str_remove(article_type, " Share on"))) %>%
  # extract year from data and convert from character to numeric
  mutate(year = parse_number(str_sub(date,  -4))) %>%
  filter(!article_type %in% c("QnAs", "This Week in PNAS")) %>%
  filter(str_detect(article_type, "Research Article")) %>%
  separate(article_type,
           into = c("article_type", "pnas_subject"),
           sep = 16)


library(quanteda)

# explore species studied for domestication, inspired by this plot
# https://twitter.com/JBeckArch/status/1510346774283698177

# convert abstract text into document feature matrix and clean the text
papers_about_domestication <-
search_results_abstract_title_date_subject_tidy_df %>%
  filter(str_detect(abstract, "domestica"))

my_dfm <-
corpus(papers_about_domestication$abstract,
       docvars = papers_about_domestication$year) %>%
  tokens(remove_numbers = TRUE,
         remove_symbols = TRUE,
        remove_punct = TRUE) %>%
  dfm(tolower = TRUE,
      remove_padding = TRUE) %>%
  dfm_select(pattern = stopwords("en"),
             selection = "remove") %>%
  dfm_select(pattern = c("dog", "dogs",
                         "maize", "wheat",
                         "horse", "horses",
                         "rice", "squash",
                         "cattle", "cows", "cow"),
             selection = "keep")

# tidy up and plot
my_dfm %>%
  convert(to = "data.frame") %>%
  cbind(docvars(my_dfm)) %>%
  mutate(horse = horse + horses) %>%
  select(-horses,
         - doc_id,
         year = docvars) %>%
  pivot_longer(-year,
               names_to = "thing",
               values_to = "n") %>%
  group_by(thing) %>%
  summarise(n = sum(n)) %>%
  ggplot() +
  aes(reorder(thing, -n), n) +
  geom_col() +
  theme_minimal() +
  xlab("Domesticate")

	# JB says she searched for "archaeolog" and "archeolog", these return 257 results for me,
	# much less than the 1002 we get from searching the archaeology 'keyword', e.g.
	# https://www.pnas.org/action/doSearch?Concept=500376&Concept=500375&startPage=0&sortBy=Earliest

	# in any case, let's start with "archaeolog" and "archeolog", I've copied the URL of the search
	# results page and edited the URL to return 500 items on the first page, so we can get all results without
	# having to scrape multiple pages of results, just to simplify the process

	library(tidyverse)
	library(rvest)

	search_url <- "https://www.pnas.org/action/doSearch?field1=AllField&text1=archaeolog+&field2=AllField&text2=archeolog&field3=AllField&text3=&publication=&Ppub=&access=on&pageSize=500&startPage=0"

	# let's get the URLs to each paper listed on that search results page so we can get abstracts, etc.
	# this takes a few seconds

	search_results_urls_df <-
	search_url %>%
	read_html() %>%
	# extract content from elements on the page that contain the article title link
	# I used the Selector Gadget Chrome Extension to identify these elements
	html_nodes(".items-results .animation-underline") %>%
	# convert to human-readable text
	html_attr("href") %>%
	# construct full URL from these results
	str_c("http://pnas.org", .)

	# and now for each link to a paper, we get the content type, title, abstract,
	# subject and date of that paper.
	# This takes several minutes, maybe more than 10?

	search_results_abstract_title_date_subject <-
	map(search_results_urls_df,
	# for each, url, do this:
	~.x %>%
	read_html() %>%
	html_nodes(".core-date-published span , h1 , .core-container > div:nth-child(1)") %>%
	# convert to human-readable text
	html_text()
	)

	# clean and tidy the titles, dates and abstracts into a nice tibble
	search_results_abstract_title_date_subject_tidy_df <-
	map_df(search_results_abstract_title_date_subject,
	~tibble(article_type = .x[1],
	title = .x[2],
	date = .x[3],
	abstract = .x[4])) %>%
	mutate(article_type = str_squish(str_remove(article_type, " Share on"))) %>%
	# extract year from data and convert from character to numeric
	mutate(year = parse_number(str_sub(date, -4))) %>%
	filter(!article_type %in% c("QnAs", "This Week in PNAS")) %>%
	filter(str_detect(article_type, "Research Article")) %>%
	separate(article_type,
	into = c("article_type", "pnas_subject"),
	sep = 16)


	library(quanteda)

	# explore species studied for domestication, e.g. inspired by this plot
	# https://twitter.com/JBeckArch/status/1510346774283698177

	# convert abstract text into document feature matrix and clean the text

	# get only papers about domestication
	papers_about_domestication <-
	search_results_abstract_title_date_subject_tidy_df %>%
	filter(str_detect(abstract, "domestica"))

	# do text analysis of the abstracts to count how many times certain words appear
	# we can also track word frequency over time with this output

	my_dfm <-
	# convert free text of abstract to corpus
	corpus(papers_about_domestication$abstract,
	docvars = papers_about_domestication$year) %>%
	# convert to tokens and clean
	tokens(remove_numbers = TRUE,
	remove_symbols = TRUE,
	remove_punct = TRUE) %>%
	# convert to document frame matrix
	dfm(tolower = TRUE,
	remove_padding = TRUE) %>%
	# remove stopwords
	dfm_select(pattern = stopwords("en"),
	selection = "remove") %>%
	# keep only words of interest here
	dfm_select(pattern = c("dog", "dogs",
	"maize", "wheat",
	"horse", "horses",
	"rice", "squash",
	"cattle", "cows", "cow"),
	selection = "keep")

	# tidy up and plot
	my_dfm %>%
	convert(to = "data.frame") %>%
	cbind(docvars(my_dfm)) %>%
	mutate(horse = horse + horses) %>%
	select(-horses,
	- doc_id,
	year = docvars) %>%
	pivot_longer(-year,
	names_to = "thing",
	values_to = "n") %>%
	group_by(thing) %>%
	summarise(n = sum(n)) %>%
	ggplot() +
	aes(reorder(thing, -n), n) +
	geom_col() +
	theme_minimal() +
	xlab("Domesticate")
	library(tidyverse)
	library(rvest)

	# base URL for a single issue
	url_pnas_toc <- "https://www.pnas.org/toc/pnas/"

	# base URL for the list of issues
	url_pnas_loi <- "https://www.pnas.org/loi/pnas/group"

	this_year <- format(Sys.time(), "%Y")

	# make a vector of year ranges to construct a URL to get all the issues pages
	url_issues_pages <-
	map_chr(seq(1910, this_year, 10),
	~paste0(url_pnas_loi, "/d", .x, ".", "y", ifelse(.x+9 < 2022, .x+9, 2022)))

	url_issues_pages <-
	map_chr(1910:this_year,
	# first round the number down to the nearest decade and then
	# put each year at the end
	~paste0(url_pnas_loi, "/d", .x%/%10*10, ".", "y", .x))

	# get the month, vol and issue numbers, subset to get only items with Months in them,
	# these are the issue details. This takes several minutes to complete
	urls_for_specific_issues <-
	map(url_issues_pages,
	~.x %>%
	read_html() %>%
	html_nodes(".flex-column") %>%
	html_text() %>%
	# subset to keep items that have a month name in them
	str_subset(paste0(month.name, collapse = "\|")) %>%
	# extract only vol and issue numbers
	str_extract("Vol.*") %>%
	str_replace_all("\\\|", "/") %>%
	str_remove_all("Vol. \| No. \| ") %>%
	str_c(url_pnas_toc, .))

	# organise so we have the year with the vol and issue
	# so we can analyse change over time

	names(urls_for_specific_issues) <- str_sub(url_issues_pages,-4,-1)

	# reshape into data frame
	urls_for_specific_issues_df <-
	tibble(urls_for_specific_issues = urls_for_specific_issues,
	year = names(urls_for_specific_issues)) %>%
	unnest_longer(urls_for_specific_issues)

	# ---------------------------------------------------------------
	# This next step takes several hours to complete, if we've already
	# done it recently, skip and read in the CSV we previously saved

	# for each specific issue, get the text of the titles and
	# sections in that issue.
	titles_in_each_issue <-
	urls_for_specific_issues_df %>%
	mutate(titles_in_each_issue =
	map(urls_for_specific_issues,
	~.x %>%
	read_html() %>%
	html_nodes(".animation-underline , .mb-4") %>%
	html_text()
	))

	# ---------------------------------------------------------------
	# Save the output so we don't have to do that again. Because this is
	# data frame with nested tibbles, we cannot use CSV but need a binary
	# format

	saveRDS(titles_in_each_issue, "pnas_titles_in_each_issue.rds")

	# read in the CSV we previously saved, if we have one
	f <- "pnas_titles_in_each_issue.rds"
	if (file.exists(f))
	titles_in_each_issue <- readRDS(f)

	# unnest so we can see all the titles and remove some irrelevant materials
	titles_in_each_issue_df <-
	titles_in_each_issue %>%
	unnest(titles_in_each_issue) %>%
	mutate(titles_in_each_issue = str_to_lower(titles_in_each_issue)) %>%
	filter(!titles_in_each_issue %in% str_to_lower( c("Enter text / DOI / keywords / authors / etc",
	"Enter words / phrases / DOI / ISBN / keywords / authors / etc",
	"Current Issue",
	"Journal_Article",
	"PNAS Nexus",
	"Special Features",
	"Colloquia" ,
	"List of Issues",
	"Collected Papers",
	"Front Matter",
	"Journal Club",
	"Podcasts",
	"About",
	"Diversity and Inclusion",
	"Authors",
	"Reviewers",
	"Subscribers",
	"Librarians",
	"Press",
	"Cozzarelli Prize",
	"PNAS Updates",
	"This Week in PNAS",
	"In This Issue",
	"Corrections",
	"QnAs",
	"Inner Workings",
	"Retrospective",
	"Opinion",
	"Profile",
	"Addendum",
	"Letters",
	"Letters (Online Only)",
	"Letter (Online Only)",
	"News Feature",
	"Errata",
	"Articles",
	"Article",
	"Awards of Medals",
	"Award of Medals",
	"PNAS Plus Significance Statements",
	"Research Article",
	"Research Articles",
	"Feature Article",
	"Introduction",
	"NATIONAL ACADEMY OF SCIENCES",
	"National Academy of Sciences",
	"National Research Council",
	"Brief Report",
	"Brief Reports",
	"Author Index",
	"Perspective",
	"Perspectives",
	"Retraction",
	"Retraction (Online Only)",
	"Commentaries",
	"Profiles",
	"Editorial",
	"Editorials",
	"Commentary",
	"Review",
	"Reviews",
	"nobel lecture",
	"n.a.s. symposium",
	"inaugural article"
	))) %>%
	filter(!str_detect(titles_in_each_issue,
	str_to_lower("Editorial\|Editor\|Report of the\|Extracts from the Minutes\|Minutes of\|Corrections\|Correction\|Correction for\|Colloquium\|PNAS"))) %>%
	# drop blank titles
	filter(titles_in_each_issue != "") %>%
	mutate(titles_in_each_issue = str_to_lower( str_squish(titles_in_each_issue)) ) %>%
	distinct()

	# get the subject section headings, which are still in the data frame
	# from browsing these, I think ones that occur more than ten times are real
	# section headers and not articles.
	titles_in_each_issue_df_section_headers <-
	titles_in_each_issue_df %>%
	mutate(word_count = str_count(titles_in_each_issue, pattern = '\\w+')) %>%
	filter(word_count %in% 1:4) %>%
	count(titles_in_each_issue) %>%
	arrange(desc(n)) %>%
	filter(!str_detect(titles_in_each_issue, "a \|profile\|on ")) %>%
	filter(n >= 2)

	# put section headings in their own column and delete from the article title column
	# this has to be done by year, since some years they didn't use subject headings
	titles_in_each_issue_with_section_headings_df <-
	titles_in_each_issue_df %>%
	mutate(year = parse_number(year)) %>%
	mutate(section_heading = ifelse(titles_in_each_issue %in% titles_in_each_issue_df_section_headers$titles_in_each_issue,
	titles_in_each_issue, NA)) %>%
	# do the fill down within each year only, so that we don't run over into a year with no
	# subject headers
	nest(data = -year) %>%
	mutate(section_heading_filled = map(data, ~.x %>% fill(section_heading))) %>%
	unnest(section_heading_filled) %>%
	filter(!titles_in_each_issue %in% titles_in_each_issue_df_section_headers$titles_in_each_issue) %>%
	select(-data) %>%
	# for now, drop some mystery years that have unusually high amounts of certain topics
	mutate(section_heading = ifelse(year %in% c(1990, 1991, 1993, 1996),
	NA, section_heading))

	# how many papers per year?
	plot_papers_per_year <-
	titles_in_each_issue_with_section_headings_df %>%
	count(year) %>%
	ggplot() +
	aes(year, n) +
	geom_col() +
	theme_minimal()

	# I think the section headings are not straightforward to use for tallying papers per subject

	# make it interactive
	plotly::ggplotly(plot_papers_per_year)

	# check to see how many of each, what else do we need to drop?
	# we need to group a bunch of these together
	section_headings_per_year_tally_df <-
	titles_in_each_issue_with_section_headings_df %>%
	mutate(section_heading = tolower(section_heading)) %>%
	mutate(section_heading =
	case_when(
	str_detect(section_heading, "anthropology") ~ "anthropology",
	str_detect(section_heading, "biochemistry") ~ "biochemistry",
	str_detect(section_heading, "biophysics") ~ "biophysics",
	str_detect(section_heading, "biolog") ~ "biology",
	str_detect(section_heading, "botany") ~ "botany",
	str_detect(section_heading, "math") ~ "math",
	str_detect(section_heading, "physical sciences") ~ "physical sciences",
	str_detect(section_heading, "cell biology") ~ "cell biology",
	str_detect(section_heading, "developmental biology") ~ "developmental biology",
	str_detect(section_heading, "geology") ~ "geology",
	str_detect(section_heading, "psycho") ~ "psychology",
	str_detect(section_heading, "immuno") ~ "immunology",
	str_detect(section_heading, "pathology") ~ "pathology",
	str_detect(section_heading, "physics") ~ "physics",
	str_detect(section_heading, "physio") ~ "physiology",
	str_detect(section_heading, "zoölogy") ~ "zoology",
	str_detect(section_heading, "agricult") ~ "agriculture",
	TRUE ~ section_heading
	)) %>%
	mutate(section_heading = str_remove(section_heading, "biological sciences:\|physical sciences:")) %>%
	mutate(section_heading = str_squish(section_heading)) %>%
	group_by(year, section_heading) %>%
	summarise(n = n()) %>%
	drop_na()

	# how many papers in each subject per year? Lets just focus on the major subject areas
	p1 <-
	section_headings_per_year_tally_df %>%
	ggplot() +
	aes(year, n,
	colour = section_heading,
	fill = section_heading) +
	geom_col()

	# make it interactive
	plotly::ggplotly(p1)

	# another way to get all anthro articles is to scrape the PNAS search page here
	# https://www-pnas-org.offcampus.lib.washington.edu/action/doSearch?Concept=500376&Concept=500375&startPage=0&sortBy=Earliest
	# it shows 1002 articles

	# another PNAS keyword search shows 1474 articles
	pnas_archaeology_search <- "https://www.pnas.org/action/doSearch?AllField=archaeology&stemming=yes&publication=pnas&pageSize=1500&startPage=0"
	# save as webpage to my disk, then read in
	search_results <-
	"Search Result _ Proceedings of the National Academy of Sciences.html" %>%
	read_html() %>%
	html_nodes(".items-results .card__meta__date , .items-results .animation-underline") %>%
	html_text() %>%
	matrix(., ncol = 2, byrow = TRUE) %>%
	tibble() %>%
	mutate(year = parse_number(str_sub(.[,1], -4)))

	search_results %>%
	count(year) %>%
	ggplot() +
	aes(year, n) +
	geom_line()


	# or for 'archaeology' it shows 74 articles
	# https://www-pnas-org.offcampus.lib.washington.edu/action/doSearch?Concept=500376&Concept=500375&sortBy=Earliest&pageSize=500&startPage=&rel=nofollow&KeywordRaw=archaeology
	# another option is pubmed and the MeSH keywords (n = 123 articles)
	# https://pubmed-ncbi-nlm-nih-gov.offcampus.lib.washington.edu/?term=%28%22Archaeology%22%5BMAJR%5D%29+AND+%28%22Proceedings+of+the+National+Academy+of+Sciences+of+the+United+States+of+America%22%5BJournal%5D%29&sort=
	# same for Science (n = 219)
	# https://pubmed-ncbi-nlm-nih-gov.offcampus.lib.washington.edu/?term=%28%22Archaeology%22%5BMAJR%5D%29+AND+%28%22Science+%28New+York%2C+N.Y.%29%22%5BJournal%5D%29&size=200
	# and Nature (n = 126, but only one of my papers is in here)
	# https://pubmed-ncbi-nlm-nih-gov.offcampus.lib.washington.edu/?term=%28%22Archaeology%22%5BMAJR%5D+%29+AND+%28%22Nature%22%5BJournal%5D%29&sort=


	# JB says she searched for "archaeolog" and "archeolog", these return 257 results for me,
	# much less than the 1002 we get from searching the archaeology 'keyword, e.g.
	# https://www.pnas.org/action/doSearch?Concept=500376&Concept=500375&startPage=0&sortBy=Earliest

	# in any case, let's start with "archaeolog" and "archeolog", I've copied the URL of the search
	# results page and edited it to return 500 items on the first page, so we can get all results without
	# having to scrape multiple pages of results, just to simplify the process
	library(tidyverse)
	library(rvest)

	search_url <- "https://www.pnas.org/action/doSearch?field1=AllField&text1=archaeolog+&field2=AllField&text2=archeolog&field3=AllField&text3=&publication=&Ppub=&access=on&pageSize=500&startPage=0"

	# this takes a few seconds:
	search_results_titles_and_dates_df <-
	search_url %>%
	read_html() %>%
	# extract content from elements on the page that contain the article title and date
	html_nodes(".items-results .card__meta__date , .items-results .animation-underline") %>%
	# convert to human-readable text
	html_text() %>%
	# reshape into a data frame with title in one col and date in another col
	matrix(., ncol = 2, byrow = TRUE) %>%
	tibble() %>%
	# extract year from first column and convert from character to numeric
	mutate(year = parse_number(str_sub(.[,1], -4)))

	# how many papers per year?
	search_results_titles_and_dates_df %>%
	count(year) %>%
	ggplot() +
	aes(year, n) +
	geom_col() +
	theme_minimal()

	# let's get the URLs to each paper so we can get abstracts, etc.
	# this takes a few seconds
	search_results_urls_df <-
	search_url %>%
	read_html() %>%
	# extract content from elements on the page that contain the article title link
	html_nodes(".items-results .animation-underline") %>%
	# convert to human-readable text
	html_attr("href") %>%
	# construct full URL from these results
	str_c("http://pnas.org", .)

	# for each link to a paper, get the content type,title, abstract,
	# subject and date of that paper.
	# This takes quite a few minutes
	search_results_abstract_title_date_subject <-
	map(search_results_urls_df,
	~.x %>%
	read_html() %>%
	html_nodes(".core-date-published span , h1 , .core-container > div:nth-child(1)") %>%
	# convert to human-readable text
	html_text()
	)

	# clean and tidy the titles, dates and abstracts into a nice tibble
	search_results_abstract_title_date_subject_tidy_df <-
	map_df(search_results_abstract_title_date_subject,
	~tibble(article_type = .x[1],
	title = .x[2],
	date = .x[3],
	abstract = .x[4])) %>%
	mutate(article_type = str_squish(str_remove(article_type, " Share on"))) %>%
	# extract year from data and convert from character to numeric
	mutate(year = parse_number(str_sub(date, -4))) %>%
	filter(!article_type %in% c("QnAs", "This Week in PNAS")) %>%
	filter(str_detect(article_type, "Research Article")) %>%
	separate(article_type,
	into = c("article_type", "pnas_subject"),
	sep = 16)


	library(quanteda)

	# explore species studied for domestication, inspired by this plot
	# https://twitter.com/JBeckArch/status/1510346774283698177

	# convert abstract text into document feature matrix and clean the text
	papers_about_domestication <-
	search_results_abstract_title_date_subject_tidy_df %>%
	filter(str_detect(abstract, "domestica"))

	my_dfm <-
	corpus(papers_about_domestication$abstract,
	docvars = papers_about_domestication$year) %>%
	tokens(remove_numbers = TRUE,
	remove_symbols = TRUE,
	remove_punct = TRUE) %>%
	dfm(tolower = TRUE,
	remove_padding = TRUE) %>%
	dfm_select(pattern = stopwords("en"),
	selection = "remove") %>%
	dfm_select(pattern = c("dog", "dogs",
	"maize", "wheat",
	"horse", "horses",
	"rice", "squash",
	"cattle", "cows", "cow"),
	selection = "keep")

	# tidy up and plot
	my_dfm %>%
	convert(to = "data.frame") %>%
	cbind(docvars(my_dfm)) %>%
	mutate(horse = horse + horses) %>%
	select(-horses,
	- doc_id,
	year = docvars) %>%
	pivot_longer(-year,
	names_to = "thing",
	values_to = "n") %>%
	group_by(thing) %>%
	summarise(n = sum(n)) %>%
	ggplot() +
	aes(reorder(thing, -n), n) +
	geom_col() +
	theme_minimal() +
	xlab("Domesticate")