Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Last active October 6, 2022 06:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save benmarwick/0e3bcf3a8f24981adc18ad4730f1efc3 to your computer and use it in GitHub Desktop.
Save benmarwick/0e3bcf3a8f24981adc18ad4730f1efc3 to your computer and use it in GitHub Desktop.
scrape PNAS archaeology articles and basic text analysis of abstracts
# JB says she searched for "archaeolog*" and "archeolog*", these return 257 results for me,
# much less than the 1002 we get from searching the archaeology 'keyword', e.g.
# https://www.pnas.org/action/doSearch?Concept=500376&Concept=500375&startPage=0&sortBy=Earliest
# in any case, let's start with "archaeolog*" and "archeolog*", I've copied the URL of the search
# results page and edited the URL to return 500 items on the first page, so we can get all results without
# having to scrape multiple pages of results, just to simplify the process
library(tidyverse)
library(rvest)
search_url <- "https://www.pnas.org/action/doSearch?field1=AllField&text1=archaeolog*+&field2=AllField&text2=archeolog*&field3=AllField&text3=&publication=&Ppub=&access=on&pageSize=500&startPage=0"
# let's get the URLs to each paper listed on that search results page so we can get abstracts, etc.
# this takes a few seconds
search_results_urls_df <-
search_url %>%
read_html() %>%
# extract content from elements on the page that contain the article title link
# I used the Selector Gadget Chrome Extension to identify these elements
html_nodes(".items-results .animation-underline") %>%
# convert to human-readable text
html_attr("href") %>%
# construct full URL from these results
str_c("http://pnas.org", .)
# and now for each link to a paper, we get the content type, title, abstract,
# subject and date of that paper.
# This takes several minutes, maybe more than 10?
search_results_abstract_title_date_subject <-
map(search_results_urls_df,
# for each, url, do this:
~.x %>%
read_html() %>%
html_nodes(".core-date-published span , h1 , .core-container > div:nth-child(1)") %>%
# convert to human-readable text
html_text()
)
# clean and tidy the titles, dates and abstracts into a nice tibble
search_results_abstract_title_date_subject_tidy_df <-
map_df(search_results_abstract_title_date_subject,
~tibble(article_type = .x[1],
title = .x[2],
date = .x[3],
abstract = .x[4])) %>%
mutate(article_type = str_squish(str_remove(article_type, " Share on"))) %>%
# extract year from data and convert from character to numeric
mutate(year = parse_number(str_sub(date, -4))) %>%
filter(!article_type %in% c("QnAs", "This Week in PNAS")) %>%
filter(str_detect(article_type, "Research Article")) %>%
separate(article_type,
into = c("article_type", "pnas_subject"),
sep = 16)
library(quanteda)
# explore species studied for domestication, e.g. inspired by this plot
# https://twitter.com/JBeckArch/status/1510346774283698177
# convert abstract text into document feature matrix and clean the text
# get only papers about domestication
papers_about_domestication <-
search_results_abstract_title_date_subject_tidy_df %>%
filter(str_detect(abstract, "domestica"))
# do text analysis of the abstracts to count how many times certain words appear
# we can also track word frequency over time with this output
my_dfm <-
# convert free text of abstract to corpus
corpus(papers_about_domestication$abstract,
docvars = papers_about_domestication$year) %>%
# convert to tokens and clean
tokens(remove_numbers = TRUE,
remove_symbols = TRUE,
remove_punct = TRUE) %>%
# convert to document frame matrix
dfm(tolower = TRUE,
remove_padding = TRUE) %>%
# remove stopwords
dfm_select(pattern = stopwords("en"),
selection = "remove") %>%
# keep only words of interest here
dfm_select(pattern = c("dog", "dogs",
"maize", "wheat",
"horse", "horses",
"rice", "squash",
"cattle", "cows", "cow"),
selection = "keep")
# tidy up and plot
my_dfm %>%
convert(to = "data.frame") %>%
cbind(docvars(my_dfm)) %>%
mutate(horse = horse + horses) %>%
select(-horses,
- doc_id,
year = docvars) %>%
pivot_longer(-year,
names_to = "thing",
values_to = "n") %>%
group_by(thing) %>%
summarise(n = sum(n)) %>%
ggplot() +
aes(reorder(thing, -n), n) +
geom_col() +
theme_minimal() +
xlab("Domesticate")
library(tidyverse)
library(rvest)
# base URL for a single issue
url_pnas_toc <- "https://www.pnas.org/toc/pnas/"
# base URL for the list of issues
url_pnas_loi <- "https://www.pnas.org/loi/pnas/group"
this_year <- format(Sys.time(), "%Y")
# make a vector of year ranges to construct a URL to get all the issues pages
url_issues_pages <-
map_chr(seq(1910, this_year, 10),
~paste0(url_pnas_loi, "/d", .x, ".", "y", ifelse(.x+9 < 2022, .x+9, 2022)))
url_issues_pages <-
map_chr(1910:this_year,
# first round the number down to the nearest decade and then
# put each year at the end
~paste0(url_pnas_loi, "/d", .x%/%10*10, ".", "y", .x))
# get the month, vol and issue numbers, subset to get only items with Months in them,
# these are the issue details. This takes several minutes to complete
urls_for_specific_issues <-
map(url_issues_pages,
~.x %>%
read_html() %>%
html_nodes(".flex-column") %>%
html_text() %>%
# subset to keep items that have a month name in them
str_subset(paste0(month.name, collapse = "|")) %>%
# extract only vol and issue numbers
str_extract("Vol.*") %>%
str_replace_all("\\|", "/") %>%
str_remove_all("Vol. | No. | ") %>%
str_c(url_pnas_toc, .))
# organise so we have the year with the vol and issue
# so we can analyse change over time
names(urls_for_specific_issues) <- str_sub(url_issues_pages,-4,-1)
# reshape into data frame
urls_for_specific_issues_df <-
tibble(urls_for_specific_issues = urls_for_specific_issues,
year = names(urls_for_specific_issues)) %>%
unnest_longer(urls_for_specific_issues)
# ---------------------------------------------------------------
# This next step takes several hours to complete, if we've already
# done it recently, skip and read in the CSV we previously saved
# for each specific issue, get the text of the titles and
# sections in that issue.
titles_in_each_issue <-
urls_for_specific_issues_df %>%
mutate(titles_in_each_issue =
map(urls_for_specific_issues,
~.x %>%
read_html() %>%
html_nodes(".animation-underline , .mb-4") %>%
html_text()
))
# ---------------------------------------------------------------
# Save the output so we don't have to do that again. Because this is
# data frame with nested tibbles, we cannot use CSV but need a binary
# format
saveRDS(titles_in_each_issue, "pnas_titles_in_each_issue.rds")
# read in the CSV we previously saved, if we have one
f <- "pnas_titles_in_each_issue.rds"
if (file.exists(f))
titles_in_each_issue <- readRDS(f)
# unnest so we can see all the titles and remove some irrelevant materials
titles_in_each_issue_df <-
titles_in_each_issue %>%
unnest(titles_in_each_issue) %>%
mutate(titles_in_each_issue = str_to_lower(titles_in_each_issue)) %>%
filter(!titles_in_each_issue %in% str_to_lower( c("Enter text / DOI / keywords / authors / etc",
"Enter words / phrases / DOI / ISBN / keywords / authors / etc",
"Current Issue",
"Journal_Article",
"PNAS Nexus",
"Special Features",
"Colloquia" ,
"List of Issues",
"Collected Papers",
"Front Matter",
"Journal Club",
"Podcasts",
"About",
"Diversity and Inclusion",
"Authors",
"Reviewers",
"Subscribers",
"Librarians",
"Press",
"Cozzarelli Prize",
"PNAS Updates",
"This Week in PNAS",
"In This Issue",
"Corrections",
"QnAs",
"Inner Workings",
"Retrospective",
"Opinion",
"Profile",
"Addendum",
"Letters",
"Letters (Online Only)",
"Letter (Online Only)",
"News Feature",
"Errata",
"Articles",
"Article",
"Awards of Medals",
"Award of Medals",
"PNAS Plus Significance Statements",
"Research Article",
"Research Articles",
"Feature Article",
"Introduction",
"NATIONAL ACADEMY OF SCIENCES",
"National Academy of Sciences",
"National Research Council",
"Brief Report",
"Brief Reports",
"Author Index",
"Perspective",
"Perspectives",
"Retraction",
"Retraction (Online Only)",
"Commentaries",
"Profiles",
"Editorial",
"Editorials",
"Commentary",
"Review",
"Reviews",
"nobel lecture",
"n.a.s. symposium",
"inaugural article"
))) %>%
filter(!str_detect(titles_in_each_issue,
str_to_lower("Editorial|Editor|Report of the|Extracts from the Minutes|Minutes of|Corrections|Correction|Correction for|Colloquium|PNAS"))) %>%
# drop blank titles
filter(titles_in_each_issue != "") %>%
mutate(titles_in_each_issue = str_to_lower( str_squish(titles_in_each_issue)) ) %>%
distinct()
# get the subject section headings, which are still in the data frame
# from browsing these, I think ones that occur more than ten times are real
# section headers and not articles.
titles_in_each_issue_df_section_headers <-
titles_in_each_issue_df %>%
mutate(word_count = str_count(titles_in_each_issue, pattern = '\\w+')) %>%
filter(word_count %in% 1:4) %>%
count(titles_in_each_issue) %>%
arrange(desc(n)) %>%
filter(!str_detect(titles_in_each_issue, "a |profile|on ")) %>%
filter(n >= 2)
# put section headings in their own column and delete from the article title column
# this has to be done by year, since some years they didn't use subject headings
titles_in_each_issue_with_section_headings_df <-
titles_in_each_issue_df %>%
mutate(year = parse_number(year)) %>%
mutate(section_heading = ifelse(titles_in_each_issue %in% titles_in_each_issue_df_section_headers$titles_in_each_issue,
titles_in_each_issue, NA)) %>%
# do the fill down within each year only, so that we don't run over into a year with no
# subject headers
nest(data = -year) %>%
mutate(section_heading_filled = map(data, ~.x %>% fill(section_heading))) %>%
unnest(section_heading_filled) %>%
filter(!titles_in_each_issue %in% titles_in_each_issue_df_section_headers$titles_in_each_issue) %>%
select(-data) %>%
# for now, drop some mystery years that have unusually high amounts of certain topics
mutate(section_heading = ifelse(year %in% c(1990, 1991, 1993, 1996),
NA, section_heading))
# how many papers per year?
plot_papers_per_year <-
titles_in_each_issue_with_section_headings_df %>%
count(year) %>%
ggplot() +
aes(year, n) +
geom_col() +
theme_minimal()
# I think the section headings are not straightforward to use for tallying papers per subject
# make it interactive
plotly::ggplotly(plot_papers_per_year)
# check to see how many of each, what else do we need to drop?
# we need to group a bunch of these together
section_headings_per_year_tally_df <-
titles_in_each_issue_with_section_headings_df %>%
mutate(section_heading = tolower(section_heading)) %>%
mutate(section_heading =
case_when(
str_detect(section_heading, "anthropology") ~ "anthropology",
str_detect(section_heading, "biochemistry") ~ "biochemistry",
str_detect(section_heading, "biophysics") ~ "biophysics",
str_detect(section_heading, "biolog") ~ "biology",
str_detect(section_heading, "botany") ~ "botany",
str_detect(section_heading, "math") ~ "math",
str_detect(section_heading, "physical sciences") ~ "physical sciences",
str_detect(section_heading, "cell biology") ~ "cell biology",
str_detect(section_heading, "developmental biology") ~ "developmental biology",
str_detect(section_heading, "geology") ~ "geology",
str_detect(section_heading, "psycho") ~ "psychology",
str_detect(section_heading, "immuno") ~ "immunology",
str_detect(section_heading, "pathology") ~ "pathology",
str_detect(section_heading, "physics") ~ "physics",
str_detect(section_heading, "physio") ~ "physiology",
str_detect(section_heading, "zoölogy") ~ "zoology",
str_detect(section_heading, "agricult") ~ "agriculture",
TRUE ~ section_heading
)) %>%
mutate(section_heading = str_remove(section_heading, "biological sciences:|physical sciences:")) %>%
mutate(section_heading = str_squish(section_heading)) %>%
group_by(year, section_heading) %>%
summarise(n = n()) %>%
drop_na()
# how many papers in each subject per year? Lets just focus on the major subject areas
p1 <-
section_headings_per_year_tally_df %>%
ggplot() +
aes(year, n,
colour = section_heading,
fill = section_heading) +
geom_col()
# make it interactive
plotly::ggplotly(p1)
# another way to get all anthro articles is to scrape the PNAS search page here
# https://www-pnas-org.offcampus.lib.washington.edu/action/doSearch?Concept=500376&Concept=500375&startPage=0&sortBy=Earliest
# it shows 1002 articles
# another PNAS keyword search shows 1474 articles
pnas_archaeology_search <- "https://www.pnas.org/action/doSearch?AllField=archaeology&stemming=yes&publication=pnas&pageSize=1500&startPage=0"
# save as webpage to my disk, then read in
search_results <-
"Search Result _ Proceedings of the National Academy of Sciences.html" %>%
read_html() %>%
html_nodes(".items-results .card__meta__date , .items-results .animation-underline") %>%
html_text() %>%
matrix(., ncol = 2, byrow = TRUE) %>%
tibble() %>%
mutate(year = parse_number(str_sub(.[,1], -4)))
search_results %>%
count(year) %>%
ggplot() +
aes(year, n) +
geom_line()
# or for 'archaeology' it shows 74 articles
# https://www-pnas-org.offcampus.lib.washington.edu/action/doSearch?Concept=500376&Concept=500375&sortBy=Earliest&pageSize=500&startPage=&rel=nofollow&KeywordRaw=archaeology
# another option is pubmed and the MeSH keywords (n = 123 articles)
# https://pubmed-ncbi-nlm-nih-gov.offcampus.lib.washington.edu/?term=%28%22Archaeology%22%5BMAJR%5D%29+AND+%28%22Proceedings+of+the+National+Academy+of+Sciences+of+the+United+States+of+America%22%5BJournal%5D%29&sort=
# same for Science (n = 219)
# https://pubmed-ncbi-nlm-nih-gov.offcampus.lib.washington.edu/?term=%28%22Archaeology%22%5BMAJR%5D%29+AND+%28%22Science+%28New+York%2C+N.Y.%29%22%5BJournal%5D%29&size=200
# and Nature (n = 126, but only one of my papers is in here)
# https://pubmed-ncbi-nlm-nih-gov.offcampus.lib.washington.edu/?term=%28%22Archaeology%22%5BMAJR%5D+%29+AND+%28%22Nature%22%5BJournal%5D%29&sort=
# JB says she searched for "archaeolog*" and "archeolog*", these return 257 results for me,
# much less than the 1002 we get from searching the archaeology 'keyword, e.g.
# https://www.pnas.org/action/doSearch?Concept=500376&Concept=500375&startPage=0&sortBy=Earliest
# in any case, let's start with "archaeolog*" and "archeolog*", I've copied the URL of the search
# results page and edited it to return 500 items on the first page, so we can get all results without
# having to scrape multiple pages of results, just to simplify the process
library(tidyverse)
library(rvest)
search_url <- "https://www.pnas.org/action/doSearch?field1=AllField&text1=archaeolog*+&field2=AllField&text2=archeolog*&field3=AllField&text3=&publication=&Ppub=&access=on&pageSize=500&startPage=0"
# this takes a few seconds:
search_results_titles_and_dates_df <-
search_url %>%
read_html() %>%
# extract content from elements on the page that contain the article title and date
html_nodes(".items-results .card__meta__date , .items-results .animation-underline") %>%
# convert to human-readable text
html_text() %>%
# reshape into a data frame with title in one col and date in another col
matrix(., ncol = 2, byrow = TRUE) %>%
tibble() %>%
# extract year from first column and convert from character to numeric
mutate(year = parse_number(str_sub(.[,1], -4)))
# how many papers per year?
search_results_titles_and_dates_df %>%
count(year) %>%
ggplot() +
aes(year, n) +
geom_col() +
theme_minimal()
# let's get the URLs to each paper so we can get abstracts, etc.
# this takes a few seconds
search_results_urls_df <-
search_url %>%
read_html() %>%
# extract content from elements on the page that contain the article title link
html_nodes(".items-results .animation-underline") %>%
# convert to human-readable text
html_attr("href") %>%
# construct full URL from these results
str_c("http://pnas.org", .)
# for each link to a paper, get the content type,title, abstract,
# subject and date of that paper.
# This takes quite a few minutes
search_results_abstract_title_date_subject <-
map(search_results_urls_df,
~.x %>%
read_html() %>%
html_nodes(".core-date-published span , h1 , .core-container > div:nth-child(1)") %>%
# convert to human-readable text
html_text()
)
# clean and tidy the titles, dates and abstracts into a nice tibble
search_results_abstract_title_date_subject_tidy_df <-
map_df(search_results_abstract_title_date_subject,
~tibble(article_type = .x[1],
title = .x[2],
date = .x[3],
abstract = .x[4])) %>%
mutate(article_type = str_squish(str_remove(article_type, " Share on"))) %>%
# extract year from data and convert from character to numeric
mutate(year = parse_number(str_sub(date, -4))) %>%
filter(!article_type %in% c("QnAs", "This Week in PNAS")) %>%
filter(str_detect(article_type, "Research Article")) %>%
separate(article_type,
into = c("article_type", "pnas_subject"),
sep = 16)
library(quanteda)
# explore species studied for domestication, inspired by this plot
# https://twitter.com/JBeckArch/status/1510346774283698177
# convert abstract text into document feature matrix and clean the text
papers_about_domestication <-
search_results_abstract_title_date_subject_tidy_df %>%
filter(str_detect(abstract, "domestica"))
my_dfm <-
corpus(papers_about_domestication$abstract,
docvars = papers_about_domestication$year) %>%
tokens(remove_numbers = TRUE,
remove_symbols = TRUE,
remove_punct = TRUE) %>%
dfm(tolower = TRUE,
remove_padding = TRUE) %>%
dfm_select(pattern = stopwords("en"),
selection = "remove") %>%
dfm_select(pattern = c("dog", "dogs",
"maize", "wheat",
"horse", "horses",
"rice", "squash",
"cattle", "cows", "cow"),
selection = "keep")
# tidy up and plot
my_dfm %>%
convert(to = "data.frame") %>%
cbind(docvars(my_dfm)) %>%
mutate(horse = horse + horses) %>%
select(-horses,
- doc_id,
year = docvars) %>%
pivot_longer(-year,
names_to = "thing",
values_to = "n") %>%
group_by(thing) %>%
summarise(n = sum(n)) %>%
ggplot() +
aes(reorder(thing, -n), n) +
geom_col() +
theme_minimal() +
xlab("Domesticate")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment