Last active
October 6, 2022 06:42
-
-
Save benmarwick/0e3bcf3a8f24981adc18ad4730f1efc3 to your computer and use it in GitHub Desktop.
scrape PNAS archaeology articles and basic text analysis of abstracts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# JB says she searched for "archaeolog*" and "archeolog*", these return 257 results for me, | |
# much less than the 1002 we get from searching the archaeology 'keyword', e.g. | |
# https://www.pnas.org/action/doSearch?Concept=500376&Concept=500375&startPage=0&sortBy=Earliest | |
# in any case, let's start with "archaeolog*" and "archeolog*", I've copied the URL of the search | |
# results page and edited the URL to return 500 items on the first page, so we can get all results without | |
# having to scrape multiple pages of results, just to simplify the process | |
library(tidyverse) | |
library(rvest) | |
search_url <- "https://www.pnas.org/action/doSearch?field1=AllField&text1=archaeolog*+&field2=AllField&text2=archeolog*&field3=AllField&text3=&publication=&Ppub=&access=on&pageSize=500&startPage=0" | |
# let's get the URLs to each paper listed on that search results page so we can get abstracts, etc. | |
# this takes a few seconds | |
search_results_urls_df <- | |
search_url %>% | |
read_html() %>% | |
# extract content from elements on the page that contain the article title link | |
# I used the Selector Gadget Chrome Extension to identify these elements | |
html_nodes(".items-results .animation-underline") %>% | |
# convert to human-readable text | |
html_attr("href") %>% | |
# construct full URL from these results | |
str_c("http://pnas.org", .) | |
# and now for each link to a paper, we get the content type, title, abstract, | |
# subject and date of that paper. | |
# This takes several minutes, maybe more than 10? | |
search_results_abstract_title_date_subject <- | |
map(search_results_urls_df, | |
# for each, url, do this: | |
~.x %>% | |
read_html() %>% | |
html_nodes(".core-date-published span , h1 , .core-container > div:nth-child(1)") %>% | |
# convert to human-readable text | |
html_text() | |
) | |
# clean and tidy the titles, dates and abstracts into a nice tibble | |
search_results_abstract_title_date_subject_tidy_df <- | |
map_df(search_results_abstract_title_date_subject, | |
~tibble(article_type = .x[1], | |
title = .x[2], | |
date = .x[3], | |
abstract = .x[4])) %>% | |
mutate(article_type = str_squish(str_remove(article_type, " Share on"))) %>% | |
# extract year from data and convert from character to numeric | |
mutate(year = parse_number(str_sub(date, -4))) %>% | |
filter(!article_type %in% c("QnAs", "This Week in PNAS")) %>% | |
filter(str_detect(article_type, "Research Article")) %>% | |
separate(article_type, | |
into = c("article_type", "pnas_subject"), | |
sep = 16) | |
library(quanteda) | |
# explore species studied for domestication, e.g. inspired by this plot | |
# https://twitter.com/JBeckArch/status/1510346774283698177 | |
# convert abstract text into document feature matrix and clean the text | |
# get only papers about domestication | |
papers_about_domestication <- | |
search_results_abstract_title_date_subject_tidy_df %>% | |
filter(str_detect(abstract, "domestica")) | |
# do text analysis of the abstracts to count how many times certain words appear | |
# we can also track word frequency over time with this output | |
my_dfm <- | |
# convert free text of abstract to corpus | |
corpus(papers_about_domestication$abstract, | |
docvars = papers_about_domestication$year) %>% | |
# convert to tokens and clean | |
tokens(remove_numbers = TRUE, | |
remove_symbols = TRUE, | |
remove_punct = TRUE) %>% | |
# convert to document frame matrix | |
dfm(tolower = TRUE, | |
remove_padding = TRUE) %>% | |
# remove stopwords | |
dfm_select(pattern = stopwords("en"), | |
selection = "remove") %>% | |
# keep only words of interest here | |
dfm_select(pattern = c("dog", "dogs", | |
"maize", "wheat", | |
"horse", "horses", | |
"rice", "squash", | |
"cattle", "cows", "cow"), | |
selection = "keep") | |
# tidy up and plot | |
my_dfm %>% | |
convert(to = "data.frame") %>% | |
cbind(docvars(my_dfm)) %>% | |
mutate(horse = horse + horses) %>% | |
select(-horses, | |
- doc_id, | |
year = docvars) %>% | |
pivot_longer(-year, | |
names_to = "thing", | |
values_to = "n") %>% | |
group_by(thing) %>% | |
summarise(n = sum(n)) %>% | |
ggplot() + | |
aes(reorder(thing, -n), n) + | |
geom_col() + | |
theme_minimal() + | |
xlab("Domesticate") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(rvest) | |
# base URL for a single issue | |
url_pnas_toc <- "https://www.pnas.org/toc/pnas/" | |
# base URL for the list of issues | |
url_pnas_loi <- "https://www.pnas.org/loi/pnas/group" | |
this_year <- format(Sys.time(), "%Y") | |
# make a vector of year ranges to construct a URL to get all the issues pages | |
url_issues_pages <- | |
map_chr(seq(1910, this_year, 10), | |
~paste0(url_pnas_loi, "/d", .x, ".", "y", ifelse(.x+9 < 2022, .x+9, 2022))) | |
url_issues_pages <- | |
map_chr(1910:this_year, | |
# first round the number down to the nearest decade and then | |
# put each year at the end | |
~paste0(url_pnas_loi, "/d", .x%/%10*10, ".", "y", .x)) | |
# get the month, vol and issue numbers, subset to get only items with Months in them, | |
# these are the issue details. This takes several minutes to complete | |
urls_for_specific_issues <- | |
map(url_issues_pages, | |
~.x %>% | |
read_html() %>% | |
html_nodes(".flex-column") %>% | |
html_text() %>% | |
# subset to keep items that have a month name in them | |
str_subset(paste0(month.name, collapse = "|")) %>% | |
# extract only vol and issue numbers | |
str_extract("Vol.*") %>% | |
str_replace_all("\\|", "/") %>% | |
str_remove_all("Vol. | No. | ") %>% | |
str_c(url_pnas_toc, .)) | |
# organise so we have the year with the vol and issue | |
# so we can analyse change over time | |
names(urls_for_specific_issues) <- str_sub(url_issues_pages,-4,-1) | |
# reshape into data frame | |
urls_for_specific_issues_df <- | |
tibble(urls_for_specific_issues = urls_for_specific_issues, | |
year = names(urls_for_specific_issues)) %>% | |
unnest_longer(urls_for_specific_issues) | |
# --------------------------------------------------------------- | |
# This next step takes several hours to complete, if we've already | |
# done it recently, skip and read in the CSV we previously saved | |
# for each specific issue, get the text of the titles and | |
# sections in that issue. | |
titles_in_each_issue <- | |
urls_for_specific_issues_df %>% | |
mutate(titles_in_each_issue = | |
map(urls_for_specific_issues, | |
~.x %>% | |
read_html() %>% | |
html_nodes(".animation-underline , .mb-4") %>% | |
html_text() | |
)) | |
# --------------------------------------------------------------- | |
# Save the output so we don't have to do that again. Because this is | |
# data frame with nested tibbles, we cannot use CSV but need a binary | |
# format | |
saveRDS(titles_in_each_issue, "pnas_titles_in_each_issue.rds") | |
# read in the CSV we previously saved, if we have one | |
f <- "pnas_titles_in_each_issue.rds" | |
if (file.exists(f)) | |
titles_in_each_issue <- readRDS(f) | |
# unnest so we can see all the titles and remove some irrelevant materials | |
titles_in_each_issue_df <- | |
titles_in_each_issue %>% | |
unnest(titles_in_each_issue) %>% | |
mutate(titles_in_each_issue = str_to_lower(titles_in_each_issue)) %>% | |
filter(!titles_in_each_issue %in% str_to_lower( c("Enter text / DOI / keywords / authors / etc", | |
"Enter words / phrases / DOI / ISBN / keywords / authors / etc", | |
"Current Issue", | |
"Journal_Article", | |
"PNAS Nexus", | |
"Special Features", | |
"Colloquia" , | |
"List of Issues", | |
"Collected Papers", | |
"Front Matter", | |
"Journal Club", | |
"Podcasts", | |
"About", | |
"Diversity and Inclusion", | |
"Authors", | |
"Reviewers", | |
"Subscribers", | |
"Librarians", | |
"Press", | |
"Cozzarelli Prize", | |
"PNAS Updates", | |
"This Week in PNAS", | |
"In This Issue", | |
"Corrections", | |
"QnAs", | |
"Inner Workings", | |
"Retrospective", | |
"Opinion", | |
"Profile", | |
"Addendum", | |
"Letters", | |
"Letters (Online Only)", | |
"Letter (Online Only)", | |
"News Feature", | |
"Errata", | |
"Articles", | |
"Article", | |
"Awards of Medals", | |
"Award of Medals", | |
"PNAS Plus Significance Statements", | |
"Research Article", | |
"Research Articles", | |
"Feature Article", | |
"Introduction", | |
"NATIONAL ACADEMY OF SCIENCES", | |
"National Academy of Sciences", | |
"National Research Council", | |
"Brief Report", | |
"Brief Reports", | |
"Author Index", | |
"Perspective", | |
"Perspectives", | |
"Retraction", | |
"Retraction (Online Only)", | |
"Commentaries", | |
"Profiles", | |
"Editorial", | |
"Editorials", | |
"Commentary", | |
"Review", | |
"Reviews", | |
"nobel lecture", | |
"n.a.s. symposium", | |
"inaugural article" | |
))) %>% | |
filter(!str_detect(titles_in_each_issue, | |
str_to_lower("Editorial|Editor|Report of the|Extracts from the Minutes|Minutes of|Corrections|Correction|Correction for|Colloquium|PNAS"))) %>% | |
# drop blank titles | |
filter(titles_in_each_issue != "") %>% | |
mutate(titles_in_each_issue = str_to_lower( str_squish(titles_in_each_issue)) ) %>% | |
distinct() | |
# get the subject section headings, which are still in the data frame | |
# from browsing these, I think ones that occur more than ten times are real | |
# section headers and not articles. | |
titles_in_each_issue_df_section_headers <- | |
titles_in_each_issue_df %>% | |
mutate(word_count = str_count(titles_in_each_issue, pattern = '\\w+')) %>% | |
filter(word_count %in% 1:4) %>% | |
count(titles_in_each_issue) %>% | |
arrange(desc(n)) %>% | |
filter(!str_detect(titles_in_each_issue, "a |profile|on ")) %>% | |
filter(n >= 2) | |
# put section headings in their own column and delete from the article title column | |
# this has to be done by year, since some years they didn't use subject headings | |
titles_in_each_issue_with_section_headings_df <- | |
titles_in_each_issue_df %>% | |
mutate(year = parse_number(year)) %>% | |
mutate(section_heading = ifelse(titles_in_each_issue %in% titles_in_each_issue_df_section_headers$titles_in_each_issue, | |
titles_in_each_issue, NA)) %>% | |
# do the fill down within each year only, so that we don't run over into a year with no | |
# subject headers | |
nest(data = -year) %>% | |
mutate(section_heading_filled = map(data, ~.x %>% fill(section_heading))) %>% | |
unnest(section_heading_filled) %>% | |
filter(!titles_in_each_issue %in% titles_in_each_issue_df_section_headers$titles_in_each_issue) %>% | |
select(-data) %>% | |
# for now, drop some mystery years that have unusually high amounts of certain topics | |
mutate(section_heading = ifelse(year %in% c(1990, 1991, 1993, 1996), | |
NA, section_heading)) | |
# how many papers per year? | |
plot_papers_per_year <- | |
titles_in_each_issue_with_section_headings_df %>% | |
count(year) %>% | |
ggplot() + | |
aes(year, n) + | |
geom_col() + | |
theme_minimal() | |
# I think the section headings are not straightforward to use for tallying papers per subject | |
# make it interactive | |
plotly::ggplotly(plot_papers_per_year) | |
# check to see how many of each, what else do we need to drop? | |
# we need to group a bunch of these together | |
section_headings_per_year_tally_df <- | |
titles_in_each_issue_with_section_headings_df %>% | |
mutate(section_heading = tolower(section_heading)) %>% | |
mutate(section_heading = | |
case_when( | |
str_detect(section_heading, "anthropology") ~ "anthropology", | |
str_detect(section_heading, "biochemistry") ~ "biochemistry", | |
str_detect(section_heading, "biophysics") ~ "biophysics", | |
str_detect(section_heading, "biolog") ~ "biology", | |
str_detect(section_heading, "botany") ~ "botany", | |
str_detect(section_heading, "math") ~ "math", | |
str_detect(section_heading, "physical sciences") ~ "physical sciences", | |
str_detect(section_heading, "cell biology") ~ "cell biology", | |
str_detect(section_heading, "developmental biology") ~ "developmental biology", | |
str_detect(section_heading, "geology") ~ "geology", | |
str_detect(section_heading, "psycho") ~ "psychology", | |
str_detect(section_heading, "immuno") ~ "immunology", | |
str_detect(section_heading, "pathology") ~ "pathology", | |
str_detect(section_heading, "physics") ~ "physics", | |
str_detect(section_heading, "physio") ~ "physiology", | |
str_detect(section_heading, "zoölogy") ~ "zoology", | |
str_detect(section_heading, "agricult") ~ "agriculture", | |
TRUE ~ section_heading | |
)) %>% | |
mutate(section_heading = str_remove(section_heading, "biological sciences:|physical sciences:")) %>% | |
mutate(section_heading = str_squish(section_heading)) %>% | |
group_by(year, section_heading) %>% | |
summarise(n = n()) %>% | |
drop_na() | |
# how many papers in each subject per year? Lets just focus on the major subject areas | |
p1 <- | |
section_headings_per_year_tally_df %>% | |
ggplot() + | |
aes(year, n, | |
colour = section_heading, | |
fill = section_heading) + | |
geom_col() | |
# make it interactive | |
plotly::ggplotly(p1) | |
# another way to get all anthro articles is to scrape the PNAS search page here | |
# https://www-pnas-org.offcampus.lib.washington.edu/action/doSearch?Concept=500376&Concept=500375&startPage=0&sortBy=Earliest | |
# it shows 1002 articles | |
# another PNAS keyword search shows 1474 articles | |
pnas_archaeology_search <- "https://www.pnas.org/action/doSearch?AllField=archaeology&stemming=yes&publication=pnas&pageSize=1500&startPage=0" | |
# save as webpage to my disk, then read in | |
search_results <- | |
"Search Result _ Proceedings of the National Academy of Sciences.html" %>% | |
read_html() %>% | |
html_nodes(".items-results .card__meta__date , .items-results .animation-underline") %>% | |
html_text() %>% | |
matrix(., ncol = 2, byrow = TRUE) %>% | |
tibble() %>% | |
mutate(year = parse_number(str_sub(.[,1], -4))) | |
search_results %>% | |
count(year) %>% | |
ggplot() + | |
aes(year, n) + | |
geom_line() | |
# or for 'archaeology' it shows 74 articles | |
# https://www-pnas-org.offcampus.lib.washington.edu/action/doSearch?Concept=500376&Concept=500375&sortBy=Earliest&pageSize=500&startPage=&rel=nofollow&KeywordRaw=archaeology | |
# another option is pubmed and the MeSH keywords (n = 123 articles) | |
# https://pubmed-ncbi-nlm-nih-gov.offcampus.lib.washington.edu/?term=%28%22Archaeology%22%5BMAJR%5D%29+AND+%28%22Proceedings+of+the+National+Academy+of+Sciences+of+the+United+States+of+America%22%5BJournal%5D%29&sort= | |
# same for Science (n = 219) | |
# https://pubmed-ncbi-nlm-nih-gov.offcampus.lib.washington.edu/?term=%28%22Archaeology%22%5BMAJR%5D%29+AND+%28%22Science+%28New+York%2C+N.Y.%29%22%5BJournal%5D%29&size=200 | |
# and Nature (n = 126, but only one of my papers is in here) | |
# https://pubmed-ncbi-nlm-nih-gov.offcampus.lib.washington.edu/?term=%28%22Archaeology%22%5BMAJR%5D+%29+AND+%28%22Nature%22%5BJournal%5D%29&sort= | |
# JB says she searched for "archaeolog*" and "archeolog*", these return 257 results for me, | |
# much less than the 1002 we get from searching the archaeology 'keyword, e.g. | |
# https://www.pnas.org/action/doSearch?Concept=500376&Concept=500375&startPage=0&sortBy=Earliest | |
# in any case, let's start with "archaeolog*" and "archeolog*", I've copied the URL of the search | |
# results page and edited it to return 500 items on the first page, so we can get all results without | |
# having to scrape multiple pages of results, just to simplify the process | |
library(tidyverse) | |
library(rvest) | |
search_url <- "https://www.pnas.org/action/doSearch?field1=AllField&text1=archaeolog*+&field2=AllField&text2=archeolog*&field3=AllField&text3=&publication=&Ppub=&access=on&pageSize=500&startPage=0" | |
# this takes a few seconds: | |
search_results_titles_and_dates_df <- | |
search_url %>% | |
read_html() %>% | |
# extract content from elements on the page that contain the article title and date | |
html_nodes(".items-results .card__meta__date , .items-results .animation-underline") %>% | |
# convert to human-readable text | |
html_text() %>% | |
# reshape into a data frame with title in one col and date in another col | |
matrix(., ncol = 2, byrow = TRUE) %>% | |
tibble() %>% | |
# extract year from first column and convert from character to numeric | |
mutate(year = parse_number(str_sub(.[,1], -4))) | |
# how many papers per year? | |
search_results_titles_and_dates_df %>% | |
count(year) %>% | |
ggplot() + | |
aes(year, n) + | |
geom_col() + | |
theme_minimal() | |
# let's get the URLs to each paper so we can get abstracts, etc. | |
# this takes a few seconds | |
search_results_urls_df <- | |
search_url %>% | |
read_html() %>% | |
# extract content from elements on the page that contain the article title link | |
html_nodes(".items-results .animation-underline") %>% | |
# convert to human-readable text | |
html_attr("href") %>% | |
# construct full URL from these results | |
str_c("http://pnas.org", .) | |
# for each link to a paper, get the content type,title, abstract, | |
# subject and date of that paper. | |
# This takes quite a few minutes | |
search_results_abstract_title_date_subject <- | |
map(search_results_urls_df, | |
~.x %>% | |
read_html() %>% | |
html_nodes(".core-date-published span , h1 , .core-container > div:nth-child(1)") %>% | |
# convert to human-readable text | |
html_text() | |
) | |
# clean and tidy the titles, dates and abstracts into a nice tibble | |
search_results_abstract_title_date_subject_tidy_df <- | |
map_df(search_results_abstract_title_date_subject, | |
~tibble(article_type = .x[1], | |
title = .x[2], | |
date = .x[3], | |
abstract = .x[4])) %>% | |
mutate(article_type = str_squish(str_remove(article_type, " Share on"))) %>% | |
# extract year from data and convert from character to numeric | |
mutate(year = parse_number(str_sub(date, -4))) %>% | |
filter(!article_type %in% c("QnAs", "This Week in PNAS")) %>% | |
filter(str_detect(article_type, "Research Article")) %>% | |
separate(article_type, | |
into = c("article_type", "pnas_subject"), | |
sep = 16) | |
library(quanteda) | |
# explore species studied for domestication, inspired by this plot | |
# https://twitter.com/JBeckArch/status/1510346774283698177 | |
# convert abstract text into document feature matrix and clean the text | |
papers_about_domestication <- | |
search_results_abstract_title_date_subject_tidy_df %>% | |
filter(str_detect(abstract, "domestica")) | |
my_dfm <- | |
corpus(papers_about_domestication$abstract, | |
docvars = papers_about_domestication$year) %>% | |
tokens(remove_numbers = TRUE, | |
remove_symbols = TRUE, | |
remove_punct = TRUE) %>% | |
dfm(tolower = TRUE, | |
remove_padding = TRUE) %>% | |
dfm_select(pattern = stopwords("en"), | |
selection = "remove") %>% | |
dfm_select(pattern = c("dog", "dogs", | |
"maize", "wheat", | |
"horse", "horses", | |
"rice", "squash", | |
"cattle", "cows", "cow"), | |
selection = "keep") | |
# tidy up and plot | |
my_dfm %>% | |
convert(to = "data.frame") %>% | |
cbind(docvars(my_dfm)) %>% | |
mutate(horse = horse + horses) %>% | |
select(-horses, | |
- doc_id, | |
year = docvars) %>% | |
pivot_longer(-year, | |
names_to = "thing", | |
values_to = "n") %>% | |
group_by(thing) %>% | |
summarise(n = sum(n)) %>% | |
ggplot() + | |
aes(reorder(thing, -n), n) + | |
geom_col() + | |
theme_minimal() + | |
xlab("Domesticate") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment