Created
September 8, 2013 23:37
-
-
Save jeremybmerrill/6489562 to your computer and use it in GitHub Desktop.
Scrape the Los Angeles Review of Books for contributors and the authors of reviewed books, then classify those by gender by pronouns in their biographies (or statistical probability, if it's clear)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'upton' | |
require 'date' | |
require 'guess' | |
GLOBAL_VERBOSE = true | |
# - any lowercased pronoun is okay | |
# - capitalized pronouns are okay unless they're in a book title, which is a series of capitalized words; | |
# that is, capitalized pronouns are okay if there are zero alphabetic characters between them and a sentence-final punct | |
FEMALE_REGEXES = [/ she[\.,\s!?\' ]/, / her[\.,\s!?\' ]/, | |
/[.?!][^A-Za-z]+She[\.,\s!?\' ]/, /[.?!][^A-Za-z]+Her[\.,\s!?\' ]/ ] | |
MALE_REGEXES = [/ he[\.,\s!?\' ]/, / his[\.,\s!?\' ]/, / him[\.,\s!?\' ]/, | |
/[.?!][^A-Za-z]+He[\.,\s!?\' ]/, /[.?!][^A-Za-z]+His[\.,\s!?\' ]/, /[.?!][^A-Za-z]+Him[\.,\s!?\' ]/ | |
] | |
FEMALE = lambda{|s| FEMALE_REGEXES.map{|r| !!s.match(r)}.include? true } | |
MALE = lambda{|s| MALE_REGEXES.map{|r| !!s.match(r)}.include? true } | |
MANUAL_GENDERING = { | |
# These should be names that you've manually verified because, perhaps because | |
# they couldn't be classified automatically | |
"James P. Example" => "male", | |
"Mary Q. Example" => "female" | |
} | |
# Helper method to get the text of a biography from the URL to the author/contributor page. | |
def bio_text(url) | |
bio(url).text.gsub(/[[:space:]]/, " ") | |
end | |
def bio(url) | |
scraper = Upton::Scraper.new([url]) | |
scraper.index_debug = true | |
scraper.verbose = GLOBAL_VERBOSE | |
bio_elem = scraper.scrape{|html| Nokogiri::HTML(html).css("div.article_body")}[0] | |
# exclude blockquotes in bios (since pronouns there are rarely about the author, and | |
# are often about a character.) | |
# on LARB's website, blockquotes are either in a blockquote element or with the | |
# `padding-left` style attribute set. | |
unless (bq = bio_elem.css("blockquote")).empty? | |
bq.each do |bqe| | |
bqe.remove | |
end | |
end | |
unless (bq = bio_elem.css("h4")).select{|header| header[:style] == "padding-left: 30px;"}.empty? | |
bq.select{|header| header[:style] == "padding-left: 30px;"}.each do |bqe| | |
bqe.remove | |
end | |
end | |
bio_elem | |
end | |
# | |
# The "magic" of classifying an author or contributor by their bio. | |
# Gets the bio, then checks if it matches either the FEMALE_REGEXES or | |
# MALE_REGEXES. If it does, returns that gender (unless overridden manually). | |
# If not, and if we can be sufficiently confident about the | |
# author/contributor's name based on the statistics from the `guess` gem, | |
# returns that gender. Otherwise, "Unknown". You'll want to manually code everyone | |
# who is shown with "Unknown." | |
# | |
def code_for_gender(url, name) | |
bio = bio_text(url) | |
is_female = FEMALE.call(bio) && !MALE.call(bio) | |
is_male = MALE.call(bio) && !FEMALE.call(bio) | |
if is_female | |
preliminary_gender = "female" | |
elsif is_male | |
preliminary_gender = "male" | |
else | |
preliminary_gender = guess_gender_from_name(name) | |
end | |
if MANUAL_GENDERING.include?(name) | |
return MANUAL_GENDERING[name] | |
else | |
return preliminary_gender | |
end | |
end | |
# | |
# Guesses a contributor or author's gender from their first name. Adjust the | |
# GENDER_GUESS_CONFIDENCE_THRESHOLD constant to change how sure you have to be | |
# to code an author's gender automatically. | |
# | |
GENDER_GUESS_CONFIDENCE_THRESHOLD = 0.995 | |
def guess_gender_from_name(name) | |
guess = Guess.gender(name) | |
if guess[:confidence] && guess[:confidence] > GENDER_GUESS_CONFIDENCE_THRESHOLD | |
return guess[:gender].to_s | |
else | |
return "Unknown" | |
end | |
end | |
# | |
# For the purpose of spot-checking, genders detected via pronouns in a | |
# biography are compared with the statistical guess based on a first name. | |
# `false` means the statistics disagree (but may be wrong.) | |
# `true` means that the statistics agree. | |
# `true (low confidence)` means that the statistics agree, but with low confidence. | |
# Confidence constants can be adjusted. | |
# If there was no gender detected via pronouns in the biography, this returns | |
# a statistical guess. If this guess is low confidence, that's noted too. | |
# | |
GENDER_CONFIRMATION_CONFIDENCE_THRESHOLD = 0.95 | |
GENDER_CONFIRMATION_LOW_CONFIDENCE_THRESHOLD = 0.9 | |
def does_gender_match?(gender_from_pronouns, name) | |
guess = Guess.gender(name) | |
if gender_from_pronouns.include?("UNK") | |
if guess[:confidence].to_f > GENDER_GUESS_CONFIDENCE_THRESHOLD | |
return guess[:gender] | |
else | |
return "Low confidence: #{guess[:gender]}" | |
end | |
else | |
if guess[:confidence].to_f > GENDER_CONFIRMATION_CONFIDENCE_THRESHOLD | |
return (guess[:gender] == gender_from_pronouns.downcase).to_s | |
elsif guess[:confidence].to_f > GENDER_CONFIRMATION_LOW_CONFIDENCE_THRESHOLD | |
if guess[:gender] == gender_from_pronouns.downcase | |
return "true (low confidence)" | |
else | |
return false | |
end | |
else | |
return "Low confidence" | |
end | |
end | |
end | |
# Scrape LARB's index page that lists all of its reviews. | |
# `Scraper#new`'s first argument is the first page of the (paginated) index. | |
# The second argument is the CSS (or XPath) selector that specifies each individual link. | |
r = Upton::Scraper.new("http://lareviewofbooks.org/reviews/?sort_by=date&pager=1", "article.hentry div.entry-summary a") | |
r.index_debug = true | |
r.verbose = GLOBAL_VERBOSE | |
# This index is paginated, with 24 per page. Upton will traverse the index for you. | |
r.paginated = true | |
r.pagination_param = "pager" #What query string variable is changed per page. | |
r.pagination_max_pages = 35 #How many pages of reviews there are. | |
# the scrape_to_csv method handles writing the records you return to a CSV with | |
# the given filename | |
r.scrape_to_csv "reviews.csv" do |instance_html, instance_url, instance_index| | |
page = Nokogiri::HTML(instance_html) | |
#for each page, find the date and titles using the given CSS selectors. | |
review_title = page.css("div#book_review h3").text | |
page_title = page.css("div#book_review h2").text.strip | |
clean_date = page.css("abbr.published").text.gsub(/(\d)(th|st|nd|rd)/, "\\1") | |
date = DateTime.strptime(clean_date, "%B %d, %Y").strftime("%F") | |
# use CSS to get the links to bios for each contributor (whose links all have the class `person`) | |
# and each author. | |
reviewer_links = page.css("a.person").to_a.select{|a| a.attr('href').match(/\/contributor\//)} | |
book_author_links = page.css("a.author").to_a | |
#create the records for the spreadsheet for authors... | |
book_author_records = book_author_links.uniq{|a| a.text}.map do |book_author_link_el| | |
book_author_link = r.send(:resolve_url, book_author_link_el.attr("href") , instance_url) | |
book_author_name = book_author_link_el.text | |
gender_from_bio_pronouns = code_for_gender( book_author_link, book_author_name) | |
[review_title, | |
page_title, | |
date, | |
book_author_name, | |
gender_from_bio_pronouns, | |
does_gender_match?(gender_from_bio_pronouns, book_author_name), | |
"author", | |
bio_text(book_author_link), | |
instance_url | |
] | |
end | |
# and for the reviewers | |
reviewer_records = reviewer_links.uniq{|a| a.text}.map do | reviewer_link_el | | |
reviewer_link = r.send(:resolve_url, reviewer_link_el .attr('href'), instance_url) | |
reviewer_name = reviewer_link_el.text | |
gender_from_bio_pronouns = code_for_gender( reviewer_link, reviewer_name ) | |
[review_title, | |
page_title, | |
date, | |
reviewer_name, | |
gender_from_bio_pronouns, | |
does_gender_match?(gender_from_bio_pronouns, reviewer_name), | |
"reviewer", | |
bio_text(reviewer_link), | |
instance_url | |
] | |
end | |
#pass them out of the block; Upton handles writing them to CSV. | |
reviewer_records + book_author_records | |
end | |
# Scrape LARB's index page that lists all of its essays. | |
# `Scraper#new`'s first argument is the first page of the (paginated) index. | |
# The second argument is the CSS (or XPath) selector that specifies each individual link. | |
r = Upton::Scraper.new("http://lareviewofbooks.org/essays/?sort_by=date&pager=1", "h5.entry-title a") | |
r.index_debug = true | |
r.verbose = GLOBAL_VERBOSE | |
# This index is paginated, with 24 per page. Upton will traverse the index for you. | |
r.paginated = true | |
r.pagination_param = "pager" | |
r.pagination_max_pages = 35 | |
# the scrape_to_csv method handles writing the records you return to a CSV with | |
# the given filename | |
r.scrape_to_csv "essays.csv" do |instance_html, instance_url, instance_index| | |
page = Nokogiri::HTML(instance_html) | |
#for each page, find the date and titles using the given CSS selectors. | |
page_title = page.css("div#book_review .article_title")[0].text.strip | |
clean_date = page.css("abbr.published").text.gsub(/(\d)(th|st|nd|rd)/, "\\1") | |
date = DateTime.strptime(clean_date, "%B %d, %Y").strftime("%F") | |
#we only want contributor pages, not author pages. "authors" in this sense can't write essays. | |
# use CSS to get the links to bios for each contributor. On some older pages, contributors have | |
# the CSS class of `author` (but their URLs contain `contributor`) | |
essayist_links = page.css("a.person", "a.author").to_a.select{|a| a.attr('href').match(/\/contributor\//)} | |
essayist_records = essayist_links.uniq{|a| a.text}.map do | essayist_link_el | | |
essayist_link = r.send(:resolve_url, essayist_link_el.attr('href'), instance_url) | |
essayist_name = essayist_link_el.text | |
gender_from_bio_pronouns = code_for_gender( essayist_link, essayist_name ) | |
["", page_title, | |
date, | |
essayist_name, | |
gender_from_bio_pronouns, | |
does_gender_match?(gender_from_bio_pronouns, essayist_name), | |
"essayist", | |
bio_text(essayist_link), | |
instance_url] | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment