jeremybmerrill/count_scraper.rb

## count_scraper.rb
require 'upton'
require 'date'
require 'guess'

GLOBAL_VERBOSE = true

# - any lowercased pronoun is okay
# - capitalized pronouns are okay unless they're in a book title, which is a series of capitalized words;
#   that is, capitalized pronouns are okay if there are zero alphabetic characters between them and a sentence-final punct
FEMALE_REGEXES = [/ she[\.,\s!?\' ]/, / her[\.,\s!?\' ]/,
                /[.?!][^A-Za-z]+She[\.,\s!?\' ]/, /[.?!][^A-Za-z]+Her[\.,\s!?\' ]/ ]
MALE_REGEXES = [/ he[\.,\s!?\' ]/, / his[\.,\s!?\' ]/, / him[\.,\s!?\' ]/,
                /[.?!][^A-Za-z]+He[\.,\s!?\' ]/, /[.?!][^A-Za-z]+His[\.,\s!?\' ]/, /[.?!][^A-Za-z]+Him[\.,\s!?\' ]/
]

FEMALE = lambda{|s| FEMALE_REGEXES.map{|r| !!s.match(r)}.include? true }
MALE = lambda{|s| MALE_REGEXES.map{|r| !!s.match(r)}.include? true }

MANUAL_GENDERING = {
    # These should be names that you've manually verified because, perhaps because
    # they couldn't be classified automatically
    "James P. Example" => "male",
    "Mary Q. Example" => "female"
}

# Helper method to get the text of a biography from the URL to the author/contributor page.
def bio_text(url)
    bio(url).text.gsub(/[[:space:]]/, " ")
end

def bio(url)
    scraper = Upton::Scraper.new([url])
    scraper.index_debug = true
    scraper.verbose = GLOBAL_VERBOSE
    bio_elem = scraper.scrape{|html| Nokogiri::HTML(html).css("div.article_body")}[0]

    # exclude blockquotes in bios (since pronouns there are rarely about the author, and
    # are often about a character.)
    # on LARB's website, blockquotes are either in a blockquote element or with the
    # `padding-left` style attribute set.
    unless (bq = bio_elem.css("blockquote")).empty?
        bq.each do |bqe|
            bqe.remove
        end
    end
    unless (bq = bio_elem.css("h4")).select{|header| header[:style] == "padding-left: 30px;"}.empty?
        bq.select{|header| header[:style] == "padding-left: 30px;"}.each do |bqe|
            bqe.remove
        end
    end

    bio_elem
end

#
# The "magic" of classifying an author or contributor by their bio.
# Gets the bio, then checks if it matches either the FEMALE_REGEXES or
# MALE_REGEXES. If it does, returns that gender (unless overridden manually).
# If not, and if we can be sufficiently confident about the
# author/contributor's name based on the statistics from the `guess` gem,
# returns that gender. Otherwise, "Unknown". You'll want to manually code everyone
# who is shown with "Unknown."
#
def code_for_gender(url, name)
    bio = bio_text(url)

    is_female = FEMALE.call(bio) && !MALE.call(bio)
    is_male = MALE.call(bio) && !FEMALE.call(bio)

    if is_female
        preliminary_gender = "female"
    elsif is_male
        preliminary_gender = "male"
    else
        preliminary_gender = guess_gender_from_name(name)
    end

    if MANUAL_GENDERING.include?(name)
        return MANUAL_GENDERING[name]
    else
        return preliminary_gender
    end
end

#
# Guesses a contributor or author's gender from their first name. Adjust the
# GENDER_GUESS_CONFIDENCE_THRESHOLD constant to change how sure you have to be
# to code an  author's gender automatically.
#
GENDER_GUESS_CONFIDENCE_THRESHOLD = 0.995
def guess_gender_from_name(name)
    guess = Guess.gender(name)
    if guess[:confidence] && guess[:confidence] > GENDER_GUESS_CONFIDENCE_THRESHOLD
        return guess[:gender].to_s
    else
        return "Unknown"
    end
end


#
# For the purpose of spot-checking, genders detected via pronouns in a
# biography are compared with the statistical guess based on a first name.
# `false` means the statistics disagree (but may be wrong.)
# `true` means that the statistics agree.
# `true (low confidence)` means that the statistics agree, but with low confidence.
# Confidence constants can be adjusted.
# If there was no gender detected via pronouns in the biography, this returns
# a statistical guess. If this guess is low confidence, that's noted too.
#
GENDER_CONFIRMATION_CONFIDENCE_THRESHOLD = 0.95
GENDER_CONFIRMATION_LOW_CONFIDENCE_THRESHOLD = 0.9
def does_gender_match?(gender_from_pronouns, name)
    guess = Guess.gender(name)
    if gender_from_pronouns.include?("UNK")
        if guess[:confidence].to_f > GENDER_GUESS_CONFIDENCE_THRESHOLD
            return guess[:gender]
        else
            return "Low confidence: #{guess[:gender]}"
        end
    else
        if guess[:confidence].to_f > GENDER_CONFIRMATION_CONFIDENCE_THRESHOLD
            return (guess[:gender] == gender_from_pronouns.downcase).to_s
        elsif guess[:confidence].to_f > GENDER_CONFIRMATION_LOW_CONFIDENCE_THRESHOLD
            if guess[:gender] == gender_from_pronouns.downcase
                return "true (low confidence)"
            else
                return false
            end
        else
            return "Low confidence"
        end
    end
end


# Scrape LARB's index page that lists all of its reviews.
# `Scraper#new`'s first argument is the first page of the (paginated) index.
# The second argument is the CSS (or XPath) selector that specifies each individual link.
r = Upton::Scraper.new("http://lareviewofbooks.org/reviews/?sort_by=date&pager=1",  "article.hentry div.entry-summary a")
r.index_debug = true
r.verbose = GLOBAL_VERBOSE

# This index is paginated, with 24 per page. Upton will traverse the index for you.
r.paginated = true
r.pagination_param = "pager" #What query string variable is changed per page.
r.pagination_max_pages = 35 #How many pages of reviews there are.

# the scrape_to_csv method handles writing the records you return to a CSV with
# the given filename
r.scrape_to_csv "reviews.csv" do |instance_html, instance_url, instance_index|
    page = Nokogiri::HTML(instance_html)

    #for each page, find the date and titles using the given CSS selectors.
    review_title = page.css("div#book_review h3").text
    page_title = page.css("div#book_review h2").text.strip
    clean_date = page.css("abbr.published").text.gsub(/(\d)(th|st|nd|rd)/, "\\1")
    date = DateTime.strptime(clean_date, "%B %d, %Y").strftime("%F")

    # use CSS to get the links to bios for each contributor (whose links all have the class `person`)
    # and each author.
    reviewer_links = page.css("a.person").to_a.select{|a| a.attr('href').match(/\/contributor\//)}
    book_author_links = page.css("a.author").to_a

    #create the records for the spreadsheet for authors...
    book_author_records = book_author_links.uniq{|a| a.text}.map do |book_author_link_el|
        book_author_link =  r.send(:resolve_url, book_author_link_el.attr("href") , instance_url)
        book_author_name = book_author_link_el.text
        gender_from_bio_pronouns = code_for_gender( book_author_link, book_author_name)
        [review_title,
            page_title,
            date,
            book_author_name,
            gender_from_bio_pronouns,
            does_gender_match?(gender_from_bio_pronouns, book_author_name),
            "author",
            bio_text(book_author_link),
            instance_url
        ]
    end

    # and for the reviewers
    reviewer_records = reviewer_links.uniq{|a| a.text}.map do | reviewer_link_el |
        reviewer_link = r.send(:resolve_url, reviewer_link_el .attr('href'), instance_url)
        reviewer_name = reviewer_link_el.text
        gender_from_bio_pronouns = code_for_gender( reviewer_link, reviewer_name )
        [review_title,
            page_title,
            date,
            reviewer_name,
            gender_from_bio_pronouns,
            does_gender_match?(gender_from_bio_pronouns, reviewer_name),
            "reviewer",
            bio_text(reviewer_link),
            instance_url
        ]
    end

    #pass them out of the block; Upton handles writing them to CSV.
    reviewer_records + book_author_records
end

# Scrape LARB's index page that lists all of its essays.
# `Scraper#new`'s first argument is the first page of the (paginated) index.
# The second argument is the CSS (or XPath) selector that specifies each individual link.
r = Upton::Scraper.new("http://lareviewofbooks.org/essays/?sort_by=date&pager=1", "h5.entry-title a")
r.index_debug = true
r.verbose = GLOBAL_VERBOSE

# This index is paginated, with 24 per page. Upton will traverse the index for you.
r.paginated = true
r.pagination_param = "pager"
r.pagination_max_pages = 35

# the scrape_to_csv method handles writing the records you return to a CSV with
# the given filename
r.scrape_to_csv "essays.csv" do |instance_html, instance_url, instance_index|
    page = Nokogiri::HTML(instance_html)

    #for each page, find the date and titles using the given CSS selectors.
    page_title = page.css("div#book_review .article_title")[0].text.strip
    clean_date = page.css("abbr.published").text.gsub(/(\d)(th|st|nd|rd)/, "\\1")
    date = DateTime.strptime(clean_date, "%B %d, %Y").strftime("%F")

    #we only want contributor pages, not author pages. "authors" in this sense can't write essays.
    # use CSS to get the links to bios for each contributor. On some older pages, contributors have
    # the CSS class of `author` (but their URLs contain `contributor`)
    essayist_links = page.css("a.person", "a.author").to_a.select{|a| a.attr('href').match(/\/contributor\//)}

    essayist_records = essayist_links.uniq{|a| a.text}.map do | essayist_link_el |
        essayist_link = r.send(:resolve_url, essayist_link_el.attr('href'), instance_url)
        essayist_name = essayist_link_el.text
        gender_from_bio_pronouns = code_for_gender( essayist_link, essayist_name )

        ["", page_title,
            date,
            essayist_name,
            gender_from_bio_pronouns,
            does_gender_match?(gender_from_bio_pronouns, essayist_name),
            "essayist",
            bio_text(essayist_link),
            instance_url]
    end
end
	require 'upton'
	require 'date'
	require 'guess'

	GLOBAL_VERBOSE = true

	# - any lowercased pronoun is okay
	# - capitalized pronouns are okay unless they're in a book title, which is a series of capitalized words;
	# that is, capitalized pronouns are okay if there are zero alphabetic characters between them and a sentence-final punct
	FEMALE_REGEXES = [/ she[\.,\s!?\' ]/, / her[\.,\s!?\' ]/,
	/[.?!][^A-Za-z]+She[\.,\s!?\' ]/, /[.?!][^A-Za-z]+Her[\.,\s!?\' ]/ ]
	MALE_REGEXES = [/ he[\.,\s!?\' ]/, / his[\.,\s!?\' ]/, / him[\.,\s!?\' ]/,
	/[.?!][^A-Za-z]+He[\.,\s!?\' ]/, /[.?!][^A-Za-z]+His[\.,\s!?\' ]/, /[.?!][^A-Za-z]+Him[\.,\s!?\' ]/
	]

	FEMALE = lambda{\|s\| FEMALE_REGEXES.map{\|r\| !!s.match(r)}.include? true }
	MALE = lambda{\|s\| MALE_REGEXES.map{\|r\| !!s.match(r)}.include? true }

	MANUAL_GENDERING = {
	# These should be names that you've manually verified because, perhaps because
	# they couldn't be classified automatically
	"James P. Example" => "male",
	"Mary Q. Example" => "female"
	}

	# Helper method to get the text of a biography from the URL to the author/contributor page.
	def bio_text(url)
	bio(url).text.gsub(/[[:space:]]/, " ")
	end

	def bio(url)
	scraper = Upton::Scraper.new([url])
	scraper.index_debug = true
	scraper.verbose = GLOBAL_VERBOSE
	bio_elem = scraper.scrape{\|html\| Nokogiri::HTML(html).css("div.article_body")}[0]

	# exclude blockquotes in bios (since pronouns there are rarely about the author, and
	# are often about a character.)
	# on LARB's website, blockquotes are either in a blockquote element or with the
	# `padding-left` style attribute set.
	unless (bq = bio_elem.css("blockquote")).empty?
	bq.each do \|bqe\|
	bqe.remove
	end
	end
	unless (bq = bio_elem.css("h4")).select{\|header\| header[:style] == "padding-left: 30px;"}.empty?
	bq.select{\|header\| header[:style] == "padding-left: 30px;"}.each do \|bqe\|
	bqe.remove
	end
	end

	bio_elem
	end

	#
	# The "magic" of classifying an author or contributor by their bio.
	# Gets the bio, then checks if it matches either the FEMALE_REGEXES or
	# MALE_REGEXES. If it does, returns that gender (unless overridden manually).
	# If not, and if we can be sufficiently confident about the
	# author/contributor's name based on the statistics from the `guess` gem,
	# returns that gender. Otherwise, "Unknown". You'll want to manually code everyone
	# who is shown with "Unknown."
	#
	def code_for_gender(url, name)
	bio = bio_text(url)

	is_female = FEMALE.call(bio) && !MALE.call(bio)
	is_male = MALE.call(bio) && !FEMALE.call(bio)

	if is_female
	preliminary_gender = "female"
	elsif is_male
	preliminary_gender = "male"
	else
	preliminary_gender = guess_gender_from_name(name)
	end

	if MANUAL_GENDERING.include?(name)
	return MANUAL_GENDERING[name]
	else
	return preliminary_gender
	end
	end

	#
	# Guesses a contributor or author's gender from their first name. Adjust the
	# GENDER_GUESS_CONFIDENCE_THRESHOLD constant to change how sure you have to be
	# to code an author's gender automatically.
	#
	GENDER_GUESS_CONFIDENCE_THRESHOLD = 0.995
	def guess_gender_from_name(name)
	guess = Guess.gender(name)
	if guess[:confidence] && guess[:confidence] > GENDER_GUESS_CONFIDENCE_THRESHOLD
	return guess[:gender].to_s
	else
	return "Unknown"
	end
	end


	#
	# For the purpose of spot-checking, genders detected via pronouns in a
	# biography are compared with the statistical guess based on a first name.
	# `false` means the statistics disagree (but may be wrong.)
	# `true` means that the statistics agree.
	# `true (low confidence)` means that the statistics agree, but with low confidence.
	# Confidence constants can be adjusted.
	# If there was no gender detected via pronouns in the biography, this returns
	# a statistical guess. If this guess is low confidence, that's noted too.
	#
	GENDER_CONFIRMATION_CONFIDENCE_THRESHOLD = 0.95
	GENDER_CONFIRMATION_LOW_CONFIDENCE_THRESHOLD = 0.9
	def does_gender_match?(gender_from_pronouns, name)
	guess = Guess.gender(name)
	if gender_from_pronouns.include?("UNK")
	if guess[:confidence].to_f > GENDER_GUESS_CONFIDENCE_THRESHOLD
	return guess[:gender]
	else
	return "Low confidence: #{guess[:gender]}"
	end
	else
	if guess[:confidence].to_f > GENDER_CONFIRMATION_CONFIDENCE_THRESHOLD
	return (guess[:gender] == gender_from_pronouns.downcase).to_s
	elsif guess[:confidence].to_f > GENDER_CONFIRMATION_LOW_CONFIDENCE_THRESHOLD
	if guess[:gender] == gender_from_pronouns.downcase
	return "true (low confidence)"
	else
	return false
	end
	else
	return "Low confidence"
	end
	end
	end


	# Scrape LARB's index page that lists all of its reviews.
	# `Scraper#new`'s first argument is the first page of the (paginated) index.
	# The second argument is the CSS (or XPath) selector that specifies each individual link.
	r = Upton::Scraper.new("http://lareviewofbooks.org/reviews/?sort_by=date&pager=1", "article.hentry div.entry-summary a")
	r.index_debug = true
	r.verbose = GLOBAL_VERBOSE

	# This index is paginated, with 24 per page. Upton will traverse the index for you.
	r.paginated = true
	r.pagination_param = "pager" #What query string variable is changed per page.
	r.pagination_max_pages = 35 #How many pages of reviews there are.

	# the scrape_to_csv method handles writing the records you return to a CSV with
	# the given filename
	r.scrape_to_csv "reviews.csv" do \|instance_html, instance_url, instance_index\|
	page = Nokogiri::HTML(instance_html)

	#for each page, find the date and titles using the given CSS selectors.
	review_title = page.css("div#book_review h3").text
	page_title = page.css("div#book_review h2").text.strip
	clean_date = page.css("abbr.published").text.gsub(/(\d)(th\|st\|nd\|rd)/, "\\1")
	date = DateTime.strptime(clean_date, "%B %d, %Y").strftime("%F")

	# use CSS to get the links to bios for each contributor (whose links all have the class `person`)
	# and each author.
	reviewer_links = page.css("a.person").to_a.select{\|a\| a.attr('href').match(/\/contributor\//)}
	book_author_links = page.css("a.author").to_a

	#create the records for the spreadsheet for authors...
	book_author_records = book_author_links.uniq{\|a\| a.text}.map do \|book_author_link_el\|
	book_author_link = r.send(:resolve_url, book_author_link_el.attr("href") , instance_url)
	book_author_name = book_author_link_el.text
	gender_from_bio_pronouns = code_for_gender( book_author_link, book_author_name)
	[review_title,
	page_title,
	date,
	book_author_name,
	gender_from_bio_pronouns,
	does_gender_match?(gender_from_bio_pronouns, book_author_name),
	"author",
	bio_text(book_author_link),
	instance_url
	]
	end

	# and for the reviewers
	reviewer_records = reviewer_links.uniq{\|a\| a.text}.map do \| reviewer_link_el \|
	reviewer_link = r.send(:resolve_url, reviewer_link_el .attr('href'), instance_url)
	reviewer_name = reviewer_link_el.text
	gender_from_bio_pronouns = code_for_gender( reviewer_link, reviewer_name )
	[review_title,
	page_title,
	date,
	reviewer_name,
	gender_from_bio_pronouns,
	does_gender_match?(gender_from_bio_pronouns, reviewer_name),
	"reviewer",
	bio_text(reviewer_link),
	instance_url
	]
	end

	#pass them out of the block; Upton handles writing them to CSV.
	reviewer_records + book_author_records
	end

	# Scrape LARB's index page that lists all of its essays.
	# `Scraper#new`'s first argument is the first page of the (paginated) index.
	# The second argument is the CSS (or XPath) selector that specifies each individual link.
	r = Upton::Scraper.new("http://lareviewofbooks.org/essays/?sort_by=date&pager=1", "h5.entry-title a")
	r.index_debug = true
	r.verbose = GLOBAL_VERBOSE

	# This index is paginated, with 24 per page. Upton will traverse the index for you.
	r.paginated = true
	r.pagination_param = "pager"
	r.pagination_max_pages = 35

	# the scrape_to_csv method handles writing the records you return to a CSV with
	# the given filename
	r.scrape_to_csv "essays.csv" do \|instance_html, instance_url, instance_index\|
	page = Nokogiri::HTML(instance_html)

	#for each page, find the date and titles using the given CSS selectors.
	page_title = page.css("div#book_review .article_title")[0].text.strip
	clean_date = page.css("abbr.published").text.gsub(/(\d)(th\|st\|nd\|rd)/, "\\1")
	date = DateTime.strptime(clean_date, "%B %d, %Y").strftime("%F")

	#we only want contributor pages, not author pages. "authors" in this sense can't write essays.
	# use CSS to get the links to bios for each contributor. On some older pages, contributors have
	# the CSS class of `author` (but their URLs contain `contributor`)
	essayist_links = page.css("a.person", "a.author").to_a.select{\|a\| a.attr('href').match(/\/contributor\//)}

	essayist_records = essayist_links.uniq{\|a\| a.text}.map do \| essayist_link_el \|
	essayist_link = r.send(:resolve_url, essayist_link_el.attr('href'), instance_url)
	essayist_name = essayist_link_el.text
	gender_from_bio_pronouns = code_for_gender( essayist_link, essayist_name )

	["", page_title,
	date,
	essayist_name,
	gender_from_bio_pronouns,
	does_gender_match?(gender_from_bio_pronouns, essayist_name),
	"essayist",
	bio_text(essayist_link),
	instance_url]
	end
	end