kueda/esslinger.rb

## esslinger.rb
#encoding: utf-8
#
# Script to parse Esslinger's A Cumulative Checklist for the Lichen-forming,
# Lichenicolous and Allied Fungi of the Continental United States and Canada
# into machine-readable CSV.
#
# Esslinger's checklist (e.g.
# http://www.ndsu.edu/pubweb/~esslinge/chcklst/chcklst7.htm) is considered
# authoritative for North American lichens, but it's authored with MS Word and
# has incosistent formatting. This script attempts to smooth that out and
# makes CSV suitable for machine processing.
#
# Usage:
#
#   ruby esslinger.rb "https://www.ndsu.edu/pubweb/~esslinge/chcklst/chcklst7.htm"
#

require 'rubygems'
require 'open-uri'
require 'biodiversity'
require 'nokogiri'
require 'csv'

def parse(node)
  # these sometimes contain space chars with weird encodings
  if node['style'] =~ /mso-spacerun/
    " "
  elsif node.children.size > 0
    txt = node.children.map{|c| parse(c)}.join('')
    txt = "<b>#{txt}</b>" if %w(strong b).include?( node.name.downcase )
    txt += "<br>" if node.name == 'p'
    txt
  elsif node.name == 'br'
    "<br>"
  elsif node.name == 'a'
    ""
  else
    node.inner_text.gsub(/\s+/, ' ')
  end
end

start = Time.now
url = ARGV[0]
html = Nokogiri.HTML(open(url), nil, "UTF-8")
paragraphs = html.search('p')
text = paragraphs.map{ |p|
  parsed = parse(p)
  parsed =~ /^\s*$/ ? nil : parsed
}.compact.join("")
parser = ScientificNameParser.new
synonyms = [%w(synonym verbatim current current_verbatim)]
names = [%w(name canonical verbatim lichenization)]
failures = []
genus = nil
stop_words = [
  "record",
  "report",
  "in eastern",
  "apparently",
  "many old",
  "this name",
  "treated",
  "probably",
  "excluded",
  "north america",
  "misidentification",
  "not known",
  "type not",
  "identity uncertain",
  "but not",
  "erroneously listed",
  "a european",
  "may not",
  "identity not"
]
previous_was_synonym = false
text.split("<br>").each do |line|
  next if line.strip.size == 0
  # The following means we're done with names and on to the citations
  break if line =~ /appendix.*specimen citations/i
  line = line.sub(/Syns?\.?\:.+$/, '').gsub(/\s+/, ' ').strip
  puts
  puts line.inspect
  # genus is all caps with at least 4 letters
  if new_genus = line[/^(<b>)?([A-Z]{4,})/, 2]
    genus = new_genus.capitalize
    puts "\tnew genus: #{genus}"
    previous_was_synonym = false
    next
  end
  unless genus
    failures << line
    puts "\tNo genus, skipping..."
    next
  end

  # Try to determine if this was a current name based on bolding. Note that this
  # is imperfect. Esslinger's list contains maddening things like <strong><span
  # style="color:blue;font-weight:normal;mso-bidi-font-
  # weight:bold">pinguis</span></strong> which is intended to be a synonym
  is_current_name = false
  if line =~ /<b>.+?<\/b>/
    is_current_name = true
  end
  line = line.gsub( /<\/?b>/, "" )

  lichenization = if new_line = line[/^\*(.+)/, 1]
    line = new_line
    "lichenicolous"
  elsif new_line = line[/^\+(.+)/, 1]
    line = new_line
    "saprophyte"
  elsif new_line = line[/^\#(.+)/, 1]
    line = new_line
    "uncertain"
  else
    "lichen"
  end

  if line =~ /=/
    old_name, current_verbatim = line.split('=')
    current_verbatim ||= "UNPARSED"
    puts "\tSynonym: was #{genus} #{old_name}, now #{current_verbatim}"
    old_name = "#{genus} #{old_name}".split(/#{stop_words.join('|')}/i)[0]
    parsed_synonym = begin
      parser.parse(old_name)
    rescue NoMethodError
      nil
    end
    current_name = current_verbatim.split( /#{stop_words.join('|')}/i )[0]
    parsed_current = begin
      parser.parse( current_name )
    rescue NoMethodError
      nil
    end
    synonym = if parsed_synonym && parsed_synonym[:scientificName] && parsed_synonym[:scientificName][:parsed]
      parsed_synonym[:scientificName][:canonical]
    else
      old_name
    end
    current = if parsed_current && parsed_current[:scientificName] && parsed_current[:scientificName][:parsed]
      canonical = parsed_current[:scientificName][:canonical]
      canonical.sub( /^#{genus[0]}\./, genus )
    else
      "UNPARSED"
    end
    synonyms << [synonym, old_name, current, current_verbatim]
    previous_was_synonym = true
    next
  end
  next if previous_was_synonym
  next unless is_current_name
  # Esslinger often adds extra annotations after the authority that screw
  # things up, so this is a lame way to deal with them
  line = line.split(/#{stop_words.join('|')}/i)[0]
  name = "#{genus} #{line}"
  puts "\tName: #{name}"
  begin
    if (parsed_name = parser.parse(name)) && parsed_name[:scientificName] && parsed_name[:scientificName][:parsed]
      puts "\tCanonical: #{parsed_name[:scientificName][:canonical]}"
      if parsed_name[:scientificName][:canonical].strip == genus
        puts "\tSpecies was blank, skipping..."
        failures << line
        next
      end
      names << [parsed_name[:scientificName][:normalized], parsed_name[:scientificName][:canonical], name, lichenization]
    end
  rescue NoMethodError => e
    failures << line
    puts "\tFailed to parse scientific name, skipping..."
  end
  previous_was_synonym = false
end

synonyms_filename = "esslinger.#{File.basename(url)}.synonyms.csv"
CSV.open(synonyms_filename, 'w') do |csv|
  synonyms.each do |line|
    csv << line.map(&:strip)
  end
end

names_filename = "esslinger.#{File.basename(url)}.names.csv"
CSV.open(names_filename, 'w') do |csv|
  names.each do |line|
    csv << line.map(&:strip)
  end
end

puts
puts "#{failures.size} failed lines:"
failures.each{|line| puts "\t#{line}"}
puts
puts "Parsed #{names.size} names, #{synonyms.size} synonyms, #{failures.size} failures in #{Time.now - start} s"
puts "Names written to #{names_filename}"
puts "Synonyms written to #{synonyms_filename}"
	#encoding: utf-8
	#
	# Script to parse Esslinger's A Cumulative Checklist for the Lichen-forming,
	# Lichenicolous and Allied Fungi of the Continental United States and Canada
	# into machine-readable CSV.
	#
	# Esslinger's checklist (e.g.
	# http://www.ndsu.edu/pubweb/~esslinge/chcklst/chcklst7.htm) is considered
	# authoritative for North American lichens, but it's authored with MS Word and
	# has incosistent formatting. This script attempts to smooth that out and
	# makes CSV suitable for machine processing.
	#
	# Usage:
	#
	# ruby esslinger.rb "https://www.ndsu.edu/pubweb/~esslinge/chcklst/chcklst7.htm"
	#

	require 'rubygems'
	require 'open-uri'
	require 'biodiversity'
	require 'nokogiri'
	require 'csv'

	def parse(node)
	# these sometimes contain space chars with weird encodings
	if node['style'] =~ /mso-spacerun/
	" "
	elsif node.children.size > 0
	txt = node.children.map{\|c\| parse(c)}.join('')
	txt = "<b>#{txt}</b>" if %w(strong b).include?( node.name.downcase )
	txt += "<br>" if node.name == 'p'
	txt
	elsif node.name == 'br'
	"<br>"
	elsif node.name == 'a'
	""
	else
	node.inner_text.gsub(/\s+/, ' ')
	end
	end

	start = Time.now
	url = ARGV[0]
	html = Nokogiri.HTML(open(url), nil, "UTF-8")
	paragraphs = html.search('p')
	text = paragraphs.map{ \|p\|
	parsed = parse(p)
	parsed =~ /^\s*$/ ? nil : parsed
	}.compact.join("")
	parser = ScientificNameParser.new
	synonyms = [%w(synonym verbatim current current_verbatim)]
	names = [%w(name canonical verbatim lichenization)]
	failures = []
	genus = nil
	stop_words = [
	"record",
	"report",
	"in eastern",
	"apparently",
	"many old",
	"this name",
	"treated",
	"probably",
	"excluded",
	"north america",
	"misidentification",
	"not known",
	"type not",
	"identity uncertain",
	"but not",
	"erroneously listed",
	"a european",
	"may not",
	"identity not"
	]
	previous_was_synonym = false
	text.split("<br>").each do \|line\|
	next if line.strip.size == 0
	# The following means we're done with names and on to the citations
	break if line =~ /appendix.*specimen citations/i
	line = line.sub(/Syns?\.?\:.+$/, '').gsub(/\s+/, ' ').strip
	puts
	puts line.inspect
	# genus is all caps with at least 4 letters
	if new_genus = line[/^(<b>)?([A-Z]{4,})/, 2]
	genus = new_genus.capitalize
	puts "\tnew genus: #{genus}"
	previous_was_synonym = false
	next
	end
	unless genus
	failures << line
	puts "\tNo genus, skipping..."
	next
	end

	# Try to determine if this was a current name based on bolding. Note that this
	# is imperfect. Esslinger's list contains maddening things like <strong><span
	# style="color:blue;font-weight:normal;mso-bidi-font-
	# weight:bold">pinguis</span></strong> which is intended to be a synonym
	is_current_name = false
	if line =~ /<b>.+?<\/b>/
	is_current_name = true
	end
	line = line.gsub( /<\/?b>/, "" )

	lichenization = if new_line = line[/^\*(.+)/, 1]
	line = new_line
	"lichenicolous"
	elsif new_line = line[/^\+(.+)/, 1]
	line = new_line
	"saprophyte"
	elsif new_line = line[/^\#(.+)/, 1]
	line = new_line
	"uncertain"
	else
	"lichen"
	end

	if line =~ /=/
	old_name, current_verbatim = line.split('=')
	current_verbatim \|\|= "UNPARSED"
	puts "\tSynonym: was #{genus} #{old_name}, now #{current_verbatim}"
	old_name = "#{genus} #{old_name}".split(/#{stop_words.join('\|')}/i)[0]
	parsed_synonym = begin
	parser.parse(old_name)
	rescue NoMethodError
	nil
	end
	current_name = current_verbatim.split( /#{stop_words.join('\|')}/i )[0]
	parsed_current = begin
	parser.parse( current_name )
	rescue NoMethodError
	nil
	end
	synonym = if parsed_synonym && parsed_synonym[:scientificName] && parsed_synonym[:scientificName][:parsed]
	parsed_synonym[:scientificName][:canonical]
	else
	old_name
	end
	current = if parsed_current && parsed_current[:scientificName] && parsed_current[:scientificName][:parsed]
	canonical = parsed_current[:scientificName][:canonical]
	canonical.sub( /^#{genus[0]}\./, genus )
	else
	"UNPARSED"
	end
	synonyms << [synonym, old_name, current, current_verbatim]
	previous_was_synonym = true
	next
	end
	next if previous_was_synonym
	next unless is_current_name
	# Esslinger often adds extra annotations after the authority that screw
	# things up, so this is a lame way to deal with them
	line = line.split(/#{stop_words.join('\|')}/i)[0]
	name = "#{genus} #{line}"
	puts "\tName: #{name}"
	begin
	if (parsed_name = parser.parse(name)) && parsed_name[:scientificName] && parsed_name[:scientificName][:parsed]
	puts "\tCanonical: #{parsed_name[:scientificName][:canonical]}"
	if parsed_name[:scientificName][:canonical].strip == genus
	puts "\tSpecies was blank, skipping..."
	failures << line
	next
	end
	names << [parsed_name[:scientificName][:normalized], parsed_name[:scientificName][:canonical], name, lichenization]
	end
	rescue NoMethodError => e
	failures << line
	puts "\tFailed to parse scientific name, skipping..."
	end
	previous_was_synonym = false
	end

	synonyms_filename = "esslinger.#{File.basename(url)}.synonyms.csv"
	CSV.open(synonyms_filename, 'w') do \|csv\|
	synonyms.each do \|line\|
	csv << line.map(&:strip)
	end
	end

	names_filename = "esslinger.#{File.basename(url)}.names.csv"
	CSV.open(names_filename, 'w') do \|csv\|
	names.each do \|line\|
	csv << line.map(&:strip)
	end
	end

	puts
	puts "#{failures.size} failed lines:"
	failures.each{\|line\| puts "\t#{line}"}
	puts
	puts "Parsed #{names.size} names, #{synonyms.size} synonyms, #{failures.size} failures in #{Time.now - start} s"
	puts "Names written to #{names_filename}"
	puts "Synonyms written to #{synonyms_filename}"