Skip to content

Instantly share code, notes, and snippets.

@kueda
Last active July 31, 2022 08:32
Show Gist options
  • Save kueda/0d84165aba157563cade to your computer and use it in GitHub Desktop.
Save kueda/0d84165aba157563cade to your computer and use it in GitHub Desktop.
Script to parse Esslinger's A Cumulative Checklist for the Lichen-forming, Lichenicolous and Allied Fungi of the Continental United States and Canada into machine-readable CSV
#encoding: utf-8
#
# Script to parse Esslinger's A Cumulative Checklist for the Lichen-forming,
# Lichenicolous and Allied Fungi of the Continental United States and Canada
# into machine-readable CSV.
#
# Esslinger's checklist (e.g.
# http://www.ndsu.edu/pubweb/~esslinge/chcklst/chcklst7.htm) is considered
# authoritative for North American lichens, but it's authored with MS Word and
# has incosistent formatting. This script attempts to smooth that out and
# makes CSV suitable for machine processing.
#
# Usage:
#
# ruby esslinger.rb "https://www.ndsu.edu/pubweb/~esslinge/chcklst/chcklst7.htm"
#
require 'rubygems'
require 'open-uri'
require 'biodiversity'
require 'nokogiri'
require 'csv'
def parse(node)
# these sometimes contain space chars with weird encodings
if node['style'] =~ /mso-spacerun/
" "
elsif node.children.size > 0
txt = node.children.map{|c| parse(c)}.join('')
txt = "<b>#{txt}</b>" if %w(strong b).include?( node.name.downcase )
txt += "<br>" if node.name == 'p'
txt
elsif node.name == 'br'
"<br>"
elsif node.name == 'a'
""
else
node.inner_text.gsub(/\s+/, ' ')
end
end
start = Time.now
url = ARGV[0]
html = Nokogiri.HTML(open(url), nil, "UTF-8")
paragraphs = html.search('p')
text = paragraphs.map{ |p|
parsed = parse(p)
parsed =~ /^\s*$/ ? nil : parsed
}.compact.join("")
parser = ScientificNameParser.new
synonyms = [%w(synonym verbatim current current_verbatim)]
names = [%w(name canonical verbatim lichenization)]
failures = []
genus = nil
stop_words = [
"record",
"report",
"in eastern",
"apparently",
"many old",
"this name",
"treated",
"probably",
"excluded",
"north america",
"misidentification",
"not known",
"type not",
"identity uncertain",
"but not",
"erroneously listed",
"a european",
"may not",
"identity not"
]
previous_was_synonym = false
text.split("<br>").each do |line|
next if line.strip.size == 0
# The following means we're done with names and on to the citations
break if line =~ /appendix.*specimen citations/i
line = line.sub(/Syns?\.?\:.+$/, '').gsub(/\s+/, ' ').strip
puts
puts line.inspect
# genus is all caps with at least 4 letters
if new_genus = line[/^(<b>)?([A-Z]{4,})/, 2]
genus = new_genus.capitalize
puts "\tnew genus: #{genus}"
previous_was_synonym = false
next
end
unless genus
failures << line
puts "\tNo genus, skipping..."
next
end
# Try to determine if this was a current name based on bolding. Note that this
# is imperfect. Esslinger's list contains maddening things like <strong><span
# style="color:blue;font-weight:normal;mso-bidi-font-
# weight:bold">pinguis</span></strong> which is intended to be a synonym
is_current_name = false
if line =~ /<b>.+?<\/b>/
is_current_name = true
end
line = line.gsub( /<\/?b>/, "" )
lichenization = if new_line = line[/^\*(.+)/, 1]
line = new_line
"lichenicolous"
elsif new_line = line[/^\+(.+)/, 1]
line = new_line
"saprophyte"
elsif new_line = line[/^\#(.+)/, 1]
line = new_line
"uncertain"
else
"lichen"
end
if line =~ /=/
old_name, current_verbatim = line.split('=')
current_verbatim ||= "UNPARSED"
puts "\tSynonym: was #{genus} #{old_name}, now #{current_verbatim}"
old_name = "#{genus} #{old_name}".split(/#{stop_words.join('|')}/i)[0]
parsed_synonym = begin
parser.parse(old_name)
rescue NoMethodError
nil
end
current_name = current_verbatim.split( /#{stop_words.join('|')}/i )[0]
parsed_current = begin
parser.parse( current_name )
rescue NoMethodError
nil
end
synonym = if parsed_synonym && parsed_synonym[:scientificName] && parsed_synonym[:scientificName][:parsed]
parsed_synonym[:scientificName][:canonical]
else
old_name
end
current = if parsed_current && parsed_current[:scientificName] && parsed_current[:scientificName][:parsed]
canonical = parsed_current[:scientificName][:canonical]
canonical.sub( /^#{genus[0]}\./, genus )
else
"UNPARSED"
end
synonyms << [synonym, old_name, current, current_verbatim]
previous_was_synonym = true
next
end
next if previous_was_synonym
next unless is_current_name
# Esslinger often adds extra annotations after the authority that screw
# things up, so this is a lame way to deal with them
line = line.split(/#{stop_words.join('|')}/i)[0]
name = "#{genus} #{line}"
puts "\tName: #{name}"
begin
if (parsed_name = parser.parse(name)) && parsed_name[:scientificName] && parsed_name[:scientificName][:parsed]
puts "\tCanonical: #{parsed_name[:scientificName][:canonical]}"
if parsed_name[:scientificName][:canonical].strip == genus
puts "\tSpecies was blank, skipping..."
failures << line
next
end
names << [parsed_name[:scientificName][:normalized], parsed_name[:scientificName][:canonical], name, lichenization]
end
rescue NoMethodError => e
failures << line
puts "\tFailed to parse scientific name, skipping..."
end
previous_was_synonym = false
end
synonyms_filename = "esslinger.#{File.basename(url)}.synonyms.csv"
CSV.open(synonyms_filename, 'w') do |csv|
synonyms.each do |line|
csv << line.map(&:strip)
end
end
names_filename = "esslinger.#{File.basename(url)}.names.csv"
CSV.open(names_filename, 'w') do |csv|
names.each do |line|
csv << line.map(&:strip)
end
end
puts
puts "#{failures.size} failed lines:"
failures.each{|line| puts "\t#{line}"}
puts
puts "Parsed #{names.size} names, #{synonyms.size} synonyms, #{failures.size} failures in #{Time.now - start} s"
puts "Names written to #{names_filename}"
puts "Synonyms written to #{synonyms_filename}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment