Skip to content

Instantly share code, notes, and snippets.

@teaforthecat
Last active December 13, 2015 17:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save teaforthecat/4945631 to your computer and use it in GitHub Desktop.
Save teaforthecat/4945631 to your computer and use it in GitHub Desktop.
Custom html parser. This was used to convert html files which were exported from an older Adobe InDesign file into markdown. The html was inconsistent and varied but this handled 90 percent of the files. The remaining cases were edited by hand in a form with a markdown enabled text area.
# renders markdown with footnotes matching Pandoc's syntax,
# but renders html with custom formatting in the class methods link_to_text_replace, link_to_fn_replace
class CatalogEntry
LINK_TO_FN = "[^%d] "
LINK_TO_TEXT = "[^%d]: %s"
CITATION = "%{last}, %{first}%{other}. \"%{title}.\" In _Bits & Pieces Put Together to Present a Semblance of a Whole: Walker Art Center Collections_, edited by Joan Rothfuss and Elizabeth Carpenter. Minneapolis, MN: Walker Art Center, 2005."
FILE_TEMPLATE = "B&Pcat-%{last},%{first}_2005"
FILE_NAME_REGEX = /B&Pcat-(.+),(.+)_2005.*/
P = "\n \n"
attr_accessor :source, :source_file_name, :artist_name, :sidebar, :author_string, :title
def initialize opts={}
return unless opts[:path]
self.sidebar = false
format = opts[:format] || File.extname(opts[:path])
@source_file_name = File.basename(opts[:path]).chomp(format)
if opts[:path] and format == '.html'
@source = Nokogiri::HTML(open(opts[:path]))
end
end
def body
process_body + "
<div class=\"footnotes\">
<ol>
" + process_notes + "
</ol>
</div>
"
end
def author
self.author_string ||= begin
text = @source.css('span.author-s-name').first.try :text
if text.nil?
text = @source.css('span.sidebar-normal').last.try :text
if AuthorMap.keys.include? text
AuthorMap[text]
elsif AuthorMap.values.include? text
text
end
else
process_author(text)
end
end
end
def process_author text
authors = text.split(/ with | and /)
authors.collect{ |a| AuthorMap[a.try(:strip)] || a || '' }.join(', ')
end
def notes
process_notes
end
# may or may not exist
def title
if sidebar?
@source.css('span.sidebar-normal').first.try :text
else
artist_name
end
end
def sidebar?
@source.css('span.sidebar-normal').present? || /_ADD/.match(@source_file_name).present?
end
def citation
return unless author.present?
first, other = author.split(',')
a = first.split(' ')
last = a.pop
first_middle = a.join(' ').chomp('.') # remove . in middle abbrev to avoid ..
CITATION % {last: last, first: first_middle, title: title, other: (", #{other}" if other)}
end
def artist_name
@artist_name ||= process_artist_name
end
def valid?
@source and @source.validate
end
private
def process_notes
@source.css('p').collect do |el|
next unless el.css('.notes').present?
next if el.text == "Notes"
number, text = *extract_footnote(process(el))
format_footnote(number,text) if number
end.compact.join(P)
end
def process_artist_name
matchdata, last, first = *FILE_NAME_REGEX.match(source_file_name)
[first,last].join(' ')
end
PROCESSER = Hash.new(->(key){ ->(el){ el.text }}).merge({
'body-copy' => ->(el){ el.text },
'notes' => ->(el){ el.text },
'sidebar-normal' => ->(el){ el.text },
'body-copy-italic' => ->(el){ el.text.sub(/\A(\s*)/){ $1.to_s+'_'}.sub(/(\s*)\Z/){ '_'+$1.to_s} }, # retain space within the element
'notes-italic' => ->(el){ '<em>'+el.text+'</em>' }, #unfortunate hack html within html
'body-copy-footnotes' => ->(el){ LINK_TO_FN % el.text },
'author-s-name' => ->(el){ raise "BODY END" },
})
def process_body
raise 'Invalid Content' unless valid?
story = @source.css('.story p.body-copy')
if sidebar?
self.sidebar = true
self.title = story.shift
end
story.collect do |el|
unless el.css('.author-s-name').present? || el.css('.notes').present?
process(el)
end
end.compact.join(P)
end
def process el
el.children.collect{|e| PROCESSER[e[:class]].call(e) }.join('')
end
def extract_footnote note_text
matchdata, number, text = */(\d+)\. (.*)/.match(note_text)
[number,text] unless number.nil?
end
def format_footnote number, text
LINK_TO_TEXT % [number, text]
end
# id = ref
def self.link_to_fn_replace html, prefix
html.gsub(/\[\^(\d+)\](\s+)/) do
id = 'fnref' + prefix + "-" + $1.to_s
href = '#fn' + prefix + "-" + $1.to_s
"<sup><a href=\"#{href}\" id=\"#{id}\" class=\"footnoteRef\" rel=\"footnote\">#{$1}</a></sup>#{$2}"
end
end
# id = fn
def self.link_to_text_replace html, prefix
html.split("\n").collect do |line|
line.sub(/(\s)*\[\^(\d+)\]\: (.*)/) do
href = '#fnref' + prefix + '-' + $2.to_s
id = 'fn' + prefix + '-' + $2.to_s
"<li id=\"#{id}\"><p>#{$3} <a href=\"#{href}\" class=\"jump\" rev=\"footnote\">&#8617;</a></p></li>"
end
end.join("\n")
end
AuthorMap = {
'A.B.' => 'Andrew Blauvelt',
'D.B.' => 'Doug Benidt',
'D.C.' => 'Doryun Chong',
'D.F.' => 'Douglas Fogle',
'D.K.' => 'Diana Kim',
'D.O.' => 'Dean Otto',
'D.S.' => 'Daniel Smith',
'E.C.' => 'Elizabeth Carpenter',
'J.C.P.' => 'Jennifer Case Phelps',
'J.R.' => 'Joan Rothfuss',
'J.V.' => 'Jill Vetter',
'L.D.' => 'Lynn Dierks',
'M.A.' => 'Max Andrews',
'O.I.' => 'Olukemi Ilesanmi',
'P.B.' => 'Philip Bither',
'P.V.' => 'Philippe Vergne',
'R.F.' => 'Richard Flood',
'R.Furtak' => 'Rosemary Furtak',
'S.E.' => 'Siri Engberg',
'S.M.' => 'Sheryl Mously',
}
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment