Last active
December 13, 2015 17:08
-
-
Save teaforthecat/4945631 to your computer and use it in GitHub Desktop.
Custom html parser. This was used to convert html files which were exported from an older Adobe InDesign file into markdown. The html was inconsistent and varied but this handled 90 percent of the files. The remaining cases were edited by hand in a form with a markdown enabled text area.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# renders markdown with footnotes matching Pandoc's syntax, | |
# but renders html with custom formatting in the class methods link_to_text_replace, link_to_fn_replace | |
class CatalogEntry | |
LINK_TO_FN = "[^%d] " | |
LINK_TO_TEXT = "[^%d]: %s" | |
CITATION = "%{last}, %{first}%{other}. \"%{title}.\" In _Bits & Pieces Put Together to Present a Semblance of a Whole: Walker Art Center Collections_, edited by Joan Rothfuss and Elizabeth Carpenter. Minneapolis, MN: Walker Art Center, 2005." | |
FILE_TEMPLATE = "B&Pcat-%{last},%{first}_2005" | |
FILE_NAME_REGEX = /B&Pcat-(.+),(.+)_2005.*/ | |
P = "\n \n" | |
attr_accessor :source, :source_file_name, :artist_name, :sidebar, :author_string, :title | |
def initialize opts={} | |
return unless opts[:path] | |
self.sidebar = false | |
format = opts[:format] || File.extname(opts[:path]) | |
@source_file_name = File.basename(opts[:path]).chomp(format) | |
if opts[:path] and format == '.html' | |
@source = Nokogiri::HTML(open(opts[:path])) | |
end | |
end | |
def body | |
process_body + " | |
<div class=\"footnotes\"> | |
<ol> | |
" + process_notes + " | |
</ol> | |
</div> | |
" | |
end | |
def author | |
self.author_string ||= begin | |
text = @source.css('span.author-s-name').first.try :text | |
if text.nil? | |
text = @source.css('span.sidebar-normal').last.try :text | |
if AuthorMap.keys.include? text | |
AuthorMap[text] | |
elsif AuthorMap.values.include? text | |
text | |
end | |
else | |
process_author(text) | |
end | |
end | |
end | |
def process_author text | |
authors = text.split(/ with | and /) | |
authors.collect{ |a| AuthorMap[a.try(:strip)] || a || '' }.join(', ') | |
end | |
def notes | |
process_notes | |
end | |
# may or may not exist | |
def title | |
if sidebar? | |
@source.css('span.sidebar-normal').first.try :text | |
else | |
artist_name | |
end | |
end | |
def sidebar? | |
@source.css('span.sidebar-normal').present? || /_ADD/.match(@source_file_name).present? | |
end | |
def citation | |
return unless author.present? | |
first, other = author.split(',') | |
a = first.split(' ') | |
last = a.pop | |
first_middle = a.join(' ').chomp('.') # remove . in middle abbrev to avoid .. | |
CITATION % {last: last, first: first_middle, title: title, other: (", #{other}" if other)} | |
end | |
def artist_name | |
@artist_name ||= process_artist_name | |
end | |
def valid? | |
@source and @source.validate | |
end | |
private | |
def process_notes | |
@source.css('p').collect do |el| | |
next unless el.css('.notes').present? | |
next if el.text == "Notes" | |
number, text = *extract_footnote(process(el)) | |
format_footnote(number,text) if number | |
end.compact.join(P) | |
end | |
def process_artist_name | |
matchdata, last, first = *FILE_NAME_REGEX.match(source_file_name) | |
[first,last].join(' ') | |
end | |
PROCESSER = Hash.new(->(key){ ->(el){ el.text }}).merge({ | |
'body-copy' => ->(el){ el.text }, | |
'notes' => ->(el){ el.text }, | |
'sidebar-normal' => ->(el){ el.text }, | |
'body-copy-italic' => ->(el){ el.text.sub(/\A(\s*)/){ $1.to_s+'_'}.sub(/(\s*)\Z/){ '_'+$1.to_s} }, # retain space within the element | |
'notes-italic' => ->(el){ '<em>'+el.text+'</em>' }, #unfortunate hack html within html | |
'body-copy-footnotes' => ->(el){ LINK_TO_FN % el.text }, | |
'author-s-name' => ->(el){ raise "BODY END" }, | |
}) | |
def process_body | |
raise 'Invalid Content' unless valid? | |
story = @source.css('.story p.body-copy') | |
if sidebar? | |
self.sidebar = true | |
self.title = story.shift | |
end | |
story.collect do |el| | |
unless el.css('.author-s-name').present? || el.css('.notes').present? | |
process(el) | |
end | |
end.compact.join(P) | |
end | |
def process el | |
el.children.collect{|e| PROCESSER[e[:class]].call(e) }.join('') | |
end | |
def extract_footnote note_text | |
matchdata, number, text = */(\d+)\. (.*)/.match(note_text) | |
[number,text] unless number.nil? | |
end | |
def format_footnote number, text | |
LINK_TO_TEXT % [number, text] | |
end | |
# id = ref | |
def self.link_to_fn_replace html, prefix | |
html.gsub(/\[\^(\d+)\](\s+)/) do | |
id = 'fnref' + prefix + "-" + $1.to_s | |
href = '#fn' + prefix + "-" + $1.to_s | |
"<sup><a href=\"#{href}\" id=\"#{id}\" class=\"footnoteRef\" rel=\"footnote\">#{$1}</a></sup>#{$2}" | |
end | |
end | |
# id = fn | |
def self.link_to_text_replace html, prefix | |
html.split("\n").collect do |line| | |
line.sub(/(\s)*\[\^(\d+)\]\: (.*)/) do | |
href = '#fnref' + prefix + '-' + $2.to_s | |
id = 'fn' + prefix + '-' + $2.to_s | |
"<li id=\"#{id}\"><p>#{$3} <a href=\"#{href}\" class=\"jump\" rev=\"footnote\">↩</a></p></li>" | |
end | |
end.join("\n") | |
end | |
AuthorMap = { | |
'A.B.' => 'Andrew Blauvelt', | |
'D.B.' => 'Doug Benidt', | |
'D.C.' => 'Doryun Chong', | |
'D.F.' => 'Douglas Fogle', | |
'D.K.' => 'Diana Kim', | |
'D.O.' => 'Dean Otto', | |
'D.S.' => 'Daniel Smith', | |
'E.C.' => 'Elizabeth Carpenter', | |
'J.C.P.' => 'Jennifer Case Phelps', | |
'J.R.' => 'Joan Rothfuss', | |
'J.V.' => 'Jill Vetter', | |
'L.D.' => 'Lynn Dierks', | |
'M.A.' => 'Max Andrews', | |
'O.I.' => 'Olukemi Ilesanmi', | |
'P.B.' => 'Philip Bither', | |
'P.V.' => 'Philippe Vergne', | |
'R.F.' => 'Richard Flood', | |
'R.Furtak' => 'Rosemary Furtak', | |
'S.E.' => 'Siri Engberg', | |
'S.M.' => 'Sheryl Mously', | |
} | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment