Last active
August 29, 2015 14:01
-
-
Save sheepeeh/e7465e2e51d01f5db841 to your computer and use it in GitHub Desktop.
Format and sort Zotero HTML bibliographies
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#encoding: UTF-8 | |
require 'nokogiri' | |
# This script is intended to be used with the custom CSL at https://gist.github.com/sheepeeh/dbb7b02973644d397378 | |
# as it relies on sorting by call number to work. | |
# Takes a directory of HTML bibliographies exported by Zotero and makes it a little nicer | |
# for display on an Omeka Simple Page. | |
# Expects files to be named CollectionNumber-BoxNumber.html | |
Dir.glob("./*.html").each do |page| | |
name = File.basename(page,".html") | |
puts "Processing #{name}" | |
coll,box = name.split("-") | |
doc = Nokogiri::HTML(open("#{page}")) | |
# Array to make sure folders are not printed twice. | |
folders = [] | |
# Open output file. | |
f = File.open("./#{name}_folders.html","a") | |
# Adds an H3 with the box number and a link to a box search. | |
# (Requires an Item Type Metadata element for box.) | |
# CHANGE TO FIT YOUR LINK STRUCTURE | |
# Also starts csl-bib-body DIV. | |
f.puts "<h3><a href='/exhibits/merrigan/items/browse?search=&advanced[0][element_id]=53&advanced[0][type]=contains&advanced[0][terms]=#{name}' title='View other items in box #{box}'>Box #{box}</a></h3>\n<div style='line-height: 1.35; padding-left: 2em; text-indent:-2em;' class='csl-bib-body'>" | |
doc.xpath("//div[@class='csl-entry']").each do |entry| | |
text = entry.content | |
# This regex is based on our local identifier construction. | |
# CHANGE TO MATCH YOUR LOCAL IDENTIFIER STRUCTURE | |
id = text.match(/(455(-([0-9]{3})){3})/) | |
id = id.to_s | |
coll,box,folder,item = id.split("-") | |
folder = "not listed" if folder.nil? | |
# Adds an H4 with the folder number and a link to a folder search | |
# unless one has already been created. | |
# (Requires an Item Type Metadata element for folder.) | |
# CHANGE TO FIT YOUR LINK STRUCTURE | |
f.puts "\n<h4><a href='/exhibits/merrigan/items/browse?search=&advanced[0][element_id]=54&advanced[0][type]=contains&advanced[0][terms]=455-#{box}-#{folder}'>Folder #{folder}</a></h4>" unless folders.include?(folder) | |
f.puts entry | |
folders << folder unless folder.nil? | |
end | |
# Outpus COINS spans | |
doc.xpath("//span[@class='Z3988']").each do |coins| | |
f.puts coins | |
end | |
# Close csl-bib-body DIV. | |
f.puts "</div>" | |
f.close | |
# Replace all of the weirdness introduced by Nokogiri | |
text = File.read("./#{name}_folders.html") | |
text.force_encoding 'utf-8' | |
replacements = [["\u201C","""], | |
["\u201D","""], | |
[' ', '' ], | |
[" :",":"], | |
["———","—"], | |
[ '—', '—' ], | |
[ '’', '’' ], | |
[ '‘', '‘' ], | |
[ '”', '”' ], | |
[ '“', '“' ], | |
[ ' ', ' ' ], | |
[ 'Â<A0>', ' '], | |
[ '—', "—" ], | |
[ '–', "—" ], | |
[ '…', '…'], | |
[ 'é', 'é'], | |
[ '“', '“'], | |
[ '‘', '‘'], | |
[ '’', '’'], | |
[ 'â€', '”'], | |
[ ' ', ' '], | |
["–",'-'], | |
["‘","'"], | |
["’","'"] | |
] | |
replacements.each { |replacement| text.gsub!(replacement[0], replacement[1]) } | |
File.open("./#{name}_folders.html", "w") { |file| file.puts text } | |
# Add/modify some formatting. | |
text = File.read("./#{name}_folders.html") | |
replacements = [['<div class="csl-block">NAL Notes:', '<div class="csl-block"><strong>NAL Notes:</strong>'], | |
['<div class="csl-block">Abstract:', '<div class="csl-block"><strong>Abstract:</strong>'], | |
["NAL identifier:","<strong>NAL identifier:</strong>"], | |
["<i>","<em>"], | |
["</i>","</em>"] | |
] | |
replacements.each { |replacement| text.gsub!(replacement[0], replacement[1]) } | |
File.open("#{name}_folders.html", "w") { |file| file.puts text } | |
end | |
puts "Done." | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Within Omeka, I add the following CSS