Convert CSV files to PBO parallel bilingual books
#!/usr/bin/env ruby | |
# | |
# Usage: | |
# csv2pbo book.csv > book.pbo | |
require 'csv' | |
path = ARGV[0] | |
# PBO format _looks_ like XML, but the parser is very fragile. So we | |
# generate this by hand. | |
print '<ParallelBook lang1="" author1="" title1="" info1="" lang2="" author2="" title2="" info2="" info="">' | |
def escape(s) | |
s.gsub('&', '&').gsub('"', '"').gsub('<', "<").gsub('>', '>') | |
end | |
first = true | |
CSV.foreach(path) do |row| | |
s = row[0] || "" | |
t = row[1] || "" | |
if first | |
# Assume the first line is the title, and mark it l="4". These codes | |
# seem a bit weird... | |
first = false | |
print "<p l=\"4\" s=\"#{escape(s)}\" t=\"#{escape(t)}\" />" | |
elsif s =~ /(\A\d+\.) / && t =~ /(\A\d+\.) / | |
# Try to mark chapter headings as l="5" so we get a table of contents. | |
# You may need to tweak this regex. | |
print "<p l=\"5\" s=\"#{escape(s)}\" t=\"#{escape(t)}\" />" | |
else | |
# If we still had paragraphs, we could mark them as l="3". | |
print "<p s=\"#{escape(s)}\" t=\"#{escape(t)}\" />" | |
end | |
end | |
print '</ParallelBook>' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment