Skip to content

Instantly share code, notes, and snippets.

@oncomouse
Last active August 29, 2015 14:10
Show Gist options
  • Save oncomouse/2e3aa2c538c39e7bddca to your computer and use it in GitHub Desktop.
Save oncomouse/2e3aa2c538c39e7bddca to your computer and use it in GitHub Desktop.
Ruby script to process LaTeX files for conversion to .docx using Pandoc as a transport.
# Usage:
# => ruby pandoc.rb -i <file_name>(.tex) (options for pandoc)
# => minimum options for pandoc: --bibliography=<bibliography_file.bib> --csl=<mla or chicago>.csl
# Installed by running 'gem install bibtex' (may need to append sudo)
require 'bibtex'
default_bibliography = "./transhumanism.bib" # change this to whatever
if ARGV.find_index("-i")
file = ARGV[ARGV.find_index("-i") + 1]
options = {}
ARGV.delete("-i")
ARGV.delete(file)
else
raise "No Input File Provided"
exit
end
if options.has_key? "bibliography"
bibliography = BibTeX.open(options["bibliography"])
else
bibliography = BibTeX.open(default_bibliography)
end
if file =~ /\.tex$/
options["file_name_base"] = file.sub(/\.tex$/,"")
else
options["file_name_base"] = file
file = "#{file}.tex"
end
if !File.exists? file
raise "File #{file} Not Found"
exit
end
pandoc_options = ARGV.join(" ")
new_latex_content = ""
labels = {}
$chapter = 1
figure = 1
section = 1
# Recursively scan in a file for labels
def scan_in_file(file_name, indent)
if File.exists? "./#{file_name}.tex"
include_file = File.read("./#{file_name}.tex")
figure = 1
section = 1
# Pull in other included files
include_file.gsub!(/^(\t*)\\input\{([^}]+)\}/) do |match|
scan_in_file($2, $1)
end
# Collect label information:
include_file.gsub!(/\t*\\([^{]*)\{(.+)\}\n\s*\\label\{(.*)\}/) do |match|
$labels[$3] = {}
if $1 == "caption"
$labels[$3]["number"] = "#{$chapter}.#{figure}"
figure += 1
elsif $1 == "section"
$labels[$3]["number"] = "#{$chapter}.#{section}"
section += 1
else
$labels[$3]["number"] = "#{$chapter}"
end
$labels[$3]["name"] = $2
match.sub("\\label{#{$3}}","")
end
line = "\n#{indent}#{include_file}".gsub(/\n/,"\n#{indent}")
else
puts "Could not find #{file_name}.tex"
line = ""
end
line
end
File.open("#{options["file_name_base"]}.tex", "r") do |latex|
# Detect if the file we're processing is not in the same directory as the parser
file_base_dir = (File.dirname("#{options["file_name_base"]}.tex") == ".") ? ("") : File.dirname("#{options["file_name_base"]}.tex") + "/"
while line = latex.gets
if line =~ /^(\t*)\\ChapterInput\{([^}]+)\}/ or line =~ /^(\t*)\\input\{([^}]+)\}/
line = scan_in_file(file_base_dir + $2, $1)
$chapter += 1
end
new_latex_content += line
end
end
# Correctly insert label text
new_latex_content.gsub!(/\\(name){0,1}ref\{([^}]+)\}/) do |match|
if $1 == "name" and !$labels[$2].nil?
$labels[$2]["name"]
elsif !$labels[$2].nil?
$labels[$2]["number"]
else
"LABEL NOT FOUND"
end
end
#puts new_latex_content
# enumerate* and itemize* make pandoc freak out.
new_latex_content.gsub!(/(enumerate|itemize)\*/,"\\1")
# Look up titles
new_latex_content.gsub!(/\\citetitle\{([^}]+)\}/) do |match|
key = $1
if bibliography[key].type == :book
"\\emph{#{bibliography[key].title}}"
else
"``#{bibliography[key].title}''"
end
end
# Fix wonkey citation format for pandoc-citeproc
new_latex_content.gsub!(/(citep|autocite|cite)\[([0-9, ]+)\]\{(\S+)\}/) do |match|
command = $1
pages = $2
cite_key = $3
if pages =~ /\,/
page_numbers = pages.split(/, */)
pages = page_numbers.map { |page| "p. #{page}" }.join(", ")
else
pages = "p. #{pages}"
end
"#{command}[#{pages}]{#{cite_key}}"
end
# Provide absolute file names for figures:
new_latex_content.gsub!(/(\t+)\\includegraphics(.*)\{([^}]+)\}/) do |match|
file_name = $3
["png", "eps", "pdf", "jpg"].each do |extension|
if File.exists? "#{file_name}.#{extension}"
file_name = "#{file_name}.#{extension}"
end
end
"#{$1}\\includegraphics#{$2}{#{file_name}}"
end
# Fix epigraphs:
new_latex_content.gsub!(/(\t+)\\epigraph\{(.+)\}\{(.+)\}/, "\\1\\begin{center}\n\\1\t\\emph{\\2 \\linebreak[4] --- \\3}\n\\1\\end{center}")
File.open("#{options["file_name_base"]}-pandoc.tex", "w") do |pandoc|
pandoc.puts new_latex_content
end
exec "pandoc -s --smart #{options["file_name_base"]}-pandoc.tex -o #{options["file_name_base"]}.docx #{pandoc_options}"
#exec "pandoc -s -f latex -t markdown+citations #{options["file_name_base"]}-pandoc.tex -o #{options["file_name_base"]}.md #{pandoc_options}"
rm "#{options["file_name_base"]}-pandoc.tex"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment