Created
August 13, 2021 03:33
-
-
Save SitanHuang/18def28623e1d1f2dab49c399e9cea8b to your computer and use it in GitHub Desktop.
multithread, auto skipping, recursive pandoc batch conversion script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'fileutils' | |
TARGET = 'pdfs/' | |
LOG_FILE = TARGET + 'pandoc.log' | |
# set to number of cpu cores/threads | |
THREADS = 6 * 2 | |
# record starting time for elapsed time | |
starting = Process.clock_gettime(Process::CLOCK_MONOTONIC) | |
# change working directory to Notes/ | |
Dir.chdir(__dir__ + '/../') | |
# open log file for pandoc | |
FileUtils.mkdir_p(TARGET) | |
log_file = File.open(LOG_FILE, 'w') | |
# number of discovered source files | |
discovered = 0 | |
# number of skipped files due to same modified time | |
skipped = 0 | |
# number of newly convereted files | |
converted = 0 | |
# number of pandoc non zero exits | |
failed = 0 | |
# create thread safe queue | |
source_files = Queue.new | |
# walk through all the PDFs to add to source_files | |
Dir.glob('**/*.md').each do |src| | |
next if src.start_with?(TARGET) | |
discovered += 1 | |
out = TARGET + src.sub(/\.md$/, '.pdf') | |
# make output directory | |
FileUtils.mkdir_p(File.dirname(out)) | |
# skip if output's modified time >= source | |
if File.exist?(out) and File.mtime(out) >= File.mtime(src) | |
skipped += 1 | |
else | |
source_files.push([src, out]) | |
end | |
end | |
threads = Array.new(THREADS) do | |
Thread.new do | |
until source_files.empty? | |
# This will remove the first object from source_files | |
next_object = source_files.shift | |
src, out = next_object | |
# do the conversion | |
system 'pandoc', | |
'--pdf-engine=xelatex', '-f', 'markdown', | |
src, '-o', out, out: log_file, err: log_file | |
if $?.exitstatus != 0 | |
STDERR.puts "pandoc exited with code #{$?.exitstatus} for #{out}" | |
failed += 1 | |
else | |
converted += 1 | |
end | |
end | |
end | |
end | |
threads.each(&:join) | |
# calculate elapsed time | |
ending = Process.clock_gettime(Process::CLOCK_MONOTONIC) | |
elapsed = ending - starting | |
puts "Discovered: #{discovered}" | |
puts "Skipped: #{skipped}" | |
puts "Converted: #{converted}" | |
puts "Failed: #{failed}" | |
puts "Elapsed time: #{elapsed.round(2)}s" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment