Skip to content

Instantly share code, notes, and snippets.

@rjkat
Created November 30, 2019 02:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rjkat/9df5d6deec686387563409fe59ecbbb3 to your computer and use it in GitHub Desktop.
Save rjkat/9df5d6deec686387563409fe59ecbbb3 to your computer and use it in GitHub Desktop.
remove duplicate pages from a PDF
# dedup.jl
# wrapper around Apache PDFBox (https://pdfbox.apache.org/)
# for removing duplicate pages from a PDF
# only tested on macOS
JAR = "pdfbox-app-2.0.17.jar"
pdfbox(args...) = run(`java -jar $JAR $args`)
numpages(pdf) = parse(Int, read(
`mdls -name kMDItemNumberOfPages -raw $pdf`,
String
))
function extractpage(infile, page)
f = tempname()
pdfbox("ExtractText",
"-startPage", page,
"-endPage", page,
infile, f
)
readlines(f)
end
abstract type AbstractDupTracker end
Base.@kwdef mutable struct SlideTracker <: AbstractDupTracker
curslide::Int = 1
end
function prevdup(context::SlideTracker, prevlines, lines)
m = match(r"^(\d+)\s*/\s*(\d+)", lines[end])
if !isnothing(m)
slide = parse(Int, m.captures[1])
dup = slide == context.curslide
if !dup
context.curslide = slide
end
return dup
end
return false
end
function uniquepages(pdf, tracker::AbstractDupTracker)
n = numpages(pdf)
unique = Int[]
if n < 1
return unique
end
prevlines = extractpage(pdf, 1)
for i = 2:n
lines = extractpage(pdf, i)
if !prevdup(tracker, prevlines, lines)
push!(unique, i - 1)
end
prevlines = lines
end
push!(unique, n)
unique
end
defaultoutname(inpdf) = joinpath(dirname(inpdf), splitext(basename(inpdf))[1] * "-unique.pdf")
function selectpages(inpdf, pages, outpdf=defaultoutname(inpdf))
filename = basename(inpdf)
dir = mktempdir()
tmppdf = joinpath(dir, filename)
cp(inpdf, tmppdf)
for page in pages
pdfbox("PDFSplit", tmppdf, "-startPage", page, "-endPage", page, "-outputPrefix", joinpath(dir, string(page)))
end
pdfs = [joinpath(dir, "$page-1.pdf") for page in pages]
pdfbox("PDFMerger", pdfs..., outpdf)
outpdf
end
function dedup(inpdf, outpdf=defaultoutname(inpdf), tracker=SlideTracker())
pages = uniquepages(inpdf, tracker)
selectpages(inpdf, pages, outpdf)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment