rjkat/dedup.jl

## dedup.jl
# dedup.jl
# wrapper around Apache PDFBox (https://pdfbox.apache.org/)
# for removing duplicate pages from a PDF
# only tested on macOS

JAR = "pdfbox-app-2.0.17.jar"

pdfbox(args...) = run(`java -jar $JAR $args`)

numpages(pdf) = parse(Int, read(
    `mdls -name kMDItemNumberOfPages -raw $pdf`,
    String
))

function extractpage(infile, page)
    f = tempname()
    pdfbox("ExtractText",
        "-startPage", page,
        "-endPage", page,
        infile, f
    )
    readlines(f)
end

abstract type AbstractDupTracker end

Base.@kwdef mutable struct SlideTracker <: AbstractDupTracker
    curslide::Int = 1
end

function prevdup(context::SlideTracker, prevlines, lines)
    m = match(r"^(\d+)\s*/\s*(\d+)", lines[end])
    if !isnothing(m)
        slide = parse(Int, m.captures[1])
        dup = slide == context.curslide
        if !dup
            context.curslide = slide
        end
        return dup
    end
    return false
end

function uniquepages(pdf, tracker::AbstractDupTracker)
    n = numpages(pdf)
    unique = Int[]
    if n < 1
        return unique
    end
    prevlines = extractpage(pdf, 1)
    for i = 2:n
        lines = extractpage(pdf, i)
        if !prevdup(tracker, prevlines, lines)
            push!(unique, i - 1)
        end
        prevlines = lines
    end
    push!(unique, n)
    unique
end

defaultoutname(inpdf) = joinpath(dirname(inpdf), splitext(basename(inpdf))[1] * "-unique.pdf")

function selectpages(inpdf, pages, outpdf=defaultoutname(inpdf))
    filename = basename(inpdf)
    dir = mktempdir()
    tmppdf = joinpath(dir, filename)
    cp(inpdf, tmppdf)
    for page in pages
        pdfbox("PDFSplit", tmppdf, "-startPage", page, "-endPage", page, "-outputPrefix", joinpath(dir, string(page)))
    end
    pdfs = [joinpath(dir, "$page-1.pdf") for page in pages]
    pdfbox("PDFMerger", pdfs..., outpdf)
    outpdf
end

function dedup(inpdf, outpdf=defaultoutname(inpdf), tracker=SlideTracker())
    pages = uniquepages(inpdf, tracker)
    selectpages(inpdf, pages, outpdf)
end
	# dedup.jl
	# wrapper around Apache PDFBox (https://pdfbox.apache.org/)
	# for removing duplicate pages from a PDF
	# only tested on macOS

	JAR = "pdfbox-app-2.0.17.jar"

	pdfbox(args...) = run(`java -jar $JAR $args`)

	numpages(pdf) = parse(Int, read(
	`mdls -name kMDItemNumberOfPages -raw $pdf`,
	String
	))

	function extractpage(infile, page)
	f = tempname()
	pdfbox("ExtractText",
	"-startPage", page,
	"-endPage", page,
	infile, f
	)
	readlines(f)
	end

	abstract type AbstractDupTracker end

	Base.@kwdef mutable struct SlideTracker <: AbstractDupTracker
	curslide::Int = 1
	end

	function prevdup(context::SlideTracker, prevlines, lines)
	m = match(r"^(\d+)\s/\s(\d+)", lines[end])
	if !isnothing(m)
	slide = parse(Int, m.captures[1])
	dup = slide == context.curslide
	if !dup
	context.curslide = slide
	end
	return dup
	end
	return false
	end

	function uniquepages(pdf, tracker::AbstractDupTracker)
	n = numpages(pdf)
	unique = Int[]
	if n < 1
	return unique
	end
	prevlines = extractpage(pdf, 1)
	for i = 2:n
	lines = extractpage(pdf, i)
	if !prevdup(tracker, prevlines, lines)
	push!(unique, i - 1)
	end
	prevlines = lines
	end
	push!(unique, n)
	unique
	end

	defaultoutname(inpdf) = joinpath(dirname(inpdf), splitext(basename(inpdf))[1] * "-unique.pdf")

	function selectpages(inpdf, pages, outpdf=defaultoutname(inpdf))
	filename = basename(inpdf)
	dir = mktempdir()
	tmppdf = joinpath(dir, filename)
	cp(inpdf, tmppdf)
	for page in pages
	pdfbox("PDFSplit", tmppdf, "-startPage", page, "-endPage", page, "-outputPrefix", joinpath(dir, string(page)))
	end
	pdfs = [joinpath(dir, "$page-1.pdf") for page in pages]
	pdfbox("PDFMerger", pdfs..., outpdf)
	outpdf
	end

	function dedup(inpdf, outpdf=defaultoutname(inpdf), tracker=SlideTracker())
	pages = uniquepages(inpdf, tracker)
	selectpages(inpdf, pages, outpdf)
	end