Skip to content

Instantly share code, notes, and snippets.

@paulsonkantony
Last active November 30, 2021 05:44
Show Gist options
  • Save paulsonkantony/c8689c69715e5477885821d4dffa2c03 to your computer and use it in GitHub Desktop.
Save paulsonkantony/c8689c69715e5477885821d4dffa2c03 to your computer and use it in GitHub Desktop.
Code snippet to remove duplicate pages from a PDF File
#!/usr/bin/env python
"""pdfClean.py: Remove duplicate pages from a PDF File."""
import os
import PyPDF2
import numpy as np
from tqdm import trange, tqdm
__author__ = 'Paulson K Antony'
__copyright__ = 'Copyright 2021, randomProjects'
__credits__ = ['Paulson K Antony']
__license__ = 'GPL'
__version__ = '3.0'
def extract_unique_page_numbers(array):
return {i: (array == i).nonzero()[0][0] for i in np.unique(array)}
#Replace input.pdf with file name of the input
pdf_reader = PyPDF2.PdfFileReader('input.pdf')
pdf_writer = PyPDF2.PdfFileWriter()
numPages = pdf_reader.getNumPages()
file = []
print('Extracting page number:')
for i in trange(numPages):
#print(i)
current_page = pdf_reader.getPage(i)
file.append(current_page.extractText())
print('\n')
file = np.array(file, dtype=object)
uniquePages = list(extract_unique_page_numbers(file).values())
uniquePages.sort()
print('Writing unique pages to output: ')
#print(uniquePages)
for i in trange(len(uniquePages)):
pdfPage = pdf_reader.getPage(uniquePages[i])
pdf_writer.addPage(pdfPage)
#print(f'Added page:', uniquePages[i])
#Replace output.pdf with file name of the output
pdf_output = "output.pdf"
with open(pdf_output, 'wb') as f:
pdf_writer.write(f)
print('OUTPUT GENERATED SUCCESSFULLY!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment