Last active
November 30, 2021 05:44
-
-
Save paulsonkantony/c8689c69715e5477885821d4dffa2c03 to your computer and use it in GitHub Desktop.
Code snippet to remove duplicate pages from a PDF File
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""pdfClean.py: Remove duplicate pages from a PDF File.""" | |
import os | |
import PyPDF2 | |
import numpy as np | |
from tqdm import trange, tqdm | |
__author__ = 'Paulson K Antony' | |
__copyright__ = 'Copyright 2021, randomProjects' | |
__credits__ = ['Paulson K Antony'] | |
__license__ = 'GPL' | |
__version__ = '3.0' | |
def extract_unique_page_numbers(array): | |
return {i: (array == i).nonzero()[0][0] for i in np.unique(array)} | |
#Replace input.pdf with file name of the input | |
pdf_reader = PyPDF2.PdfFileReader('input.pdf') | |
pdf_writer = PyPDF2.PdfFileWriter() | |
numPages = pdf_reader.getNumPages() | |
file = [] | |
print('Extracting page number:') | |
for i in trange(numPages): | |
#print(i) | |
current_page = pdf_reader.getPage(i) | |
file.append(current_page.extractText()) | |
print('\n') | |
file = np.array(file, dtype=object) | |
uniquePages = list(extract_unique_page_numbers(file).values()) | |
uniquePages.sort() | |
print('Writing unique pages to output: ') | |
#print(uniquePages) | |
for i in trange(len(uniquePages)): | |
pdfPage = pdf_reader.getPage(uniquePages[i]) | |
pdf_writer.addPage(pdfPage) | |
#print(f'Added page:', uniquePages[i]) | |
#Replace output.pdf with file name of the output | |
pdf_output = "output.pdf" | |
with open(pdf_output, 'wb') as f: | |
pdf_writer.write(f) | |
print('OUTPUT GENERATED SUCCESSFULLY!') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment