Skip to content

Instantly share code, notes, and snippets.

@do-me
Created May 25, 2019 11:22
Show Gist options
  • Save do-me/3346c3ac74d6b475302300ec582f4a20 to your computer and use it in GitHub Desktop.
Save do-me/3346c3ac74d6b475302300ec582f4a20 to your computer and use it in GitHub Desktop.
# PDF Scanner, Shortener and Marker
import PyPDF2
from PyPDF2 import PdfFileWriter,PdfFileReader
import os
# use path directly or deal with windows \ or /:
# pa=r"C:\Users\Dome\Desktop\nu\Wahlprogramme 2017\afd.pdf".replace("\\", "/")
# Part 1: PDF Scanner and Shortener
party="linke1" # and all the others: Linke, Grüne, SPD, FDP, CDU/CSU, AfD
pa= "C:/Users/Dome/Desktop/nu/Wahlprogramme 2017/"
os.chdir(pa)
pdoc= pa+party+".pdf"
pdfFileObj = open(pdoc,'rb')
# The pdfReader variable is a readable object that will be parsed
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# discerning the number of pages will allow us to parse through all pages
num_pages = pdfReader.numPages
writer = PdfFileWriter()
count = 0
text = ""
li=[]
search_values = ["migr","Migr","flücht","Flücht","auslä","Auslä","Asyl",
"immigr","Immigr","Refugee", "geflüchte","Geflüchte",
"zuwander","Zuwander", "zugewandert"]
# read each page
while count < num_pages:
pageObj = pdfReader.getPage(count) # count holds current page
text = pageObj.extractText()
if text == "" : # if there is not text continue
continue
if text == None : # or if there is None
continue
if any(x in text.lower() for x in search_values): # if in search values
li.append(count)
writer.addPage(pageObj)
count +=1
outp= pa+"migr/"+party+"_short.pdf" # filename
with open(outp, 'wb') as outfile: # save
writer.write(outfile)
pdfFileObj.close()
# PDF Marker
import fitz
doc = fitz.open(outp)
doc.pageCount
i=0
while i < doc.pageCount: # Loop
page = doc[i]
for stemstring in search_values: # marks all search values defined above
text_instances = page.searchFor(stemstring)
for inst in text_instances:
highlight = page.addHighlightAnnot(inst)
i+=1
pfin=pa+"migr/"+party+"_mig.pdf" # path name
doc.save(pfin, garbage=4, deflate=True, clean=True) # save
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment