do-me/Pdf_scan_shortened_marked.py

## Pdf_scan_shortened_marked.py
# PDF Scanner, Shortener and Marker

import PyPDF2
from PyPDF2 import PdfFileWriter,PdfFileReader
import os

# use path directly or deal with windows \ or /:
# pa=r"C:\Users\Dome\Desktop\nu\Wahlprogramme 2017\afd.pdf".replace("\\", "/")

# Part 1: PDF Scanner and Shortener

party="linke1" # and all the others: Linke, Grüne, SPD, FDP, CDU/CSU, AfD

pa= "C:/Users/Dome/Desktop/nu/Wahlprogramme 2017/"
os.chdir(pa)
pdoc= pa+party+".pdf"

pdfFileObj = open(pdoc,'rb')
# The pdfReader variable is a readable object that will be parsed
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# discerning the number of pages will allow us to parse through all pages
num_pages = pdfReader.numPages

writer = PdfFileWriter()
count = 0
text = ""
li=[]
search_values = ["migr","Migr","flücht","Flücht","auslä","Auslä","Asyl",
                 "immigr","Immigr","Refugee", "geflüchte","Geflüchte",
                 "zuwander","Zuwander", "zugewandert"]
# read each page
while count < num_pages:
    pageObj = pdfReader.getPage(count) # count holds current page

    text = pageObj.extractText()
    if text == "" : # if there is not text continue
        continue
    if text == None : # or if there is None
        continue
    if any(x in text.lower() for x in search_values): # if in search values
        li.append(count)
        writer.addPage(pageObj)
    count +=1

outp= pa+"migr/"+party+"_short.pdf" # filename
with open(outp, 'wb') as outfile: # save
    writer.write(outfile)

pdfFileObj.close()

# PDF Marker

import fitz

doc = fitz.open(outp)
doc.pageCount
i=0
while i < doc.pageCount: # Loop
    page = doc[i]
    for stemstring in search_values: # marks all search values defined above
        text_instances = page.searchFor(stemstring)
        for inst in text_instances:
            highlight = page.addHighlightAnnot(inst)
    i+=1
pfin=pa+"migr/"+party+"_mig.pdf" # path name
doc.save(pfin, garbage=4, deflate=True, clean=True) # save
	# PDF Scanner, Shortener and Marker

	import PyPDF2
	from PyPDF2 import PdfFileWriter,PdfFileReader
	import os

	# use path directly or deal with windows \ or /:
	# pa=r"C:\Users\Dome\Desktop\nu\Wahlprogramme 2017\afd.pdf".replace("\\", "/")

	# Part 1: PDF Scanner and Shortener

	party="linke1" # and all the others: Linke, Grüne, SPD, FDP, CDU/CSU, AfD

	pa= "C:/Users/Dome/Desktop/nu/Wahlprogramme 2017/"
	os.chdir(pa)
	pdoc= pa+party+".pdf"

	pdfFileObj = open(pdoc,'rb')
	# The pdfReader variable is a readable object that will be parsed
	pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
	# discerning the number of pages will allow us to parse through all pages
	num_pages = pdfReader.numPages

	writer = PdfFileWriter()
	count = 0
	text = ""
	li=[]
	search_values = ["migr","Migr","flücht","Flücht","auslä","Auslä","Asyl",
	"immigr","Immigr","Refugee", "geflüchte","Geflüchte",
	"zuwander","Zuwander", "zugewandert"]
	# read each page
	while count < num_pages:
	pageObj = pdfReader.getPage(count) # count holds current page

	text = pageObj.extractText()
	if text == "" : # if there is not text continue
	continue
	if text == None : # or if there is None
	continue
	if any(x in text.lower() for x in search_values): # if in search values
	li.append(count)
	writer.addPage(pageObj)
	count +=1

	outp= pa+"migr/"+party+"_short.pdf" # filename
	with open(outp, 'wb') as outfile: # save
	writer.write(outfile)

	pdfFileObj.close()

	# PDF Marker

	import fitz

	doc = fitz.open(outp)
	doc.pageCount
	i=0
	while i < doc.pageCount: # Loop
	page = doc[i]
	for stemstring in search_values: # marks all search values defined above
	text_instances = page.searchFor(stemstring)
	for inst in text_instances:
	highlight = page.addHighlightAnnot(inst)
	i+=1
	pfin=pa+"migr/"+party+"_mig.pdf" # path name
	doc.save(pfin, garbage=4, deflate=True, clean=True) # save