metadaddy/makepdfs.py

## makepdfs.py
# Backblaze wants developers and organization to copy and re-use our
# code examples, so we make the samples available by several different
# licenses.  One option is the MIT license (below).  Other options are
# available here:
#
# https://www.backblaze.com/using_b2_code.html
#
#
# The MIT License (MIT)
#
# Copyright (c) 2023 Backblaze
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# This script uses the PDF file at
# https://github.com/Snowflake-Labs/snowpark-python-demos/blob/main/pdf-analysis/prod_review10.pdf
# as a template for creating more PDFs with random names, dates, product
# selections and recommendations. This PDF file is used in the Snowflake demo
# "How To Analyze PDF Docs Using Snowpark" at https://www.youtube.com/watch?v=NqZzUACUzm8
#
# The code is somewhat tailored to that template PDF, but may be a useful guide
# to manipulating fields in other documents.
from PyPDF2 import PdfWriter, PdfReader
from PyPDF2.constants import FieldDictionaryAttributes as FD
from PyPDF2.constants import AnnotationDictionaryAttributes as AD
from PyPDF2.generic import NameObject, TextStringObject
from random import choice, random
from datetime import datetime
import censusname

genders = ["male", "female"]
products = ["Red Skateboard", "Blue Skateboard", "Tennis Shoes", "Basket Ball", "Boat"]
recommendations = ["Yes", "No", "MayBe"]

# Path to the template PDF - change this as necessary
template_file_path = "data/prod_review10.pdf"

# PDF field flags
RADIO_BUTTON = 1 << 15
MULTI_SELECT = 1 << 21


# This is an extended version of the update_page_form_field_values method in
# PdfWriter that handles radio buttons, and removes outdated
def update_field_values(page, fields):
    for j in range(0, len(page['/Annots'])):
        annot = page['/Annots'][j].get_object()
        if annot.get('/Subtype') == '/Widget':
            # Radio button field is not itself an annotation, but contains an annotation
            # for each individual button. Hence, we have to find annotations that have
            # a parent that is a radio button field - i.e. it has the radio button bit
            # set in its field flags - and set the value of the parent
            parent = annot.get(FD.Parent)
            # Dereference the IndirectObject if there is one
            parent = parent and parent.get_object()
            parent_field_flag = parent and parent.get(FD.Ff)
            if parent and (parent_field_flag & RADIO_BUTTON):
                # The annotation is one of the individual buttons, parent is the radio button
                # field
                if field_value := fields.get(parent.get(FD.T)):
                    update_radio_button_field(annot, parent, field_value)
            elif field_value := fields.get(annot.get(FD.T)):
                # We found the annotation for a text/choice field - they seem to work the same way
                update_text_field(annot, field_value)
            # Fix issue with single-select choice containing an "/I" array.
            # This should only be present for multi-select choice fields and
            # seems to confuse Adobe Acrobat.
            field_flag = annot.get(FD.Ff)
            if annot.get(FD.FT) == '/Ch' and (not field_flag & MULTI_SELECT) and annot.get('/I'):
                del annot['/I']


def update_text_field(annot, field_value):
    annot.update({
        NameObject(FD.V): TextStringObject(field_value)
    })
    # Remove outdated appearance so it doesn't show when the PDF is viewed
    if annot.get(AD.AP):
        del annot[AD.AP]


def update_radio_button_field(annot, parent, field_value):
    # Radio button field text table matches a field in the dict we passed in
    # Set the parent value - it must be preceded by a '/'
    parent.update({
        NameObject(FD.V): NameObject("/" + field_value)
    })
    # Set the appearance state for the individual buttons so it shows correctly when
    # the PDF is viewed
    if field_value in annot.get(AD.AP).get(FD.N):
        # This is the 'on' button
        annot.update({
            NameObject(AD.AS): NameObject("/" + field_value)
        })
    else:
        # Section 12.7.4.2.3 of the PDF 1.7 spec defines "Off" as the name of the off state
        # appearance for check boxes, and this seems to also apply to radio buttons
        annot.update({
            NameObject(AD.AS): NameObject("/Off")
        })


def main():
    reader = PdfReader(template_file_path)

    # Set range to suit your requirements
    for n in range(11, 31):
        # Choose a gender so that first and middle names are consistent
        gender = choice(genders)
        first_name = censusname.generate(nameformat='{given}', given=gender)
        middle_name = censusname.generate(nameformat='{given}', given=gender)
        last_name = censusname.generate(nameformat='{surname}')

        product = choice(products)
        recommend = choice(recommendations)

        # Generate a random date this year
        end = datetime.today()
        start = datetime(end.year, 1, 1)
        random_date = start + (end - start) * random()

        # Date looks like 03/25/2023
        purchase_date = random_date.strftime("%m/%d/%Y")

        fields = {
            'FirstName': first_name,
            'Middle Name': middle_name,
            'LastName': last_name,
            'Product': product,
            'Purchase Date': purchase_date,
            'Recommend': recommend
        }

        print(fields)

        writer = PdfWriter()
        # Need to clone the document rather than just add the page, since we
        # need the fields on the document as well as the annotations on the
        # page
        writer.clone_document_from_reader(reader)
        # This is *part* of what's needed for new field values to show up in
        # PDF viewers like Mac Preview and Adobe Acrobat
        writer.set_need_appearances_writer()
        update_field_values(writer.get_page(0), fields)

        output_file_path = f"data/prod_review{n}.pdf"
        writer.write(output_file_path)


if __name__ == "__main__":
    main()
	# Backblaze wants developers and organization to copy and re-use our
	# code examples, so we make the samples available by several different
	# licenses. One option is the MIT license (below). Other options are
	# available here:
	#
	# https://www.backblaze.com/using_b2_code.html
	#
	#
	# The MIT License (MIT)
	#
	# Copyright (c) 2023 Backblaze
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.

	# This script uses the PDF file at
	# https://github.com/Snowflake-Labs/snowpark-python-demos/blob/main/pdf-analysis/prod_review10.pdf
	# as a template for creating more PDFs with random names, dates, product
	# selections and recommendations. This PDF file is used in the Snowflake demo
	# "How To Analyze PDF Docs Using Snowpark" at https://www.youtube.com/watch?v=NqZzUACUzm8
	#
	# The code is somewhat tailored to that template PDF, but may be a useful guide
	# to manipulating fields in other documents.
	from PyPDF2 import PdfWriter, PdfReader
	from PyPDF2.constants import FieldDictionaryAttributes as FD
	from PyPDF2.constants import AnnotationDictionaryAttributes as AD
	from PyPDF2.generic import NameObject, TextStringObject
	from random import choice, random
	from datetime import datetime
	import censusname

	genders = ["male", "female"]
	products = ["Red Skateboard", "Blue Skateboard", "Tennis Shoes", "Basket Ball", "Boat"]
	recommendations = ["Yes", "No", "MayBe"]

	# Path to the template PDF - change this as necessary
	template_file_path = "data/prod_review10.pdf"

	# PDF field flags
	RADIO_BUTTON = 1 << 15
	MULTI_SELECT = 1 << 21


	# This is an extended version of the update_page_form_field_values method in
	# PdfWriter that handles radio buttons, and removes outdated
	def update_field_values(page, fields):
	for j in range(0, len(page['/Annots'])):
	annot = page['/Annots'][j].get_object()
	if annot.get('/Subtype') == '/Widget':
	# Radio button field is not itself an annotation, but contains an annotation
	# for each individual button. Hence, we have to find annotations that have
	# a parent that is a radio button field - i.e. it has the radio button bit
	# set in its field flags - and set the value of the parent
	parent = annot.get(FD.Parent)
	# Dereference the IndirectObject if there is one
	parent = parent and parent.get_object()
	parent_field_flag = parent and parent.get(FD.Ff)
	if parent and (parent_field_flag & RADIO_BUTTON):
	# The annotation is one of the individual buttons, parent is the radio button
	# field
	if field_value := fields.get(parent.get(FD.T)):
	update_radio_button_field(annot, parent, field_value)
	elif field_value := fields.get(annot.get(FD.T)):
	# We found the annotation for a text/choice field - they seem to work the same way
	update_text_field(annot, field_value)
	# Fix issue with single-select choice containing an "/I" array.
	# This should only be present for multi-select choice fields and
	# seems to confuse Adobe Acrobat.
	field_flag = annot.get(FD.Ff)
	if annot.get(FD.FT) == '/Ch' and (not field_flag & MULTI_SELECT) and annot.get('/I'):
	del annot['/I']


	def update_text_field(annot, field_value):
	annot.update({
	NameObject(FD.V): TextStringObject(field_value)
	})
	# Remove outdated appearance so it doesn't show when the PDF is viewed
	if annot.get(AD.AP):
	del annot[AD.AP]


	def update_radio_button_field(annot, parent, field_value):
	# Radio button field text table matches a field in the dict we passed in
	# Set the parent value - it must be preceded by a '/'
	parent.update({
	NameObject(FD.V): NameObject("/" + field_value)
	})
	# Set the appearance state for the individual buttons so it shows correctly when
	# the PDF is viewed
	if field_value in annot.get(AD.AP).get(FD.N):
	# This is the 'on' button
	annot.update({
	NameObject(AD.AS): NameObject("/" + field_value)
	})
	else:
	# Section 12.7.4.2.3 of the PDF 1.7 spec defines "Off" as the name of the off state
	# appearance for check boxes, and this seems to also apply to radio buttons
	annot.update({
	NameObject(AD.AS): NameObject("/Off")
	})


	def main():
	reader = PdfReader(template_file_path)

	# Set range to suit your requirements
	for n in range(11, 31):
	# Choose a gender so that first and middle names are consistent
	gender = choice(genders)
	first_name = censusname.generate(nameformat='{given}', given=gender)
	middle_name = censusname.generate(nameformat='{given}', given=gender)
	last_name = censusname.generate(nameformat='{surname}')

	product = choice(products)
	recommend = choice(recommendations)

	# Generate a random date this year
	end = datetime.today()
	start = datetime(end.year, 1, 1)
	random_date = start + (end - start) * random()

	# Date looks like 03/25/2023
	purchase_date = random_date.strftime("%m/%d/%Y")

	fields = {
	'FirstName': first_name,
	'Middle Name': middle_name,
	'LastName': last_name,
	'Product': product,
	'Purchase Date': purchase_date,
	'Recommend': recommend
	}

	print(fields)

	writer = PdfWriter()
	# Need to clone the document rather than just add the page, since we
	# need the fields on the document as well as the annotations on the
	# page
	writer.clone_document_from_reader(reader)
	# This is part of what's needed for new field values to show up in
	# PDF viewers like Mac Preview and Adobe Acrobat
	writer.set_need_appearances_writer()
	update_field_values(writer.get_page(0), fields)

	output_file_path = f"data/prod_review{n}.pdf"
	writer.write(output_file_path)


	if __name__ == "__main__":
	main()