Skip to content

Instantly share code, notes, and snippets.

@metadaddy
Created September 1, 2023 17:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save metadaddy/ceaf352d2a9c834bb8a11ae37734df92 to your computer and use it in GitHub Desktop.
Save metadaddy/ceaf352d2a9c834bb8a11ae37734df92 to your computer and use it in GitHub Desktop.
This script uses the PDF file at https://github.com/Snowflake-Labs/snowpark-python-demos/blob/main/pdf-analysis/prod_review10.pdf as a template for creating more PDFs with random names, dates, product selections and recommendations. This PDF file is used in the Snowflake demo "How To Analyze PDF Docs Using Snowpark" at https://www.youtube.com/wa…
# Backblaze wants developers and organization to copy and re-use our
# code examples, so we make the samples available by several different
# licenses. One option is the MIT license (below). Other options are
# available here:
#
# https://www.backblaze.com/using_b2_code.html
#
#
# The MIT License (MIT)
#
# Copyright (c) 2023 Backblaze
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# This script uses the PDF file at
# https://github.com/Snowflake-Labs/snowpark-python-demos/blob/main/pdf-analysis/prod_review10.pdf
# as a template for creating more PDFs with random names, dates, product
# selections and recommendations. This PDF file is used in the Snowflake demo
# "How To Analyze PDF Docs Using Snowpark" at https://www.youtube.com/watch?v=NqZzUACUzm8
#
# The code is somewhat tailored to that template PDF, but may be a useful guide
# to manipulating fields in other documents.
from PyPDF2 import PdfWriter, PdfReader
from PyPDF2.constants import FieldDictionaryAttributes as FD
from PyPDF2.constants import AnnotationDictionaryAttributes as AD
from PyPDF2.generic import NameObject, TextStringObject
from random import choice, random
from datetime import datetime
import censusname
genders = ["male", "female"]
products = ["Red Skateboard", "Blue Skateboard", "Tennis Shoes", "Basket Ball", "Boat"]
recommendations = ["Yes", "No", "MayBe"]
# Path to the template PDF - change this as necessary
template_file_path = "data/prod_review10.pdf"
# PDF field flags
RADIO_BUTTON = 1 << 15
MULTI_SELECT = 1 << 21
# This is an extended version of the update_page_form_field_values method in
# PdfWriter that handles radio buttons, and removes outdated
def update_field_values(page, fields):
for j in range(0, len(page['/Annots'])):
annot = page['/Annots'][j].get_object()
if annot.get('/Subtype') == '/Widget':
# Radio button field is not itself an annotation, but contains an annotation
# for each individual button. Hence, we have to find annotations that have
# a parent that is a radio button field - i.e. it has the radio button bit
# set in its field flags - and set the value of the parent
parent = annot.get(FD.Parent)
# Dereference the IndirectObject if there is one
parent = parent and parent.get_object()
parent_field_flag = parent and parent.get(FD.Ff)
if parent and (parent_field_flag & RADIO_BUTTON):
# The annotation is one of the individual buttons, parent is the radio button
# field
if field_value := fields.get(parent.get(FD.T)):
update_radio_button_field(annot, parent, field_value)
elif field_value := fields.get(annot.get(FD.T)):
# We found the annotation for a text/choice field - they seem to work the same way
update_text_field(annot, field_value)
# Fix issue with single-select choice containing an "/I" array.
# This should only be present for multi-select choice fields and
# seems to confuse Adobe Acrobat.
field_flag = annot.get(FD.Ff)
if annot.get(FD.FT) == '/Ch' and (not field_flag & MULTI_SELECT) and annot.get('/I'):
del annot['/I']
def update_text_field(annot, field_value):
annot.update({
NameObject(FD.V): TextStringObject(field_value)
})
# Remove outdated appearance so it doesn't show when the PDF is viewed
if annot.get(AD.AP):
del annot[AD.AP]
def update_radio_button_field(annot, parent, field_value):
# Radio button field text table matches a field in the dict we passed in
# Set the parent value - it must be preceded by a '/'
parent.update({
NameObject(FD.V): NameObject("/" + field_value)
})
# Set the appearance state for the individual buttons so it shows correctly when
# the PDF is viewed
if field_value in annot.get(AD.AP).get(FD.N):
# This is the 'on' button
annot.update({
NameObject(AD.AS): NameObject("/" + field_value)
})
else:
# Section 12.7.4.2.3 of the PDF 1.7 spec defines "Off" as the name of the off state
# appearance for check boxes, and this seems to also apply to radio buttons
annot.update({
NameObject(AD.AS): NameObject("/Off")
})
def main():
reader = PdfReader(template_file_path)
# Set range to suit your requirements
for n in range(11, 31):
# Choose a gender so that first and middle names are consistent
gender = choice(genders)
first_name = censusname.generate(nameformat='{given}', given=gender)
middle_name = censusname.generate(nameformat='{given}', given=gender)
last_name = censusname.generate(nameformat='{surname}')
product = choice(products)
recommend = choice(recommendations)
# Generate a random date this year
end = datetime.today()
start = datetime(end.year, 1, 1)
random_date = start + (end - start) * random()
# Date looks like 03/25/2023
purchase_date = random_date.strftime("%m/%d/%Y")
fields = {
'FirstName': first_name,
'Middle Name': middle_name,
'LastName': last_name,
'Product': product,
'Purchase Date': purchase_date,
'Recommend': recommend
}
print(fields)
writer = PdfWriter()
# Need to clone the document rather than just add the page, since we
# need the fields on the document as well as the annotations on the
# page
writer.clone_document_from_reader(reader)
# This is *part* of what's needed for new field values to show up in
# PDF viewers like Mac Preview and Adobe Acrobat
writer.set_need_appearances_writer()
update_field_values(writer.get_page(0), fields)
output_file_path = f"data/prod_review{n}.pdf"
writer.write(output_file_path)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment