blip2/bluebeam_masher.py

## bluebeam_masher.py
import glob
import re
import argparse
import csv

'''
Bluebeam Markup Masher
Takes all markup objects in a document with a subject that isn't "Text Box"
and exports the values to a CSV. A new value can be listedin the CSV

Doesn't work great on multi-line annotations...

Note: code is pretty hacky... needs more regex tlc

'''

parser = argparse.ArgumentParser(
    description='Parse markedup PDFs and convert data to CSV',
    formatter_class=argparse.RawDescriptionHelpFormatter,
    epilog=__doc__)
parser.add_argument(
    '-t', '--tocsv', action='store_true', help='Convert PDF markup to CSV.')
parser.add_argument(
    '-f', '--fromcsv', action='store_true', help='Replace PDF markup content from CSV.')
args = parser.parse_args()

def pdf_to_csv():
    for filename in glob.glob('*.pdf'):
        pdf = open(filename, "rb").read()
        csv_file = open(filename + '.csv', 'wb')
        output = csv.writer(csv_file)
        output.writerow(['Subject', 'Current', 'Update To'])

        r = re.compile(
            r'obj(.*?BluebeamPDFRevu.*?)endobj.*?stream(.*?)endstream', re.S)
        for s in r.findall(pdf):
            subj = re.search(r'Subj\((.*?)\)', s[0]).group(1)
            content = re.search(r'Contents\((.*?)\)', s[0]).group(1)
            if subj not in ['Text Box', ]:
                output.writerow([subj, content, content])

def csv_to_pdf():
    for filename in glob.glob('*.pdf'):
        pdf = open(filename, "rb").read()
        csv_file = open(filename + '.csv', 'r')
        input = csv.reader(csv_file)
        next(input)

        for row in input:
            row = [x.encode('utf-8') for x in row]
            r = re.compile(
                b'obj.*?Subj\(' + row[0] +
                b'\).*?Contents\((.*?)\).*?endobj', re.S)
            pdf = r.sub(lambda m: m.group().replace(row[1], row[2]), pdf)

            r = re.compile(
                b'obj.*?Subj\(' + row[0] +
                b'\).*?(stream.*?endstream)', re.S)
            stream = r.search(pdf)

            if stream:
                stream = stream.group(1)
                s = re.compile(b'.*?\((.*?)\).*?', re.S)
                sub_objects = s.findall(stream)

                new_stream = stream.replace(sub_objects[0], row[2])
                for item in sub_objects[1:]:
                    new_stream = new_stream.replace(item, b" ")

                pdf = r.sub(lambda m: m.group().replace(
                    stream, new_stream), pdf)

            r = re.compile(
                b'obj.*?Subj\(' + row[0] +
                b'\).*?stream.*?(\(.*?\)).*?endstream', re.S)
            pdf = r.sub(lambda m: m.group().replace(b'()', b'(' + row[2] + b')'), pdf)

        out = open('X' + filename, "wb")
        out.write(pdf)
        out.close()

if args.tocsv:
    pdf_to_csv()

if args.fromcsv:
    csv_to_pdf()
	import glob
	import re
	import argparse
	import csv

	'''
	Bluebeam Markup Masher
	Takes all markup objects in a document with a subject that isn't "Text Box"
	and exports the values to a CSV. A new value can be listedin the CSV

	Doesn't work great on multi-line annotations...

	Note: code is pretty hacky... needs more regex tlc

	'''

	parser = argparse.ArgumentParser(
	description='Parse markedup PDFs and convert data to CSV',
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog=__doc__)
	parser.add_argument(
	'-t', '--tocsv', action='store_true', help='Convert PDF markup to CSV.')
	parser.add_argument(
	'-f', '--fromcsv', action='store_true', help='Replace PDF markup content from CSV.')
	args = parser.parse_args()

	def pdf_to_csv():
	for filename in glob.glob('*.pdf'):
	pdf = open(filename, "rb").read()
	csv_file = open(filename + '.csv', 'wb')
	output = csv.writer(csv_file)
	output.writerow(['Subject', 'Current', 'Update To'])

	r = re.compile(
	r'obj(.?BluebeamPDFRevu.?)endobj.?stream(.?)endstream', re.S)
	for s in r.findall(pdf):
	subj = re.search(r'Subj\((.*?)\)', s[0]).group(1)
	content = re.search(r'Contents\((.*?)\)', s[0]).group(1)
	if subj not in ['Text Box', ]:
	output.writerow([subj, content, content])

	def csv_to_pdf():
	for filename in glob.glob('*.pdf'):
	pdf = open(filename, "rb").read()
	csv_file = open(filename + '.csv', 'r')
	input = csv.reader(csv_file)
	next(input)

	for row in input:
	row = [x.encode('utf-8') for x in row]
	r = re.compile(
	b'obj.*?Subj\(' + row[0] +
	b'\).?Contents\((.?)\).*?endobj', re.S)
	pdf = r.sub(lambda m: m.group().replace(row[1], row[2]), pdf)

	r = re.compile(
	b'obj.*?Subj\(' + row[0] +
	b'\).?(stream.?endstream)', re.S)
	stream = r.search(pdf)

	if stream:
	stream = stream.group(1)
	s = re.compile(b'.?\((.?)\).*?', re.S)
	sub_objects = s.findall(stream)

	new_stream = stream.replace(sub_objects[0], row[2])
	for item in sub_objects[1:]:
	new_stream = new_stream.replace(item, b" ")

	pdf = r.sub(lambda m: m.group().replace(
	stream, new_stream), pdf)

	r = re.compile(
	b'obj.*?Subj\(' + row[0] +
	b'\).?stream.?(\(.?\)).?endstream', re.S)
	pdf = r.sub(lambda m: m.group().replace(b'()', b'(' + row[2] + b')'), pdf)

	out = open('X' + filename, "wb")
	out.write(pdf)
	out.close()

	if args.tocsv:
	pdf_to_csv()

	if args.fromcsv:
	csv_to_pdf()