Phoenix-Effect/pdf_hasher.py

## pdf_hasher.py
# run using python pdf_hasher.py <pdf_file_path.pdf>
# add -v argument to print the portions of the file which
# are excluded in hash calculation.

import hashlib
import argparse
import sys

def hash_file_exclude_id(file_path, verbose=False):
    """Hashes a file excluding the /ID entry in the PDF trailer and optionally prints excluded parts."""
    hasher = hashlib.sha256()
    with open(file_path, 'rb') as file:
        content = file.read()

    content_str = content.decode('latin-1', errors='ignore')
    trailer_start_index = content_str.rfind('trailer')
    id_start_index = content_str.find('/ID', trailer_start_index)
    eof_index = content_str.find(']', id_start_index) + 1

    if id_start_index != -1 and eof_index != -1:
        content_before_id = content[:id_start_index]
        content_after_id = content[eof_index:]

        if verbose:
            excluded_content = content[id_start_index:eof_index]
            print(f"Excluded from hash in '{file_path}':")
            print(excluded_content.decode('latin-1', errors='ignore'))

        hasher.update(content_before_id)
        hasher.update(content_after_id)
    else:
        hasher.update(content)
        if verbose:
            print(f"No /ID found to exclude in '{file_path}'. Hashing entire file.")

    return hasher.hexdigest()

def main():
    parser = argparse.ArgumentParser(description="Hash a PDF file excluding its /ID entry.")
    parser.add_argument("file_path", help="Path to the PDF file to be hashed.")
    parser.add_argument("-v", "--verbose", action="store_true", help="Print the parts of the file that were excluded from the hash.")
    args = parser.parse_args()

    hash_value = hash_file_exclude_id(args.file_path, verbose=args.verbose)
    print(f"{hash_value}  {args.file_path}")

if __name__ == "__main__":
    main()
	# run using python pdf_hasher.py <pdf_file_path.pdf>
	# add -v argument to print the portions of the file which
	# are excluded in hash calculation.

	import hashlib
	import argparse
	import sys

	def hash_file_exclude_id(file_path, verbose=False):
	"""Hashes a file excluding the /ID entry in the PDF trailer and optionally prints excluded parts."""
	hasher = hashlib.sha256()
	with open(file_path, 'rb') as file:
	content = file.read()

	content_str = content.decode('latin-1', errors='ignore')
	trailer_start_index = content_str.rfind('trailer')
	id_start_index = content_str.find('/ID', trailer_start_index)
	eof_index = content_str.find(']', id_start_index) + 1

	if id_start_index != -1 and eof_index != -1:
	content_before_id = content[:id_start_index]
	content_after_id = content[eof_index:]

	if verbose:
	excluded_content = content[id_start_index:eof_index]
	print(f"Excluded from hash in '{file_path}':")
	print(excluded_content.decode('latin-1', errors='ignore'))

	hasher.update(content_before_id)
	hasher.update(content_after_id)
	else:
	hasher.update(content)
	if verbose:
	print(f"No /ID found to exclude in '{file_path}'. Hashing entire file.")

	return hasher.hexdigest()

	def main():
	parser = argparse.ArgumentParser(description="Hash a PDF file excluding its /ID entry.")
	parser.add_argument("file_path", help="Path to the PDF file to be hashed.")
	parser.add_argument("-v", "--verbose", action="store_true", help="Print the parts of the file that were excluded from the hash.")
	args = parser.parse_args()

	hash_value = hash_file_exclude_id(args.file_path, verbose=args.verbose)
	print(f"{hash_value} {args.file_path}")

	if __name__ == "__main__":
	main()