Skip to content

Instantly share code, notes, and snippets.

@Phoenix-Effect
Last active February 10, 2024 20:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Phoenix-Effect/3116a708d2c4128ea4884fdc6a6b6827 to your computer and use it in GitHub Desktop.
Save Phoenix-Effect/3116a708d2c4128ea4884fdc6a6b6827 to your computer and use it in GitHub Desktop.
This code is an example of how a PDF hash can be calculated while ignoring the '/ID' identifier in the trailer of the file. This '/ID' part is modified each time the file is saved even if the contents have not been modified. This leads to files with identical contents generating different hashes. The python script opens a given PDF file in binar…
# run using python pdf_hasher.py <pdf_file_path.pdf>
# add -v argument to print the portions of the file which
# are excluded in hash calculation.
import hashlib
import argparse
import sys
def hash_file_exclude_id(file_path, verbose=False):
"""Hashes a file excluding the /ID entry in the PDF trailer and optionally prints excluded parts."""
hasher = hashlib.sha256()
with open(file_path, 'rb') as file:
content = file.read()
content_str = content.decode('latin-1', errors='ignore')
trailer_start_index = content_str.rfind('trailer')
id_start_index = content_str.find('/ID', trailer_start_index)
eof_index = content_str.find(']', id_start_index) + 1
if id_start_index != -1 and eof_index != -1:
content_before_id = content[:id_start_index]
content_after_id = content[eof_index:]
if verbose:
excluded_content = content[id_start_index:eof_index]
print(f"Excluded from hash in '{file_path}':")
print(excluded_content.decode('latin-1', errors='ignore'))
hasher.update(content_before_id)
hasher.update(content_after_id)
else:
hasher.update(content)
if verbose:
print(f"No /ID found to exclude in '{file_path}'. Hashing entire file.")
return hasher.hexdigest()
def main():
parser = argparse.ArgumentParser(description="Hash a PDF file excluding its /ID entry.")
parser.add_argument("file_path", help="Path to the PDF file to be hashed.")
parser.add_argument("-v", "--verbose", action="store_true", help="Print the parts of the file that were excluded from the hash.")
args = parser.parse_args()
hash_value = hash_file_exclude_id(args.file_path, verbose=args.verbose)
print(f"{hash_value} {args.file_path}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment