RhetTbull/noterescue.py

## noterescue.py
"""Rescue lost Apple Notes notes, reference: https://fosstodon.org/@Cyberneticist@hachyderm.io/109992039449199371

To run this:

1. Install python (I recommend the latest version of python 3 from python.org)
2. Install the dependencies: python3 -m pip install bpylist2
3. Run the script: python3 noterescue.py <path to note file(s)>

The extracted files will be saved in the parent directory of the note file.

The files will be named like:

    <note name> - <UUID>_media.<extension>
    <note name> - <UUID>_previewImage_0.<extension>

The file ending in _media is the original photo taken by the camera app and the
_previewImage_0 is the cropped image that is shown in the notes app.

Recombining these into PDFs is left as an exercise for the reader.
"""

from __future__ import annotations

import argparse
import os
import pathlib
import re
import sys
from dataclasses import dataclass, field

from bpylist2 import archiver
from bpylist2.archive_types import DataclassArchiver

JPEG_START = b"\xff\xd8"
PDF_START = b"\x25\x50\x44\x46"  # %PDF

# The files are binary plist (bplist) files containing an NSKeyedArchiver object
# which is used to serialize object data on iOS and macOS
# I've reverse engineered the serialized classes which will be created as
# the following classes


@dataclass
class ICDataPersister(DataclassArchiver):
    identifierToDataDictionary: field(default_factory=dict)
    objectIdentifier: str
    cacheDirectoryURL: str
    accumulatedDataSize: int
    allURLs: field(default_factory=list)


@dataclass
class ICNotePasteboardData(DataclassArchiver):
    attributedStringData: bytes
    dataPersister: ICDataPersister


@dataclass
class NSURL(DataclassArchiver):
    NSrelative: str
    NSbase: str


# Register the classes with the archiver so it can deserialize the data
archiver.update_class_map({"ICDataPersister": ICDataPersister})
archiver.update_class_map({"ICNotePasteboardData": ICNotePasteboardData})
archiver.update_class_map({"NSURL": NSURL})


def main():
    """Main function for handling args and processing files"""
    parser = argparse.ArgumentParser()
    parser.add_argument("files", nargs="*")
    files = parser.parse_args().files

    if not files:
        usage()
        sys.exit(0)

    print(f"Processing {len(files)} files")
    for path in files:
        path = pathlib.Path(path)
        print(f"Processing {path}")
        data = load_note_data_from_plist(path)
        attachments = save_attachments(path.stem, path.parent, data)
        print(f"Extracted ({len(attachments)}) attachments")


def usage():
    """Print usage information"""
    print("Usage: python3 noterescue.py <path to note file(s)>")
    print(
        f"Extracted attachments will be saved in the current directory ({os.getcwd()})"
    )


def save_attachments(
    name: str, path: str | pathlib.Path, data: ICNotePasteboardData
) -> list[str]:
    """Save the attachments from the note data

    Args:
        name: The name of the note
        path: The path to save the attachments
        data: The unarchived note data

    Returns:
        A list of the saved files
    """
    data_dict = data.dataPersister.identifierToDataDictionary
    media_keys = [
        key.replace("_media", "") for key in data_dict.keys() if key.endswith("media")
    ]
    print(f"Found {len(media_keys)} media attachments: {media_keys=}")

    # which attachmments do we want to save?
    keys_to_save = [f"{key}_media" for key in media_keys]

    # find largest preview
    # we want the full size image, not the thumbnail but the order
    # (previewImage_0, previewImage_1, ...) is not deterministic
    preview_keys = {}
    for key, value in data_dict.items():
        key_base = re.sub(r"_.*$", "", key)
        if key_base in media_keys and "previewImage" in key:
            if key_base not in preview_keys:
                preview_keys[key_base] = (key, len(value))
            elif len(value) > preview_keys[key_base][1]:
                preview_keys[key_base] = (key, len(value))
    keys_to_save.extend([key for key, _ in preview_keys.values()])

    # save the attachments
    saved_files = []
    for key in keys_to_save:
        value = data_dict[key]

        if value.startswith(JPEG_START):
            # all the samples I've seen are JPEGs but also check for PDF
            ext = "jpeg"
        elif value.startswith(PDF_START):
            ext = "pdf"
        else:
            print(f"Unknown file type for {key} ({len(value)} bytes)", file=sys.stderr)
            ext = "bin"

        output_file = increment_filename(pathlib.Path(path) / f"{name} - {key}.{ext}")
        with open(output_file, "wb") as f:
            print(f"Writing {output_file} ({len(value)} bytes)")
            f.write(value)
            saved_files.append(output_file)
    return saved_files


def increment_filename(path: str | pathlib.Path) -> str:
    """If filename exists, increment until it doesn't"""
    path = str(path)
    if not os.path.exists(path):
        return path

    # find the extension
    ext = pathlib.Path(path).suffix
    base = path[: -len(ext)]

    if match := re.search(r"\s\((\d+)\)$", base):
        # increment the number
        num = int(match[1]) + 1
        base = f"{base[:match.start()]} ({num})"
    else:
        base = f"{base} (1)"

    return increment_filename(base + ext)


def load_note_data_from_plist(path: str) -> ICNotePasteboardData:
    with open(path, "rb") as f:
        data = f.read()
        return archiver.unarchive(data)


if __name__ == "__main__":
    main()
	"""Rescue lost Apple Notes notes, reference: https://fosstodon.org/@Cyberneticist@hachyderm.io/109992039449199371

	To run this:

	1. Install python (I recommend the latest version of python 3 from python.org)
	2. Install the dependencies: python3 -m pip install bpylist2
	3. Run the script: python3 noterescue.py <path to note file(s)>

	The extracted files will be saved in the parent directory of the note file.

	The files will be named like:

	<note name> - <UUID>_media.<extension>
	<note name> - <UUID>_previewImage_0.<extension>

	The file ending in _media is the original photo taken by the camera app and the
	_previewImage_0 is the cropped image that is shown in the notes app.

	Recombining these into PDFs is left as an exercise for the reader.
	"""

	from __future__ import annotations

	import argparse
	import os
	import pathlib
	import re
	import sys
	from dataclasses import dataclass, field

	from bpylist2 import archiver
	from bpylist2.archive_types import DataclassArchiver

	JPEG_START = b"\xff\xd8"
	PDF_START = b"\x25\x50\x44\x46" # %PDF

	# The files are binary plist (bplist) files containing an NSKeyedArchiver object
	# which is used to serialize object data on iOS and macOS
	# I've reverse engineered the serialized classes which will be created as
	# the following classes


	@dataclass
	class ICDataPersister(DataclassArchiver):
	identifierToDataDictionary: field(default_factory=dict)
	objectIdentifier: str
	cacheDirectoryURL: str
	accumulatedDataSize: int
	allURLs: field(default_factory=list)


	@dataclass
	class ICNotePasteboardData(DataclassArchiver):
	attributedStringData: bytes
	dataPersister: ICDataPersister


	@dataclass
	class NSURL(DataclassArchiver):
	NSrelative: str
	NSbase: str


	# Register the classes with the archiver so it can deserialize the data
	archiver.update_class_map({"ICDataPersister": ICDataPersister})
	archiver.update_class_map({"ICNotePasteboardData": ICNotePasteboardData})
	archiver.update_class_map({"NSURL": NSURL})


	def main():
	"""Main function for handling args and processing files"""
	parser = argparse.ArgumentParser()
	parser.add_argument("files", nargs="*")
	files = parser.parse_args().files

	if not files:
	usage()
	sys.exit(0)

	print(f"Processing {len(files)} files")
	for path in files:
	path = pathlib.Path(path)
	print(f"Processing {path}")
	data = load_note_data_from_plist(path)
	attachments = save_attachments(path.stem, path.parent, data)
	print(f"Extracted ({len(attachments)}) attachments")


	def usage():
	"""Print usage information"""
	print("Usage: python3 noterescue.py <path to note file(s)>")
	print(
	f"Extracted attachments will be saved in the current directory ({os.getcwd()})"
	)


	def save_attachments(
	name: str, path: str \| pathlib.Path, data: ICNotePasteboardData
	) -> list[str]:
	"""Save the attachments from the note data

	Args:
	name: The name of the note
	path: The path to save the attachments
	data: The unarchived note data

	Returns:
	A list of the saved files
	"""
	data_dict = data.dataPersister.identifierToDataDictionary
	media_keys = [
	key.replace("_media", "") for key in data_dict.keys() if key.endswith("media")
	]
	print(f"Found {len(media_keys)} media attachments: {media_keys=}")

	# which attachmments do we want to save?
	keys_to_save = [f"{key}_media" for key in media_keys]

	# find largest preview
	# we want the full size image, not the thumbnail but the order
	# (previewImage_0, previewImage_1, ...) is not deterministic
	preview_keys = {}
	for key, value in data_dict.items():
	key_base = re.sub(r"_.*$", "", key)
	if key_base in media_keys and "previewImage" in key:
	if key_base not in preview_keys:
	preview_keys[key_base] = (key, len(value))
	elif len(value) > preview_keys[key_base][1]:
	preview_keys[key_base] = (key, len(value))
	keys_to_save.extend([key for key, _ in preview_keys.values()])

	# save the attachments
	saved_files = []
	for key in keys_to_save:
	value = data_dict[key]

	if value.startswith(JPEG_START):
	# all the samples I've seen are JPEGs but also check for PDF
	ext = "jpeg"
	elif value.startswith(PDF_START):
	ext = "pdf"
	else:
	print(f"Unknown file type for {key} ({len(value)} bytes)", file=sys.stderr)
	ext = "bin"

	output_file = increment_filename(pathlib.Path(path) / f"{name} - {key}.{ext}")
	with open(output_file, "wb") as f:
	print(f"Writing {output_file} ({len(value)} bytes)")
	f.write(value)
	saved_files.append(output_file)
	return saved_files


	def increment_filename(path: str \| pathlib.Path) -> str:
	"""If filename exists, increment until it doesn't"""
	path = str(path)
	if not os.path.exists(path):
	return path

	# find the extension
	ext = pathlib.Path(path).suffix
	base = path[: -len(ext)]

	if match := re.search(r"\s\((\d+)\)$", base):
	# increment the number
	num = int(match[1]) + 1
	base = f"{base[:match.start()]} ({num})"
	else:
	base = f"{base} (1)"

	return increment_filename(base + ext)


	def load_note_data_from_plist(path: str) -> ICNotePasteboardData:
	with open(path, "rb") as f:
	data = f.read()
	return archiver.unarchive(data)


	if __name__ == "__main__":
	main()