Last active
March 15, 2023 23:52
-
-
Save RhetTbull/025615064d2b8d57cbd3fff5e211137b to your computer and use it in GitHub Desktop.
Extract embedded media in Apple Notes notes exported to a bplist file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Rescue lost Apple Notes notes, reference: https://fosstodon.org/@Cyberneticist@hachyderm.io/109992039449199371 | |
To run this: | |
1. Install python (I recommend the latest version of python 3 from python.org) | |
2. Install the dependencies: python3 -m pip install bpylist2 | |
3. Run the script: python3 noterescue.py <path to note file(s)> | |
The extracted files will be saved in the parent directory of the note file. | |
The files will be named like: | |
<note name> - <UUID>_media.<extension> | |
<note name> - <UUID>_previewImage_0.<extension> | |
The file ending in _media is the original photo taken by the camera app and the | |
_previewImage_0 is the cropped image that is shown in the notes app. | |
Recombining these into PDFs is left as an exercise for the reader. | |
""" | |
from __future__ import annotations | |
import argparse | |
import os | |
import pathlib | |
import re | |
import sys | |
from dataclasses import dataclass, field | |
from bpylist2 import archiver | |
from bpylist2.archive_types import DataclassArchiver | |
JPEG_START = b"\xff\xd8" | |
PDF_START = b"\x25\x50\x44\x46" # %PDF | |
# The files are binary plist (bplist) files containing an NSKeyedArchiver object | |
# which is used to serialize object data on iOS and macOS | |
# I've reverse engineered the serialized classes which will be created as | |
# the following classes | |
@dataclass | |
class ICDataPersister(DataclassArchiver): | |
identifierToDataDictionary: field(default_factory=dict) | |
objectIdentifier: str | |
cacheDirectoryURL: str | |
accumulatedDataSize: int | |
allURLs: field(default_factory=list) | |
@dataclass | |
class ICNotePasteboardData(DataclassArchiver): | |
attributedStringData: bytes | |
dataPersister: ICDataPersister | |
@dataclass | |
class NSURL(DataclassArchiver): | |
NSrelative: str | |
NSbase: str | |
# Register the classes with the archiver so it can deserialize the data | |
archiver.update_class_map({"ICDataPersister": ICDataPersister}) | |
archiver.update_class_map({"ICNotePasteboardData": ICNotePasteboardData}) | |
archiver.update_class_map({"NSURL": NSURL}) | |
def main(): | |
"""Main function for handling args and processing files""" | |
parser = argparse.ArgumentParser() | |
parser.add_argument("files", nargs="*") | |
files = parser.parse_args().files | |
if not files: | |
usage() | |
sys.exit(0) | |
print(f"Processing {len(files)} files") | |
for path in files: | |
path = pathlib.Path(path) | |
print(f"Processing {path}") | |
data = load_note_data_from_plist(path) | |
attachments = save_attachments(path.stem, path.parent, data) | |
print(f"Extracted ({len(attachments)}) attachments") | |
def usage(): | |
"""Print usage information""" | |
print("Usage: python3 noterescue.py <path to note file(s)>") | |
print( | |
f"Extracted attachments will be saved in the current directory ({os.getcwd()})" | |
) | |
def save_attachments( | |
name: str, path: str | pathlib.Path, data: ICNotePasteboardData | |
) -> list[str]: | |
"""Save the attachments from the note data | |
Args: | |
name: The name of the note | |
path: The path to save the attachments | |
data: The unarchived note data | |
Returns: | |
A list of the saved files | |
""" | |
data_dict = data.dataPersister.identifierToDataDictionary | |
media_keys = [ | |
key.replace("_media", "") for key in data_dict.keys() if key.endswith("media") | |
] | |
print(f"Found {len(media_keys)} media attachments: {media_keys=}") | |
# which attachmments do we want to save? | |
keys_to_save = [f"{key}_media" for key in media_keys] | |
# find largest preview | |
# we want the full size image, not the thumbnail but the order | |
# (previewImage_0, previewImage_1, ...) is not deterministic | |
preview_keys = {} | |
for key, value in data_dict.items(): | |
key_base = re.sub(r"_.*$", "", key) | |
if key_base in media_keys and "previewImage" in key: | |
if key_base not in preview_keys: | |
preview_keys[key_base] = (key, len(value)) | |
elif len(value) > preview_keys[key_base][1]: | |
preview_keys[key_base] = (key, len(value)) | |
keys_to_save.extend([key for key, _ in preview_keys.values()]) | |
# save the attachments | |
saved_files = [] | |
for key in keys_to_save: | |
value = data_dict[key] | |
if value.startswith(JPEG_START): | |
# all the samples I've seen are JPEGs but also check for PDF | |
ext = "jpeg" | |
elif value.startswith(PDF_START): | |
ext = "pdf" | |
else: | |
print(f"Unknown file type for {key} ({len(value)} bytes)", file=sys.stderr) | |
ext = "bin" | |
output_file = increment_filename(pathlib.Path(path) / f"{name} - {key}.{ext}") | |
with open(output_file, "wb") as f: | |
print(f"Writing {output_file} ({len(value)} bytes)") | |
f.write(value) | |
saved_files.append(output_file) | |
return saved_files | |
def increment_filename(path: str | pathlib.Path) -> str: | |
"""If filename exists, increment until it doesn't""" | |
path = str(path) | |
if not os.path.exists(path): | |
return path | |
# find the extension | |
ext = pathlib.Path(path).suffix | |
base = path[: -len(ext)] | |
if match := re.search(r"\s\((\d+)\)$", base): | |
# increment the number | |
num = int(match[1]) + 1 | |
base = f"{base[:match.start()]} ({num})" | |
else: | |
base = f"{base} (1)" | |
return increment_filename(base + ext) | |
def load_note_data_from_plist(path: str) -> ICNotePasteboardData: | |
with open(path, "rb") as f: | |
data = f.read() | |
return archiver.unarchive(data) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment