Skip to content

Instantly share code, notes, and snippets.

@vzhd1701
Created October 18, 2023 14:24
Show Gist options
  • Save vzhd1701/20a81fc6294382b428a26a01c6cd6cda to your computer and use it in GitHub Desktop.
Save vzhd1701/20a81fc6294382b428a26a01c6cd6cda to your computer and use it in GitHub Desktop.
Parse enex file and find images that don't have matching resource
import base64
import hashlib
import sys
from collections import defaultdict
from pathlib import Path
from io import BytesIO
from lxml import etree
def iter_xml_string_elements_as_dict(xml_string, tag_name):
xml_string = BytesIO(xml_string.encode())
yield from iter_xml_stream_elements_as_dict(xml_string, tag_name)
def iter_xml_file_elements_as_dict(xml_file, tag_name):
with open(xml_file, "rb") as f:
yield from iter_xml_stream_elements_as_dict(f, tag_name)
def iter_xml_stream_elements_as_dict(stream, tag_name):
yield from iter_process_xml_stream_elements(stream, tag_name, lambda e: _etree_to_dict(e)[tag_name])
def iter_process_xml_stream_elements(stream, tag_name, handler_func):
context = etree.iterparse(stream, events=("start", "end"), recover=True)
_, root = next(context)
for event, elem in context:
if event == "end" and elem.tag == tag_name:
yield handler_func(elem)
root.clear()
# https://stackoverflow.com/a/10077069/13100286
def _etree_to_dict(t):
d = {t.tag: {} if t.attrib else None}
children = list(t)
if children:
dd = defaultdict(list)
for dc in map(_etree_to_dict, children):
for k, v in dc.items():
dd[k].append(v)
d = {
t.tag: {
k: v[0] if len(v) == 1 else v
for k, v in dd.items()
}
}
if t.attrib:
d[t.tag].update(
(f"@{k}", v) for k, v in t.attrib.items()
)
if t.text:
text = t.text.strip()
if children or t.attrib:
if text:
d[t.tag]["#text"] = text
else:
d[t.tag] = text
return d
def _parse_resources(note_raw):
note_resources = note_raw.get("resource", [])
if isinstance(note_resources, dict):
note_resources = [note_resources]
for r in note_resources:
r_data = base64.b64decode(r["data"]["#text"])
r_md5 = hashlib.md5(r_data).hexdigest()
yield r_md5
def detect_bad_images(enex_file):
for note in iter_xml_file_elements_as_dict(enex_file, "note"):
note_images = list(iter_xml_string_elements_as_dict(note["content"], "en-media"))
print(f'Checking note "{note["title"]}"...')
if note_images:
resource_hashes = set(_parse_resources(note))
for image in note_images:
if image["@hash"] not in resource_hashes:
raise RuntimeError(f'en-media with {image["@hash"]} hash DOES NOT HAVE a matching resource!')
print(f'en-media with {image["@hash"]} hash has a matching resource!')
print("All images OK")
if __name__ == "__main__":
detect_bad_images(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment