Created
October 18, 2023 14:24
-
-
Save vzhd1701/20a81fc6294382b428a26a01c6cd6cda to your computer and use it in GitHub Desktop.
Parse enex file and find images that don't have matching resource
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import base64 | |
import hashlib | |
import sys | |
from collections import defaultdict | |
from pathlib import Path | |
from io import BytesIO | |
from lxml import etree | |
def iter_xml_string_elements_as_dict(xml_string, tag_name): | |
xml_string = BytesIO(xml_string.encode()) | |
yield from iter_xml_stream_elements_as_dict(xml_string, tag_name) | |
def iter_xml_file_elements_as_dict(xml_file, tag_name): | |
with open(xml_file, "rb") as f: | |
yield from iter_xml_stream_elements_as_dict(f, tag_name) | |
def iter_xml_stream_elements_as_dict(stream, tag_name): | |
yield from iter_process_xml_stream_elements(stream, tag_name, lambda e: _etree_to_dict(e)[tag_name]) | |
def iter_process_xml_stream_elements(stream, tag_name, handler_func): | |
context = etree.iterparse(stream, events=("start", "end"), recover=True) | |
_, root = next(context) | |
for event, elem in context: | |
if event == "end" and elem.tag == tag_name: | |
yield handler_func(elem) | |
root.clear() | |
# https://stackoverflow.com/a/10077069/13100286 | |
def _etree_to_dict(t): | |
d = {t.tag: {} if t.attrib else None} | |
children = list(t) | |
if children: | |
dd = defaultdict(list) | |
for dc in map(_etree_to_dict, children): | |
for k, v in dc.items(): | |
dd[k].append(v) | |
d = { | |
t.tag: { | |
k: v[0] if len(v) == 1 else v | |
for k, v in dd.items() | |
} | |
} | |
if t.attrib: | |
d[t.tag].update( | |
(f"@{k}", v) for k, v in t.attrib.items() | |
) | |
if t.text: | |
text = t.text.strip() | |
if children or t.attrib: | |
if text: | |
d[t.tag]["#text"] = text | |
else: | |
d[t.tag] = text | |
return d | |
def _parse_resources(note_raw): | |
note_resources = note_raw.get("resource", []) | |
if isinstance(note_resources, dict): | |
note_resources = [note_resources] | |
for r in note_resources: | |
r_data = base64.b64decode(r["data"]["#text"]) | |
r_md5 = hashlib.md5(r_data).hexdigest() | |
yield r_md5 | |
def detect_bad_images(enex_file): | |
for note in iter_xml_file_elements_as_dict(enex_file, "note"): | |
note_images = list(iter_xml_string_elements_as_dict(note["content"], "en-media")) | |
print(f'Checking note "{note["title"]}"...') | |
if note_images: | |
resource_hashes = set(_parse_resources(note)) | |
for image in note_images: | |
if image["@hash"] not in resource_hashes: | |
raise RuntimeError(f'en-media with {image["@hash"]} hash DOES NOT HAVE a matching resource!') | |
print(f'en-media with {image["@hash"]} hash has a matching resource!') | |
print("All images OK") | |
if __name__ == "__main__": | |
detect_bad_images(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment