vzhd1701/enex_detect_bad_images.py

## enex_detect_bad_images.py
import base64
import hashlib
import sys
from collections import defaultdict
from pathlib import Path
from io import BytesIO

from lxml import etree


def iter_xml_string_elements_as_dict(xml_string, tag_name):
    xml_string = BytesIO(xml_string.encode())
    yield from iter_xml_stream_elements_as_dict(xml_string, tag_name)


def iter_xml_file_elements_as_dict(xml_file, tag_name):
    with open(xml_file, "rb") as f:
        yield from iter_xml_stream_elements_as_dict(f, tag_name)


def iter_xml_stream_elements_as_dict(stream, tag_name):
    yield from iter_process_xml_stream_elements(stream, tag_name, lambda e: _etree_to_dict(e)[tag_name])


def iter_process_xml_stream_elements(stream, tag_name, handler_func):
    context = etree.iterparse(stream, events=("start", "end"), recover=True)

    _, root = next(context)

    for event, elem in context:
        if event == "end" and elem.tag == tag_name:
            yield handler_func(elem)

        root.clear()


# https://stackoverflow.com/a/10077069/13100286
def _etree_to_dict(t):
    d = {t.tag: {} if t.attrib else None}
    children = list(t)
    if children:
        dd = defaultdict(list)
        for dc in map(_etree_to_dict, children):
            for k, v in dc.items():
                dd[k].append(v)
        d = {
            t.tag: {
                k: v[0] if len(v) == 1 else v
                for k, v in dd.items()
            }
        }
    if t.attrib:
        d[t.tag].update(
            (f"@{k}", v) for k, v in t.attrib.items()
        )
    if t.text:
        text = t.text.strip()
        if children or t.attrib:
            if text:
                d[t.tag]["#text"] = text
        else:
            d[t.tag] = text
    return d


def _parse_resources(note_raw):
    note_resources = note_raw.get("resource", [])

    if isinstance(note_resources, dict):
        note_resources = [note_resources]

    for r in note_resources:
        r_data = base64.b64decode(r["data"]["#text"])
        r_md5 = hashlib.md5(r_data).hexdigest()

        yield r_md5


def detect_bad_images(enex_file):
    for note in iter_xml_file_elements_as_dict(enex_file, "note"):
        note_images = list(iter_xml_string_elements_as_dict(note["content"], "en-media"))

        print(f'Checking note "{note["title"]}"...')

        if note_images:
            resource_hashes = set(_parse_resources(note))

            for image in note_images:
                if image["@hash"] not in resource_hashes:
                    raise RuntimeError(f'en-media with {image["@hash"]} hash DOES NOT HAVE a matching resource!')

                print(f'en-media with {image["@hash"]} hash has a matching resource!')

    print("All images OK")


if __name__ == "__main__":
    detect_bad_images(sys.argv[1])
	import base64
	import hashlib
	import sys
	from collections import defaultdict
	from pathlib import Path
	from io import BytesIO

	from lxml import etree


	def iter_xml_string_elements_as_dict(xml_string, tag_name):
	xml_string = BytesIO(xml_string.encode())
	yield from iter_xml_stream_elements_as_dict(xml_string, tag_name)


	def iter_xml_file_elements_as_dict(xml_file, tag_name):
	with open(xml_file, "rb") as f:
	yield from iter_xml_stream_elements_as_dict(f, tag_name)


	def iter_xml_stream_elements_as_dict(stream, tag_name):
	yield from iter_process_xml_stream_elements(stream, tag_name, lambda e: _etree_to_dict(e)[tag_name])


	def iter_process_xml_stream_elements(stream, tag_name, handler_func):
	context = etree.iterparse(stream, events=("start", "end"), recover=True)

	_, root = next(context)

	for event, elem in context:
	if event == "end" and elem.tag == tag_name:
	yield handler_func(elem)

	root.clear()


	# https://stackoverflow.com/a/10077069/13100286
	def _etree_to_dict(t):
	d = {t.tag: {} if t.attrib else None}
	children = list(t)
	if children:
	dd = defaultdict(list)
	for dc in map(_etree_to_dict, children):
	for k, v in dc.items():
	dd[k].append(v)
	d = {
	t.tag: {
	k: v[0] if len(v) == 1 else v
	for k, v in dd.items()
	}
	}
	if t.attrib:
	d[t.tag].update(
	(f"@{k}", v) for k, v in t.attrib.items()
	)
	if t.text:
	text = t.text.strip()
	if children or t.attrib:
	if text:
	d[t.tag]["#text"] = text
	else:
	d[t.tag] = text
	return d


	def _parse_resources(note_raw):
	note_resources = note_raw.get("resource", [])

	if isinstance(note_resources, dict):
	note_resources = [note_resources]

	for r in note_resources:
	r_data = base64.b64decode(r["data"]["#text"])
	r_md5 = hashlib.md5(r_data).hexdigest()

	yield r_md5


	def detect_bad_images(enex_file):
	for note in iter_xml_file_elements_as_dict(enex_file, "note"):
	note_images = list(iter_xml_string_elements_as_dict(note["content"], "en-media"))

	print(f'Checking note "{note["title"]}"...')

	if note_images:
	resource_hashes = set(_parse_resources(note))

	for image in note_images:
	if image["@hash"] not in resource_hashes:
	raise RuntimeError(f'en-media with {image["@hash"]} hash DOES NOT HAVE a matching resource!')

	print(f'en-media with {image["@hash"]} hash has a matching resource!')

	print("All images OK")


	if __name__ == "__main__":
	detect_bad_images(sys.argv[1])