alexpovel/main.py

## __main__.py
import logging
import subprocess as sp
import typing as t

import win32clipboard as wcb  # `pip install pywin32`

logging.basicConfig(level=logging.DEBUG)

# Found these manually via "InsideClipboard". Doesn't seem to be documented anywhere.
MS_TEAMS_HTML_FORMAT_ID = 49309
MS_OFFICE_HTML_FORMAT_ID = MS_TEAMS_HTML_FORMAT_ID


def collect_contents(formats: t.Iterable[int]) -> t.Iterable[tuple[int, bytes]]:
    """Collect clipboard contents for all available formats."""

    baseline = {wcb.CF_UNICODETEXT, wcb.CF_TEXT}

    for format in set(formats) | baseline:
        if wcb.IsClipboardFormatAvailable(format):
            try:
                name = wcb.GetClipboardFormatName(format)
            except Exception:  # `CF_TEXT` etc. don't have this? 🤷
                name = "<unknown>"

            logging.info(f"Will use clipboard format {format} ('{name}')")
        else:
            logging.warning(f"Clipboard format {format} not available, skipping.")
            continue

        # Original type annotation seems incorrect; this can be either
        raw_contents = t.cast(str | bytes, wcb.GetClipboardData(format))

        if isinstance(raw_contents, str):
            contents = raw_contents.encode("utf8")
        else:
            contents = raw_contents

            try:
                contents.decode("utf8")
            except UnicodeDecodeError:
                try:
                    contents.decode("utf16")
                except UnicodeDecodeError:
                    logging.error(
                        "Clipboard contents are neither UTF-8 nor UTF-16, "
                        + "cannot continue"
                    )
                    raise
                else:
                    logging.info("Clipboard contents are UTF-16, converting to UTF-8")
                    contents = contents.decode("utf16").encode("utf8")

        assert isinstance(contents, bytes)

        logging.info(f"Raw clipboard data (length of {len(contents)}):")
        logging.info(contents)

        yield format, contents


def set_contents(formats_contents: dict[int, bytes]) -> None:
    """Set clipboard contents for all passed formats."""

    for format, contents in formats_contents.items():
        try:
            srgn = sp.run(
                ["srgn", "--german", "-vvv"],
                input=contents,
                check=True,
                capture_output=True,
            )
        except sp.CalledProcessError as e:
            logging.error(e.stderr)
            raise
        else:
            logging.debug("srgn stderr:")
            logging.debug(srgn.stderr)

            out = srgn.stdout
            logging.info("Raw srgn output:")
            logging.info(out)

            # An example output of copying from MS Teams, and its format with ID 49309
            # is (view with "InsideClipboard"):
            #
            # ```
            # Version:0.9
            # StartHTML:0000000105
            # EndHTML:0000000192
            # StartFragment:0000000141
            # EndFragment:0000000156
            # <html>
            # <body>
            # <!--StartFragment--><em>Hello </em><!--EndFragment-->
            # </body>
            # </html>
            # ```
            #
            # The assumption made here is that we do not need to edit the preamble byte
            # offset definitions, as we always only replace two bytes by two others:
            # "ue" with "ü" (0xC3BC), etc. So lengths don't change!
            assert len(out) == len(contents), (
                "srgn output is not the same length as input, "
                + "required for HTML byte offsets to be correct"
            )

            if format == wcb.CF_UNICODETEXT:
                # UTF-16 bytes are expected here!
                out = srgn.stdout.decode("utf8").encode("utf16")
            else:
                out = srgn.stdout
            wcb.SetClipboardData(format, out)


def main() -> None:
    try:
        wcb.OpenClipboard()

        formats_contents = {
            format: contents
            for format, contents in collect_contents(
                [MS_TEAMS_HTML_FORMAT_ID, MS_OFFICE_HTML_FORMAT_ID]
            )
        }

        # Doesn't work if not cleared properly 🤷 See also
        # https://learn.microsoft.com/en-us/windows/win32/dataxchg/using-the-clipboard#copying-information-to-the-clipboard
        wcb.EmptyClipboard()

        set_contents(formats_contents)
    finally:
        wcb.CloseClipboard()


if __name__ == "__main__":
    main()
	import logging
	import subprocess as sp
	import typing as t

	import win32clipboard as wcb # `pip install pywin32`

	logging.basicConfig(level=logging.DEBUG)

	# Found these manually via "InsideClipboard". Doesn't seem to be documented anywhere.
	MS_TEAMS_HTML_FORMAT_ID = 49309
	MS_OFFICE_HTML_FORMAT_ID = MS_TEAMS_HTML_FORMAT_ID


	def collect_contents(formats: t.Iterable[int]) -> t.Iterable[tuple[int, bytes]]:
	"""Collect clipboard contents for all available formats."""

	baseline = {wcb.CF_UNICODETEXT, wcb.CF_TEXT}

	for format in set(formats) \| baseline:
	if wcb.IsClipboardFormatAvailable(format):
	try:
	name = wcb.GetClipboardFormatName(format)
	except Exception: # `CF_TEXT` etc. don't have this? 🤷
	name = "<unknown>"

	logging.info(f"Will use clipboard format {format} ('{name}')")
	else:
	logging.warning(f"Clipboard format {format} not available, skipping.")
	continue

	# Original type annotation seems incorrect; this can be either
	raw_contents = t.cast(str \| bytes, wcb.GetClipboardData(format))

	if isinstance(raw_contents, str):
	contents = raw_contents.encode("utf8")
	else:
	contents = raw_contents

	try:
	contents.decode("utf8")
	except UnicodeDecodeError:
	try:
	contents.decode("utf16")
	except UnicodeDecodeError:
	logging.error(
	"Clipboard contents are neither UTF-8 nor UTF-16, "
	+ "cannot continue"
	)
	raise
	else:
	logging.info("Clipboard contents are UTF-16, converting to UTF-8")
	contents = contents.decode("utf16").encode("utf8")

	assert isinstance(contents, bytes)

	logging.info(f"Raw clipboard data (length of {len(contents)}):")
	logging.info(contents)

	yield format, contents


	def set_contents(formats_contents: dict[int, bytes]) -> None:
	"""Set clipboard contents for all passed formats."""

	for format, contents in formats_contents.items():
	try:
	srgn = sp.run(
	["srgn", "--german", "-vvv"],
	input=contents,
	check=True,
	capture_output=True,
	)
	except sp.CalledProcessError as e:
	logging.error(e.stderr)
	raise
	else:
	logging.debug("srgn stderr:")
	logging.debug(srgn.stderr)

	out = srgn.stdout
	logging.info("Raw srgn output:")
	logging.info(out)

	# An example output of copying from MS Teams, and its format with ID 49309
	# is (view with "InsideClipboard"):
	#
	# ```
	# Version:0.9
	# StartHTML:0000000105
	# EndHTML:0000000192
	# StartFragment:0000000141
	# EndFragment:0000000156
	# <html>
	# <body>
	# <!--StartFragment--><em>Hello </em><!--EndFragment-->
	# </body>
	# </html>
	# ```
	#
	# The assumption made here is that we do not need to edit the preamble byte
	# offset definitions, as we always only replace two bytes by two others:
	# "ue" with "ü" (0xC3BC), etc. So lengths don't change!
	assert len(out) == len(contents), (
	"srgn output is not the same length as input, "
	+ "required for HTML byte offsets to be correct"
	)

	if format == wcb.CF_UNICODETEXT:
	# UTF-16 bytes are expected here!
	out = srgn.stdout.decode("utf8").encode("utf16")
	else:
	out = srgn.stdout
	wcb.SetClipboardData(format, out)


	def main() -> None:
	try:
	wcb.OpenClipboard()

	formats_contents = {
	format: contents
	for format, contents in collect_contents(
	[MS_TEAMS_HTML_FORMAT_ID, MS_OFFICE_HTML_FORMAT_ID]
	)
	}

	# Doesn't work if not cleared properly 🤷 See also
	# https://learn.microsoft.com/en-us/windows/win32/dataxchg/using-the-clipboard#copying-information-to-the-clipboard
	wcb.EmptyClipboard()

	set_contents(formats_contents)
	finally:
	wcb.CloseClipboard()


	if __name__ == "__main__":
	main()