seanbreckenridge/discord_html.diff

## discord_html.diff
diff --git a/my/discord.py b/my/discord.py
index 15730f6..6a7e1f6 100644
--- a/my/discord.py
+++ b/my/discord.py
@@ -3,6 +3,8 @@ Discord Data: messages and events data
 """
 REQUIRES = [
     "git+https://github.com/seanbreckenridge/discord_data",
+    "mistletoe",
+    "urlextract",
 ]


@@ -26,23 +28,109 @@ class config(user_config):
         return Path(cls.export_path).expanduser().absolute()


-from typing import Iterator
+from typing import Iterator, Optional, Tuple
 from my.core.common import LazyLogger, Stats, mcachew
+from datetime import datetime

-from discord_data import merge_messages, merge_activity, Message, Activity
+from discord_data import merge_messages, merge_activity, Activity, Message as DMessage
+from discord_data.model import Channel
+from urlextract import URLExtract
+from mistletoe import HTMLRenderer, Document
+
+
+renderer = HTMLRenderer()


 logger = LazyLogger(__name__, level="warning")


+def _remove_supression(text: str, first_index: int, second_index: int) -> str:
+    # remove char at first index
+    text = text[:first_index] + text[first_index + 1 :]
+    # offset second index, since we removed a character
+    second_index -= 1
+    # remove character at second index
+    return text[:second_index] + text[second_index + 1 :]
+
+
+def _remove_link_supression(
+    content: str, urls: Optional[List[Tuple[str, Tuple[int, int]]]] = None
+) -> str:
+    # fix content to remove discord link supression if any links had any
+    # e.g. this is a supressed link <https://github.com>
+
+    if urls is None:
+        urls = URLExtract().find_urls(content, get_indices=True)
+
+    # need to keep track to we can offset the index in the content to remove
+    removed_chars = 0
+    for (url_text, (start_index, end_index)) in urls:
+        before_ind = (start_index - 1) - removed_chars
+        after_ind = (end_index) - removed_chars
+        try:
+            if content[before_ind] == "<" and content[after_ind] == ">":
+                content = _remove_supression(content, before_ind, after_ind)
+                removed_chars += 2
+        except IndexError:  # could happen if the url didn't have braces and we hit the end of a string
+            continue
+    return content
+
+
+def test_remove_link_supression() -> None:
+    assert _remove_supression("<test>", 0, 5) == "test"
+
+    # shouldn't affect this at all
+    content = "https://urlextract.readthedocs.io"
+    assert _remove_link_supression(content) == content
+
+    content = "<https://urlextract.readthedocs.io>"
+    expected = content.strip("<").strip(">")
+    assert _remove_link_supression(content) == expected
+
+    content = "Here is some text <https://urlextract.readthedocs.io>"
+    expected = "Here is some text https://urlextract.readthedocs.io"
+    assert _remove_link_supression(content) == expected
+
+    content = "text <https://urlextract.readthedocs.io> other text"
+    expected = "text https://urlextract.readthedocs.io other text"
+    assert _remove_link_supression(content) == expected
+
+    content = "t <https://urlextract.readthedocs.io> other <github.com> f <sean.fish>"
+    expected = "t https://urlextract.readthedocs.io other github.com f sean.fish"
+    assert _remove_link_supression(content) == expected
+
+
 def _cachew_depends_on() -> List[str]:
     return list(map(str, config._abs_export_path().iterdir()))


-# reduces time by half, after cache is created
+# same as discord_data.models.Message, but has the 'html' field; rendered markdown as HTML
+@dataclass
+class Message:
+    message_id: int
+    timestamp: datetime
+    channel: Channel
+    content: str
+    attachments: str
+    html: str
+
+    # hmm, cant subclass - set helper function manually
+    link = DMessage.link
+
+
+# reduces time by multiple minutes, after the cache is created
+# HTML rendering can take quite a long time for the thousands of messages
 @mcachew(depends_on=_cachew_depends_on, logger=logger)
 def messages() -> Iterator[Message]:
-    yield from merge_messages(export_dir=config._abs_export_path())
+    for msg in merge_messages(export_dir=config._abs_export_path()):
+        yield Message(
+            message_id=msg.message_id,
+            timestamp=msg.timestamp,
+            channel=msg.channel,
+            content=msg.content,
+            attachments=msg.attachments,
+            html=renderer.render(Document(_remove_link_supression(msg.content))),
+        )


 @mcachew(depends_on=_cachew_depends_on, logger=logger)
diff --git a/tests/test_discord.py b/tests/test_discord.py
index b7f2668..1f84c8f 100644
--- a/tests/test_discord.py
+++ b/tests/test_discord.py
@@ -1,6 +1,7 @@
 from more_itertools import ilen

 from my.discord import messages, activity, Activity
+from my.discord import test_remove_link_supression  # bring test into scope


 def test_discord() -> None:
	diff --git a/my/discord.py b/my/discord.py
	index 15730f6..6a7e1f6 100644
	--- a/my/discord.py
	+++ b/my/discord.py
	@@ -3,6 +3,8 @@ Discord Data: messages and events data
	"""
	REQUIRES = [
	"git+https://github.com/seanbreckenridge/discord_data",
	+ "mistletoe",
	+ "urlextract",
	]


	@@ -26,23 +28,109 @@ class config(user_config):
	return Path(cls.export_path).expanduser().absolute()


	-from typing import Iterator
	+from typing import Iterator, Optional, Tuple
	from my.core.common import LazyLogger, Stats, mcachew
	+from datetime import datetime

	-from discord_data import merge_messages, merge_activity, Message, Activity
	+from discord_data import merge_messages, merge_activity, Activity, Message as DMessage
	+from discord_data.model import Channel
	+from urlextract import URLExtract
	+from mistletoe import HTMLRenderer, Document
	+
	+
	+renderer = HTMLRenderer()


	logger = LazyLogger(__name__, level="warning")


	+def _remove_supression(text: str, first_index: int, second_index: int) -> str:
	+ # remove char at first index
	+ text = text[:first_index] + text[first_index + 1 :]
	+ # offset second index, since we removed a character
	+ second_index -= 1
	+ # remove character at second index
	+ return text[:second_index] + text[second_index + 1 :]
	+
	+
	+def _remove_link_supression(
	+ content: str, urls: Optional[List[Tuple[str, Tuple[int, int]]]] = None
	+) -> str:
	+ # fix content to remove discord link supression if any links had any
	+ # e.g. this is a supressed link <https://github.com>
	+
	+ if urls is None:
	+ urls = URLExtract().find_urls(content, get_indices=True)
	+
	+ # need to keep track to we can offset the index in the content to remove
	+ removed_chars = 0
	+ for (url_text, (start_index, end_index)) in urls:
	+ before_ind = (start_index - 1) - removed_chars
	+ after_ind = (end_index) - removed_chars
	+ try:
	+ if content[before_ind] == "<" and content[after_ind] == ">":
	+ content = _remove_supression(content, before_ind, after_ind)
	+ removed_chars += 2
	+ except IndexError: # could happen if the url didn't have braces and we hit the end of a string
	+ continue
	+ return content
	+
	+
	+def test_remove_link_supression() -> None:
	+ assert _remove_supression("<test>", 0, 5) == "test"
	+
	+ # shouldn't affect this at all
	+ content = "https://urlextract.readthedocs.io"
	+ assert _remove_link_supression(content) == content
	+
	+ content = "<https://urlextract.readthedocs.io>"
	+ expected = content.strip("<").strip(">")
	+ assert _remove_link_supression(content) == expected
	+
	+ content = "Here is some text <https://urlextract.readthedocs.io>"
	+ expected = "Here is some text https://urlextract.readthedocs.io"
	+ assert _remove_link_supression(content) == expected
	+
	+ content = "text <https://urlextract.readthedocs.io> other text"
	+ expected = "text https://urlextract.readthedocs.io other text"
	+ assert _remove_link_supression(content) == expected
	+
	+ content = "t <https://urlextract.readthedocs.io> other <github.com> f <sean.fish>"
	+ expected = "t https://urlextract.readthedocs.io other github.com f sean.fish"
	+ assert _remove_link_supression(content) == expected
	+
	+
	def _cachew_depends_on() -> List[str]:
	return list(map(str, config._abs_export_path().iterdir()))


	-# reduces time by half, after cache is created
	+# same as discord_data.models.Message, but has the 'html' field; rendered markdown as HTML
	+@dataclass
	+class Message:
	+ message_id: int
	+ timestamp: datetime
	+ channel: Channel
	+ content: str
	+ attachments: str
	+ html: str
	+
	+ # hmm, cant subclass - set helper function manually
	+ link = DMessage.link
	+
	+
	+# reduces time by multiple minutes, after the cache is created
	+# HTML rendering can take quite a long time for the thousands of messages
	@mcachew(depends_on=_cachew_depends_on, logger=logger)
	def messages() -> Iterator[Message]:
	- yield from merge_messages(export_dir=config._abs_export_path())
	+ for msg in merge_messages(export_dir=config._abs_export_path()):
	+ yield Message(
	+ message_id=msg.message_id,
	+ timestamp=msg.timestamp,
	+ channel=msg.channel,
	+ content=msg.content,
	+ attachments=msg.attachments,
	+ html=renderer.render(Document(_remove_link_supression(msg.content))),
	+ )


	@mcachew(depends_on=_cachew_depends_on, logger=logger)
	diff --git a/tests/test_discord.py b/tests/test_discord.py
	index b7f2668..1f84c8f 100644
	--- a/tests/test_discord.py
	+++ b/tests/test_discord.py
	@@ -1,6 +1,7 @@
	from more_itertools import ilen

	from my.discord import messages, activity, Activity
	+from my.discord import test_remove_link_supression # bring test into scope


	def test_discord() -> None: