Skip to content

Instantly share code, notes, and snippets.

@seanbreckenridge
Created April 20, 2021 02:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save seanbreckenridge/e836ced6f2fdbf2087b4bce69f0b48dc to your computer and use it in GitHub Desktop.
Save seanbreckenridge/e836ced6f2fdbf2087b4bce69f0b48dc to your computer and use it in GitHub Desktop.
diff --git a/my/discord.py b/my/discord.py
index 15730f6..6a7e1f6 100644
--- a/my/discord.py
+++ b/my/discord.py
@@ -3,6 +3,8 @@ Discord Data: messages and events data
"""
REQUIRES = [
"git+https://github.com/seanbreckenridge/discord_data",
+ "mistletoe",
+ "urlextract",
]
@@ -26,23 +28,109 @@ class config(user_config):
return Path(cls.export_path).expanduser().absolute()
-from typing import Iterator
+from typing import Iterator, Optional, Tuple
from my.core.common import LazyLogger, Stats, mcachew
+from datetime import datetime
-from discord_data import merge_messages, merge_activity, Message, Activity
+from discord_data import merge_messages, merge_activity, Activity, Message as DMessage
+from discord_data.model import Channel
+from urlextract import URLExtract
+from mistletoe import HTMLRenderer, Document
+
+
+renderer = HTMLRenderer()
logger = LazyLogger(__name__, level="warning")
+def _remove_supression(text: str, first_index: int, second_index: int) -> str:
+ # remove char at first index
+ text = text[:first_index] + text[first_index + 1 :]
+ # offset second index, since we removed a character
+ second_index -= 1
+ # remove character at second index
+ return text[:second_index] + text[second_index + 1 :]
+
+
+def _remove_link_supression(
+ content: str, urls: Optional[List[Tuple[str, Tuple[int, int]]]] = None
+) -> str:
+ # fix content to remove discord link supression if any links had any
+ # e.g. this is a supressed link <https://github.com>
+
+ if urls is None:
+ urls = URLExtract().find_urls(content, get_indices=True)
+
+ # need to keep track to we can offset the index in the content to remove
+ removed_chars = 0
+ for (url_text, (start_index, end_index)) in urls:
+ before_ind = (start_index - 1) - removed_chars
+ after_ind = (end_index) - removed_chars
+ try:
+ if content[before_ind] == "<" and content[after_ind] == ">":
+ content = _remove_supression(content, before_ind, after_ind)
+ removed_chars += 2
+ except IndexError: # could happen if the url didn't have braces and we hit the end of a string
+ continue
+ return content
+
+
+def test_remove_link_supression() -> None:
+ assert _remove_supression("<test>", 0, 5) == "test"
+
+ # shouldn't affect this at all
+ content = "https://urlextract.readthedocs.io"
+ assert _remove_link_supression(content) == content
+
+ content = "<https://urlextract.readthedocs.io>"
+ expected = content.strip("<").strip(">")
+ assert _remove_link_supression(content) == expected
+
+ content = "Here is some text <https://urlextract.readthedocs.io>"
+ expected = "Here is some text https://urlextract.readthedocs.io"
+ assert _remove_link_supression(content) == expected
+
+ content = "text <https://urlextract.readthedocs.io> other text"
+ expected = "text https://urlextract.readthedocs.io other text"
+ assert _remove_link_supression(content) == expected
+
+ content = "t <https://urlextract.readthedocs.io> other <github.com> f <sean.fish>"
+ expected = "t https://urlextract.readthedocs.io other github.com f sean.fish"
+ assert _remove_link_supression(content) == expected
+
+
def _cachew_depends_on() -> List[str]:
return list(map(str, config._abs_export_path().iterdir()))
-# reduces time by half, after cache is created
+# same as discord_data.models.Message, but has the 'html' field; rendered markdown as HTML
+@dataclass
+class Message:
+ message_id: int
+ timestamp: datetime
+ channel: Channel
+ content: str
+ attachments: str
+ html: str
+
+ # hmm, cant subclass - set helper function manually
+ link = DMessage.link
+
+
+# reduces time by multiple minutes, after the cache is created
+# HTML rendering can take quite a long time for the thousands of messages
@mcachew(depends_on=_cachew_depends_on, logger=logger)
def messages() -> Iterator[Message]:
- yield from merge_messages(export_dir=config._abs_export_path())
+ for msg in merge_messages(export_dir=config._abs_export_path()):
+ yield Message(
+ message_id=msg.message_id,
+ timestamp=msg.timestamp,
+ channel=msg.channel,
+ content=msg.content,
+ attachments=msg.attachments,
+ html=renderer.render(Document(_remove_link_supression(msg.content))),
+ )
@mcachew(depends_on=_cachew_depends_on, logger=logger)
diff --git a/tests/test_discord.py b/tests/test_discord.py
index b7f2668..1f84c8f 100644
--- a/tests/test_discord.py
+++ b/tests/test_discord.py
@@ -1,6 +1,7 @@
from more_itertools import ilen
from my.discord import messages, activity, Activity
+from my.discord import test_remove_link_supression # bring test into scope
def test_discord() -> None:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment