Created
April 20, 2021 02:07
-
-
Save seanbreckenridge/e836ced6f2fdbf2087b4bce69f0b48dc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/my/discord.py b/my/discord.py | |
index 15730f6..6a7e1f6 100644 | |
--- a/my/discord.py | |
+++ b/my/discord.py | |
@@ -3,6 +3,8 @@ Discord Data: messages and events data | |
""" | |
REQUIRES = [ | |
"git+https://github.com/seanbreckenridge/discord_data", | |
+ "mistletoe", | |
+ "urlextract", | |
] | |
@@ -26,23 +28,109 @@ class config(user_config): | |
return Path(cls.export_path).expanduser().absolute() | |
-from typing import Iterator | |
+from typing import Iterator, Optional, Tuple | |
from my.core.common import LazyLogger, Stats, mcachew | |
+from datetime import datetime | |
-from discord_data import merge_messages, merge_activity, Message, Activity | |
+from discord_data import merge_messages, merge_activity, Activity, Message as DMessage | |
+from discord_data.model import Channel | |
+from urlextract import URLExtract | |
+from mistletoe import HTMLRenderer, Document | |
+ | |
+ | |
+renderer = HTMLRenderer() | |
logger = LazyLogger(__name__, level="warning") | |
+def _remove_supression(text: str, first_index: int, second_index: int) -> str: | |
+ # remove char at first index | |
+ text = text[:first_index] + text[first_index + 1 :] | |
+ # offset second index, since we removed a character | |
+ second_index -= 1 | |
+ # remove character at second index | |
+ return text[:second_index] + text[second_index + 1 :] | |
+ | |
+ | |
+def _remove_link_supression( | |
+ content: str, urls: Optional[List[Tuple[str, Tuple[int, int]]]] = None | |
+) -> str: | |
+ # fix content to remove discord link supression if any links had any | |
+ # e.g. this is a supressed link <https://github.com> | |
+ | |
+ if urls is None: | |
+ urls = URLExtract().find_urls(content, get_indices=True) | |
+ | |
+ # need to keep track to we can offset the index in the content to remove | |
+ removed_chars = 0 | |
+ for (url_text, (start_index, end_index)) in urls: | |
+ before_ind = (start_index - 1) - removed_chars | |
+ after_ind = (end_index) - removed_chars | |
+ try: | |
+ if content[before_ind] == "<" and content[after_ind] == ">": | |
+ content = _remove_supression(content, before_ind, after_ind) | |
+ removed_chars += 2 | |
+ except IndexError: # could happen if the url didn't have braces and we hit the end of a string | |
+ continue | |
+ return content | |
+ | |
+ | |
+def test_remove_link_supression() -> None: | |
+ assert _remove_supression("<test>", 0, 5) == "test" | |
+ | |
+ # shouldn't affect this at all | |
+ content = "https://urlextract.readthedocs.io" | |
+ assert _remove_link_supression(content) == content | |
+ | |
+ content = "<https://urlextract.readthedocs.io>" | |
+ expected = content.strip("<").strip(">") | |
+ assert _remove_link_supression(content) == expected | |
+ | |
+ content = "Here is some text <https://urlextract.readthedocs.io>" | |
+ expected = "Here is some text https://urlextract.readthedocs.io" | |
+ assert _remove_link_supression(content) == expected | |
+ | |
+ content = "text <https://urlextract.readthedocs.io> other text" | |
+ expected = "text https://urlextract.readthedocs.io other text" | |
+ assert _remove_link_supression(content) == expected | |
+ | |
+ content = "t <https://urlextract.readthedocs.io> other <github.com> f <sean.fish>" | |
+ expected = "t https://urlextract.readthedocs.io other github.com f sean.fish" | |
+ assert _remove_link_supression(content) == expected | |
+ | |
+ | |
def _cachew_depends_on() -> List[str]: | |
return list(map(str, config._abs_export_path().iterdir())) | |
-# reduces time by half, after cache is created | |
+# same as discord_data.models.Message, but has the 'html' field; rendered markdown as HTML | |
+@dataclass | |
+class Message: | |
+ message_id: int | |
+ timestamp: datetime | |
+ channel: Channel | |
+ content: str | |
+ attachments: str | |
+ html: str | |
+ | |
+ # hmm, cant subclass - set helper function manually | |
+ link = DMessage.link | |
+ | |
+ | |
+# reduces time by multiple minutes, after the cache is created | |
+# HTML rendering can take quite a long time for the thousands of messages | |
@mcachew(depends_on=_cachew_depends_on, logger=logger) | |
def messages() -> Iterator[Message]: | |
- yield from merge_messages(export_dir=config._abs_export_path()) | |
+ for msg in merge_messages(export_dir=config._abs_export_path()): | |
+ yield Message( | |
+ message_id=msg.message_id, | |
+ timestamp=msg.timestamp, | |
+ channel=msg.channel, | |
+ content=msg.content, | |
+ attachments=msg.attachments, | |
+ html=renderer.render(Document(_remove_link_supression(msg.content))), | |
+ ) | |
@mcachew(depends_on=_cachew_depends_on, logger=logger) | |
diff --git a/tests/test_discord.py b/tests/test_discord.py | |
index b7f2668..1f84c8f 100644 | |
--- a/tests/test_discord.py | |
+++ b/tests/test_discord.py | |
@@ -1,6 +1,7 @@ | |
from more_itertools import ilen | |
from my.discord import messages, activity, Activity | |
+from my.discord import test_remove_link_supression # bring test into scope | |
def test_discord() -> None: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment