Created
February 13, 2022 11:06
-
-
Save emasru/6df614a9d12298f080fd8506d58fb973 to your computer and use it in GitHub Desktop.
Gets a dataset with all messages from all channels in a discord server. Requires access to channel history. Part of project on machine learning.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import discord | |
import json | |
import re | |
import platform | |
import asyncio | |
# Disables annoying "Event loop is closed" when logging out, at least on Windows | |
if platform.system() == 'Windows': | |
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) | |
client = discord.Client() | |
guildDict = { | |
"channels": { | |
} | |
} | |
guild_id = "YOUR GUILD ID AS AN INT GOES HERE, MUST HAVE ACCESS TO MESSAGE HISTORY" | |
# Filters out links. If you want to disable, just remove the function call | |
def message_filter(message_string): | |
filter_re = re.compile(r"""(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|""" | |
r"""www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?""" | |
r""":[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))""" | |
r"""*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|""" | |
r"""[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""") | |
return re.sub(filter_re, "", message_string) | |
# Reads your token from the file token.txt | |
def get_token(): | |
token_file = open("token.txt") | |
return token_file.readline() | |
@client.event | |
async def on_ready(): | |
print("Logged in as {0.user}".format(client)) | |
guild = client.get_guild(guild_id) | |
# Filters text channels | |
channels = list(filter(lambda remove: remove.type == discord.ChannelType.text, await guild.fetch_channels())) | |
for channel in channels: | |
channelDict = {str(channel): []} | |
guildDict["channels"].update(channelDict) | |
counter = 0 | |
async for message in channel.history(limit=None): # <- !!! Will take a long time! | |
# Format for how the message is scraped | |
# user id, current username (non-guild), content | |
message_object = [message.author.id, message.author.name] | |
content = message_filter(message.content) | |
counter += 1 | |
if content: # Only if the string has content | |
message_object.append(content) | |
guildDict["channels"][str(channel)].append(message_object) | |
print(f"Added message no. {counter} from channel #{str(channel)}") | |
else: | |
print(f"SKIPPED message no. {counter} from channel #{str(channel)}") | |
with open('message_data.json', 'w') as outfile: | |
json_string = json.dumps(guildDict) | |
outfile.write(json_string) | |
print("Finished dumping json file") | |
print("Logging out...") | |
await client.close() | |
if __name__ == "__main__": | |
client.run(get_token()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment