Skip to content

Instantly share code, notes, and snippets.

@emasru
Created February 13, 2022 11:06
Show Gist options
  • Save emasru/6df614a9d12298f080fd8506d58fb973 to your computer and use it in GitHub Desktop.
Save emasru/6df614a9d12298f080fd8506d58fb973 to your computer and use it in GitHub Desktop.
Gets a dataset with all messages from all channels in a discord server. Requires access to channel history. Part of project on machine learning.
import discord
import json
import re
import platform
import asyncio
# Disables annoying "Event loop is closed" when logging out, at least on Windows
if platform.system() == 'Windows':
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
client = discord.Client()
guildDict = {
"channels": {
}
}
guild_id = "YOUR GUILD ID AS AN INT GOES HERE, MUST HAVE ACCESS TO MESSAGE HISTORY"
# Filters out links. If you want to disable, just remove the function call
def message_filter(message_string):
filter_re = re.compile(r"""(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|"""
r"""www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?"""
r""":[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))"""
r"""*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|"""
r"""[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""")
return re.sub(filter_re, "", message_string)
# Reads your token from the file token.txt
def get_token():
token_file = open("token.txt")
return token_file.readline()
@client.event
async def on_ready():
print("Logged in as {0.user}".format(client))
guild = client.get_guild(guild_id)
# Filters text channels
channels = list(filter(lambda remove: remove.type == discord.ChannelType.text, await guild.fetch_channels()))
for channel in channels:
channelDict = {str(channel): []}
guildDict["channels"].update(channelDict)
counter = 0
async for message in channel.history(limit=None): # <- !!! Will take a long time!
# Format for how the message is scraped
# user id, current username (non-guild), content
message_object = [message.author.id, message.author.name]
content = message_filter(message.content)
counter += 1
if content: # Only if the string has content
message_object.append(content)
guildDict["channels"][str(channel)].append(message_object)
print(f"Added message no. {counter} from channel #{str(channel)}")
else:
print(f"SKIPPED message no. {counter} from channel #{str(channel)}")
with open('message_data.json', 'w') as outfile:
json_string = json.dumps(guildDict)
outfile.write(json_string)
print("Finished dumping json file")
print("Logging out...")
await client.close()
if __name__ == "__main__":
client.run(get_token())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment