emasru/get_message_dataset.py

## get_message_dataset.py
import discord
import json
import re
import platform
import asyncio

# Disables annoying "Event loop is closed" when logging out, at least on Windows
if platform.system() == 'Windows':
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

client = discord.Client()

guildDict = {
    "channels": {
    }
}

guild_id = "YOUR GUILD ID AS AN INT GOES HERE, MUST HAVE ACCESS TO MESSAGE HISTORY"


# Filters out links. If you want to disable, just remove the function call
def message_filter(message_string):
    filter_re = re.compile(r"""(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|"""
                           r"""www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?"""
                           r""":[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))"""
                           r"""*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|"""
                           r"""[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""")

    return re.sub(filter_re, "", message_string)


# Reads your token from the file token.txt
def get_token():
    token_file = open("token.txt")
    return token_file.readline()


@client.event
async def on_ready():
    print("Logged in as {0.user}".format(client))
    guild = client.get_guild(guild_id)

    # Filters text channels
    channels = list(filter(lambda remove: remove.type == discord.ChannelType.text, await guild.fetch_channels()))

    for channel in channels:
        channelDict = {str(channel): []}
        guildDict["channels"].update(channelDict)

        counter = 0
        async for message in channel.history(limit=None):  # <- !!! Will take a long time!
            # Format for how the message is scraped
            # user id, current username (non-guild), content
            message_object = [message.author.id, message.author.name]
            content = message_filter(message.content)
            counter += 1
            if content:  # Only if the string has content
                message_object.append(content)
                guildDict["channels"][str(channel)].append(message_object)
                print(f"Added message no. {counter} from channel #{str(channel)}")
            else:
                print(f"SKIPPED message no. {counter} from channel #{str(channel)}")

    with open('message_data.json', 'w') as outfile:
        json_string = json.dumps(guildDict)
        outfile.write(json_string)
        print("Finished dumping json file")

    print("Logging out...")
    await client.close()


if __name__ == "__main__":
    client.run(get_token())
	import discord
	import json
	import re
	import platform
	import asyncio

	# Disables annoying "Event loop is closed" when logging out, at least on Windows
	if platform.system() == 'Windows':
	asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

	client = discord.Client()

	guildDict = {
	"channels": {
	}
	}

	guild_id = "YOUR GUILD ID AS AN INT GOES HERE, MUST HAVE ACCESS TO MESSAGE HISTORY"


	# Filters out links. If you want to disable, just remove the function call
	def message_filter(message_string):
	filter_re = re.compile(r"""(?i)\b((?:[a-z][\w-]+:(?:/{1,3}\|[a-z0-9%])\|"""
	r"""www\d{0,3}[.]\|[a-z0-9.\-]+[.][a-z]{2,4}/)(?"""
	r""":[^\s()<>]+\|\(([^\s()<>]+\|(\([^\s()<>]+\)))"""
	r"""\))+(?:\(([^\s()<>]+\|(\([^\s()<>]+\)))\)\|"""
	r"""[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""")

	return re.sub(filter_re, "", message_string)


	# Reads your token from the file token.txt
	def get_token():
	token_file = open("token.txt")
	return token_file.readline()


	@client.event
	async def on_ready():
	print("Logged in as {0.user}".format(client))
	guild = client.get_guild(guild_id)

	# Filters text channels
	channels = list(filter(lambda remove: remove.type == discord.ChannelType.text, await guild.fetch_channels()))

	for channel in channels:
	channelDict = {str(channel): []}
	guildDict["channels"].update(channelDict)

	counter = 0
	async for message in channel.history(limit=None): # <- !!! Will take a long time!
	# Format for how the message is scraped
	# user id, current username (non-guild), content
	message_object = [message.author.id, message.author.name]
	content = message_filter(message.content)
	counter += 1
	if content: # Only if the string has content
	message_object.append(content)
	guildDict["channels"][str(channel)].append(message_object)
	print(f"Added message no. {counter} from channel #{str(channel)}")
	else:
	print(f"SKIPPED message no. {counter} from channel #{str(channel)}")

	with open('message_data.json', 'w') as outfile:
	json_string = json.dumps(guildDict)
	outfile.write(json_string)
	print("Finished dumping json file")

	print("Logging out...")
	await client.close()


	if __name__ == "__main__":
	client.run(get_token())