Sobsz/discord message dataset maker.py

## discord message dataset maker.py
# this script merges multiple gdpr data requests for one channel into a format suitable for training gpt-whatever on
#
# 1. copy a message link from the channel/dm you want to make a dataset of and get the *second*-to-last number
# 2. in your data request, go to the `messages` folder, then `c<number>`, then copy out the messages.csv and name it however you want to be named
# 3. get all other members of the channel to do the same
# 4. put all the csv files in the same place as this script and then run it
# 5. plop it in your training software of choice, e.g. https://colab.research.google.com/drive/15qBZx5y9rdaQSyWpsreMDnTiZ5IlN0zD
#
# this code is hereby released under 0bsd/cc0/unlicense/wtfpl/yougetthepoint

import bisect
import csv
from datetime import datetime
import glob
import heapq

def attachment_names(s):
    if not s:
        return ""
    else:
        return " ".join(f"[{i.rsplit('/', 1)[1]}]" for i in s.split(" "))

# load the files in and turn them into lines
# apologies for the hundred-yard-long string formatting line
files = []
for i in glob.glob("*.csv"):
    with open(i, encoding = "utf-8") as f:
        c = csv.reader(f)
        next(c) # skip header
        files.append([(
            int(j[0]), # id, which contains timestamp
            f"[{datetime.fromisoformat(j[1]).astimezone().strftime('%I:%M %p')}] {i[:-4]}: {' '.join(filter(None, (j[2], attachment_names(j[3]))))}")
            # [04:20 PM] name: contents [attachment.png]
            # for 24-hour time replace '%I:%M %p' with '%H:%M %p'
            for j in c][::-1]) # reverse to go chronologically

# if one person exports 2 days later than the other then we only have data of that person for those 2 days
# this takes care of that by removing the incomplete data from the end
cutoff = min([i[-1][0] for i in files])
files = [i[:bisect.bisect_right([j[0] for j in i], cutoff)] for i in files]

# and now we merge and save
with open("out.txt", "w", encoding = "utf-8") as f:
    f.write("\n".join([i[1] for i in heapq.merge(*files, key = lambda x: x[0])]))
	# this script merges multiple gdpr data requests for one channel into a format suitable for training gpt-whatever on
	#
	# 1. copy a message link from the channel/dm you want to make a dataset of and get the second-to-last number
	# 2. in your data request, go to the `messages` folder, then `c<number>`, then copy out the messages.csv and name it however you want to be named
	# 3. get all other members of the channel to do the same
	# 4. put all the csv files in the same place as this script and then run it
	# 5. plop it in your training software of choice, e.g. https://colab.research.google.com/drive/15qBZx5y9rdaQSyWpsreMDnTiZ5IlN0zD
	#
	# this code is hereby released under 0bsd/cc0/unlicense/wtfpl/yougetthepoint

	import bisect
	import csv
	from datetime import datetime
	import glob
	import heapq

	def attachment_names(s):
	if not s:
	return ""
	else:
	return " ".join(f"[{i.rsplit('/', 1)[1]}]" for i in s.split(" "))

	# load the files in and turn them into lines
	# apologies for the hundred-yard-long string formatting line
	files = []
	for i in glob.glob("*.csv"):
	with open(i, encoding = "utf-8") as f:
	c = csv.reader(f)
	next(c) # skip header
	files.append([(
	int(j[0]), # id, which contains timestamp
	f"[{datetime.fromisoformat(j[1]).astimezone().strftime('%I:%M %p')}] {i[:-4]}: {' '.join(filter(None, (j[2], attachment_names(j[3]))))}")
	# [04:20 PM] name: contents [attachment.png]
	# for 24-hour time replace '%I:%M %p' with '%H:%M %p'
	for j in c][::-1]) # reverse to go chronologically

	# if one person exports 2 days later than the other then we only have data of that person for those 2 days
	# this takes care of that by removing the incomplete data from the end
	cutoff = min([i[-1][0] for i in files])
	files = [i[:bisect.bisect_right([j[0] for j in i], cutoff)] for i in files]

	# and now we merge and save
	with open("out.txt", "w", encoding = "utf-8") as f:
	f.write("\n".join([i[1] for i in heapq.merge(*files, key = lambda x: x[0])]))