Last active
August 16, 2022 19:02
-
-
Save Sobsz/c29152df975fbb7f083c76425d66832b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this script merges multiple gdpr data requests for one channel into a format suitable for training gpt-whatever on | |
# | |
# 1. copy a message link from the channel/dm you want to make a dataset of and get the *second*-to-last number | |
# 2. in your data request, go to the `messages` folder, then `c<number>`, then copy out the messages.csv and name it however you want to be named | |
# 3. get all other members of the channel to do the same | |
# 4. put all the csv files in the same place as this script and then run it | |
# 5. plop it in your training software of choice, e.g. https://colab.research.google.com/drive/15qBZx5y9rdaQSyWpsreMDnTiZ5IlN0zD | |
# | |
# this code is hereby released under 0bsd/cc0/unlicense/wtfpl/yougetthepoint | |
import bisect | |
import csv | |
from datetime import datetime | |
import glob | |
import heapq | |
def attachment_names(s): | |
if not s: | |
return "" | |
else: | |
return " ".join(f"[{i.rsplit('/', 1)[1]}]" for i in s.split(" ")) | |
# load the files in and turn them into lines | |
# apologies for the hundred-yard-long string formatting line | |
files = [] | |
for i in glob.glob("*.csv"): | |
with open(i, encoding = "utf-8") as f: | |
c = csv.reader(f) | |
next(c) # skip header | |
files.append([( | |
int(j[0]), # id, which contains timestamp | |
f"[{datetime.fromisoformat(j[1]).astimezone().strftime('%I:%M %p')}] {i[:-4]}: {' '.join(filter(None, (j[2], attachment_names(j[3]))))}") | |
# [04:20 PM] name: contents [attachment.png] | |
# for 24-hour time replace '%I:%M %p' with '%H:%M %p' | |
for j in c][::-1]) # reverse to go chronologically | |
# if one person exports 2 days later than the other then we only have data of that person for those 2 days | |
# this takes care of that by removing the incomplete data from the end | |
cutoff = min([i[-1][0] for i in files]) | |
files = [i[:bisect.bisect_right([j[0] for j in i], cutoff)] for i in files] | |
# and now we merge and save | |
with open("out.txt", "w", encoding = "utf-8") as f: | |
f.write("\n".join([i[1] for i in heapq.merge(*files, key = lambda x: x[0])])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment