stucka/combineuniquecsvs.py

## combineuniquecsvs.py
# import csv
from glob import glob
import os
from sys import exit
import datetime

print("This will NOT work with CSVs that have multiline entries.")
print("This will completely screw with the order of your CSVs.")
print("This will risk making the Cubs win another World Series, splitting us into another alternative universe.")

timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H%M%S")

targetfile = f"!combinedunique-{timestamp}.csv"
sourcefiles = list(sorted(glob("*.csv")))

sep = "\r\n"

if os.path.exists(targetfile):
    print(f"Destination file {targetfile} already exists. Delete it, if you want to combine new stuff.")
    exit(0)
else:
    print(f"Will write to {targetfile}")

headers = None

def clean_row (row):
    return(row.replace("\r", "").replace("\n", ""))

masterdict = {}
for filecount, sourcefile in enumerate(sourcefiles):
    newrows = 0
    with open(sourcefile, "r") as sourcefilehandle:
        print(f"{filecount + 1}/{len(sourcefiles)}: {sourcefile}")
        reader = sourcefilehandle.readlines()
        if not headers:   # if we're processing the first file
            headers = clean_row(reader[0])
        if clean_row(reader[0]) != headers:
            print(f"\tHeaders mismatch with {sourcefile}, not combining with files matching {sourcefiles[0]}.")
        else:
            for row in reader[1:]:   # Skip header row
                line = clean_row(row)
                myhash = hash(line)
                if myhash not in masterdict:
                    masterdict[myhash] = []
                if line not in masterdict[myhash]:
                    newrows += 1
                    masterdict[myhash].append(line)
            print(f"\t{newrows} added")

with open(targetfile, "w", newline="") as outfile:
    outfile.write(headers + sep)
    for myhash in masterdict:
        for row in masterdict[myhash]:
            outfile.write(row + sep)
	# import csv
	from glob import glob
	import os
	from sys import exit
	import datetime

	print("This will NOT work with CSVs that have multiline entries.")
	print("This will completely screw with the order of your CSVs.")
	print("This will risk making the Cubs win another World Series, splitting us into another alternative universe.")

	timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H%M%S")

	targetfile = f"!combinedunique-{timestamp}.csv"
	sourcefiles = list(sorted(glob("*.csv")))

	sep = "\r\n"

	if os.path.exists(targetfile):
	print(f"Destination file {targetfile} already exists. Delete it, if you want to combine new stuff.")
	exit(0)
	else:
	print(f"Will write to {targetfile}")

	headers = None

	def clean_row (row):
	return(row.replace("\r", "").replace("\n", ""))

	masterdict = {}
	for filecount, sourcefile in enumerate(sourcefiles):
	newrows = 0
	with open(sourcefile, "r") as sourcefilehandle:
	print(f"{filecount + 1}/{len(sourcefiles)}: {sourcefile}")
	reader = sourcefilehandle.readlines()
	if not headers: # if we're processing the first file
	headers = clean_row(reader[0])
	if clean_row(reader[0]) != headers:
	print(f"\tHeaders mismatch with {sourcefile}, not combining with files matching {sourcefiles[0]}.")
	else:
	for row in reader[1:]: # Skip header row
	line = clean_row(row)
	myhash = hash(line)
	if myhash not in masterdict:
	masterdict[myhash] = []
	if line not in masterdict[myhash]:
	newrows += 1
	masterdict[myhash].append(line)
	print(f"\t{newrows} added")

	with open(targetfile, "w", newline="") as outfile:
	outfile.write(headers + sep)
	for myhash in masterdict:
	for row in masterdict[myhash]:
	outfile.write(row + sep)