Skip to content

Instantly share code, notes, and snippets.

@davidrleonard
Last active November 1, 2016 21:03
Show Gist options
  • Save davidrleonard/4dbeebf749248a956e44 to your computer and use it in GitHub Desktop.
Save davidrleonard/4dbeebf749248a956e44 to your computer and use it in GitHub Desktop.
Compare two CSVs, output the difference between them
# Compares two CSVs and outputs a new CSV that excludes the intersection
# i.e.:
# $ python csv-difference.py -d fulldataset.csv -s subset.csv -o output.csv -u id -k id email name
import argparse
from csv import DictReader
from csv import DictWriter
def main():
parser = argparse.ArgumentParser(description='Output difference in CSVs.')
parser.add_argument('-d', '--dataset', help='A CSV file of the full dataset', required=True)
parser.add_argument('-s', '--subset', help='A CSV file that is a subset of the full dataset', required=True)
parser.add_argument('-o', '--output', help='The CSV file we should write to (will be overwritten if it exists', required=True)
parser.add_argument('-u', '--unique', help='The unique key to compare between sets', required=True)
parser.add_argument('-k', '--keys', help='An array of all the keys', nargs='+', required=True)
args = parser.parse_args()
dataset_file = args.dataset
subset_file = args.subset
output_file = args.output
unique_key = args.unique
all_keys = args.keys
with open(dataset_file) as datafile, open(subset_file) as subsetfile, open(output_file, 'w') as outputfile:
data = { row[unique_key]: row for row in DictReader(datafile) }
subset = { row[unique_key]: row for row in DictReader(subsetfile) }
data_keys = set(data.keys())
subset_keys = set(subset.keys())
output_keys = data_keys - subset_keys
output = [ data[key] for key in output_keys ]
output_header = ','.join(all_keys)
outputfile.write(output_header + '\n')
output_csv = DictWriter(outputfile, all_keys)
for row in output:
output_csv.writerow(row)
if __name__=="__main__":
main()
@DJviolin
Copy link

DJviolin commented Nov 1, 2016

I slightly edited this code. Now accepting column number for comparison. This one only good when you want to work without headers in input and output files:

#!/usr/bin/env python3

"""
Source: https://gist.github.com/davidrleonard/4dbeebf749248a956e44
Usage: $ python3 csv-difference.py -d new.csv -s old.csv -o out.csv -c 1
"""

import sys
import argparse
import csv

def main():
    parser = argparse.ArgumentParser(description='Output difference in CSVs.')
    parser.add_argument('-d', '--dataset', help='A CSV file of the full dataset', required=True)
    parser.add_argument('-s', '--subset', help='A CSV file that is a subset of the full dataset', required=True)
    parser.add_argument('-o', '--output', help='The CSV file we should write to (will be overwritten if it exists', required=True)
    parser.add_argument('-c', '--column', help='A number of the column to be compared (0 is column 1, 1 is column 2, etc.)', required=True, type=int)

    args = parser.parse_args()
    dataset_file = args.dataset
    subset_file = args.subset
    output_file = args.output
    column_num = args.column

    with open(dataset_file, 'r') as datafile, open(subset_file, 'r') as subsetfile, open(output_file, 'w') as outputfile:
        data          = { row[column_num]: row for row in csv.reader(datafile, delimiter='|', quotechar='"') }
        subset        = { row[column_num]: row for row in csv.reader(subsetfile, delimiter='|', quotechar='"') }

        data_keys     = set(data.keys())
        subset_keys   = set(subset.keys())
        output_keys   = data_keys - subset_keys

        output        = [ data[key] for key in output_keys ]
        output_csv    = csv.writer(outputfile, delimiter='|', quotechar='"', quoting=csv.QUOTE_ALL)
        for row in output:
            output_csv.writerow(row)

if __name__ == '__main__':
    main()

sys.stdout.flush()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment