Last active
November 1, 2016 21:03
-
-
Save davidrleonard/4dbeebf749248a956e44 to your computer and use it in GitHub Desktop.
Compare two CSVs, output the difference between them
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Compares two CSVs and outputs a new CSV that excludes the intersection | |
# i.e.: | |
# $ python csv-difference.py -d fulldataset.csv -s subset.csv -o output.csv -u id -k id email name | |
import argparse | |
from csv import DictReader | |
from csv import DictWriter | |
def main(): | |
parser = argparse.ArgumentParser(description='Output difference in CSVs.') | |
parser.add_argument('-d', '--dataset', help='A CSV file of the full dataset', required=True) | |
parser.add_argument('-s', '--subset', help='A CSV file that is a subset of the full dataset', required=True) | |
parser.add_argument('-o', '--output', help='The CSV file we should write to (will be overwritten if it exists', required=True) | |
parser.add_argument('-u', '--unique', help='The unique key to compare between sets', required=True) | |
parser.add_argument('-k', '--keys', help='An array of all the keys', nargs='+', required=True) | |
args = parser.parse_args() | |
dataset_file = args.dataset | |
subset_file = args.subset | |
output_file = args.output | |
unique_key = args.unique | |
all_keys = args.keys | |
with open(dataset_file) as datafile, open(subset_file) as subsetfile, open(output_file, 'w') as outputfile: | |
data = { row[unique_key]: row for row in DictReader(datafile) } | |
subset = { row[unique_key]: row for row in DictReader(subsetfile) } | |
data_keys = set(data.keys()) | |
subset_keys = set(subset.keys()) | |
output_keys = data_keys - subset_keys | |
output = [ data[key] for key in output_keys ] | |
output_header = ','.join(all_keys) | |
outputfile.write(output_header + '\n') | |
output_csv = DictWriter(outputfile, all_keys) | |
for row in output: | |
output_csv.writerow(row) | |
if __name__=="__main__": | |
main() |
I slightly edited this code. Now accepting column number for comparison. This one only good when you want to work without headers in input and output files:
#!/usr/bin/env python3
"""
Source: https://gist.github.com/davidrleonard/4dbeebf749248a956e44
Usage: $ python3 csv-difference.py -d new.csv -s old.csv -o out.csv -c 1
"""
import sys
import argparse
import csv
def main():
parser = argparse.ArgumentParser(description='Output difference in CSVs.')
parser.add_argument('-d', '--dataset', help='A CSV file of the full dataset', required=True)
parser.add_argument('-s', '--subset', help='A CSV file that is a subset of the full dataset', required=True)
parser.add_argument('-o', '--output', help='The CSV file we should write to (will be overwritten if it exists', required=True)
parser.add_argument('-c', '--column', help='A number of the column to be compared (0 is column 1, 1 is column 2, etc.)', required=True, type=int)
args = parser.parse_args()
dataset_file = args.dataset
subset_file = args.subset
output_file = args.output
column_num = args.column
with open(dataset_file, 'r') as datafile, open(subset_file, 'r') as subsetfile, open(output_file, 'w') as outputfile:
data = { row[column_num]: row for row in csv.reader(datafile, delimiter='|', quotechar='"') }
subset = { row[column_num]: row for row in csv.reader(subsetfile, delimiter='|', quotechar='"') }
data_keys = set(data.keys())
subset_keys = set(subset.keys())
output_keys = data_keys - subset_keys
output = [ data[key] for key in output_keys ]
output_csv = csv.writer(outputfile, delimiter='|', quotechar='"', quoting=csv.QUOTE_ALL)
for row in output:
output_csv.writerow(row)
if __name__ == '__main__':
main()
sys.stdout.flush()
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi! This program is great!
Is it possible to append
all_keys
to the top ofdataset_file
andsubset_file
, when the source CSV files doesn't have a header? Or the other option, comparing the 2nd column (row[:1]
)?