Last active
November 1, 2016 21:03
-
-
Save davidrleonard/4dbeebf749248a956e44 to your computer and use it in GitHub Desktop.
Compare two CSVs, output the difference between them
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Compares two CSVs and outputs a new CSV that excludes the intersection | |
# i.e.: | |
# $ python csv-difference.py -d fulldataset.csv -s subset.csv -o output.csv -u id -k id email name | |
import argparse | |
from csv import DictReader | |
from csv import DictWriter | |
def main(): | |
parser = argparse.ArgumentParser(description='Output difference in CSVs.') | |
parser.add_argument('-d', '--dataset', help='A CSV file of the full dataset', required=True) | |
parser.add_argument('-s', '--subset', help='A CSV file that is a subset of the full dataset', required=True) | |
parser.add_argument('-o', '--output', help='The CSV file we should write to (will be overwritten if it exists', required=True) | |
parser.add_argument('-u', '--unique', help='The unique key to compare between sets', required=True) | |
parser.add_argument('-k', '--keys', help='An array of all the keys', nargs='+', required=True) | |
args = parser.parse_args() | |
dataset_file = args.dataset | |
subset_file = args.subset | |
output_file = args.output | |
unique_key = args.unique | |
all_keys = args.keys | |
with open(dataset_file) as datafile, open(subset_file) as subsetfile, open(output_file, 'w') as outputfile: | |
data = { row[unique_key]: row for row in DictReader(datafile) } | |
subset = { row[unique_key]: row for row in DictReader(subsetfile) } | |
data_keys = set(data.keys()) | |
subset_keys = set(subset.keys()) | |
output_keys = data_keys - subset_keys | |
output = [ data[key] for key in output_keys ] | |
output_header = ','.join(all_keys) | |
outputfile.write(output_header + '\n') | |
output_csv = DictWriter(outputfile, all_keys) | |
for row in output: | |
output_csv.writerow(row) | |
if __name__=="__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I slightly edited this code. Now accepting column number for comparison. This one only good when you want to work without headers in input and output files: