Created
August 27, 2018 15:43
-
-
Save JakeTRogers/7f364d64451427657b021d440d71f106 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
"""join csv data in much the way an excel vlookup would, but smarter""" | |
import argparse # argument parsing | |
import csv # parse CSV files | |
import logging # logging for file/email | |
import sys # used to exit script | |
from collections import OrderedDict # preserve dictionary order | |
from os.path import basename # get file name | |
from textwrap import dedent # used to wrap help text | |
__version__ = '0.6.3' | |
logging.basicConfig(level=logging.WARNING, format='%(levelname)s\t%(message)s') | |
log = logging.getLogger(__name__) # configure logging | |
class JoinCSV: | |
def __init__(self, path_to_file, quotes=False, sep=',', truncate=False, update_only=False): | |
self.data = dict() | |
self.fieldnames = [] | |
self.filename = basename(path_to_file) | |
self.sep = sep | |
self.setAB = self.setBA = set() | |
self.del_rows = self.new_cols = self.new_rows = self.upd_rows = 0 | |
self.update_only = update_only | |
with open(path_to_file, 'r') as csvfile: # read csv file | |
for line_num, row in enumerate(csv.reader(csvfile, delimiter=self.sep)): | |
if row: | |
if quotes: # remove all quotes from line | |
row = [ item.strip('"\'') for item in row ] | |
key = row.pop(0).lower() | |
if truncate: # truncate private key | |
key = key.split('.')[0].lower() | |
log.debug("truncated key to '{}'".format(key)) | |
if line_num == 0: # parse csv header | |
self.primary_key = key.upper() # private key column name | |
self.fieldnames = [ item.upper() for item in row ] | |
log.debug("fieldnames: '{}, {}'".format(self.primary_key, ', '.join(self.fieldnames))) | |
continue | |
if not key.isspace(): # do not add blank lines | |
self.data[key] = OrderedDict(zip(self.fieldnames, row)) | |
log.debug("file: '{}' read into dictionary".format(path_to_file)) | |
def __add__(self, csv2): | |
self.setAB, self.setBA = self ^ csv2 # csv1 XOR csv2 | |
for field in csv2.fieldnames: # loop over fields in csv2 | |
if field not in self.fieldnames: # if field not in csv1 | |
self.fieldnames.append(field) # add field to csv1 | |
log.debug('added column: {}'.format(field)) | |
self.new_cols += 1 | |
for key in csv2.data.keys(): # update existing row w/ new column values | |
if key in self.data and csv2.data[key].get(field): | |
if self.data[key].get(field) and self.data[key][field] != csv2.data[key][field]: | |
self.data[key][field] = csv2.data[key][field] | |
log.debug("changed value of column '{}' for row '{}' from '{}' to '{}'".format( | |
field, key, self.data[key][field], csv2.data[key][field]) | |
) | |
self.upd_rows += 1 | |
elif not self.data[key].get(field): # add new column to existing row | |
self.data[key][field] = csv2.data[key][field] | |
log.debug("added column '{}' with value: '{}' for row '{}'".format( | |
field, csv2.data[key][field], key) | |
) | |
self.upd_rows += 1 | |
elif key not in self.data and not self.update_only: | |
self.data[key] = csv2.data[key] # add new rows | |
log.debug('added row for {} with column, value pair(s): {}'.format(key, csv2.data[key].items())) | |
self.new_rows += 1 | |
return self | |
def __len__(self): | |
return len(self.data) | |
def __str__(self): | |
return self.primary_key + "," + ",".join(self.fieldnames) + "\n" \ | |
+ "\n".join([ ( key + ',' + self._dict_to_list(rowdict) ) for key, rowdict in sorted(self.data.items()) ]) | |
def __sub__(self, csv2): | |
for key in csv2.data.keys(): | |
if key in self.data.keys(): # remove keys in csv2 from csv1 | |
log.debug('removed row: {} with data: {}'.format(key, self.data.pop(key).items())) | |
self.del_rows += 1 | |
return self | |
def __xor__(self, csv2): | |
setAB = '\n'.join(set(self.data.keys()) - set(csv2.data.keys())) | |
setBA = '\n'.join(set(csv2.data.keys()) - set(self.data.keys())) | |
return setAB, setBA | |
def _dict_to_list(self, rowdict): | |
return ','.join((rowdict.get(key, '')) for key in self.fieldnames) | |
parser = argparse.ArgumentParser( # define cmd line arguments | |
formatter_class=argparse.RawDescriptionHelpFormatter, | |
description=dedent(""" | |
%(prog)s will perform updates, addition, or subtraction on data from 2 | |
csv files, based on a common 1st column. Output is sorted and the first | |
column is downcased to allow pairing of primary keys between files. | |
Think CLI-based automatic excel V-Lookups."""), | |
epilog=dedent(""" | |
examples:\n \ | |
add fileA to fileB:\n \ | |
%(prog)s fileA.csv fileB.csv\n\n \ | |
subtract fileB from fileA:\n \ | |
%(prog)s fileA.csv fileB.csv -s\n"""), | |
) | |
group = parser.add_mutually_exclusive_group() | |
parser.add_argument( | |
'fileA', | |
help='primary csv file') | |
parser.add_argument( | |
'fileB', | |
help='secondary csv file') | |
group.add_argument( | |
'-a', '--add', | |
action='store_true', | |
help='add new rows and columns from fileA into fileB(default)') | |
parser.add_argument( | |
'-d', '--delimiter', | |
default=',', | |
help='specify a delimiter other than the default comma') | |
parser.add_argument( | |
'-q', '--remove_quotes', | |
action='store_true', | |
dest='quotes', | |
help='remove all quotes. Required when one file is quoted and the other is not') | |
group.add_argument( | |
'-s', '--subtract', | |
action='store_true', | |
help='remove lines in fileA that contain matching primary keys for fileB') | |
parser.add_argument( | |
'-t', '--truncate', | |
action='store_true', | |
help='truncate everything after first "." from primary key(1st column). Used to remove domain names.') | |
parser.add_argument( | |
'-u', '--update-only', | |
action='store_true', | |
help='only update existing rows in csv1, do NOT add new rows') | |
parser.add_argument( | |
'-v', '--verbose', | |
action='count', | |
default=0, | |
help='use one or more -v options to increase logging level. Maximum is 2.') | |
parser.add_argument( | |
'-V', '--version', | |
action='version', | |
version='%(prog)s {}'.format(__version__)) | |
def main(): | |
if len(sys.argv) <= 1: # if no args passed | |
parser.print_help() # print help | |
exit(1) | |
args = parser.parse_args() # create arguments object | |
log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] | |
log_level = log_levels[min(len(log_levels)-1,args.verbose)] | |
log.setLevel(log_level) # set log level | |
log.debug('{}'.format(args)) # log arguments | |
# create JoinCSV objects | |
csv1 = JoinCSV(args.fileA, truncate=args.truncate, sep=args.delimiter, quotes=args.quotes, update_only=args.update_only) | |
csv2 = JoinCSV(args.fileB, truncate=args.truncate, sep=args.delimiter, quotes=args.quotes, update_only=args.update_only) | |
if args.subtract: | |
csv3 = csv1 - csv2 # subtract csv2 from csv1 | |
else: | |
csv3 = csv1 + csv2 # add csv1 and csv2 | |
log.info("--- STATS ---") | |
log.info("{} row(s) in {}".format(len(csv1) + csv1.del_rows - csv1.new_rows, csv1.filename)) | |
log.info("{} row(s) in {}".format(len(csv2), csv2.filename)) | |
log.info("{} row(s) in new file".format(len(csv1))) | |
log.info("{} row(s) removed".format(csv1.del_rows)) | |
log.info("{} row(s) added".format(csv1.new_rows)) | |
log.info("{} row(s) updated".format(csv1.upd_rows)) | |
log.info("{} column(s) added\n".format(csv1.new_cols)) | |
if csv1.setAB: # show files in csv1 but NOT in csv2 | |
log.warning("row(s) found in '{}' but NOT in '{}':\n{}\n".format(csv1.filename, csv2.filename, csv1.setAB)) | |
if csv1.setBA and not args.update_only: # show files in csv2 but NOT in csv1 | |
log.warning("row(s) found in '{}' but NOT in '{}':\n{}\n".format(csv2.filename, csv1.filename, csv1.setBA)) | |
print(csv3) # print resulting csv file | |
if __name__ == '__main__': | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment