Skip to content

Instantly share code, notes, and snippets.

@JakeTRogers
Created August 27, 2018 15:43
Show Gist options
  • Save JakeTRogers/7f364d64451427657b021d440d71f106 to your computer and use it in GitHub Desktop.
Save JakeTRogers/7f364d64451427657b021d440d71f106 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""join csv data in much the way an excel vlookup would, but smarter"""
import argparse # argument parsing
import csv # parse CSV files
import logging # logging for file/email
import sys # used to exit script
from collections import OrderedDict # preserve dictionary order
from os.path import basename # get file name
from textwrap import dedent # used to wrap help text
__version__ = '0.6.3'
logging.basicConfig(level=logging.WARNING, format='%(levelname)s\t%(message)s')
log = logging.getLogger(__name__) # configure logging
class JoinCSV:
def __init__(self, path_to_file, quotes=False, sep=',', truncate=False, update_only=False):
self.data = dict()
self.fieldnames = []
self.filename = basename(path_to_file)
self.sep = sep
self.setAB = self.setBA = set()
self.del_rows = self.new_cols = self.new_rows = self.upd_rows = 0
self.update_only = update_only
with open(path_to_file, 'r') as csvfile: # read csv file
for line_num, row in enumerate(csv.reader(csvfile, delimiter=self.sep)):
if row:
if quotes: # remove all quotes from line
row = [ item.strip('"\'') for item in row ]
key = row.pop(0).lower()
if truncate: # truncate private key
key = key.split('.')[0].lower()
log.debug("truncated key to '{}'".format(key))
if line_num == 0: # parse csv header
self.primary_key = key.upper() # private key column name
self.fieldnames = [ item.upper() for item in row ]
log.debug("fieldnames: '{}, {}'".format(self.primary_key, ', '.join(self.fieldnames)))
continue
if not key.isspace(): # do not add blank lines
self.data[key] = OrderedDict(zip(self.fieldnames, row))
log.debug("file: '{}' read into dictionary".format(path_to_file))
def __add__(self, csv2):
self.setAB, self.setBA = self ^ csv2 # csv1 XOR csv2
for field in csv2.fieldnames: # loop over fields in csv2
if field not in self.fieldnames: # if field not in csv1
self.fieldnames.append(field) # add field to csv1
log.debug('added column: {}'.format(field))
self.new_cols += 1
for key in csv2.data.keys(): # update existing row w/ new column values
if key in self.data and csv2.data[key].get(field):
if self.data[key].get(field) and self.data[key][field] != csv2.data[key][field]:
self.data[key][field] = csv2.data[key][field]
log.debug("changed value of column '{}' for row '{}' from '{}' to '{}'".format(
field, key, self.data[key][field], csv2.data[key][field])
)
self.upd_rows += 1
elif not self.data[key].get(field): # add new column to existing row
self.data[key][field] = csv2.data[key][field]
log.debug("added column '{}' with value: '{}' for row '{}'".format(
field, csv2.data[key][field], key)
)
self.upd_rows += 1
elif key not in self.data and not self.update_only:
self.data[key] = csv2.data[key] # add new rows
log.debug('added row for {} with column, value pair(s): {}'.format(key, csv2.data[key].items()))
self.new_rows += 1
return self
def __len__(self):
return len(self.data)
def __str__(self):
return self.primary_key + "," + ",".join(self.fieldnames) + "\n" \
+ "\n".join([ ( key + ',' + self._dict_to_list(rowdict) ) for key, rowdict in sorted(self.data.items()) ])
def __sub__(self, csv2):
for key in csv2.data.keys():
if key in self.data.keys(): # remove keys in csv2 from csv1
log.debug('removed row: {} with data: {}'.format(key, self.data.pop(key).items()))
self.del_rows += 1
return self
def __xor__(self, csv2):
setAB = '\n'.join(set(self.data.keys()) - set(csv2.data.keys()))
setBA = '\n'.join(set(csv2.data.keys()) - set(self.data.keys()))
return setAB, setBA
def _dict_to_list(self, rowdict):
return ','.join((rowdict.get(key, '')) for key in self.fieldnames)
parser = argparse.ArgumentParser( # define cmd line arguments
formatter_class=argparse.RawDescriptionHelpFormatter,
description=dedent("""
%(prog)s will perform updates, addition, or subtraction on data from 2
csv files, based on a common 1st column. Output is sorted and the first
column is downcased to allow pairing of primary keys between files.
Think CLI-based automatic excel V-Lookups."""),
epilog=dedent("""
examples:\n \
add fileA to fileB:\n \
%(prog)s fileA.csv fileB.csv\n\n \
subtract fileB from fileA:\n \
%(prog)s fileA.csv fileB.csv -s\n"""),
)
group = parser.add_mutually_exclusive_group()
parser.add_argument(
'fileA',
help='primary csv file')
parser.add_argument(
'fileB',
help='secondary csv file')
group.add_argument(
'-a', '--add',
action='store_true',
help='add new rows and columns from fileA into fileB(default)')
parser.add_argument(
'-d', '--delimiter',
default=',',
help='specify a delimiter other than the default comma')
parser.add_argument(
'-q', '--remove_quotes',
action='store_true',
dest='quotes',
help='remove all quotes. Required when one file is quoted and the other is not')
group.add_argument(
'-s', '--subtract',
action='store_true',
help='remove lines in fileA that contain matching primary keys for fileB')
parser.add_argument(
'-t', '--truncate',
action='store_true',
help='truncate everything after first "." from primary key(1st column). Used to remove domain names.')
parser.add_argument(
'-u', '--update-only',
action='store_true',
help='only update existing rows in csv1, do NOT add new rows')
parser.add_argument(
'-v', '--verbose',
action='count',
default=0,
help='use one or more -v options to increase logging level. Maximum is 2.')
parser.add_argument(
'-V', '--version',
action='version',
version='%(prog)s {}'.format(__version__))
def main():
if len(sys.argv) <= 1: # if no args passed
parser.print_help() # print help
exit(1)
args = parser.parse_args() # create arguments object
log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
log_level = log_levels[min(len(log_levels)-1,args.verbose)]
log.setLevel(log_level) # set log level
log.debug('{}'.format(args)) # log arguments
# create JoinCSV objects
csv1 = JoinCSV(args.fileA, truncate=args.truncate, sep=args.delimiter, quotes=args.quotes, update_only=args.update_only)
csv2 = JoinCSV(args.fileB, truncate=args.truncate, sep=args.delimiter, quotes=args.quotes, update_only=args.update_only)
if args.subtract:
csv3 = csv1 - csv2 # subtract csv2 from csv1
else:
csv3 = csv1 + csv2 # add csv1 and csv2
log.info("--- STATS ---")
log.info("{} row(s) in {}".format(len(csv1) + csv1.del_rows - csv1.new_rows, csv1.filename))
log.info("{} row(s) in {}".format(len(csv2), csv2.filename))
log.info("{} row(s) in new file".format(len(csv1)))
log.info("{} row(s) removed".format(csv1.del_rows))
log.info("{} row(s) added".format(csv1.new_rows))
log.info("{} row(s) updated".format(csv1.upd_rows))
log.info("{} column(s) added\n".format(csv1.new_cols))
if csv1.setAB: # show files in csv1 but NOT in csv2
log.warning("row(s) found in '{}' but NOT in '{}':\n{}\n".format(csv1.filename, csv2.filename, csv1.setAB))
if csv1.setBA and not args.update_only: # show files in csv2 but NOT in csv1
log.warning("row(s) found in '{}' but NOT in '{}':\n{}\n".format(csv2.filename, csv1.filename, csv1.setBA))
print(csv3) # print resulting csv file
if __name__ == '__main__':
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment