JakeTRogers/joincsvs.py

## joincsvs.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""join csv data in much the way an excel vlookup would, but smarter"""

import argparse                                            # argument parsing
import csv                                                 # parse CSV files
import logging                                             # logging for file/email
import sys                                                 # used to exit script
from collections import OrderedDict                        # preserve dictionary order
from os.path import basename                               # get file name
from textwrap import dedent                                # used to wrap help text

__version__ = '0.6.3'

logging.basicConfig(level=logging.WARNING, format='%(levelname)s\t%(message)s')
log = logging.getLogger(__name__)                          # configure logging

class JoinCSV:
    def __init__(self, path_to_file, quotes=False, sep=',', truncate=False, update_only=False):
        self.data = dict()
        self.fieldnames = []
        self.filename = basename(path_to_file)
        self.sep = sep
        self.setAB = self.setBA = set()
        self.del_rows = self.new_cols = self.new_rows = self.upd_rows = 0
        self.update_only = update_only
        with open(path_to_file, 'r') as csvfile:           # read csv file
            for line_num, row in enumerate(csv.reader(csvfile, delimiter=self.sep)):
                if row:
                    if quotes:                             # remove all quotes from line
                        row = [ item.strip('"\'') for item in row ]

                    key = row.pop(0).lower()
                    if truncate:                           # truncate private key
                        key = key.split('.')[0].lower()
                        log.debug("truncated key to '{}'".format(key))

                    if line_num == 0:                      # parse csv header
                        self.primary_key = key.upper()     # private key column name
                        self.fieldnames = [ item.upper() for item in row ]
                        log.debug("fieldnames: '{}, {}'".format(self.primary_key, ', '.join(self.fieldnames)))
                        continue

                    if not key.isspace():                  # do not add blank lines
                        self.data[key] = OrderedDict(zip(self.fieldnames, row))
            log.debug("file: '{}' read into dictionary".format(path_to_file))

    def __add__(self, csv2):
        self.setAB, self.setBA = self ^ csv2               # csv1 XOR csv2
        for field in csv2.fieldnames:                      # loop over fields in csv2
            if field not in self.fieldnames:               # if field not in csv1
                self.fieldnames.append(field)              # add field to csv1
                log.debug('added column: {}'.format(field))
                self.new_cols += 1

            for key in csv2.data.keys():                   # update existing row w/ new column values
                if key in self.data and csv2.data[key].get(field):
                    if self.data[key].get(field) and self.data[key][field] != csv2.data[key][field]:
                        self.data[key][field] = csv2.data[key][field]
                        log.debug("changed value of column '{}' for row '{}' from '{}' to '{}'".format(
                            field, key, self.data[key][field], csv2.data[key][field])
                        )
                        self.upd_rows += 1

                    elif not self.data[key].get(field):    # add new column to existing row
                        self.data[key][field] = csv2.data[key][field]
                        log.debug("added column '{}' with value: '{}' for row '{}'".format(
                            field, csv2.data[key][field], key)
                        )
                        self.upd_rows += 1

                elif key not in self.data and not self.update_only:
                    self.data[key] = csv2.data[key]        # add new rows
                    log.debug('added row for {} with column, value pair(s): {}'.format(key, csv2.data[key].items()))
                    self.new_rows += 1
        return self

    def __len__(self):
        return len(self.data)

    def __str__(self):
        return self.primary_key + "," + ",".join(self.fieldnames) + "\n" \
               + "\n".join([ ( key + ',' + self._dict_to_list(rowdict) ) for key, rowdict in sorted(self.data.items()) ])

    def __sub__(self, csv2):
        for key in csv2.data.keys():
            if key in self.data.keys():                    # remove keys in csv2 from csv1
                log.debug('removed row: {} with data: {}'.format(key, self.data.pop(key).items()))
                self.del_rows += 1
        return self

    def __xor__(self, csv2):
        setAB = '\n'.join(set(self.data.keys()) - set(csv2.data.keys()))
        setBA = '\n'.join(set(csv2.data.keys()) - set(self.data.keys()))
        return setAB, setBA

    def _dict_to_list(self, rowdict):
        return ','.join((rowdict.get(key, '')) for key in self.fieldnames)


parser = argparse.ArgumentParser(                          # define cmd line arguments
    formatter_class=argparse.RawDescriptionHelpFormatter,
    description=dedent("""
        %(prog)s will perform updates, addition, or subtraction on data from 2
        csv files, based on a common 1st column. Output is sorted and the first
        column is downcased to allow pairing of primary keys between files.
        Think CLI-based automatic excel V-Lookups."""),
    epilog=dedent("""
        examples:\n  \
        add fileA to fileB:\n    \
        %(prog)s fileA.csv fileB.csv\n\n  \
        subtract fileB from fileA:\n    \
        %(prog)s fileA.csv fileB.csv -s\n"""),
    )
group = parser.add_mutually_exclusive_group()
parser.add_argument(
    'fileA',
    help='primary csv file')
parser.add_argument(
    'fileB',
    help='secondary csv file')
group.add_argument(
    '-a', '--add',
    action='store_true',
    help='add new rows and columns from fileA into fileB(default)')
parser.add_argument(
    '-d', '--delimiter',
    default=',',
    help='specify a delimiter other than the default comma')
parser.add_argument(
    '-q', '--remove_quotes',
    action='store_true',
    dest='quotes',
    help='remove all quotes. Required when one file is quoted and the other is not')
group.add_argument(
    '-s', '--subtract',
    action='store_true',
    help='remove lines in fileA that contain matching primary keys for fileB')
parser.add_argument(
    '-t', '--truncate',
    action='store_true',
    help='truncate everything after first "." from primary key(1st column). Used to remove domain names.')
parser.add_argument(
    '-u', '--update-only',
    action='store_true',
    help='only update existing rows in csv1, do NOT add new rows')
parser.add_argument(
    '-v', '--verbose',
    action='count',
    default=0,
    help='use one or more -v options to increase logging level. Maximum is 2.')
parser.add_argument(
    '-V', '--version',
    action='version',
    version='%(prog)s {}'.format(__version__))


def main():
    if len(sys.argv) <= 1:                                 # if no args passed
        parser.print_help()                                # print help
        exit(1)

    args = parser.parse_args()                             # create arguments object

    log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
    log_level = log_levels[min(len(log_levels)-1,args.verbose)]
    log.setLevel(log_level)                                # set log level

    log.debug('{}'.format(args))                           # log arguments
                                                           # create JoinCSV objects
    csv1 = JoinCSV(args.fileA, truncate=args.truncate, sep=args.delimiter, quotes=args.quotes, update_only=args.update_only)
    csv2 = JoinCSV(args.fileB, truncate=args.truncate, sep=args.delimiter, quotes=args.quotes, update_only=args.update_only)

    if args.subtract:
        csv3 = csv1 - csv2                                 # subtract csv2 from csv1
    else:
        csv3 = csv1 + csv2                                 # add csv1 and csv2

    log.info("--- STATS ---")
    log.info("{} row(s) in {}".format(len(csv1) + csv1.del_rows - csv1.new_rows, csv1.filename))
    log.info("{} row(s) in {}".format(len(csv2), csv2.filename))
    log.info("{} row(s) in new file".format(len(csv1)))
    log.info("{} row(s) removed".format(csv1.del_rows))
    log.info("{} row(s) added".format(csv1.new_rows))
    log.info("{} row(s) updated".format(csv1.upd_rows))
    log.info("{} column(s) added\n".format(csv1.new_cols))

    if csv1.setAB:                                         # show files in csv1 but NOT in csv2
        log.warning("row(s) found in '{}' but NOT in '{}':\n{}\n".format(csv1.filename, csv2.filename, csv1.setAB))
    if csv1.setBA and not args.update_only:                # show files in csv2 but NOT in csv1
        log.warning("row(s) found in '{}' but NOT in '{}':\n{}\n".format(csv2.filename, csv1.filename, csv1.setBA))

    print(csv3)                                            # print resulting csv file

if __name__ == '__main__':
    sys.exit(main())
	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""join csv data in much the way an excel vlookup would, but smarter"""

	import argparse # argument parsing
	import csv # parse CSV files
	import logging # logging for file/email
	import sys # used to exit script
	from collections import OrderedDict # preserve dictionary order
	from os.path import basename # get file name
	from textwrap import dedent # used to wrap help text

	__version__ = '0.6.3'

	logging.basicConfig(level=logging.WARNING, format='%(levelname)s\t%(message)s')
	log = logging.getLogger(__name__) # configure logging

	class JoinCSV:
	def __init__(self, path_to_file, quotes=False, sep=',', truncate=False, update_only=False):
	self.data = dict()
	self.fieldnames = []
	self.filename = basename(path_to_file)
	self.sep = sep
	self.setAB = self.setBA = set()
	self.del_rows = self.new_cols = self.new_rows = self.upd_rows = 0
	self.update_only = update_only
	with open(path_to_file, 'r') as csvfile: # read csv file
	for line_num, row in enumerate(csv.reader(csvfile, delimiter=self.sep)):
	if row:
	if quotes: # remove all quotes from line
	row = [ item.strip('"\'') for item in row ]

	key = row.pop(0).lower()
	if truncate: # truncate private key
	key = key.split('.')[0].lower()
	log.debug("truncated key to '{}'".format(key))

	if line_num == 0: # parse csv header
	self.primary_key = key.upper() # private key column name
	self.fieldnames = [ item.upper() for item in row ]
	log.debug("fieldnames: '{}, {}'".format(self.primary_key, ', '.join(self.fieldnames)))
	continue

	if not key.isspace(): # do not add blank lines
	self.data[key] = OrderedDict(zip(self.fieldnames, row))
	log.debug("file: '{}' read into dictionary".format(path_to_file))

	def __add__(self, csv2):
	self.setAB, self.setBA = self ^ csv2 # csv1 XOR csv2
	for field in csv2.fieldnames: # loop over fields in csv2
	if field not in self.fieldnames: # if field not in csv1
	self.fieldnames.append(field) # add field to csv1
	log.debug('added column: {}'.format(field))
	self.new_cols += 1

	for key in csv2.data.keys(): # update existing row w/ new column values
	if key in self.data and csv2.data[key].get(field):
	if self.data[key].get(field) and self.data[key][field] != csv2.data[key][field]:
	self.data[key][field] = csv2.data[key][field]
	log.debug("changed value of column '{}' for row '{}' from '{}' to '{}'".format(
	field, key, self.data[key][field], csv2.data[key][field])
	)
	self.upd_rows += 1

	elif not self.data[key].get(field): # add new column to existing row
	self.data[key][field] = csv2.data[key][field]
	log.debug("added column '{}' with value: '{}' for row '{}'".format(
	field, csv2.data[key][field], key)
	)
	self.upd_rows += 1

	elif key not in self.data and not self.update_only:
	self.data[key] = csv2.data[key] # add new rows
	log.debug('added row for {} with column, value pair(s): {}'.format(key, csv2.data[key].items()))
	self.new_rows += 1
	return self

	def __len__(self):
	return len(self.data)

	def __str__(self):
	return self.primary_key + "," + ",".join(self.fieldnames) + "\n" \
	+ "\n".join([ ( key + ',' + self._dict_to_list(rowdict) ) for key, rowdict in sorted(self.data.items()) ])

	def __sub__(self, csv2):
	for key in csv2.data.keys():
	if key in self.data.keys(): # remove keys in csv2 from csv1
	log.debug('removed row: {} with data: {}'.format(key, self.data.pop(key).items()))
	self.del_rows += 1
	return self

	def __xor__(self, csv2):
	setAB = '\n'.join(set(self.data.keys()) - set(csv2.data.keys()))
	setBA = '\n'.join(set(csv2.data.keys()) - set(self.data.keys()))
	return setAB, setBA

	def _dict_to_list(self, rowdict):
	return ','.join((rowdict.get(key, '')) for key in self.fieldnames)


	parser = argparse.ArgumentParser( # define cmd line arguments
	formatter_class=argparse.RawDescriptionHelpFormatter,
	description=dedent("""
	%(prog)s will perform updates, addition, or subtraction on data from 2
	csv files, based on a common 1st column. Output is sorted and the first
	column is downcased to allow pairing of primary keys between files.
	Think CLI-based automatic excel V-Lookups."""),
	epilog=dedent("""
	examples:\n \
	add fileA to fileB:\n \
	%(prog)s fileA.csv fileB.csv\n\n \
	subtract fileB from fileA:\n \
	%(prog)s fileA.csv fileB.csv -s\n"""),
	)
	group = parser.add_mutually_exclusive_group()
	parser.add_argument(
	'fileA',
	help='primary csv file')
	parser.add_argument(
	'fileB',
	help='secondary csv file')
	group.add_argument(
	'-a', '--add',
	action='store_true',
	help='add new rows and columns from fileA into fileB(default)')
	parser.add_argument(
	'-d', '--delimiter',
	default=',',
	help='specify a delimiter other than the default comma')
	parser.add_argument(
	'-q', '--remove_quotes',
	action='store_true',
	dest='quotes',
	help='remove all quotes. Required when one file is quoted and the other is not')
	group.add_argument(
	'-s', '--subtract',
	action='store_true',
	help='remove lines in fileA that contain matching primary keys for fileB')
	parser.add_argument(
	'-t', '--truncate',
	action='store_true',
	help='truncate everything after first "." from primary key(1st column). Used to remove domain names.')
	parser.add_argument(
	'-u', '--update-only',
	action='store_true',
	help='only update existing rows in csv1, do NOT add new rows')
	parser.add_argument(
	'-v', '--verbose',
	action='count',
	default=0,
	help='use one or more -v options to increase logging level. Maximum is 2.')
	parser.add_argument(
	'-V', '--version',
	action='version',
	version='%(prog)s {}'.format(__version__))


	def main():
	if len(sys.argv) <= 1: # if no args passed
	parser.print_help() # print help
	exit(1)

	args = parser.parse_args() # create arguments object

	log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
	log_level = log_levels[min(len(log_levels)-1,args.verbose)]
	log.setLevel(log_level) # set log level

	log.debug('{}'.format(args)) # log arguments
	# create JoinCSV objects
	csv1 = JoinCSV(args.fileA, truncate=args.truncate, sep=args.delimiter, quotes=args.quotes, update_only=args.update_only)
	csv2 = JoinCSV(args.fileB, truncate=args.truncate, sep=args.delimiter, quotes=args.quotes, update_only=args.update_only)

	if args.subtract:
	csv3 = csv1 - csv2 # subtract csv2 from csv1
	else:
	csv3 = csv1 + csv2 # add csv1 and csv2

	log.info("--- STATS ---")
	log.info("{} row(s) in {}".format(len(csv1) + csv1.del_rows - csv1.new_rows, csv1.filename))
	log.info("{} row(s) in {}".format(len(csv2), csv2.filename))
	log.info("{} row(s) in new file".format(len(csv1)))
	log.info("{} row(s) removed".format(csv1.del_rows))
	log.info("{} row(s) added".format(csv1.new_rows))
	log.info("{} row(s) updated".format(csv1.upd_rows))
	log.info("{} column(s) added\n".format(csv1.new_cols))

	if csv1.setAB: # show files in csv1 but NOT in csv2
	log.warning("row(s) found in '{}' but NOT in '{}':\n{}\n".format(csv1.filename, csv2.filename, csv1.setAB))
	if csv1.setBA and not args.update_only: # show files in csv2 but NOT in csv1
	log.warning("row(s) found in '{}' but NOT in '{}':\n{}\n".format(csv2.filename, csv1.filename, csv1.setBA))

	print(csv3) # print resulting csv file

	if __name__ == '__main__':
	sys.exit(main())