rwev/csvp.py

## csvp.py
'''
CSVP.PY: Comma-Separated Value Preview

Python script for previewing large CSV datasets without loading into memory.

Author: rwev (https://github.com/rwev)

Built-in dependencies only.

Provides a preview of CSV data, including header fields and a list of unique values for each, from the first N lines of the target file.

USAGE:
    > python csvp.py /<relative>/<filepath>/<filename>.csv <N-number-lines-to-preview>

Solution statement:
    Large datasets are slow to open (load into memory) for viewing and manipulation.
    When trying to complete an analysis project, it is tedious to repeatedly open and reload
    the dataset to extract small pieces of information, such as the column labels and the format of corresponding entries.

    Consider a generic CSV dataset approximately 1 GB in size.
    Opening in Microsoft Excel takes 2 minutes and 55 seconds to open the file.
    Notepad is marginally faster, but presents information in a disorderly format,
    where one row is not aligned with the next, making it very difficult to observe the entries in each column.
    And this is with a computer of relatively powerful specifications (Intel Core i7; 16.0 GB).

    CSVP.PY attempts to resolve this by quickly providing a preview of header and value information.

    Getting a preview of the data, including the headers and unique entries in the first N lines (where N is a command-line parameter given by user that defaults to 10)
    is completed by CSVP.PY in a matter of seconds.

    In the case that the default preview of column values isn't enough, the number of lines to parse can be increased to liken the discovery of new unique values.
'''

import sys
import os
import csv
from time import time
from io import StringIO

fn = sys.argv[1]
if not os.path.exists(fn):
    print 'Invalid filepath', fn
    sys.exit()

SHOW_COUNT = 5
try:
    prevcount = sys.argv[2]
    prevcount = abs(int(prevcount))
except:
    prevcount = 10

starttime = time()

print 'Opening', fn, '...',
f = open(fn, mode='r')

fields = f.readline().split(',')

previewstr = ''
for i in range(prevcount):
    previewstr += f.readline()
previewio = StringIO(unicode(previewstr))

csvr = csv.reader(previewio)
datarows = []
for r in csvr:
    datarows.append(r)

datacols = map(list, zip(*datarows))

if (len(fields) != len(datacols)):
    print 'ERROR: Number of headers does not match columns of data.', fn
    sys.exit()

print 'read', str(prevcount), 'lines.'

print 'CSV Preview:', fn
print 'Showing first', str(SHOW_COUNT), 'unique values detected in the first ', str(prevcount), 'lines.'

for (field, datacol) in zip(fields, datacols):
    vals = list(set(datacol))
    valsstr = ', '.join(map(str, vals[:SHOW_COUNT]))
    if len(vals) > 5:
        valsstr += '...'
    print '\t', field, ':\t', valsstr

endtime = time()
print 'DONE in', str(round(endtime - starttime, 2)), ' seconds.'
	'''
	CSVP.PY: Comma-Separated Value Preview

	Python script for previewing large CSV datasets without loading into memory.

	Author: rwev (https://github.com/rwev)

	Built-in dependencies only.

	Provides a preview of CSV data, including header fields and a list of unique values for each, from the first N lines of the target file.

	USAGE:
	> python csvp.py /<relative>/<filepath>/<filename>.csv <N-number-lines-to-preview>

	Solution statement:
	Large datasets are slow to open (load into memory) for viewing and manipulation.
	When trying to complete an analysis project, it is tedious to repeatedly open and reload
	the dataset to extract small pieces of information, such as the column labels and the format of corresponding entries.

	Consider a generic CSV dataset approximately 1 GB in size.
	Opening in Microsoft Excel takes 2 minutes and 55 seconds to open the file.
	Notepad is marginally faster, but presents information in a disorderly format,
	where one row is not aligned with the next, making it very difficult to observe the entries in each column.
	And this is with a computer of relatively powerful specifications (Intel Core i7; 16.0 GB).

	CSVP.PY attempts to resolve this by quickly providing a preview of header and value information.

	Getting a preview of the data, including the headers and unique entries in the first N lines (where N is a command-line parameter given by user that defaults to 10)
	is completed by CSVP.PY in a matter of seconds.

	In the case that the default preview of column values isn't enough, the number of lines to parse can be increased to liken the discovery of new unique values.
	'''

	import sys
	import os
	import csv
	from time import time
	from io import StringIO

	fn = sys.argv[1]
	if not os.path.exists(fn):
	print 'Invalid filepath', fn
	sys.exit()

	SHOW_COUNT = 5
	try:
	prevcount = sys.argv[2]
	prevcount = abs(int(prevcount))
	except:
	prevcount = 10

	starttime = time()

	print 'Opening', fn, '...',
	f = open(fn, mode='r')

	fields = f.readline().split(',')

	previewstr = ''
	for i in range(prevcount):
	previewstr += f.readline()
	previewio = StringIO(unicode(previewstr))

	csvr = csv.reader(previewio)
	datarows = []
	for r in csvr:
	datarows.append(r)

	datacols = map(list, zip(*datarows))

	if (len(fields) != len(datacols)):
	print 'ERROR: Number of headers does not match columns of data.', fn
	sys.exit()

	print 'read', str(prevcount), 'lines.'

	print 'CSV Preview:', fn
	print 'Showing first', str(SHOW_COUNT), 'unique values detected in the first ', str(prevcount), 'lines.'

	for (field, datacol) in zip(fields, datacols):
	vals = list(set(datacol))
	valsstr = ', '.join(map(str, vals[:SHOW_COUNT]))
	if len(vals) > 5:
	valsstr += '...'
	print '\t', field, ':\t', valsstr

	endtime = time()
	print 'DONE in', str(round(endtime - starttime, 2)), ' seconds.'