Skip to content

Instantly share code, notes, and snippets.

@JimHaughwout
Last active August 29, 2015 13:57
Show Gist options
  • Save JimHaughwout/9713523 to your computer and use it in GitHub Desktop.
Save JimHaughwout/9713523 to your computer and use it in GitHub Desktop.
Describe details of a CSV file, proving a preview. Useful for first step in cleaning of really large CSV files
#! /usr/bin/env python
"""
Describes a CSV file.Recommended for files saved in Windows CSV format.
Useful for situations where you need to get some basic info on a huge CSV file
(logs, sensor data, etc.)
: param source : csv_file you want to describe. Must end in .csv
: optional param preview_size : number of rows to print raw data as preview
Result printed to screen:
First N lines if preview specified
Then: file name, size, row and column count along with
number list of header items along with sample data
"""
import sys
import csv
import os
from sys import argv
# Check usage and provide help
if not(1 < len(argv) < 4):
usage = "Usage: %s source-file.csv optional-num-rows-to-preview" % argv[0]
error = "You passed %d argument(s)." % len(argv)
sys.exit("%s -- %s" % (usage, error))
if argv[1] in ('-h', '-help'):
print "Usage: %s source-file.csv optional-num-rows-to-preview" % argv[0]
sys.exit()
source_filename = argv[1]
if '.csv' not in source_filename:
usage = "Usage: %s source-file.csv" % argv[0]
error = "You passed %r for source-file.csv" % source_filename
sys.exit("%s -- %s" % (usage, error))
if len(argv) == 3:
try:
preview_size = int(argv[2])
except:
print "Preview size %r is not an integer, skipping preview." % argv[2]
preview_size = 0 # Skip preview
else:
preview_size = 0 # No preview
# Open the file
try:
source = open(source_filename, 'r')
except:
e = sys.exc_info()[0]
sys.exit("Error - Could not open input file %r: %s" % (argv[1], e))
# Get basic file info
reader = csv.reader(source)
header = reader.next()
sample_data = reader.next()
file_size = os.path.getsize(argv[1])
# Annoyling have to loop through the file to get row count
# Take advantage of this for a preview
source.seek(0)
row_count = 0
if preview_size > 0:
print "\nPREVIEW: UP TO FIRST %d ROW(S) OF DATA, IF THEY EXIST" % \
preview_size
for row in reader:
row_count += 1
if row_count <= preview_size:
print row
col_count = len(header)
# Print in nice human readable format
print "\nFILENAME: %r" % source_filename
print "SIZE: %d bytes" % file_size
print "ROW COUNT: %d (including header)" % row_count
print "COLUMN COUNT: %d" % col_count
print "\nWHAT THE DATA IS LIKE:"
for cell in xrange(0, col_count):
#print item
print "Column %d: %-24s\tSample: %r" % \
(cell + 1, header[cell], sample_data[cell])
print ""
# Be nice and close the file
source.closed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment