Last active
August 29, 2015 13:57
-
-
Save JimHaughwout/9713523 to your computer and use it in GitHub Desktop.
Describe details of a CSV file, proving a preview. Useful for first step in cleaning of really large CSV files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
""" | |
Describes a CSV file.Recommended for files saved in Windows CSV format. | |
Useful for situations where you need to get some basic info on a huge CSV file | |
(logs, sensor data, etc.) | |
: param source : csv_file you want to describe. Must end in .csv | |
: optional param preview_size : number of rows to print raw data as preview | |
Result printed to screen: | |
First N lines if preview specified | |
Then: file name, size, row and column count along with | |
number list of header items along with sample data | |
""" | |
import sys | |
import csv | |
import os | |
from sys import argv | |
# Check usage and provide help | |
if not(1 < len(argv) < 4): | |
usage = "Usage: %s source-file.csv optional-num-rows-to-preview" % argv[0] | |
error = "You passed %d argument(s)." % len(argv) | |
sys.exit("%s -- %s" % (usage, error)) | |
if argv[1] in ('-h', '-help'): | |
print "Usage: %s source-file.csv optional-num-rows-to-preview" % argv[0] | |
sys.exit() | |
source_filename = argv[1] | |
if '.csv' not in source_filename: | |
usage = "Usage: %s source-file.csv" % argv[0] | |
error = "You passed %r for source-file.csv" % source_filename | |
sys.exit("%s -- %s" % (usage, error)) | |
if len(argv) == 3: | |
try: | |
preview_size = int(argv[2]) | |
except: | |
print "Preview size %r is not an integer, skipping preview." % argv[2] | |
preview_size = 0 # Skip preview | |
else: | |
preview_size = 0 # No preview | |
# Open the file | |
try: | |
source = open(source_filename, 'r') | |
except: | |
e = sys.exc_info()[0] | |
sys.exit("Error - Could not open input file %r: %s" % (argv[1], e)) | |
# Get basic file info | |
reader = csv.reader(source) | |
header = reader.next() | |
sample_data = reader.next() | |
file_size = os.path.getsize(argv[1]) | |
# Annoyling have to loop through the file to get row count | |
# Take advantage of this for a preview | |
source.seek(0) | |
row_count = 0 | |
if preview_size > 0: | |
print "\nPREVIEW: UP TO FIRST %d ROW(S) OF DATA, IF THEY EXIST" % \ | |
preview_size | |
for row in reader: | |
row_count += 1 | |
if row_count <= preview_size: | |
print row | |
col_count = len(header) | |
# Print in nice human readable format | |
print "\nFILENAME: %r" % source_filename | |
print "SIZE: %d bytes" % file_size | |
print "ROW COUNT: %d (including header)" % row_count | |
print "COLUMN COUNT: %d" % col_count | |
print "\nWHAT THE DATA IS LIKE:" | |
for cell in xrange(0, col_count): | |
#print item | |
print "Column %d: %-24s\tSample: %r" % \ | |
(cell + 1, header[cell], sample_data[cell]) | |
print "" | |
# Be nice and close the file | |
source.closed |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment