Skip to content

Instantly share code, notes, and snippets.

@hoffmanc
Created June 20, 2011 13:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hoffmanc/1035593 to your computer and use it in GitHub Desktop.
Save hoffmanc/1035593 to your computer and use it in GitHub Desktop.
WGA-7
# Chris Hoffman
# 2011-6-17
# Description: takes a
import pyodbc
import sys
import csv
import argparse
import glob
import os
import time
SERVER = ""
DB = ""
FILE_PTN = "*.asc"
CONN_STR = "DRIVER={SQL Server};SERVER=%s;DATABASE=%s;Integrated Security=true;Trusted Connection=true" % (SERVER,DB)
def get_args():
parser = argparse.ArgumentParser(description='CSV => DB Table.')
parser.add_argument('directory', metavar='dir', type=str,
help="directory to search for files of the form %s" % FILE_PTN)
parser.add_argument('-t','--tname', type=str,
help="output table name (edit source for server/db)")
parser.add_argument('-d', metavar='delimiter', type=str,
help="field delimiter (default: '|')")
parser.add_argument('-m', '--maxcols', metavar='max_cols', type=int, default=None,
help="maximum columns in files (default: derived)")
parser.add_argument('-f', '--file_ptn', type=str, default=None,
help="file name patern (default: *.asc)")
args = parser.parse_args()
return args
def file_list(srchdir, fileptn):
files_found = glob.glob(os.path.join(srchdir,fileptn))
if not files_found:
raise Exception("no files found matching pattern '%s' in %s" % (fileptn, srchdir))
return files_found
def get_max_cols(files):
maxcols = 0
for f in files:
fh = open(f, "rb")
line = fh.readline()
while line:
if maxcols < num_cols(line):
maxcols = num_cols(line)
print "Max Column Count Found: %s" % maxcols
line = fh.readline()
fh.close()
return maxcols
def num_cols(line):
return len(line.split('|')) + 1
def create_table(cursor, tname, max_cols):
sql = """
if object_id('%s') is null
begin
create table %s (
rownum int not null,
fname nvarchar(max) not null,
%s)
end""" % (
tname,
tname,
",".join(
map(lambda n: "v%s nvarchar(max)" % n, range(max_cols))
)
)
cursor.execute(sql)
def get_total_lines(fname):
fh = open(fname, "rb")
lcount = 0
while fh.readline():
lcount+=1
return lcount
def insert_rows(conn, cursor, tname, max_cols, fname):
total_lines = get_total_lines(fname)
all_cols = map(lambda n: "v%s" % n, (range(max_cols)))
sql_head = "insert into %s (rownum, fname, %s) " % (
tname,
",".join(all_cols)
)
fh = open(fname, "rb")
line = fh.readline()
counter = 0
start = time.time()
while line:
cols = line.split('|')
out_cols = []
for i in range(len(all_cols)):
if i < len(cols) and cols[i] != None and cols[i] != "":
out_cols.append("'%s'" % cols[i])
else:
out_cols.append('NULL')
sql = "%s values (%s, '%s', %s)" % (
sql_head,
counter,
clean_name(fname),
",".join(out_cols)
)
cursor.execute(sql)
line = fh.readline()
if counter % (total_lines / 25) == 0:
print "%s/%s (%.2f sec)" % (
counter / (total_lines / 25),
25,
time.time() - start
)
conn.commit()
start = time.time()
counter+=1
def clean_name(fname):
return os.path.splitext(os.path.basename(fname))[0].replace(".","_")
def main():
conn = pyodbc.connect(CONN_STR)
cursor = conn.cursor()
args = get_args()
files = file_list(args.directory, args.file_ptn or FILE_PTN)
max_cols = args.maxcols or get_max_cols(files)
if args.tname:
create_table(cursor, args.tname, max_cols)
conn.commit()
for f in files:
tname = args.tname or clean_name(f)
if not args.tname:
create_table(cursor, tname, max_cols)
conn.commit()
insert_rows(conn, cursor, tname, max_cols, f)
conn.commit()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment