hoffmanc/csv2db.py

## csv2db.py
# Chris Hoffman
# 2011-6-17
# Description: takes a

import pyodbc
import sys
import csv
import argparse
import glob
import os
import time

SERVER = ""
DB = ""
FILE_PTN = "*.asc"
CONN_STR = "DRIVER={SQL Server};SERVER=%s;DATABASE=%s;Integrated Security=true;Trusted Connection=true" % (SERVER,DB)

def get_args():
  parser = argparse.ArgumentParser(description='CSV => DB Table.')
  parser.add_argument('directory', metavar='dir', type=str,
             help="directory to search for files of the form %s" % FILE_PTN)
  parser.add_argument('-t','--tname', type=str,
             help="output table name (edit source for server/db)")
  parser.add_argument('-d', metavar='delimiter', type=str,
             help="field delimiter (default: '|')")
  parser.add_argument('-m', '--maxcols', metavar='max_cols', type=int, default=None,
             help="maximum columns in files (default: derived)")
  parser.add_argument('-f', '--file_ptn', type=str, default=None,
             help="file name patern (default: *.asc)")
  args = parser.parse_args()
  return args

def file_list(srchdir, fileptn):
  files_found = glob.glob(os.path.join(srchdir,fileptn))
  if not files_found:
      raise Exception("no files found matching pattern '%s' in %s" % (fileptn, srchdir))
  return files_found

def get_max_cols(files):
  maxcols = 0
  for f in files:
    fh = open(f, "rb")
    line = fh.readline()
    while line:
      if maxcols <  num_cols(line):
        maxcols = num_cols(line)
        print "Max Column Count Found: %s" % maxcols
      line = fh.readline()
    fh.close()
  return maxcols

def num_cols(line):
  return len(line.split('|')) + 1

def create_table(cursor, tname, max_cols):
  sql = """
  if object_id('%s') is null
  begin
    create table %s (
      rownum int not null,
      fname nvarchar(max) not null,
    %s)
  end""" % (
    tname,
    tname,
    ",".join(
      map(lambda n: "v%s nvarchar(max)" % n, range(max_cols))
    )
  )
  cursor.execute(sql)

def get_total_lines(fname):
  fh = open(fname, "rb")
  lcount = 0
  while fh.readline():
    lcount+=1
  return lcount

def insert_rows(conn, cursor, tname, max_cols, fname):
  total_lines = get_total_lines(fname)

  all_cols = map(lambda n: "v%s" % n, (range(max_cols)))
  sql_head = "insert into %s (rownum, fname, %s) " % (
    tname,
    ",".join(all_cols)
  )

  fh = open(fname, "rb")
  line = fh.readline()
  counter = 0
  start = time.time()
  while line:
    cols = line.split('|')
    out_cols = []
    for i in range(len(all_cols)):
      if i < len(cols) and cols[i] != None and cols[i] != "":
        out_cols.append("'%s'" % cols[i])
      else:
        out_cols.append('NULL')
    sql = "%s values (%s, '%s', %s)" % (
      sql_head,
      counter,
      clean_name(fname),
      ",".join(out_cols)
    )
    cursor.execute(sql)
    line = fh.readline()
    if counter % (total_lines / 25) == 0:
      print "%s/%s (%.2f sec)" % (
        counter / (total_lines / 25),
        25,
        time.time() - start
      )
      conn.commit()
      start = time.time()

    counter+=1

def clean_name(fname):
  return os.path.splitext(os.path.basename(fname))[0].replace(".","_")

def main():
  conn = pyodbc.connect(CONN_STR)
  cursor = conn.cursor()

  args = get_args()
  files = file_list(args.directory, args.file_ptn or FILE_PTN)
  max_cols = args.maxcols or get_max_cols(files)
  if args.tname:
    create_table(cursor, args.tname, max_cols)
    conn.commit()

  for f in files:
    tname = args.tname or clean_name(f)
    if not args.tname:
      create_table(cursor, tname, max_cols)
      conn.commit()
    insert_rows(conn, cursor, tname, max_cols, f)
    conn.commit()

if __name__ == '__main__':
    main()
	# Chris Hoffman
	# 2011-6-17
	# Description: takes a

	import pyodbc
	import sys
	import csv
	import argparse
	import glob
	import os
	import time

	SERVER = ""
	DB = ""
	FILE_PTN = "*.asc"
	CONN_STR = "DRIVER={SQL Server};SERVER=%s;DATABASE=%s;Integrated Security=true;Trusted Connection=true" % (SERVER,DB)

	def get_args():
	parser = argparse.ArgumentParser(description='CSV => DB Table.')
	parser.add_argument('directory', metavar='dir', type=str,
	help="directory to search for files of the form %s" % FILE_PTN)
	parser.add_argument('-t','--tname', type=str,
	help="output table name (edit source for server/db)")
	parser.add_argument('-d', metavar='delimiter', type=str,
	help="field delimiter (default: '\|')")
	parser.add_argument('-m', '--maxcols', metavar='max_cols', type=int, default=None,
	help="maximum columns in files (default: derived)")
	parser.add_argument('-f', '--file_ptn', type=str, default=None,
	help="file name patern (default: *.asc)")
	args = parser.parse_args()
	return args

	def file_list(srchdir, fileptn):
	files_found = glob.glob(os.path.join(srchdir,fileptn))
	if not files_found:
	raise Exception("no files found matching pattern '%s' in %s" % (fileptn, srchdir))
	return files_found

	def get_max_cols(files):
	maxcols = 0
	for f in files:
	fh = open(f, "rb")
	line = fh.readline()
	while line:
	if maxcols < num_cols(line):
	maxcols = num_cols(line)
	print "Max Column Count Found: %s" % maxcols
	line = fh.readline()
	fh.close()
	return maxcols

	def num_cols(line):
	return len(line.split('\|')) + 1

	def create_table(cursor, tname, max_cols):
	sql = """
	if object_id('%s') is null
	begin
	create table %s (
	rownum int not null,
	fname nvarchar(max) not null,
	%s)
	end""" % (
	tname,
	tname,
	",".join(
	map(lambda n: "v%s nvarchar(max)" % n, range(max_cols))
	)
	)
	cursor.execute(sql)

	def get_total_lines(fname):
	fh = open(fname, "rb")
	lcount = 0
	while fh.readline():
	lcount+=1
	return lcount

	def insert_rows(conn, cursor, tname, max_cols, fname):
	total_lines = get_total_lines(fname)

	all_cols = map(lambda n: "v%s" % n, (range(max_cols)))
	sql_head = "insert into %s (rownum, fname, %s) " % (
	tname,
	",".join(all_cols)
	)

	fh = open(fname, "rb")
	line = fh.readline()
	counter = 0
	start = time.time()
	while line:
	cols = line.split('\|')
	out_cols = []
	for i in range(len(all_cols)):
	if i < len(cols) and cols[i] != None and cols[i] != "":
	out_cols.append("'%s'" % cols[i])
	else:
	out_cols.append('NULL')
	sql = "%s values (%s, '%s', %s)" % (
	sql_head,
	counter,
	clean_name(fname),
	",".join(out_cols)
	)
	cursor.execute(sql)
	line = fh.readline()
	if counter % (total_lines / 25) == 0:
	print "%s/%s (%.2f sec)" % (
	counter / (total_lines / 25),
	25,
	time.time() - start
	)
	conn.commit()
	start = time.time()

	counter+=1

	def clean_name(fname):
	return os.path.splitext(os.path.basename(fname))[0].replace(".","_")

	def main():
	conn = pyodbc.connect(CONN_STR)
	cursor = conn.cursor()

	args = get_args()
	files = file_list(args.directory, args.file_ptn or FILE_PTN)
	max_cols = args.maxcols or get_max_cols(files)
	if args.tname:
	create_table(cursor, args.tname, max_cols)
	conn.commit()

	for f in files:
	tname = args.tname or clean_name(f)
	if not args.tname:
	create_table(cursor, tname, max_cols)
	conn.commit()
	insert_rows(conn, cursor, tname, max_cols, f)
	conn.commit()

	if __name__ == '__main__':
	main()