Skip to content

Instantly share code, notes, and snippets.

@petrblahos
Created February 19, 2014 09:07
Show Gist options
  • Save petrblahos/9088495 to your computer and use it in GitHub Desktop.
Save petrblahos/9088495 to your computer and use it in GitHub Desktop.
import csv
import sqlite3
import sys
db_con = sqlite3.connect(":memory:")
db_con.text_factory = str
def safe_column_name(name):
"""
Modifies the column name so that it only contains alphanumeric
characters. The rest is replaced by underscores.
"""
out = ""
for i in name:
if i.isalnum():
out+= i
else:
out+= "_"
if out[0].isdigit():
out = "_" + out
return out
def insert_row(table, columns, data):
"""
Insert a single row of data into the table.
#FIXME: make the cursor a parameter so that it does not have to be
recreated every time
"""
cur = db_con.cursor()
if len(data) == len(columns):
cur.execute(
"""insert into %s (%s) values (%s)""" % (
table,
", ".join(columns),
", ".join( [ "?" ] * len(columns)),
),
[ i.strip() for i in data ])
else:
raise Exception("Wrong number of columns in the row.")
cur.close()
def get_unused_column_name(name, used_names):
"""
If the name has not been used, returns it. Otherwise returns the
suffix _<number> - the number is the lowest number that makes the
new name unique.
Params:
name The proposed name.
used_names A set of used names.
Returns:
The unique name.
"""
if not name in used_names:
return name
suffix = 1
out = "%s_%s" % (name, suffix, )
while out in columns:
suffix += 1
out = "%s_%s" % (name, suffix, )
return out
def mk_table_from_header(row):
"""
Create a table named t1 with columns named by the values in row.
If any values in the row are duplicate, adds a _<number> suffix
to them.
Returns:
A tuple (tablename, column names), where table name is always "t1".
"""
columns = set()
column_names = []
for i in row:
name = get_unused_column_name(safe_column_name(i), columns)
columns.add(name)
column_names.append(name)
cur = db_con.cursor()
cur.execute("""create table t1(%s)""" % ", ".join(column_names))
cur.close()
return ("t1", column_names)
def mk_table_dummy_columns(row):
"""
Create a table named t1 with len(row) columns named col_1,
col_2, ...
Returns:
A tuple (tablename, column names), where table name is always "t1".
"""
column_names = [ "col_%d" % (i+1) for i in range(len(row)) ]
cur = db_con.cursor()
cur.execute("""create table t1(%s)""" % ", ".join(column_names))
cur.close()
return ("t1", column_names)
table = None
with open(sys.argv[1], 'rb') as csvfile:
sniff_data = csvfile.read(8192)
csvfile.seek(0)
sniffer = csv.Sniffer()
has_header = sniffer.has_header(sniff_data)
dialect = sniffer.sniff(sniff_data)
reader = csv.reader(csvfile, dialect)
for row in reader:
if table is None:
if has_header:
table = mk_table_from_header(row)
continue
else:
table = mk_table_dummy_columns(row)
insert_row(table[0], table[1], row)
cur = db_con.cursor()
cur.execute("""select * from %s limit 200""" % table[0])
for i in cur:
print i
cur.execute("""select count(*) from %s""" % table[0])
for i in cur:
print i
cur.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment