Created
February 19, 2014 09:07
-
-
Save petrblahos/9088495 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import sqlite3 | |
import sys | |
db_con = sqlite3.connect(":memory:") | |
db_con.text_factory = str | |
def safe_column_name(name): | |
""" | |
Modifies the column name so that it only contains alphanumeric | |
characters. The rest is replaced by underscores. | |
""" | |
out = "" | |
for i in name: | |
if i.isalnum(): | |
out+= i | |
else: | |
out+= "_" | |
if out[0].isdigit(): | |
out = "_" + out | |
return out | |
def insert_row(table, columns, data): | |
""" | |
Insert a single row of data into the table. | |
#FIXME: make the cursor a parameter so that it does not have to be | |
recreated every time | |
""" | |
cur = db_con.cursor() | |
if len(data) == len(columns): | |
cur.execute( | |
"""insert into %s (%s) values (%s)""" % ( | |
table, | |
", ".join(columns), | |
", ".join( [ "?" ] * len(columns)), | |
), | |
[ i.strip() for i in data ]) | |
else: | |
raise Exception("Wrong number of columns in the row.") | |
cur.close() | |
def get_unused_column_name(name, used_names): | |
""" | |
If the name has not been used, returns it. Otherwise returns the | |
suffix _<number> - the number is the lowest number that makes the | |
new name unique. | |
Params: | |
name The proposed name. | |
used_names A set of used names. | |
Returns: | |
The unique name. | |
""" | |
if not name in used_names: | |
return name | |
suffix = 1 | |
out = "%s_%s" % (name, suffix, ) | |
while out in columns: | |
suffix += 1 | |
out = "%s_%s" % (name, suffix, ) | |
return out | |
def mk_table_from_header(row): | |
""" | |
Create a table named t1 with columns named by the values in row. | |
If any values in the row are duplicate, adds a _<number> suffix | |
to them. | |
Returns: | |
A tuple (tablename, column names), where table name is always "t1". | |
""" | |
columns = set() | |
column_names = [] | |
for i in row: | |
name = get_unused_column_name(safe_column_name(i), columns) | |
columns.add(name) | |
column_names.append(name) | |
cur = db_con.cursor() | |
cur.execute("""create table t1(%s)""" % ", ".join(column_names)) | |
cur.close() | |
return ("t1", column_names) | |
def mk_table_dummy_columns(row): | |
""" | |
Create a table named t1 with len(row) columns named col_1, | |
col_2, ... | |
Returns: | |
A tuple (tablename, column names), where table name is always "t1". | |
""" | |
column_names = [ "col_%d" % (i+1) for i in range(len(row)) ] | |
cur = db_con.cursor() | |
cur.execute("""create table t1(%s)""" % ", ".join(column_names)) | |
cur.close() | |
return ("t1", column_names) | |
table = None | |
with open(sys.argv[1], 'rb') as csvfile: | |
sniff_data = csvfile.read(8192) | |
csvfile.seek(0) | |
sniffer = csv.Sniffer() | |
has_header = sniffer.has_header(sniff_data) | |
dialect = sniffer.sniff(sniff_data) | |
reader = csv.reader(csvfile, dialect) | |
for row in reader: | |
if table is None: | |
if has_header: | |
table = mk_table_from_header(row) | |
continue | |
else: | |
table = mk_table_dummy_columns(row) | |
insert_row(table[0], table[1], row) | |
cur = db_con.cursor() | |
cur.execute("""select * from %s limit 200""" % table[0]) | |
for i in cur: | |
print i | |
cur.execute("""select count(*) from %s""" % table[0]) | |
for i in cur: | |
print i | |
cur.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment