Created
May 30, 2012 21:01
Revisions
-
AWinterman created this gist
May 30, 2012 .There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,101 @@ import csv import sqlite3 as lite import subprocess from itertools import chain from time import time import sys #import logging #logging.basicConfig(level=logging.DEBUG) # #I want to create a database with a table for the meta data, #and then one table for every time increment. def convert(value): '''takes a string, tries to convert it to an int, then a float, then defaults to string if both give an error. Returns a tuple of the SQLite type and function for conversion''' tests = [("INTEGER", int), ("REAL", float)] for lite_type, test in tests: try: test(value) return lite_type, test except ValueError: continue return "STRING", str # All other heuristics failed it is a string def make_table(header, types, cursor, tablename): '''Makes a new table with `cursors`, with column names as specified in `header`, data types as specified by `types`, and name specified by `tablename`''' table_fields = [" ".join(v) for v in zip(header, types)] #table_fields sets up the fields and their types for this table. #Of course, since we name the tables by time, we don't need to keep that variable. creation_string = "CREATE TABLE %(tablename)s (%(args)s)" % { 'args': ", ".join(table_fields), 'tablename': tablename} cursor.execute(creation_string) def put_in(dictionary, c, conn): '''Inserts values of dictionary into tables with names equal to the keys of the dictionary. The database has `conn` as the connection, and `c` as a cursor, ''' for name in dictionary.keys(): query_command = 'INSERT INTO '+ name +' VALUES (%s)' % ', '.join(['?']*len(dictionary[name][0])) c.executemany(query_command, iter(dictionary[name])) conn.commit() def make_DB(data_file, data_base_name, block_size): '''This function runs through a data_file, assumed to be csv, and makes a sqlite database out of it Currently it is tailored to the ''' t0 = time() #time the operation. with open(data_file) as f, lite.connect(data_base_name) as conn: #connecting: lines = csv.reader(f) c = conn.cursor() #Initializing header = lines.next() #Column names c.execute("SELECT name FROM sqlite_master WHERE type='table';") last_tables = set([ '['+ x[0] +']' for x in c.fetchall()]) #Keeping track of tables currently in db. #Finding types: first = lines.next() #The first element in the whole csv. sample = first[:] #this is going to be used for type matching. sample.pop(2) #don't match third element, it's going to be table name. types, conv = zip(*[convert(entry) for entry in sample]) #adding a `*` actually unzips. see above for `convert`. toPut = {} #Initialize variables line_counter = 1 renew_lines = chain([first], lines) #Put the first line back on first. for line in renew_lines: date_time = line.pop(2) #the second element is the date table_name = "".join(["[", date_time.replace(" ", "_"), "]"]) #convert to valid table name if not table_name in last_tables: #This keeps track of tables. #If it's not in the db, add it, add it to list keeping track of this. make_table(header, types, c, table_name) last_tables.add(table_name) if not table_name in toPut.keys(): toPut[table_name] = [] #Initialize the array #No matter what, the line goes into the table withe corresponding name toPut[table_name].append( tuple([g(x) for (g, x) in zip(conv, line)] )) #The data just gets converted to the right type before input. #setting correct types for everybody if not line_counter % block_size: #Check if 0 modulo block_size, if so, report current location, call put_in, sys.stdout.write("writing to tables " + toPut.keys() +". \r") sys.stdout.flush() put_in(toPut, c, conn) #put_in writes dictionary to table. toPut = {} line_counter += 1 sys.stdout.write("now on line number " + str(line_counter) + ". \r") sys.stdout.flush() put_in(toPut, c, conn) #Eating the leftovers, so to speak. The number of lines probably isn't a multiple of the block size, #so this puts any leftovers into the database. print "\n", time() - t0 #make_DB("../metaStatus-3-7.csv", "VAST2.db", 500000) #This is the call I make on my system.