Skip to content

Instantly share code, notes, and snippets.

@jamescalam
Last active February 22, 2020 19:14
Show Gist options
  • Save jamescalam/74e18f935e223c4aa486dc517e689d84 to your computer and use it in GitHub Desktop.
Save jamescalam/74e18f935e223c4aa486dc517e689d84 to your computer and use it in GitHub Desktop.
Pushing a dataframe to MS SQL Server
def push_dataframe(self, data, table="raw_data", batchsize=500,
overwrite=False):
"""Function used to upload a Pandas DataFrame (data) to SQL Server.
Keyword arguments:
data -- the dataframe to be uploaded
table -- the name of the new table in SQL (default "raw_data")
batchsize -- the number of rows to upload to the new table within each
execution, recommend no more than 1000 (default 500)
overwrite -- safety measure used to ensure user does not accidentally
delete any tables (default False)
"""
# if overwrite is true we auto remove any tables with same name
if not overwrite:
# check for pre-existing table and delete if present
self.drop(table)
# create execution cursor
cursor = self.cnxn.cursor()
# activate fast execute
cursor.fast_executemany = True
# create create table statement
query = "CREATE TABLE [" + table + "] (\n"
# iterate through each column to be included in create table statement
for i in range(len(list(data))):
# add column (everything is varchar for now)
query += "\t[{}] varchar(8000)".format(list(data)[i])
# append correct connection/end statement code
if i != len(list(data))-1:
query += ",\n"
else:
query += "\n);"
cursor.execute(query) # execute the create table statement
self.cnxn.commit() # commit changes
# append query to our SQL code logger
self.query += ("\n\n-- create table\n" + query)
start_time = datetime.now() # start timer
# tell user we have started the insert operation
print("Insert started at {}.".format(start_time.strftime("%H:%M:%S")))
# initialise the previous user update time
last_check = start_time
# insert the data in batches
query = ("INSERT INTO [{}] ({})\n".format(table,
'['+'], [' # get columns
.join(list(data)) + ']') +
"VALUES\n(?{})".format(", ?"*(len(list(data))-1)))
# cursor docs: https://github.com/mkleehammer/pyodbc/wiki/Cursor
# insert data into target table in batches of 'batchsize'
for i in range(0, len(data), batchsize):
if i+batchsize > len(data):
batch = data[i: len(data)].values.tolist()
else:
batch = data[i: i+batchsize].values.tolist()
# execute batch insert
cursor.executemany(query, batch)
# commit insert to SQL Server
self.cnxn.commit()
# so user can see progress being made
if (datetime.now() - last_check).seconds > 10:
# calculations for time print-out to user
last_check = datetime.now() # updating 'last print' time
delta = datetime.now() - start_time # time taken so far
percentage = (i+batchsize) / len(data) # percentage complete
eta = start_time + (delta/percentage) # eta
# print above in readable format for user
print("{:.1f}% complete\n".format(percentage*100) +
"Current Time: {}\n".format(datetime.now()
.strftime("%H:%M:%S")) +
"ETA: {}\n".format(eta.strftime("%H:%M:%S")))
# let's see how long it took
time_taken = str((datetime.now() - start_time).seconds)
print("Insert completed in {} seconds.".format(time_taken))
# append upload details to SQL code logger
self.query += ("\n\n-- User uploaded "+table+" table\n"
"-- upload time: "+time_taken+" secs")
# updating the user
print("DataFrame uploaded as " + table +
" to " + self.database + " on " + self.server + " server.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment