Last active
February 22, 2020 19:14
-
-
Save jamescalam/74e18f935e223c4aa486dc517e689d84 to your computer and use it in GitHub Desktop.
Pushing a dataframe to MS SQL Server
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def push_dataframe(self, data, table="raw_data", batchsize=500, | |
overwrite=False): | |
"""Function used to upload a Pandas DataFrame (data) to SQL Server. | |
Keyword arguments: | |
data -- the dataframe to be uploaded | |
table -- the name of the new table in SQL (default "raw_data") | |
batchsize -- the number of rows to upload to the new table within each | |
execution, recommend no more than 1000 (default 500) | |
overwrite -- safety measure used to ensure user does not accidentally | |
delete any tables (default False) | |
""" | |
# if overwrite is true we auto remove any tables with same name | |
if not overwrite: | |
# check for pre-existing table and delete if present | |
self.drop(table) | |
# create execution cursor | |
cursor = self.cnxn.cursor() | |
# activate fast execute | |
cursor.fast_executemany = True | |
# create create table statement | |
query = "CREATE TABLE [" + table + "] (\n" | |
# iterate through each column to be included in create table statement | |
for i in range(len(list(data))): | |
# add column (everything is varchar for now) | |
query += "\t[{}] varchar(8000)".format(list(data)[i]) | |
# append correct connection/end statement code | |
if i != len(list(data))-1: | |
query += ",\n" | |
else: | |
query += "\n);" | |
cursor.execute(query) # execute the create table statement | |
self.cnxn.commit() # commit changes | |
# append query to our SQL code logger | |
self.query += ("\n\n-- create table\n" + query) | |
start_time = datetime.now() # start timer | |
# tell user we have started the insert operation | |
print("Insert started at {}.".format(start_time.strftime("%H:%M:%S"))) | |
# initialise the previous user update time | |
last_check = start_time | |
# insert the data in batches | |
query = ("INSERT INTO [{}] ({})\n".format(table, | |
'['+'], [' # get columns | |
.join(list(data)) + ']') + | |
"VALUES\n(?{})".format(", ?"*(len(list(data))-1))) | |
# cursor docs: https://github.com/mkleehammer/pyodbc/wiki/Cursor | |
# insert data into target table in batches of 'batchsize' | |
for i in range(0, len(data), batchsize): | |
if i+batchsize > len(data): | |
batch = data[i: len(data)].values.tolist() | |
else: | |
batch = data[i: i+batchsize].values.tolist() | |
# execute batch insert | |
cursor.executemany(query, batch) | |
# commit insert to SQL Server | |
self.cnxn.commit() | |
# so user can see progress being made | |
if (datetime.now() - last_check).seconds > 10: | |
# calculations for time print-out to user | |
last_check = datetime.now() # updating 'last print' time | |
delta = datetime.now() - start_time # time taken so far | |
percentage = (i+batchsize) / len(data) # percentage complete | |
eta = start_time + (delta/percentage) # eta | |
# print above in readable format for user | |
print("{:.1f}% complete\n".format(percentage*100) + | |
"Current Time: {}\n".format(datetime.now() | |
.strftime("%H:%M:%S")) + | |
"ETA: {}\n".format(eta.strftime("%H:%M:%S"))) | |
# let's see how long it took | |
time_taken = str((datetime.now() - start_time).seconds) | |
print("Insert completed in {} seconds.".format(time_taken)) | |
# append upload details to SQL code logger | |
self.query += ("\n\n-- User uploaded "+table+" table\n" | |
"-- upload time: "+time_taken+" secs") | |
# updating the user | |
print("DataFrame uploaded as " + table + | |
" to " + self.database + " on " + self.server + " server.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment