Skip to content

Instantly share code, notes, and snippets.

@sierra073
Last active February 28, 2020 02:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sierra073/49403d5c3c893631492bc16c828c8e9f to your computer and use it in GitHub Desktop.
Save sierra073/49403d5c3c893631492bc16c828c8e9f to your computer and use it in GitHub Desktop.
import os
def generate_create_table_statement_from_df(df, schema, tablename, output_path, **kwargs):
'''Function to create a PostgreSQL "create table" statement automatically given a pandas dataframe df.
Other mandatory inputs: schema, tablename, output_path = absolute path to where you want the "create table" sql file to be placed
Optional: primary_key = [list of strings of column name(s) (in order) that define the primary key]'''
headers, type_list = generate_headers(df)
statement = create_table_sql(schema, tablename, headers, type_list, kwargs.get('primary_key', None))
os.chdir(output_path)
outputile = open('create_' + tablename + '.sql', 'w')
outputile.write(statement)
def generate_headers(df):
'''Function to create list of headers with their respective data type. Input a pandas dataframe'''
headers = df.columns.tolist()
type_list = []
for col in headers: # for loop where we are defining a string representation of the postgres syntax for that column's dataframe datatype and appending it to type_list
df_type = str(df[col].dtype)
if 'int' in df_type:
type_list.append('int')
elif 'float' in df_type:
type_list.append('decimal')
elif 'bool' in df_type:
type_list.append('boolean')
else:
type_list.append('varchar')
return headers, type_list
def create_table_sql(schema, tablename, headers, type_list, primary_key):
'''Function to generate string of a PostgreSQL "create table" statement.
Takes in schema, table name, list of columns, and list of datatypes of each column.
primary_key = [list of columns that define the primary key], or simply None if no primary key'''
statement = '''drop table if exists {schema}.{tablename} cascade;
create table {schema}.{tablename} ('''.format(schema=schema, tablename=tablename)
for i in range(len(headers)):
if (type_list[i] == 'varchar')&(i == len(headers)-1):
statement = (statement + '\n{} varchar').format(headers[i].lower())
elif (type_list[i] == 'varchar'):
statement = (statement + '\n{} varchar ,').format(headers[i].lower())
elif i == len(headers)-1:
statement = (statement + '\n' + '{} {}').format(headers[i].lower(), type_list[i])
else:
statement = (statement + '\n' + '{} {}' + ' ,').format(headers[i].lower(), type_list[i])
if primary_key is not None:
statement = statement + ', primary key (' + ','.join(primary_key) + '));'
else:
statement = statement + ');'
return statement
def insert_large_table(df, schema, tablename, cursor):
'''Function to insert into PostgresSQL tables with lots of columns. Warning: doesn't guarantee speed if there are > ~25K rows and 200+ columns.
Inputs: pandas dataframe df, table schema, table name, and a psycopg2 cursor object'''
columns_list = df.columns.values.tolist()
columns = ', '.join(map(str, columns_list))
query = '(' + ','.join(["%s"] * len(columns.split(','))) + ')'
final_query = ','.join(cursor.mogrify(query, row.values.tolist()) for index, row in df.iterrows())
cursor.execute("INSERT INTO " + schema + "." + tablename + " VALUES " + final_query)
print(tablename + " inserted")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment