Last active
February 28, 2020 02:34
-
-
Save sierra073/49403d5c3c893631492bc16c828c8e9f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
def generate_create_table_statement_from_df(df, schema, tablename, output_path, **kwargs): | |
'''Function to create a PostgreSQL "create table" statement automatically given a pandas dataframe df. | |
Other mandatory inputs: schema, tablename, output_path = absolute path to where you want the "create table" sql file to be placed | |
Optional: primary_key = [list of strings of column name(s) (in order) that define the primary key]''' | |
headers, type_list = generate_headers(df) | |
statement = create_table_sql(schema, tablename, headers, type_list, kwargs.get('primary_key', None)) | |
os.chdir(output_path) | |
outputile = open('create_' + tablename + '.sql', 'w') | |
outputile.write(statement) | |
def generate_headers(df): | |
'''Function to create list of headers with their respective data type. Input a pandas dataframe''' | |
headers = df.columns.tolist() | |
type_list = [] | |
for col in headers: # for loop where we are defining a string representation of the postgres syntax for that column's dataframe datatype and appending it to type_list | |
df_type = str(df[col].dtype) | |
if 'int' in df_type: | |
type_list.append('int') | |
elif 'float' in df_type: | |
type_list.append('decimal') | |
elif 'bool' in df_type: | |
type_list.append('boolean') | |
else: | |
type_list.append('varchar') | |
return headers, type_list | |
def create_table_sql(schema, tablename, headers, type_list, primary_key): | |
'''Function to generate string of a PostgreSQL "create table" statement. | |
Takes in schema, table name, list of columns, and list of datatypes of each column. | |
primary_key = [list of columns that define the primary key], or simply None if no primary key''' | |
statement = '''drop table if exists {schema}.{tablename} cascade; | |
create table {schema}.{tablename} ('''.format(schema=schema, tablename=tablename) | |
for i in range(len(headers)): | |
if (type_list[i] == 'varchar')&(i == len(headers)-1): | |
statement = (statement + '\n{} varchar').format(headers[i].lower()) | |
elif (type_list[i] == 'varchar'): | |
statement = (statement + '\n{} varchar ,').format(headers[i].lower()) | |
elif i == len(headers)-1: | |
statement = (statement + '\n' + '{} {}').format(headers[i].lower(), type_list[i]) | |
else: | |
statement = (statement + '\n' + '{} {}' + ' ,').format(headers[i].lower(), type_list[i]) | |
if primary_key is not None: | |
statement = statement + ', primary key (' + ','.join(primary_key) + '));' | |
else: | |
statement = statement + ');' | |
return statement | |
def insert_large_table(df, schema, tablename, cursor): | |
'''Function to insert into PostgresSQL tables with lots of columns. Warning: doesn't guarantee speed if there are > ~25K rows and 200+ columns. | |
Inputs: pandas dataframe df, table schema, table name, and a psycopg2 cursor object''' | |
columns_list = df.columns.values.tolist() | |
columns = ', '.join(map(str, columns_list)) | |
query = '(' + ','.join(["%s"] * len(columns.split(','))) + ')' | |
final_query = ','.join(cursor.mogrify(query, row.values.tolist()) for index, row in df.iterrows()) | |
cursor.execute("INSERT INTO " + schema + "." + tablename + " VALUES " + final_query) | |
print(tablename + " inserted") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment