sierra073/pandas_postgres_helpers.py

## pandas_postgres_helpers.py
import os


def generate_create_table_statement_from_df(df, schema, tablename, output_path, **kwargs):
    '''Function to create a PostgreSQL "create table" statement automatically given a pandas dataframe df.
    Other mandatory inputs: schema, tablename, output_path = absolute path to where you want the "create table" sql file to be placed
    Optional: primary_key = [list of strings of column name(s) (in order) that define the primary key]'''
    headers, type_list = generate_headers(df)
    statement = create_table_sql(schema, tablename, headers, type_list, kwargs.get('primary_key', None))
    os.chdir(output_path)
    outputile = open('create_' + tablename + '.sql', 'w')
    outputile.write(statement)


def generate_headers(df):
    '''Function to create list of headers with their respective data type. Input a pandas dataframe'''
    headers = df.columns.tolist()
    type_list = []
    for col in headers:  # for loop where we are defining a string representation of the postgres syntax for that column's dataframe datatype and appending it to type_list
        df_type = str(df[col].dtype)
        if 'int' in df_type:
            type_list.append('int')
        elif 'float' in df_type:
            type_list.append('decimal')
        elif 'bool' in df_type:
            type_list.append('boolean')
        else:
            type_list.append('varchar')
    return headers, type_list


def create_table_sql(schema, tablename, headers, type_list, primary_key):
    '''Function to generate string of a PostgreSQL "create table" statement.
    Takes in schema, table name, list of columns, and list of datatypes of each column.
    primary_key = [list of columns that define the primary key], or simply None if no primary key'''

    statement = '''drop table if exists {schema}.{tablename} cascade;
     create table {schema}.{tablename} ('''.format(schema=schema, tablename=tablename)

    for i in range(len(headers)):
        if (type_list[i] == 'varchar')&(i == len(headers)-1):
            statement = (statement + '\n{} varchar').format(headers[i].lower())
        elif (type_list[i] == 'varchar'):
            statement = (statement + '\n{} varchar ,').format(headers[i].lower())
        elif i == len(headers)-1:
            statement = (statement + '\n' + '{} {}').format(headers[i].lower(), type_list[i])
        else:
            statement = (statement + '\n' + '{} {}' + ' ,').format(headers[i].lower(), type_list[i])

    if primary_key is not None:
        statement = statement + ', primary key (' + ','.join(primary_key) + '));'
    else:
        statement = statement + ');'
    return statement


def insert_large_table(df, schema, tablename, cursor):
    '''Function to insert into PostgresSQL tables with lots of columns. Warning: doesn't guarantee speed if there are > ~25K rows and 200+ columns.
    Inputs: pandas dataframe df, table schema, table name, and a psycopg2 cursor object'''

    columns_list = df.columns.values.tolist()
    columns = ', '.join(map(str, columns_list))
    query = '(' + ','.join(["%s"] * len(columns.split(','))) + ')'
    final_query = ','.join(cursor.mogrify(query, row.values.tolist()) for index, row in df.iterrows())

    cursor.execute("INSERT INTO " + schema + "." + tablename + " VALUES " + final_query)

    print(tablename + " inserted")
	import os


	def generate_create_table_statement_from_df(df, schema, tablename, output_path, **kwargs):
	'''Function to create a PostgreSQL "create table" statement automatically given a pandas dataframe df.
	Other mandatory inputs: schema, tablename, output_path = absolute path to where you want the "create table" sql file to be placed
	Optional: primary_key = [list of strings of column name(s) (in order) that define the primary key]'''
	headers, type_list = generate_headers(df)
	statement = create_table_sql(schema, tablename, headers, type_list, kwargs.get('primary_key', None))
	os.chdir(output_path)
	outputile = open('create_' + tablename + '.sql', 'w')
	outputile.write(statement)


	def generate_headers(df):
	'''Function to create list of headers with their respective data type. Input a pandas dataframe'''
	headers = df.columns.tolist()
	type_list = []
	for col in headers: # for loop where we are defining a string representation of the postgres syntax for that column's dataframe datatype and appending it to type_list
	df_type = str(df[col].dtype)
	if 'int' in df_type:
	type_list.append('int')
	elif 'float' in df_type:
	type_list.append('decimal')
	elif 'bool' in df_type:
	type_list.append('boolean')
	else:
	type_list.append('varchar')
	return headers, type_list


	def create_table_sql(schema, tablename, headers, type_list, primary_key):
	'''Function to generate string of a PostgreSQL "create table" statement.
	Takes in schema, table name, list of columns, and list of datatypes of each column.
	primary_key = [list of columns that define the primary key], or simply None if no primary key'''

	statement = '''drop table if exists {schema}.{tablename} cascade;
	create table {schema}.{tablename} ('''.format(schema=schema, tablename=tablename)

	for i in range(len(headers)):
	if (type_list[i] == 'varchar')&(i == len(headers)-1):
	statement = (statement + '\n{} varchar').format(headers[i].lower())
	elif (type_list[i] == 'varchar'):
	statement = (statement + '\n{} varchar ,').format(headers[i].lower())
	elif i == len(headers)-1:
	statement = (statement + '\n' + '{} {}').format(headers[i].lower(), type_list[i])
	else:
	statement = (statement + '\n' + '{} {}' + ' ,').format(headers[i].lower(), type_list[i])

	if primary_key is not None:
	statement = statement + ', primary key (' + ','.join(primary_key) + '));'
	else:
	statement = statement + ');'
	return statement


	def insert_large_table(df, schema, tablename, cursor):
	'''Function to insert into PostgresSQL tables with lots of columns. Warning: doesn't guarantee speed if there are > ~25K rows and 200+ columns.
	Inputs: pandas dataframe df, table schema, table name, and a psycopg2 cursor object'''

	columns_list = df.columns.values.tolist()
	columns = ', '.join(map(str, columns_list))
	query = '(' + ','.join(["%s"] * len(columns.split(','))) + ')'
	final_query = ','.join(cursor.mogrify(query, row.values.tolist()) for index, row in df.iterrows())

	cursor.execute("INSERT INTO " + schema + "." + tablename + " VALUES " + final_query)

	print(tablename + " inserted")