Shawn-Armstrong/create_table_script.py

## create_table_script.py
# data.CSV must be in the same directory as this script.
# Numpy, Pandas and mysql-connector-python are required.

import pandas as pd
import mysql.connector
from mysql.connector import Error
import re

def sanitize_column_name(column):
    sanitized_column = re.sub(r'\W+', '_', column).strip('_')
    while sanitized_column[0].isdigit():
        sanitized_column = "_" + sanitized_column
    return sanitized_column

# Replace with your MySQL container's details
mysql_host = 'localhost'
mysql_port = 3306
mysql_user = 'root'
mysql_password = 'password'
database_name = 'my_db'

# Read the CSV file
csv_file = 'data.csv'
df = pd.read_csv(csv_file)
df.columns = [sanitize_column_name(col) for col in df.columns]

# Example of column groups - modify this according to your needs
group_1_columns = df.columns[:len(df.columns)//2]
group_2_columns = df.columns[len(df.columns)//2:]

# Connect to the MySQL container
try:
    connection = mysql.connector.connect(
        host=mysql_host,
        port=mysql_port,
        user=mysql_user,
        password=mysql_password,
        database=database_name
    )

    if connection.is_connected():
        cursor = connection.cursor()

        # Main table
        main_table_name = 'MLS_data_main'
        create_main_table_query = f"CREATE TABLE {main_table_name} (id INT AUTO_INCREMENT PRIMARY KEY)"
        cursor.execute(create_main_table_query)
        connection.commit()

        # Function to create and insert data into a related table
        def create_related_table(group_name, group_columns, main_table_name):
            table_name = f"{main_table_name}_{group_name}"
            create_table_query = f"CREATE TABLE {table_name} (id INT AUTO_INCREMENT PRIMARY KEY, main_id INT, FOREIGN KEY (main_id) REFERENCES {main_table_name}(id)"
            column_definitions = []

            for column in group_columns:
                dtype = df[column].dtype
                sanitized_column = sanitize_column_name(column)
                mysql_dtype = ''
                if "int" in str(dtype):
                    mysql_dtype = 'INT'
                elif "float" in str(dtype):
                    mysql_dtype = 'FLOAT'
                elif "object" in str(dtype):
                    mysql_dtype = 'TEXT'
                elif "bool" in str(dtype):
                    mysql_dtype = 'TINYINT(1)'
                column_definitions.append(f"{sanitized_column} {mysql_dtype}")

            create_table_query += ', ' + ', '.join(column_definitions) + ")"
            cursor.execute(create_table_query)
            connection.commit()

            # Insert data into the related table
            for _, row in df[group_columns].iterrows():
                # Replace NaN values with None
                row_values = tuple(row.values)
                row_values = tuple(None if pd.isna(value) else value for value in row_values)
                insert_query = f"INSERT INTO {table_name} (main_id, {', '.join(group_columns)}) VALUES (%s, {', '.join(['%s'] * len(row))})"
                cursor.execute(insert_query, (row.name + 1, *row_values))
            connection.commit()

        # Insert data into the main table and related tables
        for _, row in df.iterrows():
            # Insert a row into the main table
            insert_main_query = f"INSERT INTO {main_table_name} (id) VALUES (%s)"
            cursor.execute(insert_main_query, (row.name + 1,))
            connection.commit()

        # Create related tables and insert data
        create_related_table('group_1', group_1_columns, main_table_name)
        create_related_table('group_2', group_2_columns, main_table_name)

        print(f"Tables created and data inserted successfully.")

except Error as e:
    print(f"Error: {e}")

finally:
    if connection.is_connected():
        cursor.close()
        connection.close()
	# data.CSV must be in the same directory as this script.
	# Numpy, Pandas and mysql-connector-python are required.

	import pandas as pd
	import mysql.connector
	from mysql.connector import Error
	import re

	def sanitize_column_name(column):
	sanitized_column = re.sub(r'\W+', '_', column).strip('_')
	while sanitized_column[0].isdigit():
	sanitized_column = "_" + sanitized_column
	return sanitized_column

	# Replace with your MySQL container's details
	mysql_host = 'localhost'
	mysql_port = 3306
	mysql_user = 'root'
	mysql_password = 'password'
	database_name = 'my_db'

	# Read the CSV file
	csv_file = 'data.csv'
	df = pd.read_csv(csv_file)
	df.columns = [sanitize_column_name(col) for col in df.columns]

	# Example of column groups - modify this according to your needs
	group_1_columns = df.columns[:len(df.columns)//2]
	group_2_columns = df.columns[len(df.columns)//2:]

	# Connect to the MySQL container
	try:
	connection = mysql.connector.connect(
	host=mysql_host,
	port=mysql_port,
	user=mysql_user,
	password=mysql_password,
	database=database_name
	)

	if connection.is_connected():
	cursor = connection.cursor()

	# Main table
	main_table_name = 'MLS_data_main'
	create_main_table_query = f"CREATE TABLE {main_table_name} (id INT AUTO_INCREMENT PRIMARY KEY)"
	cursor.execute(create_main_table_query)
	connection.commit()

	# Function to create and insert data into a related table
	def create_related_table(group_name, group_columns, main_table_name):
	table_name = f"{main_table_name}_{group_name}"
	create_table_query = f"CREATE TABLE {table_name} (id INT AUTO_INCREMENT PRIMARY KEY, main_id INT, FOREIGN KEY (main_id) REFERENCES {main_table_name}(id)"
	column_definitions = []

	for column in group_columns:
	dtype = df[column].dtype
	sanitized_column = sanitize_column_name(column)
	mysql_dtype = ''
	if "int" in str(dtype):
	mysql_dtype = 'INT'
	elif "float" in str(dtype):
	mysql_dtype = 'FLOAT'
	elif "object" in str(dtype):
	mysql_dtype = 'TEXT'
	elif "bool" in str(dtype):
	mysql_dtype = 'TINYINT(1)'
	column_definitions.append(f"{sanitized_column} {mysql_dtype}")

	create_table_query += ', ' + ', '.join(column_definitions) + ")"
	cursor.execute(create_table_query)
	connection.commit()

	# Insert data into the related table
	for _, row in df[group_columns].iterrows():
	# Replace NaN values with None
	row_values = tuple(row.values)
	row_values = tuple(None if pd.isna(value) else value for value in row_values)
	insert_query = f"INSERT INTO {table_name} (main_id, {', '.join(group_columns)}) VALUES (%s, {', '.join(['%s'] * len(row))})"
	cursor.execute(insert_query, (row.name + 1, *row_values))
	connection.commit()

	# Insert data into the main table and related tables
	for _, row in df.iterrows():
	# Insert a row into the main table
	insert_main_query = f"INSERT INTO {main_table_name} (id) VALUES (%s)"
	cursor.execute(insert_main_query, (row.name + 1,))
	connection.commit()

	# Create related tables and insert data
	create_related_table('group_1', group_1_columns, main_table_name)
	create_related_table('group_2', group_2_columns, main_table_name)

	print(f"Tables created and data inserted successfully.")

	except Error as e:
	print(f"Error: {e}")

	finally:
	if connection.is_connected():
	cursor.close()
	connection.close()