Skip to content

Instantly share code, notes, and snippets.

@Shawn-Armstrong
Created May 8, 2023 07:49
Show Gist options
  • Save Shawn-Armstrong/bbf81db0f39c4e5bbb92f0dd73883505 to your computer and use it in GitHub Desktop.
Save Shawn-Armstrong/bbf81db0f39c4e5bbb92f0dd73883505 to your computer and use it in GitHub Desktop.
# data.CSV must be in the same directory as this script.
# Numpy, Pandas and mysql-connector-python are required.
import pandas as pd
import mysql.connector
from mysql.connector import Error
import re
def sanitize_column_name(column):
sanitized_column = re.sub(r'\W+', '_', column).strip('_')
while sanitized_column[0].isdigit():
sanitized_column = "_" + sanitized_column
return sanitized_column
# Replace with your MySQL container's details
mysql_host = 'localhost'
mysql_port = 3306
mysql_user = 'root'
mysql_password = 'password'
database_name = 'my_db'
# Read the CSV file
csv_file = 'data.csv'
df = pd.read_csv(csv_file)
df.columns = [sanitize_column_name(col) for col in df.columns]
# Example of column groups - modify this according to your needs
group_1_columns = df.columns[:len(df.columns)//2]
group_2_columns = df.columns[len(df.columns)//2:]
# Connect to the MySQL container
try:
connection = mysql.connector.connect(
host=mysql_host,
port=mysql_port,
user=mysql_user,
password=mysql_password,
database=database_name
)
if connection.is_connected():
cursor = connection.cursor()
# Main table
main_table_name = 'MLS_data_main'
create_main_table_query = f"CREATE TABLE {main_table_name} (id INT AUTO_INCREMENT PRIMARY KEY)"
cursor.execute(create_main_table_query)
connection.commit()
# Function to create and insert data into a related table
def create_related_table(group_name, group_columns, main_table_name):
table_name = f"{main_table_name}_{group_name}"
create_table_query = f"CREATE TABLE {table_name} (id INT AUTO_INCREMENT PRIMARY KEY, main_id INT, FOREIGN KEY (main_id) REFERENCES {main_table_name}(id)"
column_definitions = []
for column in group_columns:
dtype = df[column].dtype
sanitized_column = sanitize_column_name(column)
mysql_dtype = ''
if "int" in str(dtype):
mysql_dtype = 'INT'
elif "float" in str(dtype):
mysql_dtype = 'FLOAT'
elif "object" in str(dtype):
mysql_dtype = 'TEXT'
elif "bool" in str(dtype):
mysql_dtype = 'TINYINT(1)'
column_definitions.append(f"{sanitized_column} {mysql_dtype}")
create_table_query += ', ' + ', '.join(column_definitions) + ")"
cursor.execute(create_table_query)
connection.commit()
# Insert data into the related table
for _, row in df[group_columns].iterrows():
# Replace NaN values with None
row_values = tuple(row.values)
row_values = tuple(None if pd.isna(value) else value for value in row_values)
insert_query = f"INSERT INTO {table_name} (main_id, {', '.join(group_columns)}) VALUES (%s, {', '.join(['%s'] * len(row))})"
cursor.execute(insert_query, (row.name + 1, *row_values))
connection.commit()
# Insert data into the main table and related tables
for _, row in df.iterrows():
# Insert a row into the main table
insert_main_query = f"INSERT INTO {main_table_name} (id) VALUES (%s)"
cursor.execute(insert_main_query, (row.name + 1,))
connection.commit()
# Create related tables and insert data
create_related_table('group_1', group_1_columns, main_table_name)
create_related_table('group_2', group_2_columns, main_table_name)
print(f"Tables created and data inserted successfully.")
except Error as e:
print(f"Error: {e}")
finally:
if connection.is_connected():
cursor.close()
connection.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment