Created
May 8, 2023 07:49
-
-
Save Shawn-Armstrong/bbf81db0f39c4e5bbb92f0dd73883505 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# data.CSV must be in the same directory as this script. | |
# Numpy, Pandas and mysql-connector-python are required. | |
import pandas as pd | |
import mysql.connector | |
from mysql.connector import Error | |
import re | |
def sanitize_column_name(column): | |
sanitized_column = re.sub(r'\W+', '_', column).strip('_') | |
while sanitized_column[0].isdigit(): | |
sanitized_column = "_" + sanitized_column | |
return sanitized_column | |
# Replace with your MySQL container's details | |
mysql_host = 'localhost' | |
mysql_port = 3306 | |
mysql_user = 'root' | |
mysql_password = 'password' | |
database_name = 'my_db' | |
# Read the CSV file | |
csv_file = 'data.csv' | |
df = pd.read_csv(csv_file) | |
df.columns = [sanitize_column_name(col) for col in df.columns] | |
# Example of column groups - modify this according to your needs | |
group_1_columns = df.columns[:len(df.columns)//2] | |
group_2_columns = df.columns[len(df.columns)//2:] | |
# Connect to the MySQL container | |
try: | |
connection = mysql.connector.connect( | |
host=mysql_host, | |
port=mysql_port, | |
user=mysql_user, | |
password=mysql_password, | |
database=database_name | |
) | |
if connection.is_connected(): | |
cursor = connection.cursor() | |
# Main table | |
main_table_name = 'MLS_data_main' | |
create_main_table_query = f"CREATE TABLE {main_table_name} (id INT AUTO_INCREMENT PRIMARY KEY)" | |
cursor.execute(create_main_table_query) | |
connection.commit() | |
# Function to create and insert data into a related table | |
def create_related_table(group_name, group_columns, main_table_name): | |
table_name = f"{main_table_name}_{group_name}" | |
create_table_query = f"CREATE TABLE {table_name} (id INT AUTO_INCREMENT PRIMARY KEY, main_id INT, FOREIGN KEY (main_id) REFERENCES {main_table_name}(id)" | |
column_definitions = [] | |
for column in group_columns: | |
dtype = df[column].dtype | |
sanitized_column = sanitize_column_name(column) | |
mysql_dtype = '' | |
if "int" in str(dtype): | |
mysql_dtype = 'INT' | |
elif "float" in str(dtype): | |
mysql_dtype = 'FLOAT' | |
elif "object" in str(dtype): | |
mysql_dtype = 'TEXT' | |
elif "bool" in str(dtype): | |
mysql_dtype = 'TINYINT(1)' | |
column_definitions.append(f"{sanitized_column} {mysql_dtype}") | |
create_table_query += ', ' + ', '.join(column_definitions) + ")" | |
cursor.execute(create_table_query) | |
connection.commit() | |
# Insert data into the related table | |
for _, row in df[group_columns].iterrows(): | |
# Replace NaN values with None | |
row_values = tuple(row.values) | |
row_values = tuple(None if pd.isna(value) else value for value in row_values) | |
insert_query = f"INSERT INTO {table_name} (main_id, {', '.join(group_columns)}) VALUES (%s, {', '.join(['%s'] * len(row))})" | |
cursor.execute(insert_query, (row.name + 1, *row_values)) | |
connection.commit() | |
# Insert data into the main table and related tables | |
for _, row in df.iterrows(): | |
# Insert a row into the main table | |
insert_main_query = f"INSERT INTO {main_table_name} (id) VALUES (%s)" | |
cursor.execute(insert_main_query, (row.name + 1,)) | |
connection.commit() | |
# Create related tables and insert data | |
create_related_table('group_1', group_1_columns, main_table_name) | |
create_related_table('group_2', group_2_columns, main_table_name) | |
print(f"Tables created and data inserted successfully.") | |
except Error as e: | |
print(f"Error: {e}") | |
finally: | |
if connection.is_connected(): | |
cursor.close() | |
connection.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment