Skip to content

Instantly share code, notes, and snippets.

@stephanGarland
Last active September 30, 2023 23:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stephanGarland/cc505a9d9c0d044a340fa19b6d778c07 to your computer and use it in GitHub Desktop.
Save stephanGarland/cc505a9d9c0d044a340fa19b6d778c07 to your computer and use it in GitHub Desktop.
Benchmarking MySQL INSERTs into InnoDB and MyISAM
import csv
import json
import os
import pymysql
import statistics
import time
def prepare_values(row):
new_row = []
for cell in row:
if isinstance(cell, list):
new_row.append(json.dumps(cell))
else:
new_row.append(cell)
return new_row
def batch_insert(tbl_name, rows, cursor, is_full):
query = f"INSERT INTO `{tbl_name}` (user_id,user_email,created_at,tags,shared_with,is_completed,is_overdue,description,title,user_name) VALUES "
values_str_list = []
for row in rows:
row = prepare_values(row)
value_str = "(" + ",".join(["%s" for _ in row]) + ")"
values_str_list.append(cursor.mogrify(value_str, row))
query += ",".join(values_str_list) + ";"
start_time = time.time()
cursor.execute(query)
end_time = time.time()
if is_full:
timings[tbl_name]["full"].append(end_time - start_time)
else:
timings[tbl_name]["empty"].append(end_time - start_time)
def process_csv_file(
cursor, csv_file_path, tbl_name, chunk_size, num_rows, is_full: bool
):
values = []
with open(csv_file_path, "r") as csvfile:
csvreader = csv.reader(csvfile, quotechar="'")
for i, row in enumerate(csvreader, 1):
values.append(row)
if i % chunk_size == 0:
print(
f"inserting batch {i // chunk_size} / {num_rows // chunk_size} into {tbl_name} ({'full' if is_full else 'empty'})"
)
batch_insert(tbl_name, values, cursor, is_full)
values = []
if values:
batch_insert(tbl_name, values, cursor, is_full)
db_params = {
"host": "YOUR_HOST",
"password": "YOUR_PASS,
"user": "YOUR_USER",
"db": "YOUR_DB",
"autocommit": True,
"ssl": {"fake_flag_to_enable_tls": True},
}
csv_file_path1 = "/mnt/ramdisk/xaa.csv"
csv_file_path2 = "/mnt/ramdisk/xab.csv"
csv_file_size = os.path.getsize(csv_file_path1)
table_name1 = "test_innodb"
table_name2 = "test_myisam"
num_rows = 12_500_000
chunk_size = 10_000
timings = {
table_name1: {"empty": [], "full": []},
table_name2: {"empty": [], "full": []},
}
connection = pymysql.connect(**db_params)
cursor = connection.cursor()
for i, tbl in enumerate([table_name1, table_name2]):
for j, f in enumerate([csv_file_path1, csv_file_path2]):
print(f"\nloading file {j} / 2 into {tbl} in batches of {chunk_size} rows\n")
process_csv_file(cursor, f, tbl, chunk_size, num_rows, bool(j))
cursor.close()
connection.close()
for table, table_timings in timings.items():
for category, category_timings in table_timings.items():
min_time = min(category_timings)
max_time = max(category_timings)
avg_time = statistics.mean(category_timings)
stdev_time = statistics.stdev(category_timings)
total_time = sum(category_timings)
print(f"\ntimings for {table} ({category}):")
print(f"\tMin: {min_time:.3f} seconds")
print(f"\tMax: {max_time:.3f} seconds")
print(f"\tAvg: {avg_time:.3f} seconds")
print(f"\tStd Dev: {stdev_time:.3f} seconds")
print(
f"\tLoaded {num_rows} rows in {total_time:.2f} seconds at {((csv_file_size / total_time) / 2**20):.2f} MiB/s"
)
@stephanGarland
Copy link
Author

timings for test_innodb (empty):
        Min: 0.437 seconds
        Max: 1.631 seconds
        Avg: 1.034 seconds
        Std Dev: 0.180 seconds
        Loaded 12500000 rows in 1292.94 seconds at 2.03 MiB/s

timings for test_innodb (full):
        Min: 0.458 seconds
        Max: 1.688 seconds
        Avg: 1.069 seconds
        Std Dev: 0.169 seconds
        Loaded 12500000 rows in 1336.56 seconds at 1.96 MiB/s

timings for test_myisam (empty):
        Min: 0.486 seconds
        Max: 2.424 seconds
        Avg: 1.017 seconds
        Std Dev: 0.230 seconds
        Loaded 12500000 rows in 1271.32 seconds at 2.06 MiB/s

timings for test_myisam (full):
        Min: 0.537 seconds
        Max: 2.427 seconds
        Avg: 1.033 seconds
        Std Dev: 0.219 seconds
        Loaded 12500000 rows in 1291.43 seconds at 2.03 MiB/s

@stephanGarland
Copy link
Author

stephanGarland commented Sep 30, 2023

Data sample that was used in this and the bulk load. The PK (the only key) for the tables is a CHAR(32), and is a UUIDv7 as shown here. While not as fast as an AUTO_INCREMENT INT, it is K-sortable so it won't cause nearly the page split issue as a UUIDv4 would.

'user_id','user_email','created_at','tags','shared_with','is_completed','is_overdue','description','title','user_name'
'018aae66103a7e7aa19e35f1b4330c22','killie.kironde@moonstone.com','2014-12-26 12:30:05','[]','[]','1','0','refute skittle unshipped thinly stark icing bribe subsystem ecard prudishly','polar','Killie.Kironde'
'018aae66103b752a81967307d5898b67','augusto.schluter@critter.com','2023-03-20 16:38:05','[]','[]','1','0','video badass surprise gander bloating jalapeno overpower conjure underpay oboe','starboard','Augusto.Schluter'
'018aae66103c7f4399a5aedd96521b89','orton.wong@widen.com','2017-08-07 21:17:02','[]','[]','0','0','basin proximity bonfire fantasy camcorder broadness waged willed strike moaner','chevy','Orton.Wong'
'018aae66103d7db488774840e2c2387d','bucky.shaffert@roulette.com','2014-06-26 22:37:13','[]','[]','1','0','shock happier tiara galleria sharply expire coeditor baritone zen dangling','cardinal','Bucky.Shaffert'
'018aae66103e7aa1a5a0b5717a876eb5','vasily.colbye@stifling.com','1997-07-23 11:14:30','[]','[]','1','0','tinderbox darkening dangle expand rash marine caliber hardness foyer wound','resort','Vasily.Colbye'
'018aae66103f7df78a68aaaaa6668501','maddy.childs@conducive.com','2016-10-13 00:52:01','[]','[]','1','0','circus saffron draw supreme audience stipulate acrobat context womankind impure','armless','Maddy.Childs'
'018aae6610407e4dbdba2d01093860b4','lucine.kunz@ambiguity.com','2001-07-03 16:29:45','[]','[]','1','0','omen skillful purposely drove pond uptown clumsily hardly womanlike obsession','backspin','Lucine.Kunz'
'018aae6610417848b92c5b8d19efe100','roderick.linson@stress.com','2000-04-14 23:49:16','[]','[]','1','0','passing puzzling until condiment getting spud coconut quiver pronounce casually','flick','Roderick.Linson'
'018aae6610427200b0e03f984f352ed4','jorrie.atkinson@pronounce.com','2010-09-25 08:27:01','[]','[]','1','0','smartness banknote thursday kindly pesticide selection dude snub shrink drone','geek','Jorrie.Atkinson'
'018aae66104372ad91c0fbcd9aae46b9','artur.sikata@frying.com','2014-12-12 16:44:49','[]','["018aae050842771f9eaa5b9ab581cfe1","018aae4c6c967046a0548438d1b7b9dc","018aadaf5172779f84b5e508557dbd9b"]','0','0','approval squeamish cornstalk spectator perfectly residue crushed resubmit swipe student','capillary','Artur.Sikata'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment