Skip to content

Instantly share code, notes, and snippets.

@BenMorel
Last active December 16, 2021 22:03
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save BenMorel/78f742356391d41c91d1d733f47dcb13 to your computer and use it in GitHub Desktop.
Save BenMorel/78f742356391d41c91d1d733f47dcb13 to your computer and use it in GitHub Desktop.
(Medium) BulkInserter benchmark
<?php
/**
* Benchmarks BulkInserter at various batch sizes.
*
* Before running this script, make sure to:
*
* - set your PDO parameters in this file
* - run composer install
* - import setup.sql into your database
* - uncompress z-employees.csv.xz and rename z-employees.csv to employees.csv
* (the file has been named this way to appear last in the gist)
*
* This script will output results in real time, and append results to a results.tsv file.
* Results can be aggregated using the results.php script.
*
* Test data is a fabricated 1.2M employees list extracted from this repo:
* https://github.com/datacharmer/test_db
*
* The original employees table contains 300k entries, it has been duplicated 4 times.
*
* The test data is fully loaded into memory to ensure that loading it is not a bottleneck.
* Therefore this script requires a lot of memory to run, be sure that you have at least 1 GB free RAM.
*/
use Brick\Db\Bulk\BulkInserter;
require 'vendor/autoload.php';
ini_set('memory_limit', '1G');
set_time_limit(0);
// Configure the batch sizes to benchmark
$batchSizes = [
1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 20, 30, 40, 50, 60, 70, 80, 90,
100, 200, 300, 400, 500, 600, 700, 800, 900,
1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000,
10000
];
$iterations = 10;
// Configure PDO
$pdo = new PDO('mysql:host=localhost;dbname=bench', 'root', '');
$pdo->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$pdo->setAttribute(PDO::ATTR_EMULATE_PREPARES, false);
// Load the employees data
$employees = [];
$fp = fopen('employees.csv', 'rb');
if ($fp === false) {
die("Could not open employees.csv\n");
}
echo 'Loading test data ';
$n = 0;
while (! feof($fp)) {
$employee = fgetcsv($fp);
if (count($employee) === 1) {
continue;
}
$employee[0] = (int) $employee[0];
$employees[] = $employee;
if (++$n % 100000 === 0) {
echo '.';
}
}
echo "\n";
fclose($fp);
$employeeCount = count($employees);
$fields = ['emp_no', 'birth_date', 'first_name', 'last_name', 'gender', 'hire_date'];
// Run the benchmark
for ($n = 1; $n <= $iterations; $n++) {
foreach ($batchSizes as $batchSize) {
echo "[Iteration $n] Batch size = $batchSize ... ";
$pdo->exec('TRUNCATE TABLE employees');
$inserter = new BulkInserter($pdo, 'employees', $fields, $batchSize);
$start = microtime(true);
$pdo->beginTransaction();
foreach ($employees as $employee) {
$inserter->queue(...$employee);
}
$inserter->flush();
$pdo->commit();
$end = microtime(true);
$count = $pdo->query('SELECT COUNT(*) FROM employees')->fetchColumn();
if ($count != $employeeCount) {
die("Expected $employeeCount records in employees table, found $count\n");
}
$rps = (int) ($employeeCount / ($end - $start));
echo "RPS = $rps\n";
file_put_contents('results.tsv', "$batchSize\t$rps\n", FILE_APPEND);
}
}
{
"require": {
"brick/db": "dev-master"
}
}
<?php
/**
* Extracts average & maximum RPS for each batch size from the results.tsv file.
*/
$data = file_get_contents('results.tsv');
$data = explode("\n", $data);
$values = [];
foreach ($data as $line) {
if ($line === '') {
continue;
}
$parts = explode("\t", $line);
if (count($parts) !== 2 || ! ctype_digit($parts[0]) || ! ctype_digit($parts[1])) {
echo "Bad line: $line\n";
continue;
}
$size = (int) $parts[0];
$rps = (int) $parts[1];
$values[$size][] = $rps;
}
$average = [];
$max = [];
foreach ($values as $size => $benchs) {
$average[$size] = round(array_sum($benchs) / count($benchs));
$max[$size] = max($benchs);
}
ksort($values);
foreach (array_keys($values) as $size) {
echo $size . "\t";
echo $average[$size] . "\t";
echo $max[$size] . "\n";
}
CREATE TABLE employees (
emp_no INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
birth_date DATE NOT NULL,
first_name VARCHAR(14) NOT NULL,
last_name VARCHAR(16) NOT NULL,
gender ENUM('M','F') NOT NULL,
hire_date DATE NOT NULL,
PRIMARY KEY (emp_no)
) ENGINE=InnoDB CHARSET=latin1;
This file has been truncated, but you can view the full file.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment