Skip to content

Instantly share code, notes, and snippets.

@norberttech
Last active November 7, 2023 17:37
Show Gist options
  • Save norberttech/fa00afa021520be34299f3a2b8f164b3 to your computer and use it in GitHub Desktop.
Save norberttech/fa00afa021520be34299f3a2b8f164b3 to your computer and use it in GitHub Desktop.
Parquet vs Json - PHP Benchmark
<?php
use Flow\Parquet\Reader;
use JsonMachine\Items;
use JsonMachine\JsonDecoder\ExtJsonDecoder;
include __DIR__ . '/vendor/autoload.php';
$jsonStream = \fopen(__DIR__ . '/dataset.json', 'r');
$start = \microtime(true);
$rows = 0;
foreach (Items::fromStream($jsonStream, ['decoder' => new ExtJsonDecoder(true)])->getIterator() as $rowData) {
$rows++;
}
$end = \microtime(true);
\fclose($jsonStream);
echo "JSON: " . \number_format($rows) . " rows in " . \number_format(($end - $start), 2) . " seconds\n";
$parquet = (new Reader())->read(__DIR__ . '/dataset.parquet');
$start = \microtime(true);
$rows = 0;
foreach ($parquet->values() as $row) {
$rows++;
}
$end = \microtime(true);
echo "Parquet: " . \number_format($rows) . " rows in " . \number_format(($end - $start), 2) . " seconds\n";
{
"name": "norbert/parquet-json-benchmark",
"description": "Parquet JSON benchmark",
"require": {
"fakerphp/faker": "2.0.x-dev",
"flow-php/etl": "1.x-dev",
"flow-php/etl-adapter-parquet": "1.x-dev",
"flow-php/etl-adapter-json": "1.x-dev"
},
"minimum-stability": "dev"
}
<?php
use Faker\Factory;
use Flow\ETL\DSL\From;
use Flow\ETL\DSL\Json;
use Flow\ETL\DSL\Parquet;
use Flow\ETL\Filesystem\SaveMode;
use Flow\ETL\Flow;
use Flow\Parquet\Consts;
include __DIR__ . '/vendor/autoload.php';
$faker = Factory::create();
echo "Generating fake data...\n";
$template = \array_map(function($i) use ($faker) {
return [
'boolean' => $faker->boolean,
'int32' => $faker->numberBetween(0, Consts::PHP_INT32_MAX),
'int64' => $faker->numberBetween(0, PHP_INT_MAX),
'float' => 10.25,
'double' => $faker->randomFloat(),
'decimal' => \round($faker->randomFloat(5), 2),
'string' => $faker->text(50),
'date' => \DateTimeImmutable::createFromMutable($faker->dateTimeThisYear)->setTime(0, 0, 0, 0),
'datetime' => \DateTimeImmutable::createFromMutable($faker->dateTimeThisYear),
'list_of_datetimes' => [
\DateTimeImmutable::createFromMutable($faker->dateTimeThisYear),
\DateTimeImmutable::createFromMutable($faker->dateTimeThisYear),
\DateTimeImmutable::createFromMutable($faker->dateTimeThisYear),
],
'map_of_ints' => [
'a' => $faker->numberBetween(0, Consts::PHP_INT32_MAX),
'b' => $faker->numberBetween(0, Consts::PHP_INT32_MAX),
'c' => $faker->numberBetween(0, Consts::PHP_INT32_MAX),
],
'list_of_strings' => \array_map(static fn (int $i) => $faker->text(50), \range(0, \random_int(1, 10))),
'struct_flat' => [
'id' => $i,
'name' => 'name_' . \str_pad((string) $i, 5, '0', STR_PAD_LEFT)
]
];
}, \range(1, 1_000));
echo "Template data generated, size: " . \number_format(\count($template)) . " records\n";
$data = [];
$multiplier = 1_000;
echo "Multiplying data by " .\number_format($multiplier) . "...\n";
for ($i = 0; $i < $multiplier; $i++) {
$data = array_merge($data, $template);
}
echo "Dataset size: " . \number_format(\count($data)) . " records\n";
(new Flow())
->read(From::array($data))
->mode(SaveMode::Overwrite)
->write(Json::to(__DIR__ . '/dataset.json'))
->write(Parquet::to(__DIR__ . '/dataset.parquet'))
->run();
echo "Json: " . \number_format(\filesize(__DIR__ . '/dataset.json') / Consts::MB_SIZE, 2) . " Mb\n";
echo "Parquet: " . \number_format(\filesize(__DIR__ . '/dataset.parquet') / Consts::MB_SIZE, 2) . " Mb\n";
$ php generate.php
Generating fake data...
Template data generated, size: 1,000 records
Multiplying data by 1,000...
Dataset size: 1,000,000 records
Json: 910.73 Mb
Parquet: 17.51 Mb
$ php benchmark.php 
JSON: 1,000,000 rows in 76.48 seconds
Parquet: 1,000,000 rows in 34.34 seconds
@norberttech
Copy link
Author

Libraries used in this benchmark:

Parquet
Json

PHP 8.1
CPU: Apple M1
RAM: 64Gb

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment