|
<?php |
|
|
|
use Faker\Factory; |
|
use Flow\ETL\DSL\From; |
|
use Flow\ETL\DSL\Json; |
|
use Flow\ETL\DSL\Parquet; |
|
use Flow\ETL\Filesystem\SaveMode; |
|
use Flow\ETL\Flow; |
|
use Flow\Parquet\Consts; |
|
|
|
include __DIR__ . '/vendor/autoload.php'; |
|
|
|
$faker = Factory::create(); |
|
|
|
echo "Generating fake data...\n"; |
|
|
|
$template = \array_map(function($i) use ($faker) { |
|
return [ |
|
'boolean' => $faker->boolean, |
|
'int32' => $faker->numberBetween(0, Consts::PHP_INT32_MAX), |
|
'int64' => $faker->numberBetween(0, PHP_INT_MAX), |
|
'float' => 10.25, |
|
'double' => $faker->randomFloat(), |
|
'decimal' => \round($faker->randomFloat(5), 2), |
|
'string' => $faker->text(50), |
|
'date' => \DateTimeImmutable::createFromMutable($faker->dateTimeThisYear)->setTime(0, 0, 0, 0), |
|
'datetime' => \DateTimeImmutable::createFromMutable($faker->dateTimeThisYear), |
|
'list_of_datetimes' => [ |
|
\DateTimeImmutable::createFromMutable($faker->dateTimeThisYear), |
|
\DateTimeImmutable::createFromMutable($faker->dateTimeThisYear), |
|
\DateTimeImmutable::createFromMutable($faker->dateTimeThisYear), |
|
], |
|
'map_of_ints' => [ |
|
'a' => $faker->numberBetween(0, Consts::PHP_INT32_MAX), |
|
'b' => $faker->numberBetween(0, Consts::PHP_INT32_MAX), |
|
'c' => $faker->numberBetween(0, Consts::PHP_INT32_MAX), |
|
], |
|
'list_of_strings' => \array_map(static fn (int $i) => $faker->text(50), \range(0, \random_int(1, 10))), |
|
'struct_flat' => [ |
|
'id' => $i, |
|
'name' => 'name_' . \str_pad((string) $i, 5, '0', STR_PAD_LEFT) |
|
] |
|
]; |
|
}, \range(1, 1_000)); |
|
|
|
echo "Template data generated, size: " . \number_format(\count($template)) . " records\n"; |
|
$data = []; |
|
|
|
$multiplier = 1_000; |
|
|
|
echo "Multiplying data by " .\number_format($multiplier) . "...\n"; |
|
|
|
for ($i = 0; $i < $multiplier; $i++) { |
|
$data = array_merge($data, $template); |
|
} |
|
|
|
echo "Dataset size: " . \number_format(\count($data)) . " records\n"; |
|
|
|
(new Flow()) |
|
->read(From::array($data)) |
|
->mode(SaveMode::Overwrite) |
|
->write(Json::to(__DIR__ . '/dataset.json')) |
|
->write(Parquet::to(__DIR__ . '/dataset.parquet')) |
|
->run(); |
|
|
|
echo "Json: " . \number_format(\filesize(__DIR__ . '/dataset.json') / Consts::MB_SIZE, 2) . " Mb\n"; |
|
echo "Parquet: " . \number_format(\filesize(__DIR__ . '/dataset.parquet') / Consts::MB_SIZE, 2) . " Mb\n"; |
Libraries used in this benchmark:
Parquet
Json
PHP 8.1
CPU: Apple M1
RAM: 64Gb