Skip to content

Instantly share code, notes, and snippets.

@maximal
Created November 3, 2020 16:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save maximal/c9b29b4367c9b60beca5687702329a17 to your computer and use it in GitHub Desktop.
Save maximal/c9b29b4367c9b60beca5687702329a17 to your computer and use it in GitHub Desktop.
Excel writing example
<?php
/**
* Генератор текстовых диффов из XLSX-файлов
* с использованием редакционного расстояния Левенштейна
*
* @author MaximAL
* @since 2020-08-01
* @date 2020-08-01
* @time 4:51
*
* @link https://maximals.ru
* @link https://sijeko.ru
*/
namespace Maximal\TextDiff;
use Box\Spout\Common\Entity\Row;
use Box\Spout\Reader\Common\Creator\ReaderEntityFactory;
use Box\Spout\Reader\XLSX\Sheet;
use Box\Spout\Writer\Common\Creator\Style\StyleBuilder;
use Box\Spout\Writer\Common\Creator\WriterEntityFactory;
/**
* Class App
* @package Maximal\TextDiff
*/
class App
{
const DEFAULT_NUMBER = 2;
private $config;
private $argv;
public function __construct(array $argv, array $config = [])
{
$this->argv = $argv;
$this->config = $config;
}
public function run()
{
echo 'XLSX text diff top generator (c) MaximAL 2020', PHP_EOL;
$argc = count($this->argv);
if ($argc < 3 || $argc > 4) {
$this->help();
return;
}
$timeStart = microtime(true);
// Валидация количества
$numberParam = $argc === 4 ? $this->argv[3] : '';
if ($argc === 4 && !preg_match('/^[1-9]\d*$/', $numberParam)) {
echo 'Third parameter (got: ', $numberParam, ') should be a positive integer.', PHP_EOL;
return;
}
$number = $argc === 4 ? intval($numberParam) : self::DEFAULT_NUMBER;
// Валидация входного файла
$inputFile = $this->argv[1];
if (!preg_match('/\.xlsx$/ui', $inputFile)) {
echo 'File must be in XLSX format only: ', $inputFile, PHP_EOL;
return;
}
if (!is_file($inputFile)) {
echo 'Input file cannot be read or does not exist: ', $inputFile, PHP_EOL;
return;
}
// Валидация выходного файла
$outputFile = $this->argv[2];
if (is_file($outputFile)) {
echo 'Output file already exists: ', $outputFile, PHP_EOL;
return;
}
if (@file_put_contents($outputFile, 'test') === false) {
echo 'Could not open file for writing (read-only?): ', $outputFile, PHP_EOL;
return;
}
unlink($outputFile);
// Поехали!
$reader = ReaderEntityFactory::createXLSXReader();
$reader->open($inputFile);
$sheetNames = [];
foreach ($reader->getSheetIterator() as $sheet) {
/** @var Sheet $sheet */
$sheetNames []= $sheet->getName();
}
if (count($sheetNames) < 2) {
echo 'XLSX file must contain at least 2 sheets: ', $inputFile, PHP_EOL;
return;
}
$array1 = [];
$array2 = [];
foreach ($reader->getSheetIterator() as $index => $sheet) {
/** @var Sheet $sheet */
foreach ($sheet->getRowIterator() as $row) {
/** @var Row $row */
if ($index === 1) {
$array1 []= $row->toArray();
} else {
$array2 []= $row->toArray();
}
}
}
$reader->close();
array_shift($array1);
array_shift($array2);
$count1 = count($array1);
$count2 = count($array2);
echo 'Average calculations: ', $count1, ' × ', $count2, ' ≈ ';
echo self::averageNumber($count1 * $count2), PHP_EOL;
echo 'Average output rows: ', $count1, ' × ', $number, ' ≈ ';
echo self::averageNumber($count1 * $number), PHP_EOL;
$defaultStyle = (new StyleBuilder())->setShouldWrapText()->build();
$writer = WriterEntityFactory::createXLSXWriter();
$writer->openToFile($outputFile)->setDefaultRowStyle($defaultStyle);
$writer->addRow(WriterEntityFactory::createRowFromArray([
'Массив 1',
null,
'Массив 2',
null,
'Близость',
]));
$writer->addRow(WriterEntityFactory::createRowFromArray([
'ИД',
'Значение',
'ИД',
'Значение',
'Расстояние Дамерау—Левенштейна',
]));
$index = 0;
$rows = 0;
foreach ($array1 as $row1) {
$index++;
$percent = 100 * $index / $count1;
echo "\r", $index , '/', $count1, sprintf(' %.1f%%', $percent);
$id1 = isset($row1[0]) ? intval($row1[0]) : 0;
if ($id1 === 0) {
continue;
}
$text1 = isset($row1[1]) ? trim($row1[1]) : '';
if ($text1 === '') {
continue;
}
$distance = [];
foreach ($array2 as $row2) {
$id2 = isset($row2[0]) ? intval($row2[0]) : 0;
if ($id2 === 0) {
continue;
}
$text2 = isset($row2[1]) ? trim($row2[1]) : '';
if ($text2 === '') {
continue;
}
$distance []= [
'id' => $id2,
'text' => $text2,
'dist' => levenshtein($text1, $text2),
];
}
usort($distance, [self::class, 'compareDistances']);
$top = array_slice($distance, 0, $number);
foreach ($top as $topIndex => $item) {
$writer->addRow(WriterEntityFactory::createRowFromArray([
$id1,
$text1,
$item['id'],
$item['text'],
$item['dist'],
]));
$rows++;
}
unset($distance);
unset($top);
}
echo PHP_EOL, 'Writing file. Please wait...';
$writer->close();
$timeDiff = microtime(true) - $timeStart;
echo "\r", 'Done with ', $rows, ' rows in ', round($timeDiff), ' seconds.', PHP_EOL;
}
private function help()
{
echo 'Usage: php ', $this->argv[0], ' {input XLSX file} ';
echo '{output XLSX file} [n, default ', self::DEFAULT_NUMBER, ']', PHP_EOL;
}
private static function compareDistances($a, $b): int
{
if ($a['dist'] === $b['dist']) {
return 0;
}
return $a['dist'] < $b['dist'] ? -1 : +1;
}
private static function averageNumber($number): string
{
if ($number > 999999999) {
return round($number / 1000000000, 1) . 'G';
}
if ($number > 999999) {
return round($number / 1000000, 1) . 'M';
}
if ($number > 999) {
return round($number / 1000, 1) . 'k';
}
return $number;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment