Last active
November 20, 2019 09:54
-
-
Save fox34/15a2ffc7f594338e5a2bb2cd3c8d799a to your computer and use it in GitHub Desktop.
Get last chunk of concatenated gzip file (PHP CLI)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env php | |
<?php | |
// Assuming gz content is plain text file and gzip file is concatenated | |
register_shutdown_function(function() { echo PHP_EOL; }); | |
// Limit access to cli | |
if (PHP_SAPI !== 'cli') die('This is a console application.'); | |
if ($_SERVER['argc'] < 2) die('Invalid number of arguments. Please provide input file.'); | |
/** | |
* https://gist.github.com/fox34/2503bc11cfc1f1001a919ac655e48706 | |
* | |
* Multiple gzip files can simply be concatenated. This function tries to extract the last chunk of a concatenated gzip file. | |
* Works by searching for the magic number 1f 8b | |
* Poor performance because of backtracking and multiple re-reads | |
* Thus, use only for smaller chunks of locally available data. | |
* Limited to maximum 1MB of compressed data (uncompressed data may be larger) | |
*/ | |
function gzfile_get_last_chunk_of_concatenated_file(string $file, int $readLimit = 1000000) : string | |
{ | |
// Limit to 1MB | |
$readLimit = min($readLimit, 1e6); | |
$fp = fopen($file, 'rb'); | |
if ($fp === false) { | |
throw new \Exception('Could not read file.'); | |
} | |
fseek($fp, -2, SEEK_END); | |
$gzdata = ''; | |
$data = ''; | |
$counter = 0; | |
// Read chunks of 2 bytes and compare with magic number | |
while (($seq = fread($fp, 2)) && $counter++ < $readLimit) { | |
// magic number not matched | |
if (bin2hex($seq) !== '1f8b') { | |
fseek($fp, -3, SEEK_CUR); | |
continue; | |
} | |
$pos = ftell($fp); | |
$gzdata = $seq; | |
// Read all remaining data | |
while ($chunk = fread($fp, 1024)) { | |
$gzdata .= $chunk; | |
} | |
// Try decoding data | |
$data = @gzdecode($gzdata); | |
// Could not decode data. Maybe magic number appeared inside compressed content? | |
if ($data === false) { | |
$data = ''; | |
fseek($fp, $pos - 3); | |
} else { | |
fclose($fp); | |
return $data; | |
} | |
} | |
fclose($fp); | |
throw new \Exception('Valid chunk not found within provided size limit.'); | |
} | |
$inputFile = $_SERVER['argv'][1]; | |
$numLines = max((int)($_SERVER['argv'][2] ?? 10), 0); | |
if (!is_readable($inputFile)) die('Provided input file is not readable.'); | |
try { | |
$lastChunk = gzfile_get_last_chunk_of_concatenated_file($inputFile); | |
} catch (\Exception $e) { | |
die('Could not read chunked gzip file: ' . $e->getMessage()); | |
} | |
if ($lastChunk === '') { | |
die('Last chunk is empty.'); | |
} | |
// Limit to last n lines | |
if ($numLines !== 0) { | |
$lines = explode(PHP_EOL, $lastChunk); | |
if (end($lines) === '') { | |
array_pop($lines); | |
} | |
if (count($lines) > $numLines) { | |
$lines = array_slice($lines, -$numLines); | |
} | |
$lastChunk = implode(PHP_EOL, $lines); | |
} | |
echo $lastChunk; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment