Skip to content

Instantly share code, notes, and snippets.

@erikyo
Last active March 28, 2022 16:03
Show Gist options
  • Save erikyo/c9365b0bc49d599f1be14b2f7feeed91 to your computer and use it in GitHub Desktop.
Save erikyo/c9365b0bc49d599f1be14b2f7feeed91 to your computer and use it in GitHub Desktop.
<?php
ini_set('display_errors', '1');
ini_set('display_startup_errors', '1');
error_reporting(E_ALL);
const MINIMUM_CHUNK_HEADER_LENGTH = 18;
const VP8X_ALPHA = 16;
const VP8X_ANIM = 2;
const VP8X_EXIF = 8;
const VP8X_ICC = 32;
const VP8X_XMP = 4;
// RIFF
// credits https://doc.wikimedia.org/mediawiki-core/1.27.3/php/classRiffExtractor.html
function decodeLossyChunkHeader( $header ) {
// Bytes 0-3 are 'VP8 '
// Bytes 4-7 are the VP8 stream size
// Bytes 8-10 are the frame tag
// Bytes 11-13 are 0x9D 0x01 0x2A called the sync code
$syncCode = substr( $header, 11, 3 );
if ( $syncCode != "\x9D\x01\x2A" ) {
print_r( 'WebP decodeLossyChunkHeader Invalid sync code: '. bin2hex( $syncCode ) . "\n" );
return [];
}
// Bytes 14-17 are image size
$imageSize = unpack( 'v2', substr( $header, 14, 4 ) );
// Image sizes are 14 bit, 2 MSB are scaling parameters which are ignored here
return [
'compression' => 'lossy',
'width' => $imageSize[1] & 0x3FFF,
'height' => $imageSize[2] & 0x3FFF
];
}
function decodeLosslessChunkHeader( $header ) {
// Bytes 0-3 are 'VP8L'
// Bytes 4-7 are chunk stream size
// Byte 8 is 0x2F called the signature
if ( $header[8] != "\x2F" ) {
print_r( 'Invalid signature: ' . bin2hex( $header[8] ) . "\n" );
return [];
}
// Bytes 9-12 contain the image size
// Bits 0-13 are width-1; bits 15-27 are height-1
$imageSize = unpack( 'C4', substr( $header, 9, 4 ) );
return [
'compression' => 'lossless',
'width' => ( $imageSize[1] | ( ( $imageSize[2] & 0x3F ) << 8 ) ) + 1,
'height' => ( ( ( $imageSize[2] & 0xC0 ) >> 6 ) | ( $imageSize[3] << 2 ) | ( ( $imageSize[4] & 0x03 ) << 10 ) ) + 1
];
}
function decodeExtendedChunkHeader( $header ) {
// Bytes 0-3 are 'VP8X'
// Byte 4-7 are chunk length
// Byte 8-11 are a flag bytes
$flags = unpack( 'c', substr( $header, 8, 1 ) );
// Byte 12-17 are image size (24 bits)
$width = unpack( 'V', substr( $header, 12, 3 ) . "\x00" );
$height = unpack( 'V', substr( $header, 15, 3 ) . "\x00" );
return [
'compression' => 'unknown',
'animated' => ( $flags[1] & VP8X_ANIM ) == VP8X_ANIM,
'transparency' => ( $flags[1] & VP8X_ALPHA ) == VP8X_ALPHA,
'EXIF' => ( $flags[1] & VP8X_EXIF ) == VP8X_EXIF,
'ICC' => ( $flags[1] & VP8X_ICC ) == VP8X_ICC,
'XMP' => ( $flags[1] & VP8X_XMP ) == VP8X_XMP,
'width' => ( $width[1] & 0xFFFFFF ) + 1,
'height' => ( $height[1] & 0xFFFFFF ) + 1
];
}
function decodeExifChunkHeader($img_metadata) {
// EXIF
// TODO: here the first bug! sometimes the exif header is jfif like and needs to be parsed in the "old" fashioned way (TLDR. it's shifted of 4byte)
$header_format = 'A4type/' . // get 4 string
'I1size/' // get 4 string
;
$header = unpack($header_format, substr( $img_metadata, 0, 8 ));
// fetch header in order to find "0000008" that marks the beginning of exif data before the idf count (what we need)
$meta_chunk = unpack( 'H40', substr( $img_metadata, 8, 20 ) )[1];
$exifstart = strpos($meta_chunk, "00000008");
$exif_start_shift = ($exifstart === 8 ) ? 8 : 8 + (($exifstart - 8) * .5);
$header_riff_format =
'A2byte_order/' . // 2byte get 4 string | "II" (4949.H) (little endian) or "MM" (4D4D.H) (big endian)
'H4fixed42/' . // 2byte get 4 string | magic number 42 fixed 002A.h
'H8offset/'. // 4byte get 4 string | 0th IFD offset. If the TIFF header is followed immediately by the 0th IFD, it is written as 00000008.H.
'H*idf_count/'; // the count of identiers to read in the next function
$metadata = array_merge(
$header,
unpack($header_riff_format, substr( $img_metadata, $exif_start_shift, 10 )),
array('orientation' => '')
// array(
// 'debug_exif_start_position'=> $exifstart,
// 'debug_exif_shift_position'=> $exif_start_shift - 8,
// )
);
// TODO: the count is wrong... the reason is that value needs to be decoded from hex with some rules
// (following the description of the field)
// The number of values. It should be noted carefully that the count is not the sum of the bytes. In the case of one
// value of SHORT (16 bits), for example, the count is '1' even though it is 2 bytes.
$metadata['idf_count'] = hexdec($metadata['idf_count']);
for ( $i = 0; $i <= $metadata['idf_count'] - 1; $i++ ) {
// Read the next 12 bytes each loop
$exif_raw = substr( $img_metadata, 10 + $exif_start_shift + ( 12 * $i ), 12 );
// Unpack 12bytes as 24hex values into char string
$meta_chunk = unpack( 'H24', $exif_raw )[1];
// Split the hex string into
$meta_chunk_tag = substr($meta_chunk, 0, 4);
$meta_chunk_offset = hexdec(substr($meta_chunk, 16, 8));
$meta_chunk_count = hexdec(substr($meta_chunk, 8, 8)); // the number of values (string length)
// TODO: If the value is smaller than 4 bytes,
// the value is stored in the 4-byte area starting from the left,
// i.e., from the lower end of the byte offset area
if ($meta_chunk_tag == '0112') {
$metadata['orientation'] = substr($meta_chunk, 16, 4);
}
// saves the hex decoded data
$metadata["dataset"][$i] = array(
'tag' => $meta_chunk_tag,
'type' => substr($meta_chunk, 4, 4), // 2bit TYPE: 0-1 Tag | 2-3 type | 4-7 Count | 8-11 value offset
'value' => substr( $img_metadata, $meta_chunk_offset, $meta_chunk_count ), // 4bit the item value - 1 byte 8bit uint | 2 ascii 8byte with 7bit ascii code | 3 short 16bit uint | 4 long 32bit uint | 5 rational long/long | 7 undefined 8bit any | 9 SLONG 4byte singed int | 10 SRATIONAL SLONG/SLONG
'raw_value_data' => array( 'hex' => $meta_chunk, 'offset' => $meta_chunk_offset, 'count' => $meta_chunk_count),
);
}
return $metadata;
}
function decodeIccpChunkHeader($img_metadata) {
// ITPC PARSE
// https://www.color.org/icc_specs2.xalter
// https://www.color.org/specification/ICCSpecRevision_25-02-10_dictType.pdf
// https://www.color.org/icc32.pdf (definitions near page 80)
$header_format = 'A4type/' . // get 4 string
'I1size/' ; // get 1byte integer
$metadata['parsed']['header'] = unpack($header_format, substr( $img_metadata, 0, 8 ));
$metadata['parsed']['raw-header'] = $raw_header = substr( $img_metadata, 8, 128 );
$metadata['parsed']['raw-body'] = $raw_body = substr( $img_metadata, 128 );
$iccp_format = 'Z4tag/' .
'Noffset/' .
'Nlength/';
for ($i = 1; $i <= 10; $i++) {
$parsed_iccp[$i] = unpack($iccp_format, substr( $raw_body, 12*$i, 12 ));
$parsed_iccp[$i]['data'] = substr(
$img_metadata,
$parsed_iccp[$i]['offset'] + 8,
$parsed_iccp[$i]['length']
);
$metadata["body-$i"] = array(
"tag" => substr( $parsed_iccp[$i]['tag'], 0, 4 ),
"type" => substr( $parsed_iccp[$i]['data'], 0, 4 ),
"data" => substr( $parsed_iccp[$i]['data'], 4 )
);
}
$metadata['icc'] = "ICC profile present";
// check for icc profile
if ( substr( $raw_header, 36, 4 ) != 'acsp' ) {
$metadata['icc'] = "ICC profile INVALID (no acsp flag) " .substr( $raw_header, 32, 4 );
}
// invalid ICC profile
else {
$input = substr( $raw_header, 16, 4 );
$output = substr( $raw_header, 20, 4 );
$metadata['icc-input'] = 'ICC profile Input: ' . $input;
$metadata['icc-output'] = 'ICC profile Output: ' . $output;
// Ignore Color profiles for conversion to other color-spaces e.g. CMYK/Lab
if ( $input != 'RGB ' || $output != 'XYZ ' ) {
$metadata['icc'] = 'ICC profile ignored';
}
}
return $metadata;
}
function decodeXmpChunkHeader($img_metadata) {
// XMP PROFILE
// https://en.wikipedia.org/wiki/Extensible_Metadata_Platform
// https://web.archive.org/web/20180919181934/http://www.metadataworkinggroup.org/pdf/mwg_guidance.pdf
// https://github.com/jeroendesloovere/xmp-metadata-extractor/blob/master/src/XmpMetadataExtractor.php
$header_format = 'A4type/' . // get 4 string
'Vsize/'; // get 4 string
$metadata['XMP-parsed'] = unpack($header_format, substr( $img_metadata, 0, 8 ));
$metadata["XMP-raw"] = $xmp_raw = utf8_encode(htmlspecialchars(substr( $img_metadata, 8)));
return $metadata;
}
function find_chunks( $file, $fileSize, $fourCC, $maxChunks = -1 ) {
// Create basic info structure
$info = [
'fileSize' => readUInt32($fileSize),
'fourCC' => $fourCC,
'chunks' => [],
];
$numberOfChunks = 0;
// Find out the chunks
while ( ! feof( $file ) && ! ( $numberOfChunks >= $maxChunks && $maxChunks >= 0 ) ) {
$chunkStart = ftell( $file );
$chunkFourCC = fread( $file, 4 );
if ( ! $chunkFourCC || strlen( $chunkFourCC ) != 4 ) {
return $info;
}
$chunkSize = fread( $file, 4 );
if ( ! $chunkSize || strlen( $chunkSize ) != 4 ) {
return $info;
}
$intChunkSize = readUInt32( $chunkSize );
// Add chunk info to the info structure
$info['chunks'][] = [
'fourCC' => $chunkFourCC,
'start' => $chunkStart,
'size' => $intChunkSize
];
// Uneven chunks have padding bytes
$padding = $intChunkSize % 2;
// Seek to the next chunk
fseek( $file, $intChunkSize + $padding, SEEK_CUR );
}
return $info;
}
function readXChar($handle, $length){
return readUnpack($handle, 'a*', $length);
}
function readHex($handle){
return readUnpack($handle, 'h', 1);
}
function read32($handle){
return readUnpack($handle, 'V', 4);
}
function read24($handle){
return readUnpack($handle, 'V', 3) . "\x00" ;
}
function read16($handle){
return readUnpack($handle, 'V', 2);
}
function readUInt32( $handle ) {
return unpack( 'V', $handle )[1];
}
function readInt($handle){
$data = unpack('I1', fread($handle, 4));
return intval($data[1]);
}
function readUnpack($handle, $type, $length){
$data = unpack($type, fread($handle, $length));
return array_pop($data);
}
function _fourbytes2int($s) {
//Read a 4-byte integer from string
return (ord($s[0])<<24) + (ord($s[1])<<16) + (ord($s[2])<<8) + ord($s[3]);
}
function _twobytes2int($s) { // equivalent to _get_ushort
//Read a 2-byte integer from string
return (ord(substr($s, 0, 1))<<8) + ord(substr($s, 1, 1));
}
function readBits($value, $start, $end) {
$mask = ( 1 << ( $end - $start ) ) - 1;
return ( $value >> $start ) & $mask;
}
// Util function to humanize filesize
function formatBytes($size, $precision = 2) {
$base = log($size, 1024);
$suffixes = array('', 'K', 'M', 'G', 'T');
return round(pow(1024, $base - floor($base)), $precision) . $suffixes[floor($base)];
}
function getImageRiff($filename) {
$fh = fopen($filename, 'rb');
$meta['RIFF'] = readXChar($fh, 4); // read 32 bits - 4 char | riff header
if ($meta['RIFF'] !== 'RIFF') {
$meta['header'] = readXChar($fh, 32);
$meta['jfif'] = substr( $meta['header'], 2, 4 );
if ($meta['jfif'] === 'JFIF') {
return htmlentities($meta['jfif']) . " isn't a webp riff format... check the header probably it's a jpg since it contains the jfif signature (TLDR HH - https://dev.exiv2.org/projects/exiv2/wiki/The_Metadata_in_JPEG_files) -> " . $meta['RIFF'].$meta['header'];
} else {
if ($meta['RIFF'] === 'GIF8' ) {
return $meta['RIFF'] . " isn't a webp riff format... check the header probably it's a gif since it contains the GIF file signature -> " . $meta['RIFF'].$meta['header'];
}
if (substr( $meta['header'], 8, 4 ) === 'IHDR') {
$meta['jfif'] = substr( $meta['header'], 8, 4 );
return $meta['jfif'] . " isn't a webp riff format... check the header probably it's a png since it contains the png file signature (TLDR IHDR - https://www.w3.org/TR/PNG-Structure.html) The first eight bytes of a PNG file always contain the following (decimal) values:".
"<br/>(137 80 78 71 13 10 26 10)".
"<br/>This signature indicates that the remainder of the file contains a single PNG image, consisting of a series of chunks beginning with an IHDR chunk and ending with an IEND chunk. -> " . $meta['RIFF'].$meta['header'];
}
return htmlspecialchars($meta['RIFF']).substr( $meta['header'], 0, 4 ) . " unknown signature -> " . $meta['header'];
}
}
$meta['Filesize'] = readInt($fh); // read 32 bits (uint32) | the whole file size
$meta['Webp'] = readXChar($fh, 4); // read 32 bits - 4 char | webp extension
$raw = find_chunks( $fh, $meta['Filesize'], $meta['Webp'], -1 );
$meta['readeable-filesize'] = formatBytes($meta['Filesize']);
$meta['chunks-flat'] = array_column($raw['chunks'], 'fourCC');
foreach ( $raw['chunks'] as $k => $chunk ) {
$chunkKey = $k . '_' . $chunk['fourCC'];
if ( in_array( $chunk['fourCC'], array( 'VP8 ', 'VP8L', 'VP8X'))){
$chunkHeader = file_get_contents( $filename, false, null, $chunk['start'], MINIMUM_CHUNK_HEADER_LENGTH );
if ( $chunk['fourCC'] == 'VP8 ' ) {
$meta[ $chunkKey ] = decodeLossyChunkHeader( $chunkHeader );
} else if ( $chunk['fourCC'] == 'VP8L' ) {
$meta[ $chunkKey ] = decodeLosslessChunkHeader( $chunkHeader );
} else if ( $chunk['fourCC'] == 'VP8X' ) {
$meta[ $chunkKey ] = decodeExtendedChunkHeader( $chunkHeader );
}
} else if ( in_array( $chunk['fourCC'], array( 'ICCP', 'ANIM', 'XMP ', 'EXIF' ) ) ) { //'ANIF'
$meta[$chunkKey] = array( $chunk['fourCC'] => 'start: '. $chunk['start'] . ' length:' . $chunk['size'] );
$img_metadata = file_get_contents( $filename, false, null, $chunk['start'], $chunk['size'] + 8 );
if ($chunk['fourCC'] === 'EXIF') {
// $meta[ $chunkKey ]["EXIF-raw"] = $img_metadata;
$meta[ $chunkKey ] = decodeExifChunkHeader($img_metadata);
} else if ($chunk['fourCC'] === 'ICCP') {
$meta[ $chunkKey ] = decodeIccpChunkHeader($img_metadata);
} else if ( $chunk['fourCC'] === 'XMP ' ) {
$meta[ $chunkKey ] = decodeXmpChunkHeader($img_metadata);
}
}
// the other metas
else if ( $chunk['fourCC'] === 'ANMF' ) {
$meta['frames'][ $chunkKey ] = "frame" . $k;
} else {
$meta[ $chunkKey ]['missed'][] = '"'. $chunk['fourCC'] . '"';
}
}
return $meta;
}
$upload_dir = __DIR__;
$images = array(
"icc-lossy.webp",
"icc-lossless.webp",
"example.webp",
"arrow.webp",
"nyan.webp",
"WebP-lossless.webp",
"WebP-lossy.webp"
);
echo '<pre>';
foreach ($images as $id => $image) {
$image_path = $upload_dir .'/'.$image;
$image_url = "./" . $image;
$image_data = getImageRiff( $image_path );
printf( '<div><img loading="lazy" id="%s" src="%s" /><p> %s (%s) </p> %s </div>', $id, $image_url, $image, $image_url, print_r($image_data, true) );
}
echo '</pre>';
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment