Last active
September 20, 2023 15:24
-
-
Save rasteiner/943ff831bce1c886b3d8b32a050efbbd to your computer and use it in GitHub Desktop.
JCR bundle reader in php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
enum BundleNames: int { | |
case NT_UNSTRUCTURED = 0; | |
case NT_RESOURCE = 1; | |
case NT_FILE = 2; | |
case NT_FOLDER = 3; | |
case NT_HIERARCHYNODE = 4; | |
case MIX_REFERENCEABLE = 5; | |
case JCR_CREATED = 6; | |
case JCR_CREATEDBY = 7; | |
case JCR_LASTMODIFIED = 8; | |
case JCR_LASTMODIFIEDBY = 9; | |
case JCR_CONTENT = 10; | |
case JCR_MIMETYPE = 11; | |
case JCR_DATA = 12; | |
case JCR_TITLE = 13; | |
case JCR_LANGUAGE = 14; | |
case JCR_ENCODING = 15; | |
case JCR_SYSTEM = 16; | |
case REP_ROOT = 17; | |
case REP_SYSTEM = 18; | |
// Access control | |
case JCR_ADD_CHILD_NODES = 19; | |
case JCR_LIFECYCLE_MANAGEMENT = 20; | |
case JCR_LOCK_MANAGEMENT = 21; | |
case JCR_MODIFY_ACCESS_CONTROL = 22; | |
case JCR_MODIFY_PROPERTIES = 23; | |
case JCR_NODE_TYPE_MANAGEMENT = 24; | |
case JCR_READ = 25; | |
case JCR_READ_ACCESS_CONTROL = 26; | |
case JCR_REMOVE_CHILD_NODES = 27; | |
case JCR_REMOVE_NODE = 28; | |
case JCR_VERSION_MANAGEMENT = 29; | |
case REP_ACCESSCONTROL = 30; | |
case REP_ACCESS_CONTROL = 31; | |
case REP_ACCESS_CONTROLLABLE = 32; | |
case REP_ACE = 33; | |
case REP_ACL = 34; | |
case REP_DENY_ACE = 35; | |
case REP_GLOB = 36; | |
case REP_GRANT_ACE = 37; | |
case REP_POLICY = 38; | |
case REP_PRINCIPAL_ACCESS_CONTROL = 39; | |
case REP_PRINCIPAL_NAME = 40; | |
case REP_PRIVILEGES = 41; | |
// Locking | |
case MIX_LOCKABLE = 42; | |
case JCR_LOCKISDEEP = 43; | |
case JCR_LOCKOWNER = 44; | |
// Versioning | |
case MIX_VERSIONABLE = 45; | |
case NT_FROZENNODE = 46; | |
case NT_VERSION = 47; | |
case NT_VERSIONEDCHILD = 48; | |
case NT_VERSIONHISTORY = 49; | |
case NT_VERSIONLABELS = 50; | |
case JCR_VERSIONSTORAGE = 51; | |
case JCR_FROZENPRIMARYTYPE = 52; | |
case JCR_FROZENUUID = 53; | |
case JCR_FROZENNODE = 54; | |
case JCR_PREDECESSORS = 55; | |
case JCR_SUCCESSORS = 56; | |
case JCR_VERSIONLABELS = 57; | |
case JCR_VERSIONHISTORY = 58; | |
case JCR_VERSIONABLEUUID = 59; | |
case JCR_ROOTVERSION = 60; | |
case JCR_ISCHECKEDOUT = 61; | |
case JCR_BASEVERSION = 62; | |
case JCR_MERGEFAILED = 63; | |
case REP_NODETYPES = 64; | |
// Node types | |
case NT_NODETYPE = 65; | |
case NT_PROPERTYDEFINITION = 66; | |
case NT_CHILDNODEDEFINITION = 67; | |
case NT_BASE = 68; | |
case JCR_NODETYPES = 69; | |
case JCR_PROTECTED = 70; | |
case JCR_ONPARENTVERSION = 71; | |
case JCR_MANDATORY = 72; | |
case JCR_AUTOCREATED = 73; | |
case JCR_FROZENMIXINTYPES = 74; | |
case JCR_NAME = 75; | |
case JCR_VALUECONSTRAINTS = 76; | |
case JCR_REQUIREDTYPE = 77; | |
case JCR_PROPERTYDEFINITION = 78; | |
case JCR_MULTIPLE = 79; | |
case JCR_DEFAULTVALUES = 80; | |
case JCR_SUPERTYPES = 81; | |
case JCR_NODETYPENAME = 82; | |
case JCR_ISMIXIN = 83; | |
case JCR_HASORDERABLECHILDNODES = 84; | |
case JCR_SAMENAMESIBLINGS = 85; | |
case JCR_REQUIREDPRIMARYTYPES = 86; | |
case JCR_CHILDNODEDEFINITION = 87; | |
case JCR_DEFAULTPRIMARYTYPE = 88; | |
case JCR_PRIMARYITEMNAME = 89; | |
case JCR_CHILDVERSIONHISTORY = 90; | |
case REP_VERSIONS = 91; | |
case REP_VERSIONSTORAGE = 92; | |
case REP_VERSION_REFERENCE = 93; | |
case REP_BASEVERSIONS = 94; | |
// Miscellaneous node types | |
case MIX_CREATED = 95; | |
case MIX_ETAG = 96; | |
case MIX_LANGUAGE = 97; | |
case MIX_LASTMODIFIED = 98; | |
case MIX_LIFECYCLE = 99; | |
case MIX_MIMETYPE = 100; | |
case MIX_SHAREABLE = 101; | |
case MIX_SIMPLE_VERSIONABLE = 102; | |
case MIX_TITLE = 103; | |
case NT_ACTIVITY = 104; | |
case NT_ADDRESS = 105; | |
case NT_CONFIGURATION = 106; | |
case NT_QUERY = 107; | |
case NT_SHARE = 108; | |
// Miscellaneous names | |
case REP_ACTIVITIES = 109; | |
case JCR_ACTIVITIES = 110; | |
case JCR_ACTIVITY = 111; | |
case JCR_ACTIVITY_TITLE = 112; | |
case JCR_XMLCHARACTERS = 113; | |
case JCR_XMLTEXT = 114; | |
case REP_CONFIGURATIONS = 115; | |
case JCR_CONFIGURATION = 116; | |
case JCR_CONFIGURATIONS = 117; | |
case JCR_COPIEDFROM = 118; | |
case JCR_CURRENT_LIFECYCLE_STATE = 119; | |
case JCR_ETAG = 120; | |
case JCR_HOST = 121; | |
case JCR_ID = 122; | |
case JCR_LIFECYCLE_POLICY = 123; | |
case JCR_PATH = 124; | |
case JCR_STATEMENT = 125; | |
} | |
const BINARY_IN_BLOB_STORE = -1; | |
const BINARY_IN_DATA_STORE = -2; | |
/** | |
* Unsigned right shift for Longs | |
* @param string $base10 Left hand side of the operation | |
* @param int $shift Right hand side: the number of bits to shift | |
* @param int $totalBits The total number of bits in the integer | |
* @return int | |
*/ | |
function urs(int $base10, int $shift): int { | |
// Convert to base 2 | |
$base2 = decbin($base10); | |
// Pad with zeros | |
$base2 = str_pad($base2, 64, '0', STR_PAD_LEFT); | |
// Shift | |
$base2 = substr($base2, 0, 64 - $shift); | |
// Convert back to base 10 | |
return bindec($base2); | |
} | |
class PropertyId { | |
public function __construct(public readonly string $bundleId, public readonly string $name) {} | |
} | |
enum PropertyType: int { | |
case UNDEFINED = 0; | |
case STRING = 1; | |
case BINARY = 2; | |
case LONG = 3; | |
case DOUBLE = 4; | |
case DATE = 5; | |
case BOOLEAN = 6; | |
case NAME = 7; | |
case PATH = 8; | |
case REFERENCE = 9; | |
case WEAKREFERENCE = 10; | |
case URI = 11; | |
case DECIMAL = 12; | |
} | |
function getCommonTimezone(int $index) { | |
static $commonTimezones = [ | |
'GMT', | |
'GMT+01:00', | |
'GMT+02:00', | |
'GMT+03:00', | |
'GMT+04:00', | |
'GMT+05:00', | |
'GMT+06:00', | |
'GMT+07:00', | |
'GMT+08:00', | |
'GMT+09:00', | |
'GMT+10:00', | |
'GMT+11:00', | |
'GMT+12:00', | |
'GMT+13:00', | |
'GMT+14:00', | |
'GMT+15:00', | |
'GMT-16:00', | |
'GMT-15:00', | |
'GMT-14:00', | |
'GMT-13:00', | |
'GMT-12:00', | |
'GMT-11:00', | |
'GMT-10:00', | |
'GMT-09:00', | |
'GMT-08:00', | |
'GMT-07:00', | |
'GMT-06:00', | |
'GMT-05:00', | |
'GMT-04:00', | |
'GMT-03:00', | |
'GMT-02:00', | |
'GMT-01:00', | |
]; | |
return new DateTimeZone($commonTimezones[$index]); | |
} | |
function hexToUUID(string $hex) { | |
return substr($hex, 0, 8) . '-' . substr($hex, 8, 4) . '-' . substr($hex, 12, 4) . '-' . substr($hex, 16, 4) . '-' . substr($hex, 20, 12); | |
} | |
class BundleReader { | |
private int $offset = 0; | |
private int $version = 0; | |
private array $parsed = []; | |
/** | |
* The default namespace and the first six other namespaces used in this | |
* bundle. Used by the readName() method to keep track of | |
* already seen namespaces. | |
* | |
* @var string[] | |
*/ | |
private array $namespaces = [ | |
// NOTE: The length of this array must be seven | |
'', null, null, null, null, null, null | |
]; | |
public function __construct(private string $bundleId, private string $data) { | |
if(strlen($bundleId) === 32) { | |
$this->bundleId = hexToUUID($bundleId); | |
} | |
$this->version = $this->readUint8(); | |
if($this->version < 3) { | |
throw new Exception('Invalid bundle version'); | |
} | |
$data = [ | |
'version' => $this->version, | |
'nodeTypeName' => $this->readName(), | |
'parentId' => $this->readNodeId(), | |
'modCount' => $this->readVarInt(), | |
]; | |
try { | |
$b = $this->readUint8(); | |
$data += [ | |
'referenceable' => ($b & 1) != 0, | |
]; | |
$mn = $this->readVarInt(($b >> 7) & 1, 1); | |
if($mn === 0) { | |
$data['mixinTypeNames'] = []; | |
} elseif ($mn === 1) { | |
$data['mixinTypeNames'] = [$this->readName()]; | |
} else { | |
$data['mixinTypeNames'] = []; | |
for($i = 0; $i < $mn; $i++) { | |
$data['mixinTypeNames'][] = $this->readName(); | |
} | |
} | |
$data['properties'] = []; | |
$pn = $this->readVarInt(($b >> 4) & 7, 7); | |
for($i = 0; $i < $pn; $i++) { | |
$id = new PropertyId($this->bundleId, $this->readName()); | |
$prop = $this->readPropertyEntry($id); | |
$data['properties'][] = $prop; | |
} | |
// child nodes (list of name/uuid pairs) | |
$data['childNodeEntries'] = []; | |
$nn = $this->readVarInt(($b >> 2) & 3, 3); | |
for($i = 0; $i < $nn; $i++) { | |
$name = $this->readName(); | |
$id = $this->readNodeId(); | |
$data['childNodeEntries'][] = [ | |
'name' => $name, | |
'id' => $id, | |
]; | |
} | |
// read shared set | |
$sn = $this->readVarInt(($b >> 1) & 1, 1); | |
$data['sharedSet'] = []; | |
for($i = 0; $i < $sn; $i++) { | |
$data['sharedSet'][] = $this->readNodeId(); | |
} | |
$this->parsed = $data; | |
} catch(Exception $e) { | |
$msg = $e->getMessage() . "\nFailed before offset: {$this->offset}\nData:\n" . bin2hex($this->data) . "\n\n"; | |
fwrite(STDERR, $msg); | |
$this->parsed = []; | |
} | |
} | |
public function getBundleId(): string { | |
return $this->bundleId; | |
} | |
public function getParsed(): array { | |
return $this->parsed; | |
} | |
protected function readPropertyEntry(PropertyId $id) { | |
$count = 1; | |
$entry = [ | |
'id' => $id, | |
]; | |
$b = $this->readUint8(); | |
$type = PropertyType::from($b & 0x0f); | |
$entry['type'] = $type; | |
$len = urs($b, 4); | |
if($len != 0) { | |
$entry['multivalued'] = true; | |
if($len == 0x0f) { | |
$count = $this->readVarInt() + 0x0f - 1; | |
} else { | |
$count = $len - 1; | |
} | |
} else { | |
$entry['multivalued'] = false; | |
} | |
$entry['modCount'] = $this->readVarInt(); | |
$values = []; | |
$blobIds = []; | |
for($i = 0; $i < $count; $i++) { | |
switch($type) { | |
case PropertyType::STRING: | |
case PropertyType::PATH: | |
$values[] = $this->readString(); | |
break; | |
case PropertyType::NAME: | |
$values[] = $this->readName(); | |
break; | |
case PropertyType::REFERENCE: | |
case PropertyType::WEAKREFERENCE: | |
$values[] = $this->readNodeId(); | |
break; | |
case PropertyType::DATE: | |
$values[] = $this->readDate(); | |
break; | |
case PropertyType::LONG: | |
$values[] = $this->readVarLong(); | |
break; | |
case PropertyType::BOOLEAN: | |
$values[] = $this->readUint8() != 0; | |
break; | |
case PropertyType::DOUBLE: | |
$values[] = $this->readDouble(); | |
break; | |
case PropertyType::BINARY: | |
$bin = $this->readBinary(); | |
$values[] = $bin; | |
break; | |
default: | |
throw new Exception('Unsupported property type: ' . $type->name); | |
} | |
} | |
$entry['values'] = $values; | |
return $entry; | |
} | |
// 32 bit signed int | |
protected function readInt(): int { | |
$b = $this->getBytes(4); | |
return unpack('l', $b[3].$b[2].$b[1].$b[0])[1]; | |
} | |
protected function readBinary(): array|string { | |
$size = $this->readInt(); | |
/* | |
// debug to err stream | |
$length = strlen($this->data); | |
fwrite(STDERR, "Size: $size - at offset {$this->offset} of {$length}\n"); | |
fwrite(STDERR, "Data: " . bin2hex($this->data) . "\n"); | |
*/ | |
if($size == BINARY_IN_DATA_STORE) { | |
$blobId = $this->readString(); | |
return [ | |
'type' => 'data_store', | |
'id' => $blobId, | |
]; | |
} else if ($size == BINARY_IN_BLOB_STORE) { | |
$blobId = $this->readString(); | |
return [ | |
'type' => 'blob_store', | |
'id' => $blobId, | |
]; | |
} else { | |
return $this->getBytes($size); | |
} | |
} | |
protected function readLong(): int { | |
if(PHP_INT_SIZE < 8) { | |
throw new Exception('64-bit integers are not supported'); | |
} | |
return unpack('q', $this->getBytes(8))[1]; | |
} | |
protected function readDouble(): float { | |
return unpack('d', $this->getBytes(8))[1]; | |
} | |
protected function readVarLong(): int { | |
$value = 0; | |
$bits = 0; | |
do { | |
$b = $this->readUint8(); | |
if($bits < 57) { | |
$value = ($b & 0x7f) << 57 | urs($value, 7); | |
$bits += 7; | |
} else { | |
$value = ($b & 0x01) << 63 | urs($value, 1); | |
$bits = 64; | |
} | |
} while(($b & 0x80) != 0); | |
$value = urs($value, 64 - $bits); | |
if(($value & 1) != 0) { | |
return ~urs($value, 1); | |
} else { | |
return urs($value, 1); | |
} | |
} | |
protected function readDate(): DateTime { | |
$ts = $this->readVarLong(); | |
if(($ts & 1) == 0) { | |
$tz = getCommonTimezone(0); | |
$ts >>= 1; | |
} elseif(($ts & 2) == 0) { | |
$tz = getCommonTimezone(($ts >> 2) & 0x1f); | |
$ts >>= 7; | |
} else { | |
$m = ($ts << 19) >> 21; | |
$h = $m / 60; | |
if($m < 0) { | |
$s = sprintf("GMT-%02d:%02d", -$h, $h * 60 - $m); | |
} else { | |
$s = sprintf("GMT+%02d:%02d", $h, $m - $h * 60); | |
} | |
$tz = new DateTimeZone($s); | |
$ts >>= 13; | |
} | |
$u = 0; | |
$s = 0; | |
$m = 0; | |
$h = 0; | |
$type = $ts & 3; | |
$ts >>= 2; | |
switch($type) { | |
case 3: | |
$u = (int) $ts & 0x3fffffff; // 30 bits | |
$s = (int) ($u / 1000); | |
$m = (int) ($s / 60); | |
$h = (int) ($m / 60); | |
$m -= $h * 60; | |
$s -= ($h * 60 + $m) * 60; | |
$u -= (($h * 60 + $m) * 60 + $s) * 1000; | |
$ts >>= 30; | |
break; | |
case 2: | |
$m = (int) $ts & 0x07ff; // 11 bits | |
$h = (int) ($m / 60); | |
$m -= $h * 60; | |
$ts >>= 11; | |
break; | |
case 1: | |
$h = (int) $ts & 0x1f; // 5 bits | |
$ts >>= 5; | |
break; | |
} | |
$d = (int) $ts & 0x01ff; // 9 bits; | |
$ts >>= 9; | |
$y = (int) ($ts + 2010); | |
// pad days with leading zeros | |
$d = sprintf('%03d', $d); | |
return new DateTime("$y-$d $h:$m:$s.$u", $tz); | |
} | |
protected function getBytes(int $length): string { | |
$result = substr($this->data, $this->offset, $length); | |
$this->offset += $length; | |
return $result; | |
} | |
protected function readUint8(): int { | |
return unpack('C', $this->getBytes(1))[1]; | |
} | |
protected function readNodeId(): string { | |
$bytes = $this->getBytes(16); | |
$id = bin2hex($bytes); | |
// generate a UUID representation | |
$id = hexToUUID($id); | |
return $id; | |
} | |
protected function readName(): string { | |
$b = $this->readUint8(); | |
if(($b & 0x80) == 0) { | |
return BundleNames::from($b)->name; | |
} | |
$ns = ($b >> 4) & 0x07; | |
if($ns < count($this->namespaces) && $this->namespaces[$ns] !== null) { | |
$uri = $this->namespaces[$ns]; | |
} else { | |
$uri = $this->readString(); | |
if($ns < count($this->namespaces)) { | |
$this->namespaces[$ns] = $uri; | |
} | |
} | |
$local = $this->readBytes(($b & 0x0f) + 1, 0x10); | |
return "$uri::$local"; | |
} | |
protected function readString(): string { | |
return $this->readBytes(0, 0); | |
} | |
protected function readBytes(int $len, int $base): string { | |
$len = $this->readVarInt($len, $base); | |
return $this->getBytes($len); | |
} | |
protected function readVarInt(?int $value = null, ?int $base = null): int { | |
if($value === null) { | |
$b = $this->readUint8(); | |
if(($b & 0x80) == 0) { | |
return $b; | |
} | |
return $this->readVarInt() << 7 | ($b & 0x7f); | |
} | |
if($value < $base) { | |
return $value; | |
} else { | |
return $this->readVarInt() + $base; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
Extract bundles from magnolia db. | |
Use it like `php extractTable.php users > data.json` | |
*/ | |
include 'BundleReader.php'; | |
// connect to the database | |
$mysqli = new mysqli('<host>', '<user>', '<password>', '<database>'); | |
// check connection | |
if (mysqli_connect_errno()) { | |
printf("Connect failed: %s\n", mysqli_connect_error()); | |
exit(); | |
} | |
// get table name from command line | |
$table = 'PM_' . strtoupper($argv[1]) . '_BUNDLE'; | |
// check if table exists, by first getting all tables and then checking if the table is in the array | |
$query = "SHOW TABLES"; | |
$result = $mysqli->query($query); | |
$tables = []; | |
while ($row = $result->fetch_assoc()) { | |
$tables[] = $row['Tables_in_magnolia']; | |
} | |
if (!in_array($table, $tables)) { | |
fwrite(STDERR, "Table $table does not exist\n"); | |
exit(1); | |
} | |
// all rows from table | |
$query = "SELECT NODE_ID as `id`, BUNDLE_DATA as `data` FROM $table"; | |
$result = $mysqli->query($query); | |
$structure = []; | |
$root = null; | |
// iterate over the results | |
while ($row = $result->fetch_assoc()) { | |
$id = bin2hex($row['id']); | |
$data = $row['data']; | |
$reader = new BundleReader($id, $data); | |
$id = $reader->getBundleId(); | |
$parsed = $reader->getParsed(); | |
if ($parsed) { | |
$props = []; | |
foreach ($parsed['properties'] as $prop) { | |
$exploded = explode('::', $prop['id']->name); | |
$name = $exploded[1] ?? $exploded[0]; | |
$value = match($prop['multivalued']) { | |
true => $prop['values'], | |
false => $prop['values'][0] | |
}; | |
$props[$name] = $value; | |
} | |
$parsed['properties'] = $props; | |
//"nodeTypeName":"REP_ROOT" | |
if ($parsed['nodeTypeName'] === 'REP_ROOT') { | |
$root = $id; | |
} | |
$structure[$id] = $parsed; | |
} | |
} | |
// check what id has non existing parent | |
foreach ($structure as $id => $node) { | |
if (!isset($structure[$node['parentId']]) && $id !== $root) { | |
fwrite(STDERR, "Node $id has non existing parent: {$node['parentId']}\n"); | |
fwrite(STDERR, "Root is: $root\n"); | |
} | |
} | |
echo json_encode($structure, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT); | |
// close the connection | |
$mysqli->close(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment