Skip to content

Instantly share code, notes, and snippets.

@rasteiner
Last active September 20, 2023 15:24
Show Gist options
  • Save rasteiner/943ff831bce1c886b3d8b32a050efbbd to your computer and use it in GitHub Desktop.
Save rasteiner/943ff831bce1c886b3d8b32a050efbbd to your computer and use it in GitHub Desktop.
JCR bundle reader in php
<?php
enum BundleNames: int {
case NT_UNSTRUCTURED = 0;
case NT_RESOURCE = 1;
case NT_FILE = 2;
case NT_FOLDER = 3;
case NT_HIERARCHYNODE = 4;
case MIX_REFERENCEABLE = 5;
case JCR_CREATED = 6;
case JCR_CREATEDBY = 7;
case JCR_LASTMODIFIED = 8;
case JCR_LASTMODIFIEDBY = 9;
case JCR_CONTENT = 10;
case JCR_MIMETYPE = 11;
case JCR_DATA = 12;
case JCR_TITLE = 13;
case JCR_LANGUAGE = 14;
case JCR_ENCODING = 15;
case JCR_SYSTEM = 16;
case REP_ROOT = 17;
case REP_SYSTEM = 18;
// Access control
case JCR_ADD_CHILD_NODES = 19;
case JCR_LIFECYCLE_MANAGEMENT = 20;
case JCR_LOCK_MANAGEMENT = 21;
case JCR_MODIFY_ACCESS_CONTROL = 22;
case JCR_MODIFY_PROPERTIES = 23;
case JCR_NODE_TYPE_MANAGEMENT = 24;
case JCR_READ = 25;
case JCR_READ_ACCESS_CONTROL = 26;
case JCR_REMOVE_CHILD_NODES = 27;
case JCR_REMOVE_NODE = 28;
case JCR_VERSION_MANAGEMENT = 29;
case REP_ACCESSCONTROL = 30;
case REP_ACCESS_CONTROL = 31;
case REP_ACCESS_CONTROLLABLE = 32;
case REP_ACE = 33;
case REP_ACL = 34;
case REP_DENY_ACE = 35;
case REP_GLOB = 36;
case REP_GRANT_ACE = 37;
case REP_POLICY = 38;
case REP_PRINCIPAL_ACCESS_CONTROL = 39;
case REP_PRINCIPAL_NAME = 40;
case REP_PRIVILEGES = 41;
// Locking
case MIX_LOCKABLE = 42;
case JCR_LOCKISDEEP = 43;
case JCR_LOCKOWNER = 44;
// Versioning
case MIX_VERSIONABLE = 45;
case NT_FROZENNODE = 46;
case NT_VERSION = 47;
case NT_VERSIONEDCHILD = 48;
case NT_VERSIONHISTORY = 49;
case NT_VERSIONLABELS = 50;
case JCR_VERSIONSTORAGE = 51;
case JCR_FROZENPRIMARYTYPE = 52;
case JCR_FROZENUUID = 53;
case JCR_FROZENNODE = 54;
case JCR_PREDECESSORS = 55;
case JCR_SUCCESSORS = 56;
case JCR_VERSIONLABELS = 57;
case JCR_VERSIONHISTORY = 58;
case JCR_VERSIONABLEUUID = 59;
case JCR_ROOTVERSION = 60;
case JCR_ISCHECKEDOUT = 61;
case JCR_BASEVERSION = 62;
case JCR_MERGEFAILED = 63;
case REP_NODETYPES = 64;
// Node types
case NT_NODETYPE = 65;
case NT_PROPERTYDEFINITION = 66;
case NT_CHILDNODEDEFINITION = 67;
case NT_BASE = 68;
case JCR_NODETYPES = 69;
case JCR_PROTECTED = 70;
case JCR_ONPARENTVERSION = 71;
case JCR_MANDATORY = 72;
case JCR_AUTOCREATED = 73;
case JCR_FROZENMIXINTYPES = 74;
case JCR_NAME = 75;
case JCR_VALUECONSTRAINTS = 76;
case JCR_REQUIREDTYPE = 77;
case JCR_PROPERTYDEFINITION = 78;
case JCR_MULTIPLE = 79;
case JCR_DEFAULTVALUES = 80;
case JCR_SUPERTYPES = 81;
case JCR_NODETYPENAME = 82;
case JCR_ISMIXIN = 83;
case JCR_HASORDERABLECHILDNODES = 84;
case JCR_SAMENAMESIBLINGS = 85;
case JCR_REQUIREDPRIMARYTYPES = 86;
case JCR_CHILDNODEDEFINITION = 87;
case JCR_DEFAULTPRIMARYTYPE = 88;
case JCR_PRIMARYITEMNAME = 89;
case JCR_CHILDVERSIONHISTORY = 90;
case REP_VERSIONS = 91;
case REP_VERSIONSTORAGE = 92;
case REP_VERSION_REFERENCE = 93;
case REP_BASEVERSIONS = 94;
// Miscellaneous node types
case MIX_CREATED = 95;
case MIX_ETAG = 96;
case MIX_LANGUAGE = 97;
case MIX_LASTMODIFIED = 98;
case MIX_LIFECYCLE = 99;
case MIX_MIMETYPE = 100;
case MIX_SHAREABLE = 101;
case MIX_SIMPLE_VERSIONABLE = 102;
case MIX_TITLE = 103;
case NT_ACTIVITY = 104;
case NT_ADDRESS = 105;
case NT_CONFIGURATION = 106;
case NT_QUERY = 107;
case NT_SHARE = 108;
// Miscellaneous names
case REP_ACTIVITIES = 109;
case JCR_ACTIVITIES = 110;
case JCR_ACTIVITY = 111;
case JCR_ACTIVITY_TITLE = 112;
case JCR_XMLCHARACTERS = 113;
case JCR_XMLTEXT = 114;
case REP_CONFIGURATIONS = 115;
case JCR_CONFIGURATION = 116;
case JCR_CONFIGURATIONS = 117;
case JCR_COPIEDFROM = 118;
case JCR_CURRENT_LIFECYCLE_STATE = 119;
case JCR_ETAG = 120;
case JCR_HOST = 121;
case JCR_ID = 122;
case JCR_LIFECYCLE_POLICY = 123;
case JCR_PATH = 124;
case JCR_STATEMENT = 125;
}
const BINARY_IN_BLOB_STORE = -1;
const BINARY_IN_DATA_STORE = -2;
/**
* Unsigned right shift for Longs
* @param string $base10 Left hand side of the operation
* @param int $shift Right hand side: the number of bits to shift
* @param int $totalBits The total number of bits in the integer
* @return int
*/
function urs(int $base10, int $shift): int {
// Convert to base 2
$base2 = decbin($base10);
// Pad with zeros
$base2 = str_pad($base2, 64, '0', STR_PAD_LEFT);
// Shift
$base2 = substr($base2, 0, 64 - $shift);
// Convert back to base 10
return bindec($base2);
}
class PropertyId {
public function __construct(public readonly string $bundleId, public readonly string $name) {}
}
enum PropertyType: int {
case UNDEFINED = 0;
case STRING = 1;
case BINARY = 2;
case LONG = 3;
case DOUBLE = 4;
case DATE = 5;
case BOOLEAN = 6;
case NAME = 7;
case PATH = 8;
case REFERENCE = 9;
case WEAKREFERENCE = 10;
case URI = 11;
case DECIMAL = 12;
}
function getCommonTimezone(int $index) {
static $commonTimezones = [
'GMT',
'GMT+01:00',
'GMT+02:00',
'GMT+03:00',
'GMT+04:00',
'GMT+05:00',
'GMT+06:00',
'GMT+07:00',
'GMT+08:00',
'GMT+09:00',
'GMT+10:00',
'GMT+11:00',
'GMT+12:00',
'GMT+13:00',
'GMT+14:00',
'GMT+15:00',
'GMT-16:00',
'GMT-15:00',
'GMT-14:00',
'GMT-13:00',
'GMT-12:00',
'GMT-11:00',
'GMT-10:00',
'GMT-09:00',
'GMT-08:00',
'GMT-07:00',
'GMT-06:00',
'GMT-05:00',
'GMT-04:00',
'GMT-03:00',
'GMT-02:00',
'GMT-01:00',
];
return new DateTimeZone($commonTimezones[$index]);
}
function hexToUUID(string $hex) {
return substr($hex, 0, 8) . '-' . substr($hex, 8, 4) . '-' . substr($hex, 12, 4) . '-' . substr($hex, 16, 4) . '-' . substr($hex, 20, 12);
}
class BundleReader {
private int $offset = 0;
private int $version = 0;
private array $parsed = [];
/**
* The default namespace and the first six other namespaces used in this
* bundle. Used by the readName() method to keep track of
* already seen namespaces.
*
* @var string[]
*/
private array $namespaces = [
// NOTE: The length of this array must be seven
'', null, null, null, null, null, null
];
public function __construct(private string $bundleId, private string $data) {
if(strlen($bundleId) === 32) {
$this->bundleId = hexToUUID($bundleId);
}
$this->version = $this->readUint8();
if($this->version < 3) {
throw new Exception('Invalid bundle version');
}
$data = [
'version' => $this->version,
'nodeTypeName' => $this->readName(),
'parentId' => $this->readNodeId(),
'modCount' => $this->readVarInt(),
];
try {
$b = $this->readUint8();
$data += [
'referenceable' => ($b & 1) != 0,
];
$mn = $this->readVarInt(($b >> 7) & 1, 1);
if($mn === 0) {
$data['mixinTypeNames'] = [];
} elseif ($mn === 1) {
$data['mixinTypeNames'] = [$this->readName()];
} else {
$data['mixinTypeNames'] = [];
for($i = 0; $i < $mn; $i++) {
$data['mixinTypeNames'][] = $this->readName();
}
}
$data['properties'] = [];
$pn = $this->readVarInt(($b >> 4) & 7, 7);
for($i = 0; $i < $pn; $i++) {
$id = new PropertyId($this->bundleId, $this->readName());
$prop = $this->readPropertyEntry($id);
$data['properties'][] = $prop;
}
// child nodes (list of name/uuid pairs)
$data['childNodeEntries'] = [];
$nn = $this->readVarInt(($b >> 2) & 3, 3);
for($i = 0; $i < $nn; $i++) {
$name = $this->readName();
$id = $this->readNodeId();
$data['childNodeEntries'][] = [
'name' => $name,
'id' => $id,
];
}
// read shared set
$sn = $this->readVarInt(($b >> 1) & 1, 1);
$data['sharedSet'] = [];
for($i = 0; $i < $sn; $i++) {
$data['sharedSet'][] = $this->readNodeId();
}
$this->parsed = $data;
} catch(Exception $e) {
$msg = $e->getMessage() . "\nFailed before offset: {$this->offset}\nData:\n" . bin2hex($this->data) . "\n\n";
fwrite(STDERR, $msg);
$this->parsed = [];
}
}
public function getBundleId(): string {
return $this->bundleId;
}
public function getParsed(): array {
return $this->parsed;
}
protected function readPropertyEntry(PropertyId $id) {
$count = 1;
$entry = [
'id' => $id,
];
$b = $this->readUint8();
$type = PropertyType::from($b & 0x0f);
$entry['type'] = $type;
$len = urs($b, 4);
if($len != 0) {
$entry['multivalued'] = true;
if($len == 0x0f) {
$count = $this->readVarInt() + 0x0f - 1;
} else {
$count = $len - 1;
}
} else {
$entry['multivalued'] = false;
}
$entry['modCount'] = $this->readVarInt();
$values = [];
$blobIds = [];
for($i = 0; $i < $count; $i++) {
switch($type) {
case PropertyType::STRING:
case PropertyType::PATH:
$values[] = $this->readString();
break;
case PropertyType::NAME:
$values[] = $this->readName();
break;
case PropertyType::REFERENCE:
case PropertyType::WEAKREFERENCE:
$values[] = $this->readNodeId();
break;
case PropertyType::DATE:
$values[] = $this->readDate();
break;
case PropertyType::LONG:
$values[] = $this->readVarLong();
break;
case PropertyType::BOOLEAN:
$values[] = $this->readUint8() != 0;
break;
case PropertyType::DOUBLE:
$values[] = $this->readDouble();
break;
case PropertyType::BINARY:
$bin = $this->readBinary();
$values[] = $bin;
break;
default:
throw new Exception('Unsupported property type: ' . $type->name);
}
}
$entry['values'] = $values;
return $entry;
}
// 32 bit signed int
protected function readInt(): int {
$b = $this->getBytes(4);
return unpack('l', $b[3].$b[2].$b[1].$b[0])[1];
}
protected function readBinary(): array|string {
$size = $this->readInt();
/*
// debug to err stream
$length = strlen($this->data);
fwrite(STDERR, "Size: $size - at offset {$this->offset} of {$length}\n");
fwrite(STDERR, "Data: " . bin2hex($this->data) . "\n");
*/
if($size == BINARY_IN_DATA_STORE) {
$blobId = $this->readString();
return [
'type' => 'data_store',
'id' => $blobId,
];
} else if ($size == BINARY_IN_BLOB_STORE) {
$blobId = $this->readString();
return [
'type' => 'blob_store',
'id' => $blobId,
];
} else {
return $this->getBytes($size);
}
}
protected function readLong(): int {
if(PHP_INT_SIZE < 8) {
throw new Exception('64-bit integers are not supported');
}
return unpack('q', $this->getBytes(8))[1];
}
protected function readDouble(): float {
return unpack('d', $this->getBytes(8))[1];
}
protected function readVarLong(): int {
$value = 0;
$bits = 0;
do {
$b = $this->readUint8();
if($bits < 57) {
$value = ($b & 0x7f) << 57 | urs($value, 7);
$bits += 7;
} else {
$value = ($b & 0x01) << 63 | urs($value, 1);
$bits = 64;
}
} while(($b & 0x80) != 0);
$value = urs($value, 64 - $bits);
if(($value & 1) != 0) {
return ~urs($value, 1);
} else {
return urs($value, 1);
}
}
protected function readDate(): DateTime {
$ts = $this->readVarLong();
if(($ts & 1) == 0) {
$tz = getCommonTimezone(0);
$ts >>= 1;
} elseif(($ts & 2) == 0) {
$tz = getCommonTimezone(($ts >> 2) & 0x1f);
$ts >>= 7;
} else {
$m = ($ts << 19) >> 21;
$h = $m / 60;
if($m < 0) {
$s = sprintf("GMT-%02d:%02d", -$h, $h * 60 - $m);
} else {
$s = sprintf("GMT+%02d:%02d", $h, $m - $h * 60);
}
$tz = new DateTimeZone($s);
$ts >>= 13;
}
$u = 0;
$s = 0;
$m = 0;
$h = 0;
$type = $ts & 3;
$ts >>= 2;
switch($type) {
case 3:
$u = (int) $ts & 0x3fffffff; // 30 bits
$s = (int) ($u / 1000);
$m = (int) ($s / 60);
$h = (int) ($m / 60);
$m -= $h * 60;
$s -= ($h * 60 + $m) * 60;
$u -= (($h * 60 + $m) * 60 + $s) * 1000;
$ts >>= 30;
break;
case 2:
$m = (int) $ts & 0x07ff; // 11 bits
$h = (int) ($m / 60);
$m -= $h * 60;
$ts >>= 11;
break;
case 1:
$h = (int) $ts & 0x1f; // 5 bits
$ts >>= 5;
break;
}
$d = (int) $ts & 0x01ff; // 9 bits;
$ts >>= 9;
$y = (int) ($ts + 2010);
// pad days with leading zeros
$d = sprintf('%03d', $d);
return new DateTime("$y-$d $h:$m:$s.$u", $tz);
}
protected function getBytes(int $length): string {
$result = substr($this->data, $this->offset, $length);
$this->offset += $length;
return $result;
}
protected function readUint8(): int {
return unpack('C', $this->getBytes(1))[1];
}
protected function readNodeId(): string {
$bytes = $this->getBytes(16);
$id = bin2hex($bytes);
// generate a UUID representation
$id = hexToUUID($id);
return $id;
}
protected function readName(): string {
$b = $this->readUint8();
if(($b & 0x80) == 0) {
return BundleNames::from($b)->name;
}
$ns = ($b >> 4) & 0x07;
if($ns < count($this->namespaces) && $this->namespaces[$ns] !== null) {
$uri = $this->namespaces[$ns];
} else {
$uri = $this->readString();
if($ns < count($this->namespaces)) {
$this->namespaces[$ns] = $uri;
}
}
$local = $this->readBytes(($b & 0x0f) + 1, 0x10);
return "$uri::$local";
}
protected function readString(): string {
return $this->readBytes(0, 0);
}
protected function readBytes(int $len, int $base): string {
$len = $this->readVarInt($len, $base);
return $this->getBytes($len);
}
protected function readVarInt(?int $value = null, ?int $base = null): int {
if($value === null) {
$b = $this->readUint8();
if(($b & 0x80) == 0) {
return $b;
}
return $this->readVarInt() << 7 | ($b & 0x7f);
}
if($value < $base) {
return $value;
} else {
return $this->readVarInt() + $base;
}
}
}
<?php
/**
Extract bundles from magnolia db.
Use it like `php extractTable.php users > data.json`
*/
include 'BundleReader.php';
// connect to the database
$mysqli = new mysqli('<host>', '<user>', '<password>', '<database>');
// check connection
if (mysqli_connect_errno()) {
printf("Connect failed: %s\n", mysqli_connect_error());
exit();
}
// get table name from command line
$table = 'PM_' . strtoupper($argv[1]) . '_BUNDLE';
// check if table exists, by first getting all tables and then checking if the table is in the array
$query = "SHOW TABLES";
$result = $mysqli->query($query);
$tables = [];
while ($row = $result->fetch_assoc()) {
$tables[] = $row['Tables_in_magnolia'];
}
if (!in_array($table, $tables)) {
fwrite(STDERR, "Table $table does not exist\n");
exit(1);
}
// all rows from table
$query = "SELECT NODE_ID as `id`, BUNDLE_DATA as `data` FROM $table";
$result = $mysqli->query($query);
$structure = [];
$root = null;
// iterate over the results
while ($row = $result->fetch_assoc()) {
$id = bin2hex($row['id']);
$data = $row['data'];
$reader = new BundleReader($id, $data);
$id = $reader->getBundleId();
$parsed = $reader->getParsed();
if ($parsed) {
$props = [];
foreach ($parsed['properties'] as $prop) {
$exploded = explode('::', $prop['id']->name);
$name = $exploded[1] ?? $exploded[0];
$value = match($prop['multivalued']) {
true => $prop['values'],
false => $prop['values'][0]
};
$props[$name] = $value;
}
$parsed['properties'] = $props;
//"nodeTypeName":"REP_ROOT"
if ($parsed['nodeTypeName'] === 'REP_ROOT') {
$root = $id;
}
$structure[$id] = $parsed;
}
}
// check what id has non existing parent
foreach ($structure as $id => $node) {
if (!isset($structure[$node['parentId']]) && $id !== $root) {
fwrite(STDERR, "Node $id has non existing parent: {$node['parentId']}\n");
fwrite(STDERR, "Root is: $root\n");
}
}
echo json_encode($structure, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT);
// close the connection
$mysqli->close();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment