Skip to content

Instantly share code, notes, and snippets.

@nikic
Last active August 3, 2023 05:48
Show Gist options
  • Save nikic/98357b71fd67756b0f064c9517b62a34 to your computer and use it in GitHub Desktop.
Save nikic/98357b71fd67756b0f064c9517b62a34 to your computer and use it in GitHub Desktop.
<?php declare(strict_types=1);
/**
* Usage: php opaquify.php file|dir [file|dir...]
*
* Tests that fail afterwards usually fall into two categories:
* - opaquify.php produced malformed IR. This usually happens because bitcasts
* referenced by PHI nodes were dropped. This requires manually updating the
* PHI argument.
* - There is a difference in IR output. These are usually benign differences,
* but those are the cases that require human verification.
*/
if (!selfTest())
exit(-1);
if ($argc < 2) {
echo "Missing path argument.\n";
echo "Usage: php opaquify.php file|dir [file|dir...]\n";
exit(-1);
}
// Increase backtracking limit for some very large tests.
ini_set('pcre.backtrack_limit', '10000000');
ini_set('pcre.recursion_limit', '500000');
$force = false;
$keepBitcasts = false;
$args = [];
foreach (array_slice($argv, 1) as $arg) {
if ($arg === '-f' || $arg === '--force') {
$force = true;
} else if ($arg === '--keep-bitcasts') {
$keepBitcasts = true;
} else {
$args[] = $arg;
}
}
foreach (files($args) as $f) {
$isIR = (bool) preg_match('/\.(?:ll|mir)$/', $f);
$isC = preg_match('/\.(?:c|cpp|cl|clcpp|cu|m|mm|mlir|fir)$/', $f);
if (!$isIR && !$isC)
continue;
$new = $orig = file_get_contents($f);
if ($isC) {
$new = str_replace('-Xclang -no-opaque-pointers ', '', $new);
$new = str_replace('-no-opaque-pointers ', '', $new);
if ($new === $orig && !$force) {
// Only opaquify if there was a -no-opaque-pointers flag.
continue;
}
} else {
$new = str_replace('-opaque-pointers=0 ', '', $new);
if ($new === $orig && !$force) {
// Only opaquify if there was an -opaque-pointers=0 flag.
continue;
}
}
$new = opaquify($new, $isIR, $keepBitcasts);
if ($new !== $orig) {
file_put_contents($f, $new);
}
}
function opaquify(string $c, bool $isIR, bool $keepBitcasts = false): string {
$c = replaceWithPtrFixpoint($c, $isIR);
$c = remangleIntrinsics($c);
$c = removeDuplicateDeclarations($c);
$c = replaceToFixPoint($c, fn($c) => removeConstantExprs($c));
if (!$keepBitcasts)
$c = replaceBitcasts($c, $isIR);
return $c;
}
function minOffset(?int $off1, ?int $off2): ?int {
if ($off1 !== null && $off2 !== null)
return min($off1, $off2);
if ($off1 !== null)
return $off1;
if ($off2 !== null)
return $off2;
return null;
}
function findNextAssign(string $c, string $var, int $startOffset): ?int {
$regex = '/' . preg_quote($var, '/') . '\s*=/';
$off = null;
if (preg_match($regex, $c, $matches, PREG_OFFSET_CAPTURE, $startOffset)) {
$off = $matches[0][1];
}
$off = minOffset($off, strpos($c, 'define ', $startOffset) ?: null);
$off = minOffset($off, strpos($c, "...\n---", $startOffset) ?: null);
return $off;
}
function getUseName(string $name): string {
if (preg_match('/^(%?)\[\[([^:]+):.*\]\]$/', $name, $matches)) {
return $matches[1] . '[[' . $matches[2] . ']]';
}
return $name;
}
function replaceInRange(
string $regex, string $input, int $startOffset, ?int $endOffset,
Closure $fn
) {
$prefix = substr($input, 0, $startOffset);
$middle = substr(
$input, $startOffset,
$endOffset !== null ? $endOffset - $startOffset : null);
$end = $endOffset !== null ? substr($input, $endOffset) : '';
return $prefix . preg_replace_callback($regex, $fn, $middle) . $end;
}
function renumberInstructions(
string $c, int $removedNum, int $startOffset
): string {
$endOffset = strpos($c, 'define', $startOffset) ?: null;
$endOffset = minOffset($endOffset, strpos($c, "...\n---", $startOffset) ?: null);
return replaceInRange(
'/%(\d+)/', $c, $startOffset, $endOffset,
function($matches) use ($removedNum) {
$num = (int) $matches[1];
if ($num > $removedNum) {
return '%' . ($num - 1);
}
return $matches[0];
});
}
function replaceBitcasts(string $c, bool $isIR): string {
// Don't replace zero-index GEPs in clang tests. These are still getting
// generated by the frontend.
$gep = $isIR ? '| getelementptr(?:\s+inbounds)?\s+[^,]+,\s*(?&ptr_ty)\s+(?<arg>[^,]+)(?:\s*,\s*i\d+\s+0)+' : '';
$regex = <<<REGEX
/
(?(DEFINE)
(?<ptr_ty> ptr(?:\s+addrspace\s*\(\s*\d+\s*\))?)
)
^(?<prefix>.*?) (?<res>\S+|%?\[\[.+\]\]) \h+ = \h+
(?|
bitcast\h+(?&ptr_ty)\h+(?<arg>\S+)\h+to\h+.*
$gep
)
(?:\s*,\s*!.*)? # Ignore metadata
$
/mx
REGEX;
$offset = 0;
while (preg_match($regex, $c, $matches, PREG_OFFSET_CAPTURE, $offset)) {
$startOffset = $matches[0][1];
$len = strlen($matches[0][0]);
$res = $matches['res'][0];
$arg = $matches['arg'][0];
$resUseName = getUseName($res);
$argUseName = getUseName($arg);
$nextAssign = findNextAssign($c, $res, $startOffset + $len);
if ($argUseName === $arg) {
// Replace uses of $res with $arg
$useRegex = '/' . preg_quote($resUseName, '/') . '(?![\w.])/';
$c = replaceInRange(
$useRegex, $c, $startOffset + $len, $nextAssign,
function($matches) use($argUseName) {
return $argUseName;
});
} else {
// If the bitcast used to define the argument [[A::%.*]], we need to
// shift this definition to the first later occurrence of [[A]] (which
// might be a newly introduced one).
$useRegex = '/(?:' . preg_quote($resUseName, '/') . '|'
. preg_quote($argUseName, '/') . ')(?![\w.])/';
$placedDef = false;
$c = replaceInRange(
$useRegex, $c, $startOffset + $len, $nextAssign,
function($matches) use($arg, $argUseName, &$placedDef) {
if (!$placedDef) {
$placedDef = true;
return $arg;
}
return $argUseName;
});
}
// Remove the bitcast itself
$c = substr_replace($c, '', $startOffset, $len + 1);
$offset = $startOffset;
// If CHECK: is followed by CHECK-NEXT:, convert it to CHECK:
if (preg_match('/(\w+):/', $matches['prefix'][0], $matches1)) {
$checkName = $matches1[1];
if (preg_match('/\G(.*)' . preg_quote($checkName, '/') . '-NEXT:/', $c, $matches, 0, $startOffset)) {
$c = substr($c, 0, $startOffset + strlen($matches[1])) .
$checkName . ':' .
substr($c, $startOffset + strlen($matches[0]));
}
}
// Dropping a numbered instruction requires renumbering
if (preg_match('/%(\d+)/', $res, $matches))
$c = renumberInstructions($c, (int) $matches[1], $startOffset);
}
if (!$isIR) {
$c = preg_replace('/^.*?[A-Z0-9](?:-NEXT)?:\h+bitcast(?:|\h+ptr.*)\n/m', '', $c);
}
return $c;
}
function removeConstantExprs(string $c): string {
// Remove bitcast and zero-offset GEP constexprs.
$regex = <<<'REGEX'
/
(?(DEFINE)
(?<ptr_ty> ptr(?:\s+addrspace\s*\(\s*\d+\s*\))?|\{\{\.\*\}\})
)
(?|
bitcast\s*\(\s*(?&ptr_ty)\s+(?<arg>.+?)\s+to\s+(?&ptr_ty)\s*\)
| getelementptr(?:\s+inbounds)?\s*\((?:(?:.+?,\s*)?(?&ptr_ty)|\{\{\.\*\}\})\s+(?<arg>[^,]+)(?:\s*,\s*i\d+\s+0)+\)
)/x
REGEX;
$c = preg_replace_callback($regex, fn($m) => $m['arg'], $c);
// Aliases allow an inconsistent syntax missing the "ptr".
$c = preg_replace('/alias([^,]+), @/', 'alias$1, ptr @', $c);
return $c;
}
// Remangle llvm.xyz.p0i8 to llvm.xyz.p0
function remangleIntrinsics(string $c): string {
return preg_replace_callback('/@llvm\.[\w.]+/', function($matches) {
return preg_replace('/(\.(?:(?:nx)?v\d+)?p\d+)(?:\w*s.+?s(?=\.|$)|[^.]*)/', '$1', $matches[0]);
}, $c);
}
// After remangling, we may have multiple redundant declarations.
function removeDuplicateDeclarations(string $c): string {
$seenDeclarations = [];
return preg_replace_callback(
'/^(\h*(?:;|\/\/)\h+\w+:\h*)?declare.*@(llvm\.[\w.]+).*\n/m',
function($matches) use (&$seenDeclarations) {
$name = $matches[1] . $matches[2];
$duplicate = isset($seenDeclarations[$name]);
$seenDeclarations[$name] = true;
return $duplicate ? '' : $matches[0];
},
$c);
}
function replaceOnlyInComments(string $c, callable $fn): string {
return preg_replace_callback(
'(//.*$)m', fn($matches) => $fn($matches[0]), $c);
}
function replaceToFixpoint(string $c, Closure $replaceFn): string {
do {
$new = $replaceFn($c);
$changed = $new !== $c;
$c = $new;
} while ($changed);
return $c;
}
function replaceWithPtrFixpoint(string $c, bool $isIR): string {
return replaceToFixpoint($c, function($c) use($isIR) {
$c = $isIR
? replaceWithPtr($c)
: replaceOnlyInComments($c, 'replaceWithPtr');
return replacePlaceholderWithPtr($c);
});
}
function replaceWithPtr(string $c): string {
$regex = <<<'REGEX'
/
\[\[ .*? \]\](*SKIP)(*F)| # Skip FileCheck placeholders
\{\{ .*? \}\}(*SKIP)(*F)| # Skip FileCheck placeholders
\n;(?!\h*[\w-]+:).*(*SKIP)(*F)| # Skip code comments
(?:
void | half | bfloat | float | double | x86_fp80 | fp128 | ppc_fp128
| label | metadata | x86_mmx | x86_amx | type | label | opaque | token | ptr
| i\d+ # Integer type
| %[\w.-]+ # Named struct type
| %"[^"]+" # Named struct type
| (\[ \s* \d+ \s* x [^[\]]* (?: (?-1) [^[\]]* )* \]) # Array type
| < (?:vscale\s+x\s+)? \d+ \s* x [^<>]* > # Vector type
| (\{ [^{}]* (?: (?-1) [^{}]* )* \}) # Struct type
| < (?-1) > # Packed struct type
)
(?: \s* (\( [^()]* (?: (?-1) [^()]* )* \)) )? # Function type
(?: # Ignore intermediate pointer types
(?: \s* addrspace \s* \( \s* \d+ \s* \) )?
\s* \*
)*
(?<addrspace> \s* addrspace \s* \( \s* \d+ \s* \) )?
\s* \* # Pointer type
(?<trailing>.?)
/x
REGEX;
$c = preg_replace_callback($regex, function($matches) {
$trailing = $matches['trailing'];
$addSpace = ctype_alpha($trailing) || $trailing === '%' || $trailing === '@';
return 'ptr' . $matches['addrspace'] . ($addSpace ? ' ' : '') . $trailing;
}, $c);
if ($c === null) {
throw new Exception('PCRE replace failed with: ' . preg_last_error_msg());
}
return $c;
}
// Handle pointer to placeholders separately, as it conflicts with the
// placeholder skipping above.
function replacePlaceholderWithPtr(string $c): string {
$regex = <<<'REGEX'
/
%?
(?: i?\[\[(?:\#\w+|(?<name>\w+)(?::(?<regex>[^]]+))?)\]\]
| i?(?:%\w+)?\{\{[^}]+\}\}
)
(?<addrspace>(?:\s+ addrspace \s* \( \s* \d+ \s* \) )?)
\*
/x
REGEX;
$offset = 0;
while (preg_match($regex, $c, $matches, PREG_OFFSET_CAPTURE, $offset)) {
$startOffset = $matches[0][1];
$len = strlen($matches[0][0]);
$replacement = 'ptr' . $matches['addrspace'][0];
$c = substr_replace($c, $replacement, $startOffset, $len);
$offset = $startOffset + strlen($replacement);
if (isset($matches['regex']) && $matches['regex'][0] !== "") {
// Move var definition to next use.
$name = $matches['name'][0];
$varRegex = $matches['regex'][0];
$c = substr($c, 0, $offset) . preg_replace(
'/\[\[' . preg_quote($name) . '\]\]/',
'[[' . $name . ':' . $varRegex . ']]',
substr($c, $offset),
limit: 1);
}
}
return $c;
}
function files(array $paths): iterable {
foreach ($paths as $path) {
if (is_file($path)) {
yield $path;
} else {
$it = new RecursiveIteratorIterator(
new RecursiveDirectoryIterator($path),
RecursiveIteratorIterator::LEAVES_ONLY
);
foreach ($it as $file) {
yield $file->getPathName();
}
}
}
}
function selfTest() {
$input = <<<'INPUT'
i8*
i8 addrspace(1)*
i8 addrspace(1)* addrspace(2)*
i8* (i8*)*
i8 (i8 addrspace(1)*)*
{ i8, { i8 } }*
<vscale x 2 x i32>*
<{ i32, i32 }>*
[[T]]*
[[T:.*]]*
[[T]] addrspace(1)*
i[[#N]]*
i[[N:.*]]*
i[[N]]*
{{.+}}*
i{{32|64}}*
%foo{{.+}}*
i8 *%x
i8 *@x
i8*{{[a-z_ ]*}}
; CHECK: declare void @llvm.foo.p0i8.p1i16() attrs
; CHECK: declare void @llvm.foo.p0i8.p1i32() attrs
// FOO: declare void @llvm.foo.p0i8.p1i32() attrs
declare void @llvm.foo.p0i8.p1i16() attrs
declare void @llvm.foo.p0i8.p1i32() attrs
declare void @llvm.bar.v8p0i16()
declare void @llvm.bar.nxv8p0i16()
declare void @llvm.bar.p256()
declare void @llvm.bar.p0s_struct.s1s.p0s_struct.s1s()
declare void @llvm.bar.p0s_struct.s1s.p0a3s_struct.s1s()
; CHECK-NEXT: [[TMP:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ getelementptr inbounds ([[MYSTRUCT:%.*]], ptr @var, i64 0, i32 0), [[SELECT]] ]
%call = call i8 addrspace(200)* @strcpy(i8 addrspace(200)* %dst, i8 addrspace(200)* getelementptr inbounds ([17 x i8], [17 x i8] addrspace(200)* @str, i64 0, i64 0))
store i16 1, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 0) to i16*), align 2
bitcast ({{.*}} %x to i8*)
getelementptr ({{.*}} %x, i32 0, i32 0)
getelementptr ({{.*}}* %x, i32 0, i32 0)
getelementptr (<{ i32, i32 }>, <{ i32, i32 }>* @_MergedGlobals.1, i32 0, i32 0)
%res = bitcast ptr %arg to ptr, !dbg !0
use(%res)
use(%res_suffix)
%res = something
use(%res)
%1 = bitcast ptr %arg to ptr
%2 = foo
use(ptr %0, ptr %1, ptr %2)
; CHECK: [[PF:%.*]] = bitcast ptr [[P]] to ptr
; CHECK-NEXT: [[V2F:%.*]] = load float, ptr [[PF]], align 4
; CHECK-NEXT: %[[X:%.*]] = bitcast ptr %p to ptr
; CHECK-NEXT: %[[Y:%.*]] = load float, ptr %[[X]], align 4
; CHECK-NEXT: [[X2:%.*]] = bitcast ptr [[X1:%.*]] to ptr
; CHECK-NEXT: load float, ptr [[X1]], align 4
; CHECK-NEXT: load float, ptr [[X2]], align 4
; CHECK-NEXT: load float, ptr [[X2]], align 4
; CHECK-NEXT: [[V:%[^ ]+]] = bitcast ptr %p to ptr
; CHECK-NEXT: [[W:%[^ ]+]] = load float, ptr [[V]], align 4
; CHECK-NEXT: %[[Q:[^ ]+]] = bitcast ptr %p to ptr
; CHECK-NEXT: %[[R:[^ ]+]] = load float, ptr %[[Q]], align 4
%q = bitcast i8* %p to [[AGG:{ i32 }]]*
getelementptr [[AGG]], [[AGG]]* %q, i64 1
[[B:%[a-z]*]]
INPUT;
$expected = <<<'EXPECTED'
ptr
ptr addrspace(1)
ptr addrspace(2)
ptr
ptr
ptr
ptr
ptr
ptr
ptr
ptr addrspace(1)
ptr
ptr
ptr
ptr
ptr
ptr
ptr %x
ptr @x
ptr{{[a-z_ ]*}}
; CHECK: declare void @llvm.foo.p0.p1() attrs
// FOO: declare void @llvm.foo.p0.p1() attrs
declare void @llvm.foo.p0.p1() attrs
declare void @llvm.bar.v8p0()
declare void @llvm.bar.nxv8p0()
declare void @llvm.bar.p256()
declare void @llvm.bar.p0.p0()
; CHECK-NEXT: [[TMP:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ @var, [[SELECT]] ]
%call = call ptr addrspace(200) @strcpy(ptr addrspace(200) %dst, ptr addrspace(200) @str)
store i16 1, ptr @b, align 2
%x
%x
%x
@_MergedGlobals.1
use(%arg)
use(%res_suffix)
%res = something
use(%res)
%1 = foo
use(ptr %0, ptr %arg, ptr %1)
; CHECK: [[V2F:%.*]] = load float, ptr [[P]], align 4
; CHECK-NEXT: %[[Y:%.*]] = load float, ptr %p, align 4
; CHECK-NEXT: load float, ptr [[X1:%.*]], align 4
; CHECK-NEXT: load float, ptr [[X1]], align 4
; CHECK-NEXT: load float, ptr [[X1]], align 4
; CHECK-NEXT: [[W:%[^ ]+]] = load float, ptr %p, align 4
; CHECK-NEXT: %[[R:[^ ]+]] = load float, ptr %p, align 4
getelementptr [[AGG:{ i32 }]], ptr %p, i64 1
[[B:%[a-z]*]]
EXPECTED;
$result = opaquify($input, isIR: true);
if ($result !== $expected) {
echo "Self-test (IR) failed:\n$result\n";
return false;
}
$input = <<<'INPUT'
void *foo(void *);
// CHECK: i8*
// CHECK-NEXT: bitcast ptr {{.*}} to ptr
// CHECK-NEXT: bitcast ptr
// CHECK-NEXT: bitcast
// CHECK-NEXT: bitcast <2 x i32> %x to <4 x i16>
// FOO: bitcast ptr
INPUT;
$expected = <<<'EXPECTED'
void *foo(void *);
// CHECK: ptr
// CHECK-NEXT: bitcast <2 x i32> %x to <4 x i16>
EXPECTED;
$result = opaquify($input, isIR: false);
if ($result !== $expected) {
echo "Self-test (C) failed:\n$result\n";
return false;
}
return true;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment