Skip to content

Instantly share code, notes, and snippets.

@pounard
Last active June 26, 2018 11:24
Show Gist options
  • Save pounard/4c242bf94e36d4e8f8f1ae68021ec45c to your computer and use it in GitHub Desktop.
Save pounard/4c242bf94e36d4e8f8f1ae68021ec45c to your computer and use it in GitHub Desktop.
Path normalization, equivalent of Python's os.path.normpath() and os.path.absath()
<?php
// Generate test data for performance test
$testData = [];
$words = ['..', 'test', 'pouet', '/', '..', '', 'usr', 'bin', 'firefox', 'do not', 'panic', 'secret', 'perso', 'private', 'foo', '/', '..', '', 'hyper long word', 'word with space', 'some56457', '@ert', '.ezrzer', 'cassoulet', '/', '..', ''];
for ($i = 0; $i < 100; ++$i) {
$value = [];
$count = rand(0, 20);
for ($j = 1; $j < $count; ++$j) {
$value[] = $words[rand(0, count($words) - 1)];
}
$value = implode('/', $value);
if (rand(0, 4) < 1) {
$value = '/'.$value;
}
if (rand(0, 4) < 1) {
$value = $value . '/';
}
$testData[] = $value;
}
print_r($testData);
$test = [
// Tests with '..'
'a/b/..' => 'a',
'https://a/b/../' => 'https://a',
'/a/b/c/d/../e/f' => '/a/b/c/e/f',
'a/b/c/../../e/f' => 'a/e/f',
'ftp://a/../b/../c/../e/f' => 'ftp://e/f',
'a../b/c../d..e/' => 'a../b/c../d..e',
'../c/d' => '../c/d',
// With multiple '/'
'/a/b/////c/d/../e/f' => '/a/b/c/e/f',
'file:////a/b/c//../..//e/f' => 'file:///a/e/f',
'////a/../b/../c//../e/f' => '/e/f',
'a../b//c../d..e/' => 'a../b/c../d..e',
'../c////d' => '../c/d',
// With dots
'a/b/./././..' => 'a',
'a/.b/./../' => 'a',
'/a/b/.c/d/../e/f' => '/a/b/.c/e/f',
'.a/./b/c/.././../e./f' => '.a/e./f',
// Special cases
'/' => '/',
'.' => '.',
'..' => '..',
'/..' => '..', // Invalid
'./' => '.',
'../' => '..',
'/.' => '/',
];
// preg_replace() based method, I'm surprised this is actually twice faster than
// the array_splice() based method.
function normalizePath($string)
{
// Handle windows gracefully
if (\DIRECTORY_SEPARATOR !== '/') {
$string = \str_replace(\DIRECTORY_SEPARATOR, '/', $string);
}
// Also tests some special cases we can't really do anything with
if (false === \strpos($string, '/') || '/' === $string || '.' === $string || '..' === $string) {
return $string;
}
// This is supposedly invalid, but an empty string is an empty string
if ('' === ($string = \rtrim($string, '/'))) {
return '';
}
$scheme = null;
if (\strpos($string, '://')) {
list($scheme, $string) = \explode('://', $string, 2);
}
// Matches useless '.' repetitions
$string = \preg_replace('@^\./|(/\.)+/|/\.$@', '/', $string);
$count = 0;
do {
// string such as '//' can be generated by the first regex, hence the second
$string = \preg_replace('@[^/]+/+\.\.(/+|$)@', '$2', \preg_replace('@//+@', '/', $string), -1, $count);
} while ($count);
// rtrim() a second time because preg_replace() could leave a trailing '/'
return ($scheme ? ($scheme.'://') : '').\rtrim($string, '/');
}
// array_splice() version, this is actually more or less the same implementation
// as python's own os.path.normpath() function, used by os.path.abspath() method
// which is itself in use in os.path.realpath() method.
function normalizePath2($string)
{
// Handle windows gracefully
if (\DIRECTORY_SEPARATOR !== '/') {
$string = \str_replace(\DIRECTORY_SEPARATOR, '/', $string);
}
// Also tests some special cases we can't really do anything with
if (false === \strpos($string, '/') || '/' === $string || '.' === $string || '..' === $string) {
return $string;
}
if ('' === ($string = rtrim($string, '/'))) {
return '';
}
$scheme = null;
if (strpos($string, '://')) {
list($scheme, $string) = explode('://', $string, 2);
}
$segments = explode('/', $string);
$absolute = '/' === $string[0];
$start = 0;
do {
$continue = false;
$length = count($segments);
for ($i = $start; $i < $length; ++$i) {
$value = $segments[$i];
if ('' === $value || '.' === $value) { // No-op values
$start = $i;
array_splice($segments, $i, 1);
$continue = true; break;
}
if ('..' === $value && $i > 0) { // Back in hierarchy, drop previous
array_splice($segments, $i - 1, 2);
$start = $i == 1 ? 0 : $i - 2;
$continue = true; break;
}
}
} while ($continue);
return ($absolute ? '/' : ($scheme ? ($scheme.'://') : '')).implode('/', $segments);
}
echo "\n\nnormalizePath()\n";
foreach ($test as $string => $expected) {
$ret = normalizePath($string);
if ($ret === $expected) {
echo "OK $string -> $ret\n";
} else {
echo "FAIL $string -> $ret (expected: $expected)\n";
}
}
echo "\n\nnormalizePath2()\n";
foreach ($test as $string => $expected) {
$ret = normalizePath2($string);
if ($ret === $expected) {
echo "OK $string -> $ret\n";
} else {
echo "FAIL $string -> $ret (expected: $expected)\n";
}
}
echo "\n\nnormalizePath() performance... ";
$time = microtime(true);
for ($i = 0; $i < 1000; ++$i) {
foreach ($testData as $string) {
$ret1 = normalizePath($string);
}
}
echo (microtime(true) - $time)," sec for 100,000 calls\n";
echo "\n\nnormalizePath2() performance... ";
$time = microtime(true);
for ($i = 0; $i < 1000; ++$i) {
foreach ($testData as $string) {
$ret1 = normalizePath2($string);
}
}
echo (microtime(true) - $time)," sec for 100,000 calls\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment