Created
January 5, 2022 07:09
-
-
Save nascheme/553576e857f89083423858f174e23d10 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* In-place path normalisation. Returns the start of the normalized | |
path, which will be within the original buffer. Guaranteed to not | |
make the path longer, and will not fail. 'size' is the length of | |
the path, if known. If -1, the first null character will be assumed | |
to be the end of the path. */ | |
wchar_t * | |
_Py_normpath(wchar_t *path, Py_ssize_t size) | |
{ | |
if (!path[0] || size == 0) { | |
return path; | |
} | |
if (size < 0) { | |
size = wcslen(path); | |
} | |
#define CHAR(idx) ((idx >= 0 && idx < size) ? path[idx] : 0) | |
#ifdef ALTSEP | |
#define IS_SEP(idx) (CHAR(idx) == SEP || CHAR(idx) == ALTSEP) | |
#else | |
#define IS_SEP(idx) (CHAR(idx) == SEP) | |
#endif | |
#define SEP_OR_END(idx) (IS_SEP(idx) || CHAR(idx) == 0) | |
Py_ssize_t i = 0; // path read index | |
Py_ssize_t j = 0; // path write index | |
// POSIX allows one or two initial slashes, but treats three or more as | |
// single slash. | |
//( see http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_13) | |
bool initial_slashes = IS_SEP(i); | |
if (initial_slashes) { | |
path[j++] = path[i++]; | |
if (IS_SEP(i) && !IS_SEP(i+1)) { | |
// double slash case, keep both of them | |
path[j++] = path[i++]; | |
} | |
} | |
int num_comps = 0; // num of components in output path | |
// loop through each component of path | |
while (i < size) { | |
if (IS_SEP(i)) { | |
if (IS_SEP(j-1)) { | |
// slash after another slash, drop it | |
i++; | |
} | |
else { | |
// slash after path component, add canonicalized slash | |
if (j > 0) { | |
path[j++] = SEP; | |
num_comps++; | |
} | |
i++; | |
} | |
continue; | |
} | |
// check if component is "." | |
bool dot = CHAR(i) == L'.'; | |
if (dot) { | |
if (SEP_OR_END(i+1)) { | |
// discard single dot path component | |
i += 1; | |
continue; | |
} | |
} | |
// check if component is ".." | |
bool remove_last = false; | |
bool last_dotdot = false; | |
bool dotdot = (dot && CHAR(i+1) == L'.' && SEP_OR_END(i+2)); | |
if (dotdot) { | |
// complex case, we can't remove parent component if it is .. | |
last_dotdot = CHAR(j-2) == L'.' && CHAR(j-3) == L'.' && SEP_OR_END(j-4); | |
remove_last = (initial_slashes || num_comps) && (!num_comps || !last_dotdot); | |
} | |
if (remove_last) { | |
// remove last component of path | |
if (num_comps > 0) { | |
j--; // remove trailing slash | |
while (j > 0 && !IS_SEP(j-1)) { | |
// remove characters of last component | |
j--; | |
} | |
num_comps--; | |
} | |
// skip the .. | |
i += 2; | |
continue; | |
} | |
else { | |
// component is not .. or we can't remove parent, add component | |
while (true) { | |
path[j++] = path[i++]; | |
if (SEP_OR_END(i)) { | |
break; | |
} | |
} | |
continue; | |
} | |
} | |
if (num_comps > 0) { | |
// remove trailing slashes | |
while (IS_SEP(j-1)) { | |
--j; | |
} | |
} | |
if (j < size) { | |
path[j] = L'\0'; | |
} | |
#undef CHAR | |
#undef SEP_OR_END | |
#undef IS_SEP | |
return path; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment