Skip to content

Instantly share code, notes, and snippets.

@nascheme
Created January 5, 2022 07:09
Show Gist options
  • Save nascheme/553576e857f89083423858f174e23d10 to your computer and use it in GitHub Desktop.
Save nascheme/553576e857f89083423858f174e23d10 to your computer and use it in GitHub Desktop.
/* In-place path normalisation. Returns the start of the normalized
path, which will be within the original buffer. Guaranteed to not
make the path longer, and will not fail. 'size' is the length of
the path, if known. If -1, the first null character will be assumed
to be the end of the path. */
wchar_t *
_Py_normpath(wchar_t *path, Py_ssize_t size)
{
if (!path[0] || size == 0) {
return path;
}
if (size < 0) {
size = wcslen(path);
}
#define CHAR(idx) ((idx >= 0 && idx < size) ? path[idx] : 0)
#ifdef ALTSEP
#define IS_SEP(idx) (CHAR(idx) == SEP || CHAR(idx) == ALTSEP)
#else
#define IS_SEP(idx) (CHAR(idx) == SEP)
#endif
#define SEP_OR_END(idx) (IS_SEP(idx) || CHAR(idx) == 0)
Py_ssize_t i = 0; // path read index
Py_ssize_t j = 0; // path write index
// POSIX allows one or two initial slashes, but treats three or more as
// single slash.
//( see http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_13)
bool initial_slashes = IS_SEP(i);
if (initial_slashes) {
path[j++] = path[i++];
if (IS_SEP(i) && !IS_SEP(i+1)) {
// double slash case, keep both of them
path[j++] = path[i++];
}
}
int num_comps = 0; // num of components in output path
// loop through each component of path
while (i < size) {
if (IS_SEP(i)) {
if (IS_SEP(j-1)) {
// slash after another slash, drop it
i++;
}
else {
// slash after path component, add canonicalized slash
if (j > 0) {
path[j++] = SEP;
num_comps++;
}
i++;
}
continue;
}
// check if component is "."
bool dot = CHAR(i) == L'.';
if (dot) {
if (SEP_OR_END(i+1)) {
// discard single dot path component
i += 1;
continue;
}
}
// check if component is ".."
bool remove_last = false;
bool last_dotdot = false;
bool dotdot = (dot && CHAR(i+1) == L'.' && SEP_OR_END(i+2));
if (dotdot) {
// complex case, we can't remove parent component if it is ..
last_dotdot = CHAR(j-2) == L'.' && CHAR(j-3) == L'.' && SEP_OR_END(j-4);
remove_last = (initial_slashes || num_comps) && (!num_comps || !last_dotdot);
}
if (remove_last) {
// remove last component of path
if (num_comps > 0) {
j--; // remove trailing slash
while (j > 0 && !IS_SEP(j-1)) {
// remove characters of last component
j--;
}
num_comps--;
}
// skip the ..
i += 2;
continue;
}
else {
// component is not .. or we can't remove parent, add component
while (true) {
path[j++] = path[i++];
if (SEP_OR_END(i)) {
break;
}
}
continue;
}
}
if (num_comps > 0) {
// remove trailing slashes
while (IS_SEP(j-1)) {
--j;
}
}
if (j < size) {
path[j] = L'\0';
}
#undef CHAR
#undef SEP_OR_END
#undef IS_SEP
return path;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment