Skip to content

Instantly share code, notes, and snippets.

@abravalheri
Last active April 20, 2022 15:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abravalheri/7a9f184b6034710684be1da803675140 to your computer and use it in GitHub Desktop.
Save abravalheri/7a9f184b6034710684be1da803675140 to your computer and use it in GitHub Desktop.
How to implement Namespace packages via `MetaPathFinder`?

Please note that the implementations here are simple PoCs, and is subject to implementation errors, bugs, over simplifications etc..

Preliminary Findings

It seems that the underlying importlib machinery will automatically create an adequate namespace loader if the spec is created with is_package=True and loader=None.

>>> import sys
>>> import importlib
>>> from importlib.machinery import ModuleSpec
>>>
>>>
>>> class _MyMetaPathFinder:
...     @classmethod
...     def find_spec(cls, fullname, path, target=None):
...         if fullname == "namespacepkg":
...             spec = ModuleSpec(fullname, None, is_package=True)
...             spec.submodule_search_locations = ["/tmp"]
...             return spec
...
>>>
>>> sys.meta_path.append(_MyMetaPathFinder)
>>> namespacepkg = importlib.import_module("namespacepkg")
>>> namespacepkg
<module 'namespacepkg' (namespace)>
>>> namespacepkg.__loader__
<_frozen_importlib_external._NamespaceLoader object at 0x7f231a0e1cd0>
>>> namespacepkg.__spec__
ModuleSpec(name='namespacepkg', loader=<_frozen_importlib_external._NamespaceLoader object at 0x7f231a0e1cd0>, submodule_search_locations=['/tmp'])
>>> namespacepkg.__path__
['/tmp']

However, namespacepkg.__path__ and namespacepkg.__spec__.submodule_search_locations differ from what would obtained when a namespace package actually exists in the file system:

>>> import sys
>>> from pathlib import Path
>>>
>>> Path("/tmp/workdir/experiment1/namespacepkg").mkdir(exist_ok=True, parents=True)
>>> sys.path.append("/tmp/workdir/experiment1")
>>> import namespacepkg
>>> namespacepkg.__spec__
ModuleSpec(name='namespacepkg', loader=<_frozen_importlib_external._NamespaceLoader object at 0x7ffd1dfed580>, submodule_search_locations=_NamespacePath(['/tmp/workdir/experiment1/namespacepkg']))
>>> namespacepkg.__path__
_NamespacePath(['/tmp/workdir/experiment1/namespacepkg'])

Here we can see the underlying importlib machinery using _NamespacePath.

Approach A: Custom __path__ emulating the dynamic behaviour of _NamespacePath

Based on the preliminary findings, it seems that the key for dynamic path computation relies on the _NamespacePath class, therefore we can try to emulate its behaviour with a custom class.

The reference in https://docs.python.org/3/reference/import.html#namespace-packages, describes this behavior as:

Namespace packages do not use an ordinary list for their path attribute. They instead use a custom iterable type which will automatically perform a new search for package portions on the next import attempt within that package if the path of their parent package (or sys.path for a top level package) changes.

We can implement this roughly as follows:

>>> import importlib, pkgutil, sys
>>> from collections.abc import Sequence
>>> from importlib.machinery import ModuleSpec
>>>
>>>
>>> NAMESPACES = {
...     "parent": ["/tmp/workdir/project1"],
...     "parent.child": ["/tmp/workdir/project1/parent"]
... }
>>>
>>>
>>> class _MyMetaPathFinder:
...     @classmethod
...     def find_spec(cls, fullname, path, target=None):
...         if fullname in NAMESPACES:
...             spec = ModuleSpec(fullname, None, is_package=True)
...             spec.submodule_search_locations = _MyNamespacePath(fullname, NAMESPACES[fullname])
...             return spec
...     @classmethod
...     def invalidate_caches(cls):
...         _MyNamespacePath._epoch += 1
...
>>>
>>> class _MyNamespacePath(Sequence):
...     _epoch = 0
...     def __init__(self, name, paths):
...         self._name = name
...         self._orig_paths = _ordered_set(paths)
...         self._cache = _ordered_set(self._orig_paths)
...         self._last_epoch = self._epoch
...     def _refresh(self):
...         if self._epoch != self._last_epoch:
...             self._cache = self._orig_paths.copy()
...             self._last_epoch = self._epoch
...         fresh = pkgutil.extend_path([], self._name)
...         self._cache.update(_ordered_set(fresh))
...         return self._cache
...     def __len__(self):
...         return len(self._refresh())
...     def __getitem__(self, index):
...         return list(self._refresh())[index]
...     def __repr__(self):
...         return f"{self.__class__.__name__}({list(self._cache)!r})"
...
>>>
>>> _ordered_set = dict.fromkeys  # poor man's ordered set implementation
>>> sys.meta_path.insert(0, _MyMetaPathFinder)
>>> import parent.child
>>> parent.__spec__
ModuleSpec(name='parent', loader=<_frozen_importlib_external._NamespaceLoader object at 0x7f85790a2c70>, submodule_search_locations=_MyNamespacePath(['/tmp/workdir/project1']))
>>> parent.child.__spec__
ModuleSpec(name='parent.child', loader=<_frozen_importlib_external._NamespaceLoader object at 0x7f85790a2b50>, submodule_search_locations=_MyNamespacePath(['/tmp/workdir/project1/parent']))
>>> parent.__path__
_MyNamespacePath(['/tmp/workdir/project1'])
>>> parent.child.__path__
_MyNamespacePath(['/tmp/workdir/project1/parent'])

This implementation (despite not being completely optimised), does pass the tests for dynamic path computation as defined in PEP 420:

>>> # Re-using the same Python REPL session as previous code snippet:
>>> from pathlib import Path
>>> Path("/tmp/workdir/project2/parent/child").mkdir(exist_ok=True, parents=True)
>>> Path("/tmp/workdir/project2/parent/child/two.py").touch()
>>> sys.path.append("/tmp/workdir/project2")
>>> from parent.child import two
>>> two.__spec__
ModuleSpec(name='parent.child.two', loader=<_frozen_importlib_external.SourceFileLoader object at 0x7f85790a2b20>, origin='/tmp/workdir/project2/parent/child/two.py')
>>> parent.__path__
_MyNamespacePath(['/tmp/workdir/project1', '/tmp/workdir/project2/parent'])
>>>
>>> Path("/tmp/workdir/project3/parent/child").mkdir(exist_ok=True, parents=True)
>>> Path("/tmp/workdir/project2/parent/child/three.py").touch()
>>> sys.path.append("/tmp/workdir/project3")
>>> from parent.child import three
>>> three.__spec__
ModuleSpec(name='parent.child.three', loader=<_frozen_importlib_external.SourceFileLoader object at 0x7f8578e92370>, origin='/tmp/workdir/project2/parent/child/three.py')
>>> parent.__path__
_MyNamespacePath(['/tmp/workdir/project1', '/tmp/workdir/project2/parent', '/tmp/workdir/project3/parent'])
>>> parent.child.__path__
_MyNamespacePath(['/tmp/workdir/project1/parent', '/tmp/workdir/project2/parent/child', '/tmp/workdir/project3/parent/child'])

Note that this implementation requires the finder to appear first in sys.meta_path.

Approach B: Use a PathEntryFinder instead of MetaPathFinder

By inspecting the code in CPython's importlib._bootstrap_external, it seems that PathFinder will automatically handle the creation of _NamespacePath if we use a PathEntryFinder instead of a MetaPathFinder.

A candidate implementation would be:

>>> import importlib, pkgutil, sys
>>> from collections.abc import Sequence
>>> from importlib.machinery import ModuleSpec
>>> from pathlib import Path
>>>
>>> PATH_PLACEHOLDER = "__my_namespace__.__path_hook__"
>>> # ^ Arbitrary entry without any correspondence to the file system
>>> #   that will be added to `sys.path` just to trigger the path entry hook
>>>
>>> NAMESPACES = {
...     "parent": ["/tmp/workdir/project1/parent"],
...     "parent.child": ["/tmp/workdir/project1/parent/child"]
... }
>>>
>>>
>>> class _MyPathEntryFinder:
...     @classmethod
...     def find_spec(cls, fullname, path, target=None):
...         if fullname in NAMESPACES:
...             spec = ModuleSpec(fullname, None, is_package=True)
...             spec.submodule_search_locations = NAMESPACES[fullname]
...             return spec
...     @classmethod
...     def _path_hook(cls, path):
...         if path == PATH_PLACEHOLDER:
...             return cls
...         raise ImportError
...
>>>
>>> Path("/tmp/workdir/project1/parent/child").mkdir(exist_ok=True, parents=True)
>>> sys.path.append(PATH_PLACEHOLDER)
>>> sys.path_hooks.append(_MyPathEntryFinder._path_hook)
>>> import parent.child
>>> parent.__spec__
ModuleSpec(name='parent', loader=<_frozen_importlib_external._NamespaceLoader object at 0x7f05c7cf00a0>, submodule_search_locations=_NamespacePath(['/tmp/workdir/project1/parent']))
>>> parent.child.__spec__
ModuleSpec(name='parent.child', loader=<_frozen_importlib_external._NamespaceLoader object at 0x7f05c7cf0400>, submodule_search_locations=_NamespacePath(['/tmp/workdir/project1/parent/child']))
>>> parent.__path__
_NamespacePath(['/tmp/workdir/project1/parent'])
>>> parent.child.__path__
_NamespacePath(['/tmp/workdir/project1/parent/child'])
>>>
>>>
>>> Path("/tmp/workdir/project1/parent/child/one.py").touch()
>>> from parent.child import one
>>> one.__spec__
ModuleSpec(name='parent.child.one', loader=<_frozen_importlib_external.SourceFileLoader object at 0x7f05c7cfb9d0>, origin='/tmp/workdir/project1/parent/child/one.py')
>>>

This implementation (despite not being completely optimised), does pass the tests for dynamic path computation as defined in PEP 420:

>>> # Re-using the same Python REPL session as previous code snippet:
>>> Path("/tmp/workdir/project2/parent/child").mkdir(exist_ok=True, parents=True)
>>> Path("/tmp/workdir/project2/parent/child/two.py").touch()
>>> sys.path.append("/tmp/workdir/project2")
>>> from parent.child import two
>>> two.__spec__
ModuleSpec(name='parent.child.two', loader=<_frozen_importlib_external.SourceFileLoader object at 0x7f05c7c94160>, origin='/tmp/workdir/project2/parent/child/two.py')
>>> parent.__path__
_NamespacePath(['/tmp/workdir/project1/parent', '/tmp/workdir/project2/parent'])
>>>
>>> Path("/tmp/workdir/project3/parent/child").mkdir(exist_ok=True, parents=True)
>>> Path("/tmp/workdir/project2/parent/child/three.py").touch()
>>> sys.path.append("/tmp/workdir/project3")
>>> from parent.child import three
>>> three.__spec__
ModuleSpec(name='parent.child.three', loader=<_frozen_importlib_external.SourceFileLoader object at 0x7f05c7c942b0>, origin='/tmp/workdir/project2/parent/child/three.py')
>>> parent.__path__
_NamespacePath(['/tmp/workdir/project1/parent', '/tmp/workdir/project2/parent', '/tmp/workdir/project3/parent'])

Note that this implementation requires an arbitrary entry to be added to sys.path. Also note that submodule_search_locations has to be set to a non-empty list1

Footnotes

  1. This is not easily spotted by inspecting CPython's importlib._bootstrap_external, but can be checked empirically.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment