Skip to content

Instantly share code, notes, and snippets.

@spezold
Last active April 14, 2022 09:00
Show Gist options
  • Save spezold/3b00ea130aa4db0540eb9a82719ef8d5 to your computer and use it in GitHub Desktop.
Save spezold/3b00ea130aa4db0540eb9a82719ef8d5 to your computer and use it in GitHub Desktop.
Find all modules that are imported by the given project, list the code files (*.py, *.ipynb) that use them, and try to distinguish between STL and non-STL modules.
"""
CAUTION: Make sure that
1. this file is placed in the root directory of the project of interest
(or otherwise, adjust `BASE_DIR` accordingly);
2. the file is run in the same Python environment (conda environment, poetry environment, ...)
as the project of interest (so activate the corresponding environment first, if necessary).
"""
from collections import defaultdict
from importlib.util import find_spec
import json
from pathlib import Path
import re
import subprocess
import sys
from typing import Dict, List, Optional
QUERY_IMPORT = r"^\s*import\s+(\w+)" # import ...
QUERY_FROM = r"^\s*from\s+(\w+).*import" # from ... import ...
def all_py_files_for(base_dir: Path) -> List[Path]:
return sorted(f for f in base_dir.glob("**/*.py") if f.resolve() != Path(__file__).resolve()) # Exclude this file
def all_ipynb_files_for(base_dir: Path) -> List[Path]:
return sorted(f for f in base_dir.glob("**/*.ipynb"))
def all_code_lines_for_py_file_at(p: Path) -> List[str]:
return p.read_text(encoding="utf-8").split("\n")
def all_code_lines_for_ipynb_file_at(p: Path) -> List[str]:
code_cells = [c for c in json.loads(p.read_text(encoding="utf-8"))["cells"] if c["cell_type"] == "code"]
return [ln for c in code_cells for ln in c["source"]]
def all_modules_in(code_lines: List[str]) -> List[str]:
return [m.group(1) for m in (re.search(q, ln) for ln in code_lines for q in [QUERY_IMPORT, QUERY_FROM]) if m]
def module_by_file_in(base_dir: Path) -> Dict[str, List[Path]]: # key: module name, value: list of importing file paths
module_by_file = defaultdict(list)
for source_path in all_py_files_for(base_dir):
for module in all_modules_in(all_code_lines_for_py_file_at(source_path)):
module_by_file[module].append(source_path)
for source_path in all_ipynb_files_for(base_dir):
for module in all_modules_in(all_code_lines_for_ipynb_file_at(source_path)):
module_by_file[module].append(source_path)
# Return as regular dict with sorted keys, sorted paths, and removed duplicates
return {k: sorted(set(v)) for k, v in sorted(module_by_file.items())}
def is_part_of_stl(module_name: str, base_dir: Optional[Path]) -> bool: # CAUTION: heuristic only for Python < 3.10
try:
stdlib_module_names = getattr(sys, "stdlib_module_names") # This is present in Python >= 3.10 only
return module_name in stdlib_module_names
except AttributeError:
if module_name in sys.builtin_module_names: # For these we can still be sure, everything else is guessing
return True
module_spec = find_spec(module_name)
if not module_spec: # Should only happen for modules that are imported but not installed (so cannot be STL)
return False
origin = module_spec.origin
if not origin: # Seem like modules from current code base can have origin None
return False
if origin in ["built-in", "builtin"]: # Do we ever get there?
return True
origin_path = Path(origin).resolve() # Anyway, from now on, all `origin`s should be file paths
assert origin_path.is_file()
origin_dir = origin_path.parent
if base_dir is not None and base_dir.resolve() in origin_dir.parents: # Exclude modules from current code base
return False
return all(p not in origin_dir.parts for p in ["site-packages", "dist-packages"]) # Exclude installed modules
if __name__ == "__main__":
BASE_DIR = Path(__file__).parent # TODO: This could be a command line argument
print(f"Using {BASE_DIR} as base directory.")
try:
git_command = "git describe --all"
git_state = subprocess.check_output(git_command, cwd=BASE_DIR).strip().decode()
print(f"Git state ('${git_command}'): {git_state}")
except (FileNotFoundError, subprocess.CalledProcessError):
pass # Either git is not installed, or not in a git repo
print()
m_by_f = module_by_file_in(BASE_DIR)
for key, val in m_by_f.items():
print(f"Module '{key}' used in:")
print("\n".join(f"- {p}" for p in val))
print("\nPart of STL:")
print("\n".join(m for m in m_by_f.keys() if is_part_of_stl(m, base_dir=BASE_DIR)))
print("\nNot part of STL:")
print("\n".join(m for m in m_by_f.keys() if not is_part_of_stl(m, base_dir=BASE_DIR)))
if "importlib" in m_by_f.keys():
print(f"\nCAUTION: The following source files use 'importlib' and thus should be manually checked for "
"additional imports:")
print("\n".join((f"- {p}" for p in m_by_f["importlib"])))
@spezold
Copy link
Author

spezold commented Apr 14, 2022

Known issues

  • The regexes for the import statements are not perfect, e.g. multiple imports from the same line and relative imports are not working. If you want to fix this yourself, you should parse the python code using the built-in ast module (i.e. Python's own tooling) instead. The answer to this stackoverflow question can serve as the basis.
  • Distinguishing between STL and non-STL modules only works fully reliably, starting with Python 3.10's sys.stdlib_module_names. For versions prior to 3.10, I implemented a heuristic that seems to work fully reliably in my projects, but I may have overlooked some special cases. Have a look and adapt the is_part_of_stl() function, if something looks wrong in this respect.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment