Skip to content

Instantly share code, notes, and snippets.

@alexeagle
Created July 14, 2020 21:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexeagle/9fd6684e9306cf741f246dd3518a48ec to your computer and use it in GitHub Desktop.
Save alexeagle/9fd6684e9306cf741f246dd3518a48ec to your computer and use it in GitHub Desktop.
"""
Read a poetry lock file and convert to a bazel dependency graph,
then write BUILD.bazel files for each installed dependency.
"""
import textwrap
import os
import re
import sys
def parse_direct_deps(buf):
"""
Read the pyproject.toml file, which lists direct dependencies of the project.
Note: avoid a dependency on a toml parser as it's hard to bootstrap deps.
"""
line = buf.readline()
deps = []
state = "scan_package"
while line:
if (
line.strip() == "[tool.poetry.dependencies]"
or line.strip() == "[tool.poetry.dev-dependencies]"
):
state = "add_deps"
elif line.startswith("["):
state = "scan_package"
elif line.startswith("#"):
pass
elif state == "add_deps" and len(line.strip()):
dep = line.split("=")[0].strip()
if len(dep) and dep != "python":
deps.append(dep)
line = buf.readline()
return deps
def parse_dep_graph(buf):
"""
Read the poetry lock file, which lists the dependencies of each package.
Note: avoid a dependency on a toml parser as it's hard to bootstrap deps.
"""
line = buf.readline()
dep_graph = {}
state = "scan_package"
name_line = re.compile(r"\s*name\s=\s\"(.*)\"")
while line:
if line.strip() == "[[package]]":
state = "scan_name"
elif line.strip() == "[package.dependencies]":
state = "add_deps"
elif line.startswith("["):
state = "scan_package"
else:
name_match = name_line.match(line)
if name_match and state == "scan_name":
pkg = name_match[1]
dep_graph[pkg] = []
state = "scan_deps"
if state == "add_deps" and len(line.strip()):
dep = line.split("=")[0].strip()
dep_graph[pkg].append(dep)
if dep not in dep_graph.keys():
dep_graph[dep] = []
line = buf.readline()
return dep_graph
def to_label(pkg):
return "\"//%s\"" % pkg
_HEADER = """\
# Generated by bazel/python/poetry/generate_build_files.py
# as part of poetry_install
load("@rules_python//python:defs.bzl", "py_library")
package(default_visibility = ["//visibility:public"])
"""
def generate_top_level_build(direct_deps):
"""
This BUILD file appears at the root of the @my_deps workspace
"""
# Flatten and de-duplicate dependencies
deps = set([d for deps in direct_deps for d in deps])
return _HEADER + textwrap.dedent(
"""\
# This re-exports all of the direct dependencies listed in the pyproject.toml file
py_library(name = "all", deps = [{}])
""".format(
", ".join([to_label(k) for k in sorted(deps)])
)
)
def generate_pkg_target(
name, dependencies, pkg_content_kind, comment="Generated target"
):
"""
This BUILD file appears at the top of the installed packages at
@my_deps/__sitepkgs__
"""
if pkg_content_kind == "dir":
srcs = """glob(["{name}/**/*.py"], allow_empty = True)"""
# Workaround bazelbuild/bazel#4327 Runfiles: support paths with spaces
data = """glob(["{name}/**/*"], exclude=["{name}/**/*.py", "{name}/**/* *"])"""
elif pkg_content_kind == "py_file":
srcs = """["{name}.py"]"""
data = "[]"
elif pkg_content_kind == "so_file":
srcs = "[]"
data = """glob(["{name}.*.so"])"""
elif pkg_content_kind == "empty":
comment = (
"No files were found for this package. It might indicate a bug."
)
srcs = "[]"
data = "[]"
else:
raise Exception("unknown package content kind", pkg_content_kind)
# Flatten and de-duplicate dependencies
deps = set([d for deps in dependencies for d in deps])
return textwrap.dedent(
"""\
# {comment}
py_library(
name = "{{name}}",
srcs = {srcs},
data = {data},
imports = ["."],
deps = [{{dependencies}}],
)
""".format(
comment=comment, data=data, srcs=srcs,
).format(
name=name,
dependencies=", ".join(
["\"%s\"" % d for d in sorted(deps) if d != name]
),
)
)
def generate_alias_build(top_level, alias_target):
"""
This BUILD file appears in the package-specific vanity location
@my_deps/pkg
so that users can dep on '@my_deps//pkg' rather than '@my_deps//__sitepkgs__/pkg'
"""
return _HEADER + textwrap.dedent(
"""\
# Convenience alias so you can dep on @my_deps//pkg
alias(name = "{name}", actual = "{actual}")
""".format(
name=top_level, actual=alias_target
)
)
def top_level_names(pkg, site_packages):
"""
Find out what top-level names the package can be imported with
For example, .venv/lib/python3.8/site-packages/python_dateutil-2.8.1.dist-info/top_level.txt
tells us that the package is imported as "dateutil"
"""
for d in os.listdir(site_packages):
if (
d.endswith(".dist-info") or d.endswith(".egg-info")
) and d.lower().startswith(pkg.lower().replace("-", "_") + "-"):
top_level_file = os.path.join(site_packages, d, "top_level.txt")
if os.path.exists(top_level_file):
with open(
os.path.join(site_packages, d, "top_level.txt")
) as top_level_file:
# Filter out deep imports like googleapiclient/discovery_cache
# Also filter out names prefixed with underscore
return [
l.strip()
for l in top_level_file.readlines()
if "/" not in l and not l.startswith("_")
]
print(
"WARNING:",
pkg,
"has no top-level.txt in its distribution, assuming it is imported as",
pkg,
)
return [pkg]
def main(argv):
if len(argv) < 3:
print(
"Usage: generate_build_files.py path/to/pyproject.toml path/to/poetry.lock path/to/installed/site_packages",
file=sys.stderr,
)
return 1
[toml_file, lock_file, site_packages] = argv
with open(toml_file) as toml:
direct_deps = parse_direct_deps(toml)
with open(lock_file) as lock:
dep_graph = parse_dep_graph(lock)
with open('BUILD.bazel', 'w') as top_build:
top_build.write(
generate_top_level_build(
[top_level_names(dep, site_packages) for dep in direct_deps]
)
)
# What top-level imports are possible from this site_packages?
top_importable = os.listdir(site_packages)
with open(os.path.join(site_packages, "BUILD.bazel"), 'w') as pkgs_build:
pkgs_build.write(_HEADER)
for [pkg, deps] in dep_graph.items():
for top_level in top_level_names(pkg, site_packages):
# Multiple packages can amend the same top-level import
if not os.path.exists(top_level):
os.mkdir(top_level)
with open(
"%s/BUILD.bazel" % top_level, 'w'
) as alias_build:
alias_target = "//%s:%s" % (site_packages, top_level)
alias_build.write(
generate_alias_build(top_level, alias_target)
)
# Some packages get installed as single file instead of directory
# We just have to look on disk to see what poetry decided to do
pkg_content_kind = None
if top_level in top_importable:
pkg_content_kind = "dir"
elif top_level + ".py" in top_importable:
pkg_content_kind = "py_file"
elif any(
[
i.startswith(top_level + ".") and i.endswith(".so")
for i in top_importable
]
):
# for example pvectorc.cpython-38-darwin.so
pkg_content_kind = "so_file"
else:
pkg_content_kind = "empty"
if pkg_content_kind:
pkgs_build.write(
generate_pkg_target(
top_level,
# TODO: we only look at the deps for the first package that
# contributed to top_level, but it's possible that other packages
# with additional deps also contribute to the same top_level
[
top_level_names(d, site_packages)
for d in deps
],
pkg_content_kind,
"Generated from " + pkg,
)
)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
"Repository rule to run poetry to install dependencies during WORKSPACE loading"
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
_ATTRS = {
"pyproject": attr.label(doc = "The pyproject.toml file to install", mandatory = True),
"poetry_lock": attr.label(doc = "The poetry.lock file"),
"quiet": attr.bool(doc = "Whether to print the output from poetry"),
"timeout": attr.int(
default = 3600,
doc = "Maximum duration of the package manager execution in seconds.",
),
"python_interpreter": attr.label(
doc = "A python interpreter to run poetry under",
default = "@python_interpreter//:python_bin",
),
}
# When you run the poetry installer, it creates this BIN entry for your $PATH
# From https://github.com/python-poetry/poetry/blob/1.0.9/get-poetry.py#L200-L218
# And slightly adapted for the lib path which differs in the poetry distribution
_POETRY_BIN = """
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import glob
import sys
import os
lib = os.path.normpath(os.path.join(os.path.realpath("{}"), "../.."))
vendors = os.path.join(lib, "_vendor")
current_vendors = os.path.join(
vendors, "py{{}}".format(".".join(str(v) for v in sys.version_info[:2]))
)
sys.path.insert(0, lib)
sys.path.insert(0, current_vendors)
if __name__ == "__main__":
from poetry.console import main
main()
"""
def _impl(repository_ctx):
py_interpreter = repository_ctx.path(repository_ctx.attr.python_interpreter)
poetry_main = repository_ctx.path(Label("@poetry//:__main__.py"))
# Lay out the working directory (output_base/external/users_deps) so that poetry
# runs in a project environment it expects.
repository_ctx.file("_poetry_bin.py", content = _POETRY_BIN.format(poetry_main))
repository_ctx.symlink(repository_ctx.attr.pyproject, "pyproject.toml")
repository_ctx.symlink(repository_ctx.attr.poetry_lock, "poetry.lock")
repository_ctx.symlink(Label("//bazel/python/poetry:generate_build_files.py"), "generate_build_files.py")
poetry_args = ["install", "--no-interaction", "--no-ansi"]
# OPT: we could expose environment as an attribute
poetry_env = {
# Bazel will keep its own copy of the venv so we can write BUILD files there
# https://python-poetry.org/docs/configuration/#virtualenvsin-project-boolean
"POETRY_VIRTUALENVS_IN_PROJECT": "true",
# TODO: maybe we don't want to create virtualenvs at all? need to understand how they are used
# but if we do this, where does poetry put the downloaded packages?
# "POETRY_VIRTUALENVS_CREATE": "false",
}
repository_ctx.report_progress("Running poetry install on %s" % repository_ctx.attr.pyproject)
result = repository_ctx.execute(
[py_interpreter, "_poetry_bin.py"] + poetry_args,
timeout = repository_ctx.attr.timeout,
quiet = repository_ctx.attr.quiet,
environment = poetry_env,
)
if result.return_code:
fail("poetry_install failed:\nSTDOUT:\n%s\nSTDERR:\n%s" % (result.stdout, result.stderr))
# rules_python doesn't allow hyphens anywhere in the path when referencing files.
# So we symlink the site-packages to sitepkgs for bazel labels to be unaware of the hyphen
# See https://github.com/bazelbuild/bazel/issues/9171
# FIXME: where does the "3.8" come from?
repository_ctx.symlink(repository_ctx.path(".venv/lib/python3.8/site-packages"), "__sitepkgs__")
repository_ctx.report_progress("Processing site-packages: generating BUILD files")
result = repository_ctx.execute([
py_interpreter,
"generate_build_files.py",
repository_ctx.path("pyproject.toml"),
repository_ctx.path("poetry.lock"),
"__sitepkgs__",
])
if result.return_code:
fail("generate_build_files.py failed:\nSTDOUT:\n%s\nSTDERR:\n%s" % (result.stdout, result.stderr))
poetry_install_rule = repository_rule(
implementation = _impl,
attrs = _ATTRS,
)
def poetry_install(**kwargs):
"Wrapper macro around the repository rule"
# Use a maybe so this only runs the first time poetry_install is called
_maybe(
http_archive,
name = "poetry",
sha256 = "073b2e557f4a53605da6009a8b3585de00ffef4bfece4dfc3d974b8e5f00d481",
strip_prefix = "poetry",
# Add a build file here, just to produce a label that we can reference from the repository rule
# so we can determine the path to poetry library
build_file_content = """exports_files(["__main__.py"])""",
# FIXME: need linux URL also, see rules_nodejs node_repositories#_download_node for ideas.
# Also it needs to be a toolchain so a docker container gets a linux python interpreter
# TODO: also understand whether this becomes a cache key for something
# and whether it needs a toolchain for cross-compile??
urls = ["https://github.com/python-poetry/poetry/releases/download/1.0.9/poetry-1.0.9-darwin.tar.gz"],
)
poetry_install_rule(**kwargs)
def _maybe(repo_rule, name, **kwargs):
if name not in native.existing_rules():
repo_rule(name = name, **kwargs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment