Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@brieb
Last active July 28, 2022 16:34
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brieb/8439c7869fa058554c58377fb52a3c84 to your computer and use it in GitHub Desktop.
Save brieb/8439c7869fa058554c58377fb52a3c84 to your computer and use it in GitHub Desktop.
Bazel TSC Genrule

Bazel TSC Genrule

Approach:

  • Run yarn install (happens via rules_nodejs yarn_install)
  • Create tar packages -- one for entire package dir, one only containing the *.ts and package.json files needed for TSC
  • tsc.bzl genrule
    • Unpack tars for package and tsc dependencies
    • Run TS compilation
    • Pack input files used as part of the compilation and output files emitted into a tsc.tar.gz

Key insights:

  • Reducing input size and count improves performance when running Bazel with Remote Build Execution (RBE)
  • Tarring up files used during compilation (the files in --listFiles) allows us to only include tsc.tar.gz for direct dependencies vs all transitive dependencies. ts_project includes transitive node_modules and d.ts files in the sandbox because they can technically be needed for subsequent project compilations. For example, suppose project foo depends on bar, and types from bar are exposed in foo's emit, so baz which depends on foo would also need bar's outputs in the sandbox. In the tar version, the relevant files from bar would be included in foo's tar. Note: --listFiles still may include unnecessary files that aren't relevant for foo's emit, but at least it's a narrower set. These could be pruned as a further optimization.
  • Tarring up only *.ts files per node_modules makes inputs smaller
  • Using tars instead of raw files reduces the number of inputs

Note: Gists don't support folders so this won't run as is, for illustration purposes only.

# WORKSPACE.bazel

...
load("//bazel/pkg:pkg_setup.bzl", "pkg_setup")

pkg_setup(
    name = "pkg",
    package_json = "//:package.json",
    yarn_lock = "//:yarn.lock",
)
...
const path = require('path');
const fs = require('fs');
const glob = require('globby');
const memoize = require('lodash/memoize');
/**
* @typedef {Object} Metadatum
* @property {string} version
* @property {StringifiableSet} deps
*/
/**
* @typedef {Record<string, Metadatum>} Metadata
*/
class StringifiableSet extends Set {
toJSON() {
return Array.from(this);
}
}
/**
* @param {string} fileName
* @returns {boolean}
*/
function isTopLevelPackage(fileName) {
const firstNodeModules = fileName.indexOf('node_modules/');
return firstNodeModules !== -1 && fileName.indexOf('node_modules/', firstNodeModules + 1) === -1;
}
/**
* Determines the package name for the given package.json file
*
* We need to determine the package name from the directory on the filesystem
* instead of using the name field in package.json because we use `npm:` as
* the version in package.json for some packages (e.g. `"react-redux-v7":
* "npm:react-redux"`).
*
* @param {string} fileName
* @returns {string}
*/
function getPackageName(fileName) {
return fileName.replace(/.*node_modules\/(@[^/]+\/[^/]+|[^/]+)\/package.json/, '$1');
}
/**
* Determines the top level package name from the given package.json file
*
* @param {string} fileName
* @returns {string}
*/
function getTopLevelPackageName(fileName) {
return fileName.replace(/node_modules\/(@[^/]+\/[^/]+|[^/]+).*/, '$1');
}
/**
* @param {string} packageName
* @param {string} resolvedFromDir
* @param {Set<string>} allPackageJsons
* @returns {boolean}
*/
function hasNestedPackage(packageName, resolvedFromDir, allPackageJsons) {
const dirs = resolvedFromDir.split(path.sep);
// Iterate over directories backwards, stopping at 2 since the first
// directory is always node_modules and we never expect
// node_modules/node_modules to exist.
for (let i = dirs.length; i > 1; i--) {
const dir = dirs.slice(0, i).join(path.sep);
const pathToCheck = path.join(dir, 'node_modules', packageName, 'package.json');
if (allPackageJsons.has(pathToCheck)) {
return true;
}
}
return false;
}
const getAllTransitiveDeps = memoize(
/**
* @param {string} name
* @param {Metadata} metadata
* @param {Set<string>} [visited]
*/
(name, metadata, visited = new Set()) => {
// Keep track of visited packages to avoid infinite recursion when there
// are dependency cycles.
visited.add(name);
const allTransitiveDeps = new StringifiableSet();
metadata[name].deps.forEach(
/**
* @param {string} depToFind
*/
(depToFind) => {
allTransitiveDeps.add(depToFind);
if (visited.has(depToFind)) {
// We've already visited this package, which means that there is a
// circular dependency. We need to handle this more carefully to
// avoid infinite loops.
metadata[depToFind].deps.forEach((depToAdd) => {
allTransitiveDeps.add(depToAdd);
if (!visited.has(depToAdd)) {
getAllTransitiveDeps(depToAdd, metadata, visited).forEach((anotherDepToAdd) => {
allTransitiveDeps.add(anotherDepToAdd);
});
}
});
} else {
getAllTransitiveDeps(depToFind, metadata, visited).forEach((depToAdd) => {
allTransitiveDeps.add(depToAdd);
});
}
},
);
return allTransitiveDeps;
},
);
async function getMetadata() {
/**
* @type {Map<string, Record<string, any>>}
*/
const packageData = new Map();
/** @type {Set<string>} */
const topLevelPackageNames = new Set();
/** @type {Set<string>} */
const allPackageJsons = new Set();
const globPatterns = [
// Top-level packages
'node_modules/*/package.json',
'node_modules/@*/*/package.json',
// Nested packages
'node_modules/**/node_modules/*/package.json',
'node_modules/**/node_modules/@*/*/package.json',
];
// TODO change `stream` to `globbyStream` when updating to v12
for await (const fileName of glob.stream(globPatterns)) {
if (typeof fileName !== 'string') {
throw new Error(`Unexpected file name: ${fileName}`);
}
allPackageJsons.add(fileName);
const name = getPackageName(fileName);
const data = JSON.parse(fs.readFileSync(fileName, 'utf8'));
packageData.set(fileName, data);
// If this is a top-level package, add it to a list so we can iterate over
// them next
if (isTopLevelPackage(fileName)) {
topLevelPackageNames.add(name);
}
}
/** @type {Metadata}} */
const metadata = {};
packageData.forEach((data, fileName) => {
const { version, dependencies, peerDependencies } = data;
const topLevelPackageName = getTopLevelPackageName(fileName);
if (!metadata[topLevelPackageName]) {
metadata[topLevelPackageName] = { version: '', deps: new StringifiableSet() };
}
if (isTopLevelPackage(fileName)) {
metadata[topLevelPackageName].version = version;
}
[...Object.keys(dependencies || {}), ...Object.keys(peerDependencies || {})].forEach((dep) => {
if (
topLevelPackageNames.has(dep) &&
!hasNestedPackage(dep, path.dirname(fileName), allPackageJsons)
) {
// This dependency is a top-level package and is not a package that is
// nested as a child of the current package.json. This means that it is
// depending on the hoisted version and not the nested one.
metadata[topLevelPackageName].deps.add(dep);
}
});
});
// Flatten all top-level transitive dependencies down
/** @type {Metadata} */
const flattenedMetadata = {};
Object.entries(metadata).forEach(([name, { version }]) => {
flattenedMetadata[name] = { version, deps: getAllTransitiveDeps(name, metadata) };
});
return flattenedMetadata;
}
(async () => {
const metadata = await getMetadata();
console.log(JSON.stringify(metadata, null, 2));
})();
def _impl(ctx):
files = depset(
[ctx.file.src],
transitive = [dep[DefaultInfo].files for dep in ctx.attr.deps],
)
runfiles = ctx.runfiles(
files = [ctx.file.src],
transitive_files = depset([ctx.file.src]),
)
for dep in ctx.attr.deps:
runfiles = runfiles.merge(dep[DefaultInfo].data_runfiles)
return DefaultInfo(files = files, runfiles = runfiles)
package = rule(
implementation = _impl,
attrs = {
"package_name": attr.string(mandatory = True),
"version": attr.string(),
"src": attr.label(allow_single_file = True),
"deps": attr.label_list(),
},
provides = [DefaultInfo],
)
load("@build_bazel_rules_nodejs//internal/node:node_labels.bzl", "get_node_label")
def _get_pkg_meta(rctx):
rctx.file("pkg_meta.json", "")
rctx.file(
"_pkg_meta.sh",
content = """#!/usr/bin/env bash
set -e pipefail
(cd "{root}"; "{node}" bazel/pkg/gen-yarn-metadata.js > "{pkg_meta_out}")
""".format(
root = rctx.path(rctx.attr.package_json).dirname,
node = rctx.path(get_node_label(rctx)),
pkg_meta_out = rctx.path("pkg_meta.json"),
),
executable = True,
)
result = rctx.execute([rctx.path("_pkg_meta.sh")])
if result.return_code:
fail("pkg_meta failed: %s (%s)" % (result.stdout, result.stderr))
return json.decode(rctx.read(rctx.path("pkg_meta.json")))
def _write_build(rctx, pkg_meta):
exports = []
for name in pkg_meta.keys():
exports.append("exports_files([\"node_modules/{name}\"])".format(name = name))
exports.append("")
rctx.file("BUILD.bazel", "\n".join(exports))
def _define_packages(rctx, pkg_meta):
for (name, meta) in pkg_meta.items():
rctx.file(
name + "_tar/BUILD.bazel",
"""# @generated by pkg_setup.bzl
load("@rules_pkg//:pkg.bzl", "pkg_tar")
load("@pineapple//bazel/pkg:pkg_types_tar.bzl", "pkg_types_tar")
pkg_tar(
name = "tar",
extension = "tar.gz",
srcs = ["//:node_modules/{package_name}"],
strip_prefix = "/external/pkg/node_modules/",
mode = "0777",
visibility = ["//:__subpackages__"],
)
pkg_types_tar(
name = "types_tar",
package_name = "{package_name}",
src = "//:node_modules/{package_name}",
visibility = ["//:__subpackages__"],
)
""".format(package_name = name),
)
rctx.file(
name + "/BUILD.bazel",
"""# @generated by pkg_setup.bzl
load("@pineapple//bazel/pkg:package.bzl", "package")
package(
name = "tar",
package_name = "{package_name}",
version = "{version}",
src = "{tar_src}",
deps = {tar_deps},
visibility = ["//visibility:public"],
)
package(
name = "types_tar",
package_name = "{package_name}",
version = "{version}",
src = "{types_tar_src}",
deps = {types_tar_deps},
visibility = ["//visibility:public"],
)
""".format(
name = name.split("/")[-1],
package_name = name,
version = meta["version"],
tar_src = "//" + name + "_tar:tar",
tar_deps = ["//" + dep + "_tar:tar" for dep in meta["deps"]],
types_tar_src = "//" + name + "_tar:types_tar",
types_tar_deps = ["//" + dep + "_tar:types_tar" for dep in meta["deps"]],
),
)
def _impl(rctx):
# Assumes yarn install has already been run in the package.json dir via rules_nodejs
rctx.symlink(
rctx.path(str(rctx.path(rctx.attr.package_json).dirname) + "/node_modules"),
rctx.path("node_modules"),
)
pkg_meta = _get_pkg_meta(rctx)
_write_build(rctx, pkg_meta)
_define_packages(rctx, pkg_meta)
pkg_setup = repository_rule(
implementation = _impl,
attrs = {
"package_json": attr.label(mandatory = True, allow_single_file = True),
"yarn_lock": attr.label(mandatory = True, allow_single_file = True),
"node_repository": attr.string(default = "nodejs"),
},
# Indicate that this rule fetches everything from the local system and
# should be reevaluated at every fetch.
local = True,
# Indicate that the repository inspects the system for configuration
# purpose. Tells Bazel to re-build this when running `bazel sync
# --configure`.
configure = True,
)
def _impl(ctx):
output = ctx.actions.declare_file(str(ctx.attr.name) + ".tar.gz")
ctx.actions.run_shell(
mnemonic = "PkgTypesTar",
outputs = [output],
inputs = ctx.files.src + [ctx.file._script],
command = "{script} {src} {dst}".format(
script = ctx.file._script.path,
src = ctx.file.src.path,
dst = output.path,
),
execution_requirements = {
"no-remote-exec": "1",
},
progress_message = "Tarring types @pkg//{package_name}".format(package_name = ctx.attr.package_name),
)
return DefaultInfo(
files = depset([output]),
runfiles = ctx.runfiles(files = [output]),
)
pkg_types_tar = rule(
implementation = _impl,
attrs = {
"src": attr.label(allow_single_file = True),
"package_name": attr.string(mandatory = True),
"_script": attr.label(
allow_single_file = True,
default = "//bazel/pkg:pkg_types_tar.sh",
),
},
provides = [DefaultInfo],
)
#!/bin/bash
set -euo pipefail
SRC="$1"
DST="$2"
# Split the src path into the path to the root and the path from the root to
# the package. We do this so that all of the tars can be untarred from the root.
ROOT=${SRC%/node_modules/*}
PKG=node_modules/${SRC##*/node_modules/}
# Find .ts and package.json files within the current package directory to be included within the
# tar archive. Some packages contain .ts files in addition to .d.ts files, so we include both.
OUTPUT_FILES=$(dirname "$DST")/output_files.txt
(cd "$ROOT" && find "$PKG" -type f -name "*.ts" -o -name "package.json" | sort) > "$OUTPUT_FILES"
# Create a deterministic tar.gz archive
# Inspired by https://reproducible-builds.org/docs/archives/#full-example
# gzip with the -n option to not include the timestamp https://serverfault.com/a/110244
tar \
--mtime="2000-01-01 00:00Z" \
--owner=0 --group=0 --numeric-owner \
--format=gnu \
--dereference \
--create \
--directory="$ROOT" \
--files-from="$OUTPUT_FILES" \
| gzip --fast --no-name > "$DST"
// @ts-nocheck
const fs = require('fs');
const ts = require('typescript');
const tsconfigFile = process.argv[2];
const tsconfigBzlFile = process.argv[3];
const { config } = ts.parseConfigFileTextToJson(
tsconfigFile,
fs.readFileSync(tsconfigFile, 'utf8'),
);
// For '--project' compat. rule_nodejs also uses '--project'.
delete config.references;
config.compilerOptions = config.compilerOptions || {};
config.compilerOptions.declarationMap = false;
config.compilerOptions.rootDir = '.';
fs.writeFileSync(tsconfigBzlFile, JSON.stringify(config), { encoding: 'utf8' });
def _impl(ctx):
output = ctx.actions.declare_file("tsc.tar.gz")
args = ctx.actions.args()
args.add_all([
ctx.file._tsconfig_script,
ctx.file.tsconfig,
output,
ctx.expand_make_variables("env", "$(BINDIR)", {}),
ctx.file._node_bin.path,
])
ctx.actions.run_shell(
outputs = [output],
inputs = depset(ctx.files.srcs + ctx.files.deps + [ctx.file.tsconfig]),
command = "{script} $@".format(script = ctx.file._script.path),
arguments = [args],
tools = [ctx.file._node_bin, ctx.file._script, ctx.file._tsconfig_script],
progress_message = "Compiling TypeScript %s" % output.short_path,
)
return DefaultInfo(
files = depset([output]),
runfiles = ctx.runfiles(files = [output]),
)
tsc = rule(
implementation = _impl,
attrs = {
"srcs": attr.label_list(allow_files = True, mandatory = True),
"deps": attr.label_list(allow_files = True, providers = [DefaultInfo]),
"tsconfig": attr.label(allow_single_file = True),
"_script": attr.label(
default = Label("//bazel/tsc:tsc.sh"),
executable = True,
cfg = "exec",
allow_single_file = True,
),
"_tsconfig_script": attr.label(
default = Label("//bazel/tsc:tsc-tsconfig.js"),
executable = True,
cfg = "exec",
allow_single_file = True,
),
"_node_bin": attr.label(
default = Label("@build_bazel_rules_nodejs//toolchains/node:node_bin"),
executable = True,
cfg = "exec",
allow_single_file = True,
),
},
provides = [DefaultInfo],
)
#!/bin/bash
set -euo pipefail
TSCONFIG_SCRIPT=$1
TSCONFIG=$2
OUT_TAR=$3
BINDIR=$4
NODE_BIN=$5
TS_LIB="external/npm/node_modules/typescript"
TSCONFIG_DIR=$(dirname "$TSCONFIG")
OUT_DIR=$(dirname "$OUT_TAR")
TSCONFIG_BZL="$TSCONFIG_DIR/tsconfig.bzl.json"
PWD="$(pwd)"
export NODE_PATH="$PWD/$BINDIR/external/npm/node_modules:$PWD/external/npm/node_modules:$PWD/node_modules"
mkdir node_modules
# Symlink the full typescript module to node_modules. Safe as it's not included in any tars.
# The tsc rule depends on "@npm//typescript".
ln -s "$PWD/$TS_LIB" node_modules/typescript
# If this project has no first-party deps, then there will be no tsc.tar.gz
# files to extract, which will cause tar to fail. To avoid this, we can make
# sure these files exist before attempting to extract them.
if test -n "$(find "$BINDIR/frontend" -name 'tsc.tar.gz' -print -quit)"
then
find "$BINDIR/frontend" -name "tsc.tar.gz" -exec cat {} + | tar -ixmzf - --skip-old-files
fi
# TODO this could potentially be combined with the above so we untar
# first-party and third-party files at the same time
if test -n "$(find "$BINDIR/external/pkg" -name '*.tar.gz' -print -quit)"
then
find "$BINDIR/external/pkg" -name "*.tar.gz" -exec cat {} + | tar -ixmzf - --skip-old-files
fi
$NODE_BIN "$TSCONFIG_SCRIPT" "$TSCONFIG" "$TSCONFIG_BZL"
set +e
$NODE_BIN \
"$TS_LIB/lib/tsc.js" \
--project "$TSCONFIG_BZL" \
--outDir "$OUT_DIR" \
--listFiles \
> output.txt
result="$?"
set -e
if [ "$result" -ne 0 ]; then
# Print output, excluding the "--listFiles" output where each line is a path prefixed with "/"
grep -v "^/" output.txt
exit "$result"
fi
# We include tslib and typescript as direct deps in every project, so we can exclude their
# .d.ts files from this list which makes the output smaller.
# TODO maybe we can exclude @types/node here too?
grep "^$(pwd).*\.d\.ts$" output.txt \
| grep -v "/external/npm/" \
| sed "s|^$(pwd)/||" \
| grep -v "^node_modules/tslib/" \
| grep -v "^node_modules/typescript/" \
> output_files_tmp.txt || true
# Must include package.json for included node_modules types
grep "^node_modules" output_files_tmp.txt \
| sed -E "s:^(node_modules/(@[^/]+/)?[^/]+)/.*$:\1/package.json:" \
| sort -u \
>> output_files_tmp.txt || true
# Exclude .tsbuildinfo file because that changes between builds even if the outputs are the same
find "$OUT_DIR" -type f -name "*.d.ts" >> output_files_tmp.txt
# Include .d.ts sources
find "$TSCONFIG_DIR" -type f -name "*.d.ts" >> output_files_tmp.txt
sort -u output_files_tmp.txt > output_files.txt
# Create a deterministic tar.gz archive
# Inspired by https://reproducible-builds.org/docs/archives/#full-example
# gzip with the -n option to not include the timestamp https://serverfault.com/a/110244
tar \
--mtime="2000-01-01 00:00Z" \
--owner=0 --group=0 --numeric-owner \
--format=gnu \
--dereference \
--create \
-T output_files.txt \
| gzip --fast --no-name > "$OUT_TAR"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment