ConnorBaker/changes.md Secret

## changes.md

      
    Raw
  

              changes.md
            
          
    changes


don't build a custom protobuf, need to set BUILD_CUSTOM_PROTOBUF false


use the system pybind11 (environment variable was missing PY in the name), should be USE_SYSTEM_PYBIND11 not USE_SYSTEM_BIND11


NCCL can be used without cudaSupport being true, PyTorch will use it via RCCL (for ROCm), need to update code


Can we use static NCCL instead of the shared lib? Wouldn't that let me avoid needing nccl.out in addition to nccl.dev? Should be able to use just nccl.dev then since it contains the includes we need and the library we need.


More descriptive comment for why ++ lists.optionals stdenv.isLinux [ linuxHeaders_5_19 ] is needed
# FIXME: This is a hacky way to get around an error Gloo throws with newer Linux headers:
#   error: flexible array member ‘ethtool_link_settings::link_mode_masks’ not at end of ‘struct
#   gloo::getInterfaceSpeedGLinkSettings(int, ifreq*)::<unnamed>’
++ lists.optionals stdenv.isLinux [ linuxHeaders_5_19 ]


removed BUILD_NAMEDTENSOR due to it being unused:
python3.10-torch> CMake Warning:
python3.10-torch>   Manually-specified variables were not used by the project:
python3.10-torch>     BUILD_NAMEDTENSOR


Unable to find a way to silence the Could NOT find ZLIB (missing: ZLIB_LIBRARY ZLIB_INCLUDE_DIR) warning


Unable to find a way to silence the Set PYBIND11_PYTHON_VERSION to search for a specific version warning


CMake complains about mpfr, gmp, and fftw3 not being found
python3.10-torch> -- Using option `-Wall -Wno-unused -Wno-attributes -Wno-unused-result -Wno-psabi -ffp-contract=off -fno-math-errno -fno-trapping-math` to compile libsleef
python3.10-torch> -- Building shared libs : OFF
python3.10-torch> -- Building static test bins: OFF
python3.10-torch> -- MPFR : LIB_MPFR-NOTFOUND
python3.10-torch> -- GMP : LIBGMP-NOTFOUND
python3.10-torch> -- RT : /nix/store/8bmp6r3a0xfha3wj36phlc47clh9w81l-glibc-2.35-224/lib/librt.so
python3.10-torch> -- FFTW3 : LIBFFTW3-NOTFOUND
python3.10-torch> -- OPENSSL :
python3.10-torch> -- SDE : SDE_COMMAND-NOTFOUND
python3.10-torch> -- RUNNING_ON_TRAVIS :
python3.10-torch> -- COMPILER_SUPPORTS_OPENMP : 1

I guess they're optional dependencies because we can build without them, but what does their loss mean?
Can be fixed by adding mpfr gmp fftw to buildInputs -- no idea why they're not found when they're in nativeBuildInputs, which is gross because they should really only be needed at build time


CUPTI is somehow always missing -- is it because it's expected to be in /lib/extras/CUPTI or some other odd, non-root location?
python3.10-torch> -- Using Kineto with CUPTI support
python3.10-torch> -- Configuring Kineto dependency:
python3.10-torch> --   KINETO_SOURCE_DIR = /build/source/third_party/kineto/libkineto
python3.10-torch> --   KINETO_BUILD_TESTS = OFF
python3.10-torch> --   KINETO_LIBRARY_TYPE = static
python3.10-torch> --   CUDA_SOURCE_DIR = /nix/store/7s7wi0mv4s335hs7r8vqwq74cxdnbgd5-cuda-redist-native-11.7
python3.10-torch> --   CUDA_INCLUDE_DIRS = /nix/store/7s7wi0mv4s335hs7r8vqwq74cxdnbgd5-cuda-redist-native-11.7/include
python3.10-torch> -- Could not find CUPTI library, using CPU-only Kineto build
python3.10-torch> -- Found PythonInterp: /nix/store/0n4y44dnaxafqs7cg625aldrb152x7bx-python3-3.10.10/bin/python3.10 (found version "3.10.10")
python3.10-torch> INFO ROCM_SOURCE_DIR = 
python3.10-torch> INFO CUPTI unavailable or disabled - not building GPU profilers
python3.10-torch> -- Kineto: FMT_SOURCE_DIR = /build/source/third_party/fmt
python3.10-torch> -- Kineto: FMT_INCLUDE_DIR = /build/source/third_party/fmt/include
python3.10-torch> INFO CUPTI_INCLUDE_DIR = /nix/store/7s7wi0mv4s335hs7r8vqwq74cxdnbgd5-cuda-redist-native-11.7/include


Could NOT find CUB (missing: CUB_INCLUDE_DIR) -- is this a problem? I don't see any other errors related to CUB
python3.10-torch> -- Could NOT find CUB (missing: CUB_INCLUDE_DIR)


## default.nix
{ buildDocs ? false
, buildPythonPackage
, fetchFromGitHub
, fetchpatch
, lib
, magma
, mpi
, MPISupport ? false
, python
, stdenv

, # Native build inputs
  cmake
, linkFarm
, pybind11
, removeReferencesTo
, symlinkJoin
, util-linux
, which

  # TODO: Necessary?
, mpfr
, gmp
, fftw

, # Build inputs
  Accelerate
, CoreServices
, libobjc
, numactl

, # Propagated build inputs
  cffi
, click
, numpy
, pyyaml
, typing-extensions

, # Unit tests
  hypothesis
, psutil

, # Disable MKLDNN on aarch64-darwin, it negatively impacts performance,
  # this is also what official pytorch build does
  mklDnnSupport ? !(stdenv.isDarwin && stdenv.isAarch64)

, # virtual pkg that consistently instantiates blas across nixpkgs
  # See https://github.com/NixOS/nixpkgs/pull/83888
  blas

, # ninja (https://ninja-build.org) must be available to run C++ extensions tests,
  ninja
, linuxHeaders_5_19

, # dependencies for torch.utils.tensorboard
  future
, isPy3k
, pillow
, protobuf
, pythonOlder
, six
, tensorboard

, # CUDA dependencies
  cudaPackages ? { }
, cudaSupport ? false
, cudnnSupport ? true
, useSystemNccl ? true

, # ROCm dependencies
  gpuTargets ? [ ]
, hip
, hipblas
, hipcub
, hipfft
, hipify
, hipsolver
, hipsparse
, miopen
, miopengemm
, openmp
, rccl
, rocblas
, rocfft
, rocm-comgr
, rocm-core
, rocm-device-libs
, rocm-opencl-runtime
, rocm-runtime
, rocm-thunk
, rocminfo
, rocmSupport ? false
, rocprim
, rocrand
, rocsolver
, rocsparse
, rocthrust
, roctracer
}:

let
  inherit (lib)
    attrsets
    lists
    strings
    trivial
    versions
    ;

  inherit (cudaPackages)
    backendStdenv
    cudaFlags
    cudatoolkit
    cudaVersion
    nccl
    ;

  # setBool :: Bool -> String
  setBool = v: if v then "1" else "0";

  # https://github.com/pytorch/pytorch/blob/v1.13.1/torch/utils/cpp_extension.py#L1751
  supportedTorchCudaCapabilities =
    let
      real = [ "3.5" "3.7" "5.0" "5.2" "5.3" "6.0" "6.1" "6.2" "7.0" "7.2" "7.5" "8.0" "8.6" ];
      ptx = lists.map (x: "${x}+PTX") real;
    in
    real ++ ptx;

  # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
  #   of the first list *from* the second list. That means:
  #   lists.subtractLists a b = b - a

  # For CUDA
  supportedCudaCapabilities = lists.intersectLists cudaFlags.cudaCapabilities supportedTorchCudaCapabilities;
  unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities cudaFlags.cudaCapabilities;

  # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
  gpuArchWarner = supported: unsupported:
    trivial.throwIf (supported == [ ])
      (
        "No supported GPU targets specified. Requested GPU targets: "
        + strings.concatStringsSep ", " unsupported
      )
      supported;

  # Create the gpuTargetString.
  gpuTargetString = strings.concatStringsSep ";" (
    if gpuTargets != [ ] then
    # If gpuTargets is specified, it always takes priority.
      gpuTargets
    else if cudaSupport then
      gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
    else if rocmSupport then
      hip.gpuTargets
    else
      throw "No GPU targets specified"
  );

  # For common CUDA dependencies
  # NOTE: NCCL is not included here because it may also be used by ROCm (PyTorch's RCCL backend).
  cuda-redist-common = with cudaPackages; [

  ] ++ lists.optionals cudnnSupport [ cudnn ];

  # For CUDA dependencies needed only at build time
  cuda-redist-native = symlinkJoin {
    name = "cuda-redist-native-${cudaVersion}";
    paths = with cudaPackages; [
      # TODO: Does enabling this give me CUDA_CUPTI support for Kineto?
      cuda_cccl # <thrust/*>
      cuda_cudart # cuda_runtime.h
      cuda_cupti # for Kineto GPU profiling
      cuda_nvcc
      cuda_nvml_dev # <nvml.h>
      cuda_nvprof # <cuda_profiler_api.h>
      cuda_nvrtc
      cuda_nvtx # -llibNVToolsExt
      libcublas
      libcufft
      libcurand
      libcusolver
      libcusparse
    ] ++ cuda-redist-common;
  };

  # For CUDA dependencies needed only at runtime
  cuda-redist = symlinkJoin {
    name = "cuda-redist-${cudaVersion}";
    paths = with cudaPackages; [
    ] ++ cuda-redist-common;
  };

  # Normally libcuda.so.1 is provided at runtime by nvidia-x11 via
  # LD_LIBRARY_PATH=/run/opengl-driver/lib.  We only use the stub
  # libcuda.so from cudatoolkit for running tests, so that we don’t have
  # to recompile pytorch on every update to nvidia-x11 or the kernel.

  # TODO: No more cudatoolkit!

  # cudaStub = linkFarm "cuda-stub" [{
  #   name = "libcuda.so.1";
  #   path = "${cudatoolkit}/lib/stubs/libcuda.so";
  # }];
  # cudaStubEnv = lib.optionalString cudaSupport
  #   "LD_LIBRARY_PATH=${cudaStub}\${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH ";

  rocmtoolkit_joined = symlinkJoin {
    name = "rocm-merged";

    paths = [
      rocm-core
      hip
      rccl
      miopen
      miopengemm
      rocrand
      rocblas
      rocfft
      rocsparse
      hipsparse
      rocthrust
      rocprim
      hipcub
      roctracer
      rocfft
      rocsolver
      hipfft
      hipsolver
      hipblas
      rocminfo
      rocm-thunk
      rocm-comgr
      rocm-device-libs
      rocm-runtime
      rocm-opencl-runtime
      hipify
    ];
  };

  version = "1.13.1";
in

# assert that cudnnSupport is only enabled if cudaSupport is enabled
assert cudnnSupport -> cudaSupport;

# assert that everything needed for cuda is present and that the correct cuda versions are used
assert cudaSupport -> (builtins.elem (versions.major cudatoolkit.version) [ "9" "10" "11" ]);

# confirm that cudatoolkits are sync'd across dependencies
assert (MPISupport && cudaSupport) -> mpi.cudatoolkit == cudatoolkit;
assert cudaSupport -> magma.cudaPackages == cudaPackages;


buildPythonPackage {
  pname = "torch";
  # Don't forget to update torch-bin to the same version.
  inherit version;
  format = "setuptools";

  disabled = pythonOlder "3.7.0";

  outputs = [
    "out" # output standard python package
    "dev" # output libtorch headers
    "lib" # output libtorch libraries
  ];

  src = fetchFromGitHub {
    owner = "pytorch";
    repo = "pytorch";
    rev = "refs/tags/v${version}";
    fetchSubmodules = true;
    hash = "sha256-yQz+xHPw9ODRBkV9hv1th38ZmUr/fXa+K+d+cvmX3Z8=";
  };

  patches = lists.optionals (stdenv.isDarwin && stdenv.isx86_64) [
    # pthreadpool added support for Grand Central Dispatch in April
    # 2020. However, this relies on functionality (DISPATCH_APPLY_AUTO)
    # that is available starting with macOS 10.13. However, our current
    # base is 10.12. Until we upgrade, we can fall back on the older
    # pthread support.
    ./pthreadpool-disable-gcd.diff
  ] ++ [
    # PyTorch fails to build on gcc 12 due to gloo
    # https://github.com/pytorch/pytorch/issues/77614
    (fetchpatch {
      url = "https://github.com/facebookincubator/gloo/commit/4a5e339b764261d20fc409071dc7a8b8989aa195.patch";
      stripLen = 1;
      extraPrefix = "third_party/gloo/";
      hash = "sha256-UxR1r7F6g76BWj3GBIrSy5t+YZDCWy6mMddwx+hon5w=";
    })
  ];

  postPatch = lib.optionalString rocmSupport ''
    # https://github.com/facebookincubator/gloo/pull/297
    substituteInPlace third_party/gloo/cmake/Hipify.cmake \
      --replace "\''${HIPIFY_COMMAND}" "python \''${HIPIFY_COMMAND}"

    # Replace hard-coded rocm paths
    substituteInPlace caffe2/CMakeLists.txt \
      --replace "/opt/rocm" "${rocmtoolkit_joined}" \
      --replace "hcc/include" "hip/include" \
      --replace "rocblas/include" "include/rocblas" \
      --replace "hipsparse/include" "include/hipsparse"

    # Doesn't pick up the environment variable?
    substituteInPlace third_party/kineto/libkineto/CMakeLists.txt \
      --replace "\''$ENV{ROCM_SOURCE_DIR}" "${rocmtoolkit_joined}" \
      --replace "/opt/rocm" "${rocmtoolkit_joined}"

    # Strangely, this is never set in cmake
    substituteInPlace cmake/public/LoadHIP.cmake \
      --replace "set(ROCM_PATH \$ENV{ROCM_PATH})" \
        "set(ROCM_PATH \$ENV{ROCM_PATH})''\nset(ROCM_VERSION ${
          strings.concatStrings (strings.intersperse "0" (strings.splitString "." hip.version))
        })"
  '';

  preConfigure = ''
    export BUILD_DOCS=${setBool buildDocs}
  ''
  # We only do an imports check, so do not build tests either.
  + ''
    export BUILD_TEST=${setBool false}
  ''
  # Unlike MKL, oneDNN (née MKLDNN) is FOSS, so we enable support for
  # it by default. PyTorch currently uses its own vendored version
  # of oneDNN through Intel iDeep.
  + ''
    export USE_MKLDNN=${setBool mklDnnSupport}
    export USE_MKLDNN_CBLAS=${setBool mklDnnSupport}
  ''
  # Avoid using pybind11 from git submodule
  # Also avoids pytorch exporting the headers of pybind11
  # NOTE: We cannot silence the CMake warning asking us to set PYBIND11_PYTHON_VERSION because it
  #   will not be passed along by PyTorch.
  + ''
    export USE_SYSTEM_PYBIND11=${setBool true}
  ''
  # Avoid using protobuf from git submodule
  + ''
    export BUILD_CUSTOM_PROTOBUF=${setBool false}
  ''
  # Override the (weirdly) wrong version set by default. See
  # https://github.com/NixOS/nixpkgs/pull/52437#issuecomment-449718038
  # https://github.com/pytorch/pytorch/blob/v1.0.0/setup.py#L267
  + ''
    export PYTORCH_BUILD_VERSION=${version}
    export PYTORCH_BUILD_NUMBER=0
  ''
  # don't build pytorch's third_party NCCL
  + strings.optionalString useSystemNccl ''
    export USE_NCCL=${setBool true}
    export USE_STATIC_NCCL=${setBool true}
    export USE_SYSTEM_NCCL=${setBool true}
    export NCCL_LIB_DIR="${nccl.dev}/lib"
    export NCCL_INCLUDE_DIR="${nccl.dev}/include"
  ''
  # enable CUDA support
  # NOTE: For some reason, if we do not explicitly set LIBRARY_PATH, the build will fail with
  #   python3.10-torch>   /nix/store/nydwzhllkq0a21dny69zdjczh6v275lb-binutils-2.40/bin/ld: cannot
  #   python3.10-torch>   find -lcudadevrt: No such file or directory
  #   python3.10-torch>   /nix/store/nydwzhllkq0a21dny69zdjczh6v275lb-binutils-2.40/bin/ld: cannot
  #   python3.10-torch>   find -lcudart_static: No such file or directory
  + strings.optionalString cudaSupport ''
    export USE_CUDA=${setBool true}
    export USE_FLASH_ATTENTION=${setBool true}
    export CUDA_HOME="${cuda-redist-native}"
    export TORCH_CUDA_ARCH_LIST="${gpuTargetString}"
    export CC="${backendStdenv.cc}/bin/cc"
    export CXX="${backendStdenv.cc}/bin/c++"
    export LIBRARY_PATH="$CUDA_HOME/lib:''${LIBRARY_PATH:+:}$LIBRARY_PATH"
  ''
  + strings.optionalString cudnnSupport ''
    export USE_CUDNN=${setBool true}
    export CUDNN_LIB_DIR="${cuda-redist-native}/lib"
    export CUDNN_INCLUDE_DIR="${cuda-redist-native}/include"
  '' + strings.optionalString rocmSupport ''
    export ROCM_PATH=${rocmtoolkit_joined}
    export ROCM_SOURCE_DIR=${rocmtoolkit_joined}
    export PYTORCH_ROCM_ARCH="${gpuTargetString}"
    export CMAKE_CXX_FLAGS="-I${rocmtoolkit_joined}/include -I${rocmtoolkit_joined}/include/rocblas"
    python tools/amd_build/build_amd.py
  '';

  # Use pytorch's custom configurations
  dontUseCmakeConfigure = true;

  preBuild = ''
    export MAX_JOBS=$NIX_BUILD_CORES
    ${python.pythonForBuild.interpreter} setup.py build --cmake-only
    ${cmake}/bin/cmake build
  '';

  preFixup = ''
    function join_by { local IFS="$1"; shift; echo "$*"; }
    function strip2 {
      IFS=':'
      read -ra RP <<< $(patchelf --print-rpath $1)
      IFS=' '
      RP_NEW=$(join_by : ''${RP[@]:2})
      patchelf --set-rpath \$ORIGIN:''${RP_NEW} "$1"
    }
    for f in $(find ''${out} -name 'libcaffe2*.so')
    do
      strip2 $f
    done
  '';

  # Suppress a weird warning in mkl-dnn, part of ideep in pytorch
  # (upstream seems to have fixed this in the wrong place?)
  # https://github.com/intel/mkl-dnn/commit/8134d346cdb7fe1695a2aa55771071d455fae0bc
  # https://github.com/pytorch/pytorch/issues/22346
  #
  # Also of interest: pytorch ignores CXXFLAGS uses CFLAGS for both C and C++:
  # https://github.com/pytorch/pytorch/blob/v1.11.0/setup.py#L17
  env.NIX_CFLAGS_COMPILE = toString (
    lists.optionals (blas.implementation == "mkl") [ "-Wno-error=array-bounds" ]
    # Suppress gcc regression: avx512 math function raises uninitialized variable warning
    # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
    # See also: Fails to compile with GCC 12.1.0 https://github.com/pytorch/pytorch/issues/77939
    ++ lists.optionals stdenv.cc.isGNU [ "-Wno-error=maybe-uninitialized" "-Wno-error=uninitialized" ]
  );

  nativeBuildInputs = [
    cmake
    ninja
    pybind11
    removeReferencesTo
    util-linux
    which
  ]
  ++ lists.optionals cudaSupport [ cuda-redist-native ]
  ++ lists.optionals rocmSupport [ rocmtoolkit_joined ];

  buildInputs = [
    blas
    blas.provider
    pybind11

    # TODO: Why do these need to be in buildInputs?
    mpfr
    gmp
    fftw
  ]
  # FIXME: This is a hacky way to get around an error Gloo throws with newer Linux headers:
  #   error: flexible array member ‘ethtool_link_settings::link_mode_masks’ not at end of ‘struct
  #   gloo::getInterfaceSpeedGLinkSettings(int, ifreq*)::<unnamed>’
  ++ lists.optionals stdenv.isLinux [ linuxHeaders_5_19 ]
  ++ lists.optionals cudaSupport [ cuda-redist ]
  ++ lists.optionals rocmSupport [ openmp ]
  ++ lists.optionals (cudaSupport || rocmSupport) [ magma ]
  ++ lists.optionals stdenv.isLinux [ numactl ]
  ++ lists.optionals stdenv.isDarwin [ Accelerate CoreServices libobjc ];

  propagatedBuildInputs = [
    cffi
    click
    numpy
    pyyaml
    typing-extensions
    # the following are required for tensorboard support
    future
    pillow
    protobuf
    six
    tensorboard
  ] ++ lists.optionals MPISupport [ mpi ]
  ++ lists.optionals rocmSupport [ rocmtoolkit_joined ];

  # Tests take a long time and may be flaky, so just sanity-check imports
  doCheck = false;

  pythonImportsCheck = [
    "torch"
  ];

  nativeCheckInputs = [ hypothesis ninja psutil ];

  checkPhase = strings.concatStringsSep " " [
    "runHook preCheck"
    # cudaStubEnv
    "${python.interpreter} test/run_test.py"
    "--exclude"
    (strings.concatStringsSep " " [
      "utils" # utils requires git, which is not allowed in the check phase

      # "dataloader" # psutils correctly finds and triggers multiprocessing, but is too sandboxed to run -- resulting in numerous errors
      # ^^^^^^^^^^^^ NOTE: while test_dataloader does return errors, these are acceptable errors and do not interfere with the build

      # tensorboard has acceptable failures for pytorch 1.3.x due to dependencies on tensorboard-plugins
      (strings.optionalString (versions.majorMinor version == "1.3") "tensorboard")
    ])
    "runHook postCheck"
  ];

  postInstall = ''
    find "$out/${python.sitePackages}/torch/include" "$out/${python.sitePackages}/torch/lib" -type f -exec remove-references-to -t ${stdenv.cc} '{}' +

    mkdir $dev
    cp -r $out/${python.sitePackages}/torch/include $dev/include
    cp -r $out/${python.sitePackages}/torch/share $dev/share

    # Fix up library paths for split outputs
    substituteInPlace \
      $dev/share/cmake/Torch/TorchConfig.cmake \
      --replace \''${TORCH_INSTALL_PREFIX}/lib "$lib/lib"

    substituteInPlace \
      $dev/share/cmake/Caffe2/Caffe2Targets-release.cmake \
      --replace \''${_IMPORT_PREFIX}/lib "$lib/lib"

    mkdir $lib
    mv $out/${python.sitePackages}/torch/lib $lib/lib
    ln -s $lib/lib $out/${python.sitePackages}/torch/lib
  '' + strings.optionalString rocmSupport ''
    substituteInPlace $dev/share/cmake/Tensorpipe/TensorpipeTargets-release.cmake \
      --replace "\''${_IMPORT_PREFIX}/lib64" "$lib/lib"

    substituteInPlace $dev/share/cmake/ATen/ATenConfig.cmake \
      --replace "/build/source/torch/include" "$dev/include"
  '';

  postFixup = strings.optionalString stdenv.isDarwin ''
    for f in $(ls $lib/lib/*.dylib); do
        install_name_tool -id $lib/lib/$(basename $f) $f || true
    done

    install_name_tool -change @rpath/libshm.dylib $lib/lib/libshm.dylib $lib/lib/libtorch_python.dylib
    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libtorch_python.dylib
    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch_python.dylib

    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch.dylib

    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libshm.dylib
    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libshm.dylib
  '';

  # Builds in 2+h with 2 cores, and ~15m with a big-parallel builder.
  requiredSystemFeatures = [ "big-parallel" ];

  passthru = {
    inherit cudaSupport cudaPackages;
    # At least for 1.10.2 `torch.fft` is unavailable unless BLAS provider is MKL. This attribute allows for easy detection of its availability.
    blasProvider = blas.provider;
  } // attrsets.optionalAttrs cudaSupport {
    # NOTE: supportedCudaCapabilities isn't computed unless cudaSupport is true, so we can't use
    #   it in the passthru set above because a downstream package might try to access it even
    #   when cudaSupport is false. Better to have it missing than null or an empty list by default.
    cudaCapabilities = supportedCudaCapabilities;
  };

  meta = with lib; {
    changelog = "https://github.com/pytorch/pytorch/releases/tag/v${version}";
    # keep PyTorch in the description so the package can be found under that name on search.nixos.org
    description = "PyTorch: Tensors and Dynamic neural networks in Python with strong GPU acceleration";
    homepage = "https://pytorch.org/";
    license = licenses.bsd3;
    maintainers = with maintainers; [ teh thoughtpolice tscholak ]; # tscholak esp. for darwin-related builds
    platforms = with platforms; linux ++ lists.optionals (!cudaSupport || !rocmSupport) darwin;
    broken = rocmSupport && cudaSupport; # CUDA and ROCm are mutually exclusive
  };
}
	{ buildDocs ? false
	, buildPythonPackage
	, fetchFromGitHub
	, fetchpatch
	, lib
	, magma
	, mpi
	, MPISupport ? false
	, python
	, stdenv

	, # Native build inputs
	cmake
	, linkFarm
	, pybind11
	, removeReferencesTo
	, symlinkJoin
	, util-linux
	, which

	# TODO: Necessary?
	, mpfr
	, gmp
	, fftw

	, # Build inputs
	Accelerate
	, CoreServices
	, libobjc
	, numactl

	, # Propagated build inputs
	cffi
	, click
	, numpy
	, pyyaml
	, typing-extensions

	, # Unit tests
	hypothesis
	, psutil

	, # Disable MKLDNN on aarch64-darwin, it negatively impacts performance,
	# this is also what official pytorch build does
	mklDnnSupport ? !(stdenv.isDarwin && stdenv.isAarch64)

	, # virtual pkg that consistently instantiates blas across nixpkgs
	# See https://github.com/NixOS/nixpkgs/pull/83888
	blas

	, # ninja (https://ninja-build.org) must be available to run C++ extensions tests,
	ninja
	, linuxHeaders_5_19

	, # dependencies for torch.utils.tensorboard
	future
	, isPy3k
	, pillow
	, protobuf
	, pythonOlder
	, six
	, tensorboard

	, # CUDA dependencies
	cudaPackages ? { }
	, cudaSupport ? false
	, cudnnSupport ? true
	, useSystemNccl ? true

	, # ROCm dependencies
	gpuTargets ? [ ]
	, hip
	, hipblas
	, hipcub
	, hipfft
	, hipify
	, hipsolver
	, hipsparse
	, miopen
	, miopengemm
	, openmp
	, rccl
	, rocblas
	, rocfft
	, rocm-comgr
	, rocm-core
	, rocm-device-libs
	, rocm-opencl-runtime
	, rocm-runtime
	, rocm-thunk
	, rocminfo
	, rocmSupport ? false
	, rocprim
	, rocrand
	, rocsolver
	, rocsparse
	, rocthrust
	, roctracer
	}:

	let
	inherit (lib)
	attrsets
	lists
	strings
	trivial
	versions
	;

	inherit (cudaPackages)
	backendStdenv
	cudaFlags
	cudatoolkit
	cudaVersion
	nccl
	;

	# setBool :: Bool -> String
	setBool = v: if v then "1" else "0";

	# https://github.com/pytorch/pytorch/blob/v1.13.1/torch/utils/cpp_extension.py#L1751
	supportedTorchCudaCapabilities =
	let
	real = [ "3.5" "3.7" "5.0" "5.2" "5.3" "6.0" "6.1" "6.2" "7.0" "7.2" "7.5" "8.0" "8.6" ];
	ptx = lists.map (x: "${x}+PTX") real;
	in
	real ++ ptx;

	# NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
	# of the first list from the second list. That means:
	# lists.subtractLists a b = b - a

	# For CUDA
	supportedCudaCapabilities = lists.intersectLists cudaFlags.cudaCapabilities supportedTorchCudaCapabilities;
	unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities cudaFlags.cudaCapabilities;

	# Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
	gpuArchWarner = supported: unsupported:
	trivial.throwIf (supported == [ ])
	(
	"No supported GPU targets specified. Requested GPU targets: "
	+ strings.concatStringsSep ", " unsupported
	)
	supported;

	# Create the gpuTargetString.
	gpuTargetString = strings.concatStringsSep ";" (
	if gpuTargets != [ ] then
	# If gpuTargets is specified, it always takes priority.
	gpuTargets
	else if cudaSupport then
	gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
	else if rocmSupport then
	hip.gpuTargets
	else
	throw "No GPU targets specified"
	);

	# For common CUDA dependencies
	# NOTE: NCCL is not included here because it may also be used by ROCm (PyTorch's RCCL backend).
	cuda-redist-common = with cudaPackages; [

	] ++ lists.optionals cudnnSupport [ cudnn ];

	# For CUDA dependencies needed only at build time
	cuda-redist-native = symlinkJoin {
	name = "cuda-redist-native-${cudaVersion}";
	paths = with cudaPackages; [
	# TODO: Does enabling this give me CUDA_CUPTI support for Kineto?
	cuda_cccl # <thrust/*>
	cuda_cudart # cuda_runtime.h
	cuda_cupti # for Kineto GPU profiling
	cuda_nvcc
	cuda_nvml_dev # <nvml.h>
	cuda_nvprof # <cuda_profiler_api.h>
	cuda_nvrtc
	cuda_nvtx # -llibNVToolsExt
	libcublas
	libcufft
	libcurand
	libcusolver
	libcusparse
	] ++ cuda-redist-common;
	};

	# For CUDA dependencies needed only at runtime
	cuda-redist = symlinkJoin {
	name = "cuda-redist-${cudaVersion}";
	paths = with cudaPackages; [
	] ++ cuda-redist-common;
	};

	# Normally libcuda.so.1 is provided at runtime by nvidia-x11 via
	# LD_LIBRARY_PATH=/run/opengl-driver/lib. We only use the stub
	# libcuda.so from cudatoolkit for running tests, so that we don’t have
	# to recompile pytorch on every update to nvidia-x11 or the kernel.

	# TODO: No more cudatoolkit!

	# cudaStub = linkFarm "cuda-stub" [{
	# name = "libcuda.so.1";
	# path = "${cudatoolkit}/lib/stubs/libcuda.so";
	# }];
	# cudaStubEnv = lib.optionalString cudaSupport
	# "LD_LIBRARY_PATH=${cudaStub}\${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH ";

	rocmtoolkit_joined = symlinkJoin {
	name = "rocm-merged";

	paths = [
	rocm-core
	hip
	rccl
	miopen
	miopengemm
	rocrand
	rocblas
	rocfft
	rocsparse
	hipsparse
	rocthrust
	rocprim
	hipcub
	roctracer
	rocfft
	rocsolver
	hipfft
	hipsolver
	hipblas
	rocminfo
	rocm-thunk
	rocm-comgr
	rocm-device-libs
	rocm-runtime
	rocm-opencl-runtime
	hipify
	];
	};

	version = "1.13.1";
	in

	# assert that cudnnSupport is only enabled if cudaSupport is enabled
	assert cudnnSupport -> cudaSupport;

	# assert that everything needed for cuda is present and that the correct cuda versions are used
	assert cudaSupport -> (builtins.elem (versions.major cudatoolkit.version) [ "9" "10" "11" ]);

	# confirm that cudatoolkits are sync'd across dependencies
	assert (MPISupport && cudaSupport) -> mpi.cudatoolkit == cudatoolkit;
	assert cudaSupport -> magma.cudaPackages == cudaPackages;


	buildPythonPackage {
	pname = "torch";
	# Don't forget to update torch-bin to the same version.
	inherit version;
	format = "setuptools";

	disabled = pythonOlder "3.7.0";

	outputs = [
	"out" # output standard python package
	"dev" # output libtorch headers
	"lib" # output libtorch libraries
	];

	src = fetchFromGitHub {
	owner = "pytorch";
	repo = "pytorch";
	rev = "refs/tags/v${version}";
	fetchSubmodules = true;
	hash = "sha256-yQz+xHPw9ODRBkV9hv1th38ZmUr/fXa+K+d+cvmX3Z8=";
	};

	patches = lists.optionals (stdenv.isDarwin && stdenv.isx86_64) [
	# pthreadpool added support for Grand Central Dispatch in April
	# 2020. However, this relies on functionality (DISPATCH_APPLY_AUTO)
	# that is available starting with macOS 10.13. However, our current
	# base is 10.12. Until we upgrade, we can fall back on the older
	# pthread support.
	./pthreadpool-disable-gcd.diff
	] ++ [
	# PyTorch fails to build on gcc 12 due to gloo
	# https://github.com/pytorch/pytorch/issues/77614
	(fetchpatch {
	url = "https://github.com/facebookincubator/gloo/commit/4a5e339b764261d20fc409071dc7a8b8989aa195.patch";
	stripLen = 1;
	extraPrefix = "third_party/gloo/";
	hash = "sha256-UxR1r7F6g76BWj3GBIrSy5t+YZDCWy6mMddwx+hon5w=";
	})
	];

	postPatch = lib.optionalString rocmSupport ''
	# https://github.com/facebookincubator/gloo/pull/297
	substituteInPlace third_party/gloo/cmake/Hipify.cmake \
	--replace "\''${HIPIFY_COMMAND}" "python \''${HIPIFY_COMMAND}"

	# Replace hard-coded rocm paths
	substituteInPlace caffe2/CMakeLists.txt \
	--replace "/opt/rocm" "${rocmtoolkit_joined}" \
	--replace "hcc/include" "hip/include" \
	--replace "rocblas/include" "include/rocblas" \
	--replace "hipsparse/include" "include/hipsparse"

	# Doesn't pick up the environment variable?
	substituteInPlace third_party/kineto/libkineto/CMakeLists.txt \
	--replace "\''$ENV{ROCM_SOURCE_DIR}" "${rocmtoolkit_joined}" \
	--replace "/opt/rocm" "${rocmtoolkit_joined}"

	# Strangely, this is never set in cmake
	substituteInPlace cmake/public/LoadHIP.cmake \
	--replace "set(ROCM_PATH \$ENV{ROCM_PATH})" \
	"set(ROCM_PATH \$ENV{ROCM_PATH})''\nset(ROCM_VERSION ${
	strings.concatStrings (strings.intersperse "0" (strings.splitString "." hip.version))
	})"
	'';

	preConfigure = ''
	export BUILD_DOCS=${setBool buildDocs}
	''
	# We only do an imports check, so do not build tests either.
	+ ''
	export BUILD_TEST=${setBool false}
	''
	# Unlike MKL, oneDNN (née MKLDNN) is FOSS, so we enable support for
	# it by default. PyTorch currently uses its own vendored version
	# of oneDNN through Intel iDeep.
	+ ''
	export USE_MKLDNN=${setBool mklDnnSupport}
	export USE_MKLDNN_CBLAS=${setBool mklDnnSupport}
	''
	# Avoid using pybind11 from git submodule
	# Also avoids pytorch exporting the headers of pybind11
	# NOTE: We cannot silence the CMake warning asking us to set PYBIND11_PYTHON_VERSION because it
	# will not be passed along by PyTorch.
	+ ''
	export USE_SYSTEM_PYBIND11=${setBool true}
	''
	# Avoid using protobuf from git submodule
	+ ''
	export BUILD_CUSTOM_PROTOBUF=${setBool false}
	''
	# Override the (weirdly) wrong version set by default. See
	# https://github.com/NixOS/nixpkgs/pull/52437#issuecomment-449718038
	# https://github.com/pytorch/pytorch/blob/v1.0.0/setup.py#L267
	+ ''
	export PYTORCH_BUILD_VERSION=${version}
	export PYTORCH_BUILD_NUMBER=0
	''
	# don't build pytorch's third_party NCCL
	+ strings.optionalString useSystemNccl ''
	export USE_NCCL=${setBool true}
	export USE_STATIC_NCCL=${setBool true}
	export USE_SYSTEM_NCCL=${setBool true}
	export NCCL_LIB_DIR="${nccl.dev}/lib"
	export NCCL_INCLUDE_DIR="${nccl.dev}/include"
	''
	# enable CUDA support
	# NOTE: For some reason, if we do not explicitly set LIBRARY_PATH, the build will fail with
	# python3.10-torch> /nix/store/nydwzhllkq0a21dny69zdjczh6v275lb-binutils-2.40/bin/ld: cannot
	# python3.10-torch> find -lcudadevrt: No such file or directory
	# python3.10-torch> /nix/store/nydwzhllkq0a21dny69zdjczh6v275lb-binutils-2.40/bin/ld: cannot
	# python3.10-torch> find -lcudart_static: No such file or directory
	+ strings.optionalString cudaSupport ''
	export USE_CUDA=${setBool true}
	export USE_FLASH_ATTENTION=${setBool true}
	export CUDA_HOME="${cuda-redist-native}"
	export TORCH_CUDA_ARCH_LIST="${gpuTargetString}"
	export CC="${backendStdenv.cc}/bin/cc"
	export CXX="${backendStdenv.cc}/bin/c++"
	export LIBRARY_PATH="$CUDA_HOME/lib:''${LIBRARY_PATH:+:}$LIBRARY_PATH"
	''
	+ strings.optionalString cudnnSupport ''
	export USE_CUDNN=${setBool true}
	export CUDNN_LIB_DIR="${cuda-redist-native}/lib"
	export CUDNN_INCLUDE_DIR="${cuda-redist-native}/include"
	'' + strings.optionalString rocmSupport ''
	export ROCM_PATH=${rocmtoolkit_joined}
	export ROCM_SOURCE_DIR=${rocmtoolkit_joined}
	export PYTORCH_ROCM_ARCH="${gpuTargetString}"
	export CMAKE_CXX_FLAGS="-I${rocmtoolkit_joined}/include -I${rocmtoolkit_joined}/include/rocblas"
	python tools/amd_build/build_amd.py
	'';

	# Use pytorch's custom configurations
	dontUseCmakeConfigure = true;

	preBuild = ''
	export MAX_JOBS=$NIX_BUILD_CORES
	${python.pythonForBuild.interpreter} setup.py build --cmake-only
	${cmake}/bin/cmake build
	'';

	preFixup = ''
	function join_by { local IFS="$1"; shift; echo "$*"; }
	function strip2 {
	IFS=':'
	read -ra RP <<< $(patchelf --print-rpath $1)
	IFS=' '
	RP_NEW=$(join_by : ''${RP[@]:2})
	patchelf --set-rpath \$ORIGIN:''${RP_NEW} "$1"
	}
	for f in $(find ''${out} -name 'libcaffe2*.so')
	do
	strip2 $f
	done
	'';

	# Suppress a weird warning in mkl-dnn, part of ideep in pytorch
	# (upstream seems to have fixed this in the wrong place?)
	# https://github.com/intel/mkl-dnn/commit/8134d346cdb7fe1695a2aa55771071d455fae0bc
	# https://github.com/pytorch/pytorch/issues/22346
	#
	# Also of interest: pytorch ignores CXXFLAGS uses CFLAGS for both C and C++:
	# https://github.com/pytorch/pytorch/blob/v1.11.0/setup.py#L17
	env.NIX_CFLAGS_COMPILE = toString (
	lists.optionals (blas.implementation == "mkl") [ "-Wno-error=array-bounds" ]
	# Suppress gcc regression: avx512 math function raises uninitialized variable warning
	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
	# See also: Fails to compile with GCC 12.1.0 https://github.com/pytorch/pytorch/issues/77939
	++ lists.optionals stdenv.cc.isGNU [ "-Wno-error=maybe-uninitialized" "-Wno-error=uninitialized" ]
	);

	nativeBuildInputs = [
	cmake
	ninja
	pybind11
	removeReferencesTo
	util-linux
	which
	]
	++ lists.optionals cudaSupport [ cuda-redist-native ]
	++ lists.optionals rocmSupport [ rocmtoolkit_joined ];

	buildInputs = [
	blas
	blas.provider
	pybind11

	# TODO: Why do these need to be in buildInputs?
	mpfr
	gmp
	fftw
	]
	# FIXME: This is a hacky way to get around an error Gloo throws with newer Linux headers:
	# error: flexible array member ‘ethtool_link_settings::link_mode_masks’ not at end of ‘struct
	# gloo::getInterfaceSpeedGLinkSettings(int, ifreq*)::<unnamed>’
	++ lists.optionals stdenv.isLinux [ linuxHeaders_5_19 ]
	++ lists.optionals cudaSupport [ cuda-redist ]
	++ lists.optionals rocmSupport [ openmp ]
	++ lists.optionals (cudaSupport \|\| rocmSupport) [ magma ]
	++ lists.optionals stdenv.isLinux [ numactl ]
	++ lists.optionals stdenv.isDarwin [ Accelerate CoreServices libobjc ];

	propagatedBuildInputs = [
	cffi
	click
	numpy
	pyyaml
	typing-extensions
	# the following are required for tensorboard support
	future
	pillow
	protobuf
	six
	tensorboard
	] ++ lists.optionals MPISupport [ mpi ]
	++ lists.optionals rocmSupport [ rocmtoolkit_joined ];

	# Tests take a long time and may be flaky, so just sanity-check imports
	doCheck = false;

	pythonImportsCheck = [
	"torch"
	];

	nativeCheckInputs = [ hypothesis ninja psutil ];

	checkPhase = strings.concatStringsSep " " [
	"runHook preCheck"
	# cudaStubEnv
	"${python.interpreter} test/run_test.py"
	"--exclude"
	(strings.concatStringsSep " " [
	"utils" # utils requires git, which is not allowed in the check phase

	# "dataloader" # psutils correctly finds and triggers multiprocessing, but is too sandboxed to run -- resulting in numerous errors
	# ^^^^^^^^^^^^ NOTE: while test_dataloader does return errors, these are acceptable errors and do not interfere with the build

	# tensorboard has acceptable failures for pytorch 1.3.x due to dependencies on tensorboard-plugins
	(strings.optionalString (versions.majorMinor version == "1.3") "tensorboard")
	])
	"runHook postCheck"
	];

	postInstall = ''
	find "$out/${python.sitePackages}/torch/include" "$out/${python.sitePackages}/torch/lib" -type f -exec remove-references-to -t ${stdenv.cc} '{}' +

	mkdir $dev
	cp -r $out/${python.sitePackages}/torch/include $dev/include
	cp -r $out/${python.sitePackages}/torch/share $dev/share

	# Fix up library paths for split outputs
	substituteInPlace \
	$dev/share/cmake/Torch/TorchConfig.cmake \
	--replace \''${TORCH_INSTALL_PREFIX}/lib "$lib/lib"

	substituteInPlace \
	$dev/share/cmake/Caffe2/Caffe2Targets-release.cmake \
	--replace \''${_IMPORT_PREFIX}/lib "$lib/lib"

	mkdir $lib
	mv $out/${python.sitePackages}/torch/lib $lib/lib
	ln -s $lib/lib $out/${python.sitePackages}/torch/lib
	'' + strings.optionalString rocmSupport ''
	substituteInPlace $dev/share/cmake/Tensorpipe/TensorpipeTargets-release.cmake \
	--replace "\''${_IMPORT_PREFIX}/lib64" "$lib/lib"

	substituteInPlace $dev/share/cmake/ATen/ATenConfig.cmake \
	--replace "/build/source/torch/include" "$dev/include"
	'';

	postFixup = strings.optionalString stdenv.isDarwin ''
	for f in $(ls $lib/lib/*.dylib); do
	install_name_tool -id $lib/lib/$(basename $f) $f \|\| true
	done

	install_name_tool -change @rpath/libshm.dylib $lib/lib/libshm.dylib $lib/lib/libtorch_python.dylib
	install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libtorch_python.dylib
	install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch_python.dylib

	install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch.dylib

	install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libshm.dylib
	install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libshm.dylib
	'';

	# Builds in 2+h with 2 cores, and ~15m with a big-parallel builder.
	requiredSystemFeatures = [ "big-parallel" ];

	passthru = {
	inherit cudaSupport cudaPackages;
	# At least for 1.10.2 `torch.fft` is unavailable unless BLAS provider is MKL. This attribute allows for easy detection of its availability.
	blasProvider = blas.provider;
	} // attrsets.optionalAttrs cudaSupport {
	# NOTE: supportedCudaCapabilities isn't computed unless cudaSupport is true, so we can't use
	# it in the passthru set above because a downstream package might try to access it even
	# when cudaSupport is false. Better to have it missing than null or an empty list by default.
	cudaCapabilities = supportedCudaCapabilities;
	};

	meta = with lib; {
	changelog = "https://github.com/pytorch/pytorch/releases/tag/v${version}";
	# keep PyTorch in the description so the package can be found under that name on search.nixos.org
	description = "PyTorch: Tensors and Dynamic neural networks in Python with strong GPU acceleration";
	homepage = "https://pytorch.org/";
	license = licenses.bsd3;
	maintainers = with maintainers; [ teh thoughtpolice tscholak ]; # tscholak esp. for darwin-related builds
	platforms = with platforms; linux ++ lists.optionals (!cudaSupport \|\| !rocmSupport) darwin;
	broken = rocmSupport && cudaSupport; # CUDA and ROCm are mutually exclusive
	};
	}