Skip to content

Instantly share code, notes, and snippets.

@matteoferla
Last active January 26, 2024 17:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save matteoferla/8f4f7739938f963964a6c69efe1baca0 to your computer and use it in GitHub Desktop.
Save matteoferla/8f4f7739938f963964a6c69efe1baca0 to your computer and use it in GitHub Desktop.
Singularity and CUDA

I want Pytorch, Jax and Tensorflow.

Check: nvidia-smi still gives 'NVIDIA-SMI 510.85.02 Driver Version: 510.85.02 CUDA Version: 11.6 '

conda activate base
unset LD_LIBRARY_PATH
unset CUDA_HOME
export NEW_ENV='cuda118redux'
export CONDA_ALWAYS_YES=yes
export CONDA_CHANNELS="anaconda,conda-forge,nvidia"
rm -rf $CONDA_PREFIX/envs/$NEW_ENV
export PYTHONUSERBASE=$CONDA_PREFIX/envs/$NEW_ENV
conda create -y -n $NEW_ENV python==3.10 &&
conda activate $NEW_ENV &&
conda env config vars set PYTHONUSERBASE=$CONDA_PREFIX
conda env config vars set CONDA_OVERRIDE_CUDA=11.8
conda env config vars set LD_LIBRARY_PATH="$CONDA_PREFIX:/usr/local/cuda/compat:/.singularity.d/libs"
conda env config vars set CUDA_HOME=$CONDA_PREFIX
# reset
conda deactivate &&
conda activate $NEW_ENV &&

conda install -y ipykernel &&

export CONDA_CHANNEL_PRIORITY='strict'
export CONDA_CHANNELS="nvidia/label/cuda-11.8.0"

conda install -y nvidia/label/cuda-11.8.0::cuda
conda install -y nvidia/label/cuda-11.8.0::cuda-toolkit
conda install -y nvidia/label/cuda-11.8.0::cuda-nvrtc
conda install -y nvidia/label/cuda-11.8.0::libcufile
conda install -y nvidia/label/cuda-11.8.0::cuda-tools
conda install -y nvidia/label/cuda-11.8.0::cuda-cudart
conda install -y nvidia/label/cuda-11.8.0::cuda-cudart-dev

pip install -q tensorrt
conda install tensorflow-gpu
    
pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install -q --upgrade "jax[cuda11_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html

conda install -y conda-forge::openmm

Fixes

TF works but not JAX and Torch

PyTorch was compiled  against (8, 7, 0) but found runtime version (8, 4, 0).
Found CUDA version 11070, but JAX was built against version 11080, which is newer.

Checking... conda list cudnn

# packages in environment at /data/xchem-fragalysis/mferla/waconda/envs/cuda118redux:
#
# Name                    Version                   Build  Channel
cudnn                     8.9.2.26               cuda11_0    anaconda
nvidia-cudnn-cu11         2022.5.19                pypi_0    pypi
nvidia-cudnn-cu116        8.4.0.27                 pypi_0    pypi
nvidia-cudnn-cu12         8.9.7.29                 pypi_0    pypi

Fixing... pip uninstall nvidia-cudnn-cu116

Torch is fine now

conda list cuda* | grep '11.7'

nvidia-cuda-cupti-cu117   11.7.50                  pypi_0    pypi
nvidia-cuda-nvcc-cu117    11.7.64                  pypi_0    pypi
nvidia-cuda-runtime-cu117 11.7.60                  pypi_0    pypi

pip uninstall nvidia-cuda-runtime-cu117 nvidia-cuda-nvcc-cu117 nvidia-cuda-cupti-cu117

Found cuFFT version 10702, but JAX was built against version 10900

conda list cuFFT

# packages in environment at /data/xchem-fragalysis/mferla/waconda/envs/cuda118redux:
#
# Name                    Version                   Build  Channel
libcufft                  11.0.12.1                     0    nvidia
libcufft-dev              11.0.12.1                     0    nvidia
libcufft-static           11.0.12.1                     0    nvidia
nvidia-cufft-cu11         2022.4.8                 pypi_0    pypi
nvidia-cufft-cu117        10.7.2.50                pypi_0    pypi

Now I get Unable to load cuSOLVER

conda list cusolver

# packages in environment at /data/xchem-fragalysis/mferla/waconda/envs/cuda118redux:
#
# Name                    Version                   Build  Channel
libcusolver               11.5.4.101                    0    nvidia
libcusolver-dev           11.5.4.101                    0    nvidia
libcusolver-static        11.5.4.101                    0    nvidia
nvidia-cusolver-cu11      2022.4.8                 pypi_0    pypi
nvidia-cusolver-cu117     11.3.5.50                pypi_0    pypi

Fixing conda uninstall libcublas... but no!

The following packages will be SUPERSEDED by a higher-priority channel:

cuda-libraries-dev conda-forge::cuda-libraries-dev-12.3.~ --> nvidia::cuda-libraries-dev-11.6.1-0 cuda-tools nvidia/label/cuda-11.8.0::cuda-tools-~ --> nvidia::cuda-tools-11.6.1-0 cuda-visual-tools conda-forge::cuda-visual-tools-12.3.2~ --> nvidia::cuda-visual-tools-11.6.1-0

Cancel.

export CONDA_CHANNEL_PRIORITY='strict'
export CONDA_CHANNELS="nvidia/label/cuda-11.8.0"

conda uninstall libcublas

unset CONDA_CHANNEL_PRIORITY
unset CONDA_CHANNELS

Ehrm... Why did I deal with cublas?

conda install nvidia/label/cuda-11.8.0::libcublas

double tap:

conda install -y nvidia/label/cuda-11.8.0::libcusolver
conda install -y nvidia/label/cuda-11.8.0::libcusolver-dev
conda install -y nvidia/label/cuda-11.8.0::libcusolver-static


export CONDA_CHANNEL_PRIORITY='strict'
export CONDA_CHANNELS="nvidia/label/cuda-11.8.0,nvidia,conda-forge,anaconda"

conda install -y nvidia/label/cuda-11.8.0::cuda
conda install -y nvidia/label/cuda-11.8.0::cuda-toolkit
conda install -y nvidia/label/cuda-11.8.0::cuda-nvrtc
conda install -y nvidia/label/cuda-11.8.0::libcufile
conda install -y nvidia/label/cuda-11.8.0::cuda-tools
conda install -y nvidia/label/cuda-11.8.0::cuda-cudart
conda install -y nvidia/label/cuda-11.8.0::cuda-cudart-dev
conda install -y nvidia/label/cuda-11.8.0::cuda-cupti

JAX gives: XlaRuntimeError INTERNAL: libdevice not found at ./libdevice.10.bc

CONDA_PREFIX -name "libdevice.*"

/data/xchem-fragalysis/mferla/waconda/envs/cuda118redux/lib/python3.10/site-packages/triton/third_party/cuda/lib/libdevice.10.bc
/data/xchem-fragalysis/mferla/waconda/envs/cuda118redux/lib/python3.10/site-packages/jaxlib/cuda/nvvm/libdevice/libdevice.10.bc
/data/xchem-fragalysis/mferla/waconda/envs/cuda118redux/lib/libdevice.10.bc
/data/xchem-fragalysis/mferla/waconda/envs/cuda118redux/nvvm/libdevice/libdevice.10.bc

Whereas Jax says it is looking in:

/data/xchem-fragalysis/mferla/waconda/envs/cuda118redux/lib/python3.10/site-packages/nvidia/cuda_nvcc /usr/local/cuda-11.8 /usr/local/cuda /data/xchem-fragalysis/mferla/waconda/envs/cuda118redux/lib/python3.10/site-packages/nvidia/cuda_nvcc

None of these do anything:

#export CUDA_HOME=$CONDA_PREFIX/lib/python3.10/site-packages/jaxlib/cuda
export CUDA_HOME=$CONDA_PREFIX
export CUDA_DIR=$CUDA_HOME
export XLA_FLAGS='--xla_gpu_cuda_data_dir=='$CUDA_HOME
export LD_LIBRARY_PATH=$CONDA_PREFIX/lib/python3.10/site-packages/jaxlib/cuda:$CONDA_PREFIX:/usr/local/cuda/compat:/.singularity.d/libs

cp -r $CONDA_PREFIX/nvvm $CONDA_PREFIX/lib/python3.10/site-packages/nvidia/cuda_nvcc/nvvm

Nothing worked.

Switched TF and Jax. Now TF works but with warning:

successful NUMA node read from SysFS had negative value (-1)

And Jax says:

Found cuBLAS version 111001, but JAX was built against version 111103, which is newer.

rm -rf $CONDA_PREFIX/lib/python3.10/site-packages/nvidia/cuda_nvcc/nvvm -> no change

Nothing reverting env vars

unset XLA_FLAGS
export LD_LIBRARY_PATH=$CONDA_PREFIX:/usr/local/cuda/compat:/.singularity.d/libs

conda list cublas

# packages in environment at /data/xchem-fragalysis/mferla/waconda/envs/cuda118redux:
#
# Name                    Version                   Build  Channel
libcublas                 11.11.3.6                     0    nvidia/label/cuda-11.8.0
libcublas-dev             11.11.3.6                     0    nvidia/label/cuda-11.8.0
nvidia-cublas-cu11        2022.4.8                 pypi_0    pypi
nvidia-cublas-cu117       11.10.1.25               pypi_0    pypi
nvidia-cublas-cu12        12.3.4.1                 pypi_0    pypi

pip uninstall nvidia-cublas-cu117 nvidia-cublas-cu11 nvidia-cublas-cu12

Reverted to missing path error even TF then Jax.

Doing the sledgehammer way: rm -rf $CONDA_PREFIX/lib/python3.10/site-packages/nvidia

It works.

## I have a very specific problem In the cluster I use (not the university's one, the HTCondor nodes which I will call by a pseudonym, the Emerald city, there are several A100s but have a stack of issues. As a result I currently install Cuda Toolkit 11.8 via conda in a Singularity container in a world that has moved on to Cuda 12. Most likely if you do not have root access, you may be able to use cuda-compat if your system has a long-term support release not a nightly driver release.

CentOS

They are in containers running CentOS 7. Unfortunately in 2023+ most programs require gnu library for C > 2.15 (glibc) which depends on kernel headers so cannot be conda installed. The solution is to use a singularity container with RockyLinux 9.

A minor problem is that I do not have a x86_64 Linux machine with root access. So all my singularity build commands have to be not root.

# singularity build --notest --fakeroot rockyplus.sif rockyplus.def

Bootstrap: library
Library: https://library.sylabs.io
From: library/default/rockylinux:9

%post
    # Update and upgrade system packages
    dnf update  -y
    dnf upgrade -y
    dnf makecache

    # Development tools and kernel development packages
    #dnf group install -y "Development Tools"
    dnf install -y kernel-devel
    dnf install -y epel-release 'dnf-command(config-manager)'
    dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
    dnf install -y dkms
    dnf makecache
    dnf install -y expat

    # X11 and graphical interface tools
    dnf install -y libXrender 
    dnf config-manager --set-enabled crb
    dnf config-manager --enable crb
    dnf install -y xterm xorg-x11-server-Xorg xorg-x11-server-utils xorg-x11-xauth

    # Basic utilities
    dnf install -y git postfix nano cpio
    dnf install -y make autoconf automake libtool
    dnf install -y lsof zlib wget curl libvdpau
    dnf install -y yum-utils glibc-common pciutils

    # Science-related packages
    dnf install -y openbabel

    # Infiniband
    dnf install -y libibverbs-devel
  
    # Fortran and C++ compilers
    dnf install -y gcc-gfortran gcc-c++

    # Additional configuration tools
    dnf install -y kernel-devel-$(uname -r) || dnf install -y kernel-devel
    dnf install -y kernel-headers-$(uname -r) || dnf install -y kernel-headers

    # NVIDIA Container Toolkit
    curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | tee /etc/yum.repos.d/nvidia-container-toolkit.repo
    dnf install -y nvidia-container-toolkit

    # Additional configuration
    echo 'export PS1="[\u@\h \W]\$"' >> /etc/bashrc

But stuff gets weird with Nvidia... whereas I can install the toolkit

%post
   dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
   dnf clean all   
   dnf install -y nvidia-container-toolkit
   # same as curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | tee /etc/yum.repos.d/nvidia-container-toolkit.repo
 
   dnf -y module install nvidia-driver:latest-dkms
   dnf install -y cuda

This fails:

%post
   # update but not update release
   dnf update -y --security --bugfix

   # make repo EPEL available (for dkms)
   dnf install -y epel-release 'dnf-command(config-manager)'
   
   # https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#rhel-9-rocky-9
   dnf install -y dkms libvdpau
   dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
   rpm --erase gpg-pubkey-7fa2af80* || echo 'No package anyway'
   #curl -sSL https://developer.download.nvidia.com/compute/cuda/repos/fedora32/x86_64/D42D0685.pub | gpg --import
   dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
   dnf clean expire-cache
   echo 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
   #dnf module list nvidia-driver
   #dnf module install -y nvidia-driver:535
   #dnf module install -y nvidia-headless-535 nvidia-utils-535
   # https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#additional-package-manager-capabilities
   dnf install      -y cuda-12-2 cuda-tools-12-2 cuda-toolkit-12-2 cuda-compiler-12-2 cuda-libraries-12-2
   # removed: nvidia-gds cuda-libraries-dev-12-2 

   # make CUDA libraries available
   # files are placed in /etc/ld.so.conf.d/ by dnf, just need to put it to cache
   ldconfig
   echo 'export PATH=/usr/local/cuda-12.2/bin:$PATH' >> $APPTAINER_ENVIRONMENT
   echo "export CUDA_HOME=/usr/local/cuda-12.2" >> $APPTAINER_ENVIRONMENT
   echo "export CUDA_INSTALL_PATH=/usr/local/cuda-12.2" >> $APPTAINER_ENVIRONMENT
   # https://stackoverflow.com/a/45319156/4625475
   # ??
 
    # third-party libraries as recommended by Nvidia
    #dnf install -y freeglut-devel libX11-devel libXi-devel libXmu-devel mesa-libGL mesa-libGLU-devel freeimage-devel glfw-devel
   
    # --- clean up
    dnf clean all
    rm -rf /var/cache/dnf

Insert tests with precompiled driver containers

Driver

The next problem is that the Nvidia driver is R510. A nighty version, not a long term version. So Cuda compat does not work with it.

The next problem along is that the Cuda in /usr/local/cuda-11.6 does not match the driver: kernel version 510.85.2 does not match DSO version 510.108.3 -- cannot find working devices in this configuration Cf. ls /usr/local/cuda-11.6/targets/x86_64-linux/lib -lath

import warnings, sys
#warnings.simplefilter('default', stream=sys.stdout)
import sys, os
print('sys.version_info', sys.version_info)
def test_tf():
import tensorflow as tf
print(tf.__version__)
assert tf.test.is_built_with_cuda(), 'TF not CUDA build'
print("CUDA version:", tf.sysconfig.get_build_info()["cuda_version"])
print("cuDNN version:", tf.sysconfig.get_build_info()["cudnn_version"])
print("CUDA library paths:", tf.sysconfig.get_lib())
assert tf.config.list_physical_devices('GPU'), 'TF: no CUDA devices'
print("tf.config.list_physical_devices('GPU')", tf.config.list_physical_devices('GPU'))
a = tf.constant([[1.0, 2.0], [3.0, 4.0]])
b = tf.constant([[1.0, 1.0], [0.0, 1.0]])
c = tf.matmul(a, b)
print(c)
def test_jax():
from jax.lib import xla_bridge
assert xla_bridge.get_backend().platform != 'cpu', 'Jax XLA: Not CUDA '
import jax.numpy as jnp
from jax import random
key = random.PRNGKey(0)
x = random.normal(key, (5000, 5000), dtype=jnp.float32)
print( jnp.dot(x, x.T) )
def test_torch():
import torch
assert torch.cuda.is_available(), 'Torch no CUDA available'
print(torch.cuda.device_count(), torch.cuda.get_device_name(0)), 'Torch no CUDA devices counted'
print(f'Using CuDNN: {torch.backends.cudnn.enabled} ({torch.backends.cudnn.m.version()})')
device = torch.device("cuda")
# Create a random tensor and transfer it to the GPU
x = torch.rand(5, 3).to(device)
print("A random tensor:", x)
y = x * x
print("After calculation:", y)
print("Calculated on:", y.device)
def test_openmm():
from openmm import unit
import openmm as mm
import openmm.app as mma
import sys
import numpy as np
print(mm.Platform.getPluginLoadFailures())
print(mm.Platform.getPlatformByName('CUDA').supportsKernels('CUDA'))
print(mm.Platform.findPlatform('CUDA'))
if __name__ == '__main__':
for fun in (test_jax, test_tf, test_torch, test_openmm):
print(f'\n\n{fun.__name__}')
try:
fun()
except Exception as error:
print(error.__class__.__name__, error)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment