Skip to content

Instantly share code, notes, and snippets.

@datduong
Last active July 2, 2018 06:34
Show Gist options
  • Save datduong/62eed65a7daba4f4140579ae3d5352e5 to your computer and use it in GitHub Desktop.
Save datduong/62eed65a7daba4f4140579ae3d5352e5 to your computer and use it in GitHub Desktop.
ErrorCudaTorchDocker

To start a docker with torch, I use

nvidia-docker run -e TERM=dumb -i -t kaixhin/torch 

Running a main script like main.lua gives error that cutorch is not found. Below is partial error message shown

Training a SPEN on the Data using tag_cmd.sh
tag_cmd.sh: 5: [: 0: unexpected operator
-gradient_clip 1.0 -optim_method adam -evaluation_frequency 25 -save_frequency 25 -adam_epsilon 1e-8 -batches_per_epoch 100 -learning_rate_decay 0.0 -learning_rate 0.001 -num_epochs 30 -training_mode pretrain_unaries
running in tag-runs/Mon_Jul__2_01:41:55_UTC_2018
{
  cudnn : 1
  inference_rtol : 1e-05
  profile : 0
  batch_size : 10
  icnn : 0
  problem : "SequenceTagging"
  init_line_search_step : 1
  inference_learning_rate : 0.1
  inference_learning_rate_power : 1
  init_classifier : ""
  test_list : "./data/sequence/crf-data.test.list"
  inference_learning_rate_decay : 0
  train_list : "./data/sequence/crf-data.train.list"
  max_inference_iters : 20
  evaluate_classifier_only : 0
  gpuid : 0
  training_configs : "tag-runs/Mon_Jul__2_01:41:55_UTC_2018/train-config"
  results_file : ""
  first_iter_to_penalize_convergence : 10
  continuous_outputs : 0
  penalize_all_iterates : 0
  mirror_descent : 1
  out_dir : "tag-runs/Mon_Jul__2_01:41:55_UTC_2018/results"
  line_search : 1
  model_file : "tag-runs/Mon_Jul__2_01:41:55_UTC_2018/model-"
  evaluate_spen_only : 0
  init_full_net : ""
  entropy_weight : 1
  unconstrained_iterates : 1
  problem_config : "tag-runs/Mon_Jul__2_01:41:55_UTC_2018/problem-config"
  convergence_regularization_weight : 0
  training_method : "E2E"
  inference_momentum : 0.5
  init_at_local_prediction : 0
  shuffle : 1
  learn_inference_hyperparams : 1
  first_iter_to_apply_loss : 10
}
USING GPU 0
/root/torch/install/bin/luajit: /root/torch/install/share/lua/5.1/trepl/init.lua:389: module 'cutorch' not found:No LuaRocks module found for cutorch
        no field package.preload['cutorch']
        no file '/root/.luarocks/share/lua/5.1/cutorch.lua'
        no file '/root/.luarocks/share/lua/5.1/cutorch/init.lua'
        no file '/root/torch/install/share/lua/5.1/cutorch.lua'
        no file '/root/torch/install/share/lua/5.1/cutorch/init.lua'
        no file './cutorch.lua'
        no file '/root/torch/install/share/luajit-2.1.0-beta1/cutorch.lua'
        no file '/usr/local/share/lua/5.1/cutorch.lua'
        no file '/usr/local/share/lua/5.1/cutorch/init.lua'
        no file 'util/cutorch.lua'
        no file 'optimize/cutorch.lua'
        no file '../torch-util/cutorch.lua'
        no file 'evaluate/cutorch.lua'
        no file 'batch/cutorch.lua'
        no file 'train/cutorch.lua'
        no file 'infer1d/cutorch.lua'
        no file '../cutorch.lua'
        no file 'model/cutorch.lua'
        no file 'infer/cutorch.lua'
        no file 'flags/cutorch.lua'
        no file '/root/.luarocks/lib/lua/5.1/cutorch.so'
        no file '/root/torch/install/lib/lua/5.1/cutorch.so'
        no file '/root/torch/install/lib/cutorch.so'
        no file './cutorch.so'
        no file '/usr/local/lib/lua/5.1/cutorch.so'
        no file '/usr/local/lib/lua/5.1/loadall.so'
stack traceback:
        [C]: in function 'error'
        /root/torch/install/share/lua/5.1/trepl/init.lua:389: in function 'require'
        main.lua:21: in main chunk
        [C]: in function 'dofile'
        /root/torch/install/lib/luarocks/rocks/trepl/scm-1/bin/th:150: in main chunk
        [C]: at 0x00406670

So, I install cutorch. I set path to CUDA_TOOLKIT_ROOT_DIR as found in nlp7.

root@d752fba5c0f8:~# echo $CUDA_TOOLKIT_ROOT_DIR
/usr/local/cuda-8.0

root@d752fba5c0f8:~# luarocks install cutorch

Installing https://raw.githubusercontent.com/torch/rocks/master/cutorch-scm-1.rockspec...
Using https://raw.githubusercontent.com/torch/rocks/master/cutorch-scm-1.rockspec... switching to 'build' mode
Cloning into 'cutorch'...
remote: Counting objects: 229, done.
remote: Compressing objects: 100% (184/184), done.
remote: Total 229 (delta 62), reused 90 (delta 43), pack-reused 0
Receiving objects: 100% (229/229), 241.83 KiB | 0 bytes/s, done.
Resolving deltas: 100% (62/62), done.
Checking connectivity... done.
Warning: unmatched variable LUALIB

jopts=$(getconf _NPROCESSORS_CONF)

echo "Building on $jopts cores"
cmake -E make_directory build && cd build && cmake .. -DLUALIB= -DLUA_INCDIR=/root/torch/install/include -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="/root/torch/install/bin/.." -DCMAKE_INSTALL_PREFIX="/root/torch/install/lib/luarocks/rocks/cutorch/scm-1" && make -j$jopts install

Building on 40 cores
-- The C compiler identification is GNU 4.8.4
-- The CXX compiler identification is GNU 4.8.4
-- Check for working C compiler: /usr/bin/cc
-- Check for working C compiler: /usr/bin/cc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Found Torch7 in /root/torch/install
CMake Error at /usr/share/cmake-2.8/Modules/FindCUDA.cmake:548 (message):
  Specify CUDA_TOOLKIT_ROOT_DIR
Call Stack (most recent call first):
  CMakeLists.txt:7 (FIND_PACKAGE)


-- Configuring incomplete, errors occurred!
See also "/tmp/luarocks_cutorch-scm-1-7692/cutorch/build/CMakeFiles/CMakeOutput.log".

Error: Build error: Failed building.

Next, I tried installing new cuda tool kit but failed.

root@d752fba5c0f8:~# sudo dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64

(Reading database ... 71358 files and directories currently installed.)
Preparing to unpack cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 ...
Unpacking cuda-repo-ubuntu1604-9-2-local (9.2.88-1) over (9.2.88-1) ...
Setting up cuda-repo-ubuntu1604-9-2-local (9.2.88-1) ...
root@d752fba5c0f8:~# sudo apt-get install cuda
Reading package lists... Done
Building dependency tree
Reading state information... Done
Some packages could not be installed. This may mean that you have
requested an impossible situation or if you are using the unstable
distribution that some required packages have not yet been created
or been moved out of Incoming.
The following information may help to resolve the situation:

The following packages have unmet dependencies:
 cuda : Depends: cuda-9-2 (>= 9.2.88) but it is not going to be installed
E: Unable to correct problems, you have held broken packages.

However, cuda tool kit can be insatlled using the old cuda toolkit via sudo apt install nvidia-cuda-toolkit. But... I get error during luarocks install cutorch saying that the cuda toolkit is too old (see below).

root@d752fba5c0f8:~# sudo apt install nvidia-cuda-toolkit ### this statement runs fine. 

root@d752fba5c0f8:~# luarocks install cutorch

Installing https://raw.githubusercontent.com/torch/rocks/master/cutorch-scm-1.rockspec...
Using https://raw.githubusercontent.com/torch/rocks/master/cutorch-scm-1.rockspec... switching to 'build' mode
Cloning into 'cutorch'...
remote: Counting objects: 229, done.
remote: Compressing objects: 100% (184/184), done.
remote: Total 229 (delta 62), reused 90 (delta 43), pack-reused 0
Receiving objects: 100% (229/229), 241.83 KiB | 0 bytes/s, done.
Resolving deltas: 100% (62/62), done.
Checking connectivity... done.
Warning: unmatched variable LUALIB

jopts=$(getconf _NPROCESSORS_CONF)

echo "Building on $jopts cores"
cmake -E make_directory build && cd build && cmake .. -DLUALIB= -DLUA_INCDIR=/root/torch/install/include -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="/root/torch/install/bin/.." -DCMAKE_INSTALL_PREFIX="/root/torch/install/lib/luarocks/rocks/cutorch/scm-1" && make -j$jopts install

Building on 40 cores
-- The C compiler identification is GNU 4.8.4
-- The CXX compiler identification is GNU 4.8.4
-- Check for working C compiler: /usr/bin/cc
-- Check for working C compiler: /usr/bin/cc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Found Torch7 in /root/torch/install
CMake Error at /usr/share/cmake-2.8/Modules/FindPackageHandleStandardArgs.cmake:108 (message):
  Could NOT find CUDA: Found unsuitable version "5.5", but required is at
  least "6.5" (found /usr)
Call Stack (most recent call first):
  /usr/share/cmake-2.8/Modules/FindPackageHandleStandardArgs.cmake:313 (_FPHSA_FAILURE_MESSAGE)
  /usr/share/cmake-2.8/Modules/FindCUDA.cmake:806 (find_package_handle_standard_args)
  CMakeLists.txt:7 (FIND_PACKAGE)


-- Configuring incomplete, errors occurred!
See also "/tmp/luarocks_cutorch-scm-1-9531/cutorch/build/CMakeFiles/CMakeOutput.log".
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment