Skip to content

Instantly share code, notes, and snippets.

@alucard001
Created July 22, 2023 10:28
Show Gist options
  • Save alucard001/ed115328a82865961d020d46387cfd47 to your computer and use it in GitHub Desktop.
Save alucard001/ed115328a82865961d020d46387cfd47 to your computer and use it in GitHub Desktop.
llama.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4",
"mount_file_id": "16eiLkNtIBMiShmr5fm73TVz62HkMcStE",
"authorship_tag": "ABX9TyPxXP4j+tZj+/7gU6TQj2lR",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/alucard001/ed115328a82865961d020d46387cfd47/llama.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "rXtDrQpRKPM3"
},
"outputs": [],
"source": [
"# !pip install torch torchvision torchaudio\n",
"# !pip install -r drive/MyDrive/llama/requirements.txt"
]
},
{
"cell_type": "code",
"source": [
"import torch\n",
"print(\"Pytorch version: \", torch.__version__)\n",
"print(\"which cuda was your PyTorch built?:\", torch.version.cuda)\n",
"print(\"Is torch.cuda available?\", torch.cuda.is_available())\n",
"print(\"torch CUDA device name: \", torch.cuda.get_device_name(0))"
],
"metadata": {
"id": "yGLpIrxPZLO7",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "18488a85-445f-4d16-91cf-a4e3dd4437cc"
},
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Pytorch version: 2.0.1+cu118\n",
"which cuda was your PyTorch built?: 11.8\n",
"Is torch.cuda available? True\n",
"torch CUDA device name: Tesla T4\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!torchrun --nproc_per_node 1 example_text_completion.py \\\n",
" --ckpt_dir drive/MyDrive/llama/llama-2-7b/ \\\n",
" --tokenizer_path tokenizer.model \\\n",
" --max_seq_len 128 --max_batch_size 4"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "5-BGd2HmbHS_",
"outputId": "828f5d9b-b03d-4261-e428-20518afa9ddc"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"> initializing model parallel with size 1\n",
"> initializing ddp with size 1\n",
"> initializing pipeline with size 1\n",
"ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 0 (pid: 1950) of binary: /usr/bin/python3\n",
"Traceback (most recent call last):\n",
" File \"/usr/local/bin/torchrun\", line 8, in <module>\n",
" sys.exit(main())\n",
" File \"/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 346, in wrapper\n",
" return f(*args, **kwargs)\n",
" File \"/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py\", line 794, in main\n",
" run(args)\n",
" File \"/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py\", line 785, in run\n",
" elastic_launch(\n",
" File \"/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py\", line 134, in __call__\n",
" return launch_agent(self._config, self._entrypoint, list(args))\n",
" File \"/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py\", line 250, in launch_agent\n",
" raise ChildFailedError(\n",
"torch.distributed.elastic.multiprocessing.errors.ChildFailedError: \n",
"=====================================================\n",
"example_text_completion.py FAILED\n",
"-----------------------------------------------------\n",
"Failures:\n",
" <NO_OTHER_FAILURES>\n",
"-----------------------------------------------------\n",
"Root Cause (first observed failure):\n",
"[0]:\n",
" time : 2023-07-22_10:24:43\n",
" host : f01c16657d58\n",
" rank : 0 (local_rank: 0)\n",
" exitcode : -9 (pid: 1950)\n",
" error_file: <N/A>\n",
" traceback : Signal 9 (SIGKILL) received by PID 1950\n",
"=====================================================\n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "LLcsZiN0bTQ1"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment