Created
August 19, 2019 12:48
-
-
Save pat-coady/e4fd0773abe8f15b58b3624b1f39d3ea to your computer and use it in GitHub Desktop.
Train Custom TF Model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Train Custom TensorFlow Model\n", | |
"\n", | |
"see:\n", | |
"https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from azureml.core.runconfig import RunConfiguration\n", | |
"from azureml.core.conda_dependencies import CondaDependencies\n", | |
"from azureml.core import Workspace\n", | |
"from azureml.core.compute import ComputeTarget\n", | |
"from azureml.core.compute import AmlCompute\n", | |
"from azureml.core.compute_target import ComputeTargetException\n", | |
"from azureml.core.runconfig import DEFAULT_CPU_IMAGE\n", | |
"from azureml.core.runconfig import DEFAULT_GPU_IMAGE\n", | |
"from azureml.core import Experiment\n", | |
"from azureml.core import ScriptRunConfig\n", | |
"import os " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Initial Setup\n", | |
"\n", | |
"Install CLI: `brew update && brew install azure-cli`\n", | |
"\n", | |
"First use: `az login` (opens browser window for ADI SSO)\n", | |
"\n", | |
"Create resource group: `az group create --name myResourceGroup --location eastus`\n", | |
"\n", | |
"Crease ML workspace: `az ml workspace create -w MyWorkspace -g MyResourceGroup`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# grabbed config.json file from Azure web portal\n", | |
"# TODO - lookup method to use CLI or Python SDK method to get this file\n", | |
"\n", | |
"ws = Workspace.from_config(path=\"./.azureml/config.json\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[{'name': 'Standard_DS1_v2', 'vCPUs': 1, 'memoryGB': 3.5}, {'name': 'Standard_DS2_v2', 'vCPUs': 2, 'memoryGB': 7.0}, {'name': 'Standard_DS3_v2', 'vCPUs': 4, 'memoryGB': 14.0}, {'name': 'Standard_DS4_v2', 'vCPUs': 8, 'memoryGB': 28.0}, {'name': 'Standard_DS5_v2', 'vCPUs': 16, 'memoryGB': 56.0}, {'name': 'Standard_DS11_v2', 'vCPUs': 2, 'memoryGB': 14.0}, {'name': 'Standard_DS12_v2', 'vCPUs': 4, 'memoryGB': 28.0}, {'name': 'Standard_DS13_v2', 'vCPUs': 8, 'memoryGB': 56.0}, {'name': 'Standard_DS14_v2', 'vCPUs': 16, 'memoryGB': 112.0}, {'name': 'Standard_DS15_v2', 'vCPUs': 20, 'memoryGB': 140.0}, {'name': 'Standard_D1_v2', 'vCPUs': 1, 'memoryGB': 3.5}, {'name': 'Standard_D2_v2', 'vCPUs': 2, 'memoryGB': 7.0}, {'name': 'Standard_D3_v2', 'vCPUs': 4, 'memoryGB': 14.0}, {'name': 'Standard_D4_v2', 'vCPUs': 8, 'memoryGB': 28.0}, {'name': 'Standard_D11_v2', 'vCPUs': 2, 'memoryGB': 14.0}, {'name': 'Standard_D12_v2', 'vCPUs': 4, 'memoryGB': 28.0}, {'name': 'Standard_D13_v2', 'vCPUs': 8, 'memoryGB': 56.0}, {'name': 'Standard_D14_v2', 'vCPUs': 16, 'memoryGB': 112.0}, {'name': 'Standard_D1', 'vCPUs': 1, 'memoryGB': 3.5}, {'name': 'Standard_D2', 'vCPUs': 2, 'memoryGB': 7.0}, {'name': 'Standard_D3', 'vCPUs': 4, 'memoryGB': 14.0}, {'name': 'Standard_D4', 'vCPUs': 8, 'memoryGB': 28.0}, {'name': 'Standard_D11', 'vCPUs': 2, 'memoryGB': 14.0}, {'name': 'Standard_D12', 'vCPUs': 4, 'memoryGB': 28.0}, {'name': 'Standard_D13', 'vCPUs': 8, 'memoryGB': 56.0}, {'name': 'Standard_D14', 'vCPUs': 16, 'memoryGB': 112.0}, {'name': 'Standard_NV6', 'vCPUs': 6, 'memoryGB': 56.0}, {'name': 'Standard_NV12', 'vCPUs': 12, 'memoryGB': 112.0}, {'name': 'Standard_NV24', 'vCPUs': 24, 'memoryGB': 224.0}, {'name': 'Standard_NC6s_v2', 'vCPUs': 6, 'memoryGB': 112.0}, {'name': 'Standard_NC12s_v2', 'vCPUs': 12, 'memoryGB': 224.0}, {'name': 'Standard_NC24rs_v2', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_NC24s_v2', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_F2s_v2', 'vCPUs': 2, 'memoryGB': 4.0}, {'name': 'Standard_F4s_v2', 'vCPUs': 4, 'memoryGB': 8.0}, {'name': 'Standard_F8s_v2', 'vCPUs': 8, 'memoryGB': 16.0}, {'name': 'Standard_F16s_v2', 'vCPUs': 16, 'memoryGB': 32.0}, {'name': 'Standard_F32s_v2', 'vCPUs': 32, 'memoryGB': 64.0}, {'name': 'Standard_F64s_v2', 'vCPUs': 64, 'memoryGB': 128.0}, {'name': 'Standard_F72s_v2', 'vCPUs': 72, 'memoryGB': 144.0}, {'name': 'Standard_NC6', 'vCPUs': 6, 'memoryGB': 56.0}, {'name': 'Standard_NC12', 'vCPUs': 12, 'memoryGB': 112.0}, {'name': 'Standard_NC24', 'vCPUs': 24, 'memoryGB': 224.0}, {'name': 'Standard_NC24r', 'vCPUs': 24, 'memoryGB': 224.0}, {'name': 'Standard_ND6s', 'vCPUs': 6, 'memoryGB': 112.0}, {'name': 'Standard_ND12s', 'vCPUs': 12, 'memoryGB': 224.0}, {'name': 'Standard_ND24rs', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_ND24s', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_NC6s_v3', 'vCPUs': 6, 'memoryGB': 112.0}, {'name': 'Standard_NC12s_v3', 'vCPUs': 12, 'memoryGB': 224.0}, {'name': 'Standard_NC24rs_v3', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_NC24s_v3', 'vCPUs': 24, 'memoryGB': 448.0}]\n" | |
] | |
} | |
], | |
"source": [ | |
"# List VM families for Azure Machine Learning Compute\n", | |
"# Likely you'll have to request quota increase to use GPUs\n", | |
"# TODO - Can Microsoft bump up quotas for everyone at once, instead of one-at-a-time?\n", | |
"\n", | |
"print(AmlCompute.supported_vmsizes(workspace=ws))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"USE_GPU = True" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Option A: Run-based Compute\n", | |
"You can create Azure Machine Learning Compute as a compute target at run time. The compute is automatically created for your run. The compute is deleted automatically once the run completes. \n", | |
" \n", | |
"**Note from Azure docs:** *Run-based creation of Azure Machine Learning compute is currently in Preview. Don't use run-based creation if you use automated hyperparameter tuning or automated machine learning. To use hyperparameter tuning or automated machine learning, create a persistent compute target instead.*" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create a new runconfig object\n", | |
"run_compute = RunConfiguration()\n", | |
"\n", | |
"# Signal that you want to use AmlCompute to execute the script\n", | |
"run_compute.target = \"amlcompute\"\n", | |
"\n", | |
"# AmlCompute is created in the same region as your workspace\n", | |
"# Set the VM size for AmlCompute from the list of supported_vmsizes\n", | |
"# GPU-base has CUDA and CUDNN pre-installed\n", | |
"# supports conda_packages also\n", | |
"if USE_GPU:\n", | |
" run_compute.amlcompute.vm_size = 'Standard_NC6s_v2'\n", | |
" run_compute.environment.docker.base_image = DEFAULT_GPU_IMAGE\n", | |
" run_compute.environment.python.conda_dependencies = CondaDependencies.create(\n", | |
" pip_packages=['tensorflow-gpu==2.0.0-beta1', 'tensorflow-datasets'])\n", | |
"else:\n", | |
" run_compute.amlcompute.vm_size = 'Standard_DS4_v2'\n", | |
" run_compute.environment.docker.base_image = DEFAULT_CPU_IMAGE\n", | |
" run_compute.environment.python.conda_dependencies = CondaDependencies.create(\n", | |
" pip_packages=['tensorflow==2.0.0-beta1', 'tensorflow-datasets'])\n", | |
"\n", | |
"# Enable Docker\n", | |
"run_compute.environment.docker.enabled = True" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Option B: Persistent Compute\n", | |
"A persistent Azure Machine Learning Compute can be reused across jobs. The compute can be shared with other users in the workspace and is kept between jobs." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Choose a name for your cluster\n", | |
"cluster_name = \"my_cluster\"\n", | |
"\n", | |
"# Verify that cluster does not exist already\n", | |
"try:\n", | |
" cluster = ComputeTarget(workspace=ws, name=cluster_name)\n", | |
" print('Found existing cluster, use it.')\n", | |
"except ComputeTargetException:\n", | |
" if GPU:\n", | |
" vm_size = 'Standard_NC6s_v2'\n", | |
" else:\n", | |
" vm_size = 'Standard_DS4_v2'\n", | |
" compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size,\n", | |
" min_nodes=1,\n", | |
" max_nodes=4,\n", | |
" admin_username='pcoady',\n", | |
" admin_user_password='abc123')\n", | |
" cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)\n", | |
"\n", | |
"cluster.wait_for_completion(show_output=True)\n", | |
"# TODO - What is best practice for authentication?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"cluster.list_nodes()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create a new runconfig object\n", | |
"persist_compute = RunConfiguration()\n", | |
"\n", | |
"# Use the cpu_cluster you created above. \n", | |
"persist_compute.target = cluster\n", | |
"\n", | |
"# Enable Docker\n", | |
"persist_compute.environment.docker.enabled = True\n", | |
"\n", | |
"# GPU-base has CUDA and CUDNN pre-installed\n", | |
"# Supports conda_packages also\n", | |
"if USE_GPU:\n", | |
" persist_compute.environment.docker.base_image = DEFAULT_GPU_IMAGE\n", | |
" persist_compute.environment.python.conda_dependencies = CondaDependencies.create(\n", | |
" pip_packages=['tensorflow-gpu==2.0.0-beta1', 'tensorflow-datasets'])\n", | |
"else:\n", | |
" persist_compute.environment.docker.base_image = DEFAULT_CPU_IMAGE\n", | |
" persist_compute.environment.python.conda_dependencies = CondaDependencies.create(\n", | |
" pip_packages=['tensorflow==2.0.0-beta1', 'tensorflow-datasets'])\n", | |
"\n", | |
"# Use conda_dependencies.yml to create a conda environment in the Docker image for execution\n", | |
"persist_compute.environment.python.user_managed_dependencies = False\n", | |
"\n", | |
"# Auto-prepare the Docker image when used for execution (if it is not already prepared)\n", | |
"# TODO: What does this do? Delete?\n", | |
"persist_compute.auto_prepare_environment = True\n", | |
"\n", | |
"# Specify CondaDependencies obj, add necessary packages\n", | |
"persist_compute.environment.python.conda_dependencies = CondaDependencies.create(\n", | |
" pip_packages=['tensorflow==2.0.0-beta1', 'tensorflow-datasets'])\n", | |
"\n", | |
"# TODO - What does WARNING below mean?\n", | |
"# TODO - How do I \"see\" the Docker environment created above above?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Take cluster down\n", | |
"cluster.delete()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Create an Experiment and Run" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"experiment_name = 'my_experiment'\n", | |
"exp = Experiment(workspace=ws, name=experiment_name)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"RunId: my_experiment_1566217437_c6c71141\n", | |
"Web View: https://mlworkspace.azure.ai/portal/subscriptions/5fb52191-233d-4b0f-9713-de0e41784e6e/resourceGroups/MyResourceGroup/providers/Microsoft.MachineLearningServices/workspaces/MyWorkspace/experiments/my_experiment/runs/my_experiment_1566217437_c6c71141\n", | |
"\n", | |
"Streaming azureml-logs/20_image_build_log.txt\n", | |
"=============================================\n", | |
"\n", | |
"2019/08/19 12:24:04 Downloading source code...\n", | |
"2019/08/19 12:24:05 Finished downloading source code\n", | |
"2019/08/19 12:24:06 Using acb_vol_10cd6c7f-bb53-453b-8f54-eac0665a5c8c as the home volume\n", | |
"2019/08/19 12:24:06 Creating Docker network: acb_default_network, driver: 'bridge'\n", | |
"2019/08/19 12:24:06 Successfully set up Docker network: acb_default_network\n", | |
"2019/08/19 12:24:06 Setting up Docker configuration...\n", | |
"2019/08/19 12:24:07 Successfully set up Docker configuration\n", | |
"2019/08/19 12:24:07 Logging in to registry: myworkspace0bc58e64.azurecr.io\n", | |
"2019/08/19 12:24:08 Successfully logged into myworkspace0bc58e64.azurecr.io\n", | |
"2019/08/19 12:24:08 Executing step ID: acb_step_0. Timeout(sec): 5400, Working directory: '', Network: 'acb_default_network'\n", | |
"2019/08/19 12:24:08 Scanning for dependencies...\n", | |
"2019/08/19 12:24:09 Successfully scanned dependencies\n", | |
"2019/08/19 12:24:09 Launching container with name: acb_step_0\n", | |
"Sending build context to Docker daemon 58.88kB\n", | |
"\n", | |
"Step 1/15 : FROM mcr.microsoft.com/azureml/base-gpu:intelmpi2018.3-cuda10.0-cudnn7-ubuntu16.04@sha256:72b4c4a403fe51e22875950cb0d740e751a4f6ff8ee1d8e7efc201d2aca3da54\n", | |
"sha256:72b4c4a403fe51e22875950cb0d740e751a4f6ff8ee1d8e7efc201d2aca3da54: Pulling from azureml/base-gpu\n", | |
"34667c7e4631: Pulling fs layer\n", | |
"d18d76a881a4: Pulling fs layer\n", | |
"119c7358fbfc: Pulling fs layer\n", | |
"2aaf13f3eff0: Pulling fs layer\n", | |
"28d5148dfcec: Pulling fs layer\n", | |
"454bc542fc4c: Pulling fs layer\n", | |
"369f77cbea49: Pulling fs layer\n", | |
"ac4ef821cc62: Pulling fs layer\n", | |
"9b9781a46f34: Pulling fs layer\n", | |
"ade089defcf2: Pulling fs layer\n", | |
"d019e3e4cfbf: Pulling fs layer\n", | |
"760466bbcaac: Pulling fs layer\n", | |
"248f2eda89bd: Pulling fs layer\n", | |
"0b3dd3eae0b6: Pulling fs layer\n", | |
"a7b7b3a17514: Pulling fs layer\n", | |
"3ee89eb8f0db: Pulling fs layer\n", | |
"d9710d2db7a8: Pulling fs layer\n", | |
"2aaf13f3eff0: Waiting\n", | |
"28d5148dfcec: Waiting\n", | |
"454bc542fc4c: Waiting\n", | |
"369f77cbea49: Waiting\n", | |
"760466bbcaac: Waiting\n", | |
"248f2eda89bd: Waiting\n", | |
"ac4ef821cc62: Waiting\n", | |
"0b3dd3eae0b6: Waiting\n", | |
"a7b7b3a17514: Waiting\n", | |
"3ee89eb8f0db: Waiting\n", | |
"d9710d2db7a8: Waiting\n", | |
"9b9781a46f34: Waiting\n", | |
"ade089defcf2: Waiting\n", | |
"d019e3e4cfbf: Waiting\n", | |
"d18d76a881a4: Verifying Checksum\n", | |
"d18d76a881a4: Download complete\n", | |
"119c7358fbfc: Verifying Checksum\n", | |
"119c7358fbfc: Download complete\n", | |
"2aaf13f3eff0: Download complete\n", | |
"\n", | |
"Execution Summary\n", | |
"=================\n", | |
"RunId: my_experiment_1566217437_c6c71141\n", | |
"Web View: https://mlworkspace.azure.ai/portal/subscriptions/5fb52191-233d-4b0f-9713-de0e41784e6e/resourceGroups/MyResourceGroup/providers/Microsoft.MachineLearningServices/workspaces/MyWorkspace/experiments/my_experiment/runs/my_experiment_1566217437_c6c71141\n" | |
] | |
}, | |
{ | |
"ename": "ActivityFailedException", | |
"evalue": "ActivityFailedException:\n\tMessage: Activity Failed:\n\"Detailed error not set on the Run. Please check the logs for details.\"\n\tInnerException None\n\tErrorResponse {\"error\": {\"message\": \"Activity Failed:\\n\\\"Detailed error not set on the Run. Please check the logs for details.\\\"\"}}", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mActivityFailedException\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-7-8a6eb9a50fe1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0msrc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mScriptRunConfig\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_directory\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscript_folder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscript\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'train.py'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrun_compute\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mrun\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mexp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msubmit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mrun\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for_completion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mshow_output\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;32m~/venvs/aml/lib/python3.6/site-packages/azureml/core/run.py\u001b[0m in \u001b[0;36mwait_for_completion\u001b[0;34m(self, show_output, wait_post_processing, raise_on_error)\u001b[0m\n\u001b[1;32m 530\u001b[0m \u001b[0mfile_handle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstdout\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 531\u001b[0m \u001b[0mwait_post_processing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mwait_post_processing\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 532\u001b[0;31m raise_on_error=raise_on_error)\n\u001b[0m\u001b[1;32m 533\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_details\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 534\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/venvs/aml/lib/python3.6/site-packages/azureml/core/run.py\u001b[0m in \u001b[0;36m_stream_run_output\u001b[0;34m(self, file_handle, wait_post_processing, raise_on_error)\u001b[0m\n\u001b[1;32m 769\u001b[0m \u001b[0mfile_handle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 770\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 771\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mActivityFailedException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_details\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdumps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 772\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 773\u001b[0m \u001b[0mfile_handle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mActivityFailedException\u001b[0m: ActivityFailedException:\n\tMessage: Activity Failed:\n\"Detailed error not set on the Run. Please check the logs for details.\"\n\tInnerException None\n\tErrorResponse {\"error\": {\"message\": \"Activity Failed:\\n\\\"Detailed error not set on the Run. Please check the logs for details.\\\"\"}}" | |
] | |
} | |
], | |
"source": [ | |
"# run_config could also be persist_compute, or local_compute (no example for local_compute above)\n", | |
"script_folder = os.getcwd()\n", | |
"src = ScriptRunConfig(source_directory=script_folder, script='train.py', run_config=run_compute)\n", | |
"run = exp.submit(src)\n", | |
"run.wait_for_completion(show_output=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# TODO - What is best way to dry-run a training script that needs a GPU (which my local machine doesn't have)?\n", | |
"# Docker bring-up and tear-down too slow for this purpose." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# TODO - Crashes when I try to use GPU - can't figure out error from logs. (CPU runs OK)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment