Skip to content

Instantly share code, notes, and snippets.

@pat-coady
Created August 19, 2019 12:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pat-coady/e4fd0773abe8f15b58b3624b1f39d3ea to your computer and use it in GitHub Desktop.
Save pat-coady/e4fd0773abe8f15b58b3624b1f39d3ea to your computer and use it in GitHub Desktop.
Train Custom TF Model
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Train Custom TensorFlow Model\n",
"\n",
"see:\n",
"https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"from azureml.core import Workspace\n",
"from azureml.core.compute import ComputeTarget\n",
"from azureml.core.compute import AmlCompute\n",
"from azureml.core.compute_target import ComputeTargetException\n",
"from azureml.core.runconfig import DEFAULT_CPU_IMAGE\n",
"from azureml.core.runconfig import DEFAULT_GPU_IMAGE\n",
"from azureml.core import Experiment\n",
"from azureml.core import ScriptRunConfig\n",
"import os "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Initial Setup\n",
"\n",
"Install CLI: `brew update && brew install azure-cli`\n",
"\n",
"First use: `az login` (opens browser window for ADI SSO)\n",
"\n",
"Create resource group: `az group create --name myResourceGroup --location eastus`\n",
"\n",
"Crease ML workspace: `az ml workspace create -w MyWorkspace -g MyResourceGroup`"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# grabbed config.json file from Azure web portal\n",
"# TODO - lookup method to use CLI or Python SDK method to get this file\n",
"\n",
"ws = Workspace.from_config(path=\"./.azureml/config.json\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[{'name': 'Standard_DS1_v2', 'vCPUs': 1, 'memoryGB': 3.5}, {'name': 'Standard_DS2_v2', 'vCPUs': 2, 'memoryGB': 7.0}, {'name': 'Standard_DS3_v2', 'vCPUs': 4, 'memoryGB': 14.0}, {'name': 'Standard_DS4_v2', 'vCPUs': 8, 'memoryGB': 28.0}, {'name': 'Standard_DS5_v2', 'vCPUs': 16, 'memoryGB': 56.0}, {'name': 'Standard_DS11_v2', 'vCPUs': 2, 'memoryGB': 14.0}, {'name': 'Standard_DS12_v2', 'vCPUs': 4, 'memoryGB': 28.0}, {'name': 'Standard_DS13_v2', 'vCPUs': 8, 'memoryGB': 56.0}, {'name': 'Standard_DS14_v2', 'vCPUs': 16, 'memoryGB': 112.0}, {'name': 'Standard_DS15_v2', 'vCPUs': 20, 'memoryGB': 140.0}, {'name': 'Standard_D1_v2', 'vCPUs': 1, 'memoryGB': 3.5}, {'name': 'Standard_D2_v2', 'vCPUs': 2, 'memoryGB': 7.0}, {'name': 'Standard_D3_v2', 'vCPUs': 4, 'memoryGB': 14.0}, {'name': 'Standard_D4_v2', 'vCPUs': 8, 'memoryGB': 28.0}, {'name': 'Standard_D11_v2', 'vCPUs': 2, 'memoryGB': 14.0}, {'name': 'Standard_D12_v2', 'vCPUs': 4, 'memoryGB': 28.0}, {'name': 'Standard_D13_v2', 'vCPUs': 8, 'memoryGB': 56.0}, {'name': 'Standard_D14_v2', 'vCPUs': 16, 'memoryGB': 112.0}, {'name': 'Standard_D1', 'vCPUs': 1, 'memoryGB': 3.5}, {'name': 'Standard_D2', 'vCPUs': 2, 'memoryGB': 7.0}, {'name': 'Standard_D3', 'vCPUs': 4, 'memoryGB': 14.0}, {'name': 'Standard_D4', 'vCPUs': 8, 'memoryGB': 28.0}, {'name': 'Standard_D11', 'vCPUs': 2, 'memoryGB': 14.0}, {'name': 'Standard_D12', 'vCPUs': 4, 'memoryGB': 28.0}, {'name': 'Standard_D13', 'vCPUs': 8, 'memoryGB': 56.0}, {'name': 'Standard_D14', 'vCPUs': 16, 'memoryGB': 112.0}, {'name': 'Standard_NV6', 'vCPUs': 6, 'memoryGB': 56.0}, {'name': 'Standard_NV12', 'vCPUs': 12, 'memoryGB': 112.0}, {'name': 'Standard_NV24', 'vCPUs': 24, 'memoryGB': 224.0}, {'name': 'Standard_NC6s_v2', 'vCPUs': 6, 'memoryGB': 112.0}, {'name': 'Standard_NC12s_v2', 'vCPUs': 12, 'memoryGB': 224.0}, {'name': 'Standard_NC24rs_v2', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_NC24s_v2', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_F2s_v2', 'vCPUs': 2, 'memoryGB': 4.0}, {'name': 'Standard_F4s_v2', 'vCPUs': 4, 'memoryGB': 8.0}, {'name': 'Standard_F8s_v2', 'vCPUs': 8, 'memoryGB': 16.0}, {'name': 'Standard_F16s_v2', 'vCPUs': 16, 'memoryGB': 32.0}, {'name': 'Standard_F32s_v2', 'vCPUs': 32, 'memoryGB': 64.0}, {'name': 'Standard_F64s_v2', 'vCPUs': 64, 'memoryGB': 128.0}, {'name': 'Standard_F72s_v2', 'vCPUs': 72, 'memoryGB': 144.0}, {'name': 'Standard_NC6', 'vCPUs': 6, 'memoryGB': 56.0}, {'name': 'Standard_NC12', 'vCPUs': 12, 'memoryGB': 112.0}, {'name': 'Standard_NC24', 'vCPUs': 24, 'memoryGB': 224.0}, {'name': 'Standard_NC24r', 'vCPUs': 24, 'memoryGB': 224.0}, {'name': 'Standard_ND6s', 'vCPUs': 6, 'memoryGB': 112.0}, {'name': 'Standard_ND12s', 'vCPUs': 12, 'memoryGB': 224.0}, {'name': 'Standard_ND24rs', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_ND24s', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_NC6s_v3', 'vCPUs': 6, 'memoryGB': 112.0}, {'name': 'Standard_NC12s_v3', 'vCPUs': 12, 'memoryGB': 224.0}, {'name': 'Standard_NC24rs_v3', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_NC24s_v3', 'vCPUs': 24, 'memoryGB': 448.0}]\n"
]
}
],
"source": [
"# List VM families for Azure Machine Learning Compute\n",
"# Likely you'll have to request quota increase to use GPUs\n",
"# TODO - Can Microsoft bump up quotas for everyone at once, instead of one-at-a-time?\n",
"\n",
"print(AmlCompute.supported_vmsizes(workspace=ws))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"USE_GPU = True"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Option A: Run-based Compute\n",
"You can create Azure Machine Learning Compute as a compute target at run time. The compute is automatically created for your run. The compute is deleted automatically once the run completes. \n",
" \n",
"**Note from Azure docs:** *Run-based creation of Azure Machine Learning compute is currently in Preview. Don't use run-based creation if you use automated hyperparameter tuning or automated machine learning. To use hyperparameter tuning or automated machine learning, create a persistent compute target instead.*"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Create a new runconfig object\n",
"run_compute = RunConfiguration()\n",
"\n",
"# Signal that you want to use AmlCompute to execute the script\n",
"run_compute.target = \"amlcompute\"\n",
"\n",
"# AmlCompute is created in the same region as your workspace\n",
"# Set the VM size for AmlCompute from the list of supported_vmsizes\n",
"# GPU-base has CUDA and CUDNN pre-installed\n",
"# supports conda_packages also\n",
"if USE_GPU:\n",
" run_compute.amlcompute.vm_size = 'Standard_NC6s_v2'\n",
" run_compute.environment.docker.base_image = DEFAULT_GPU_IMAGE\n",
" run_compute.environment.python.conda_dependencies = CondaDependencies.create(\n",
" pip_packages=['tensorflow-gpu==2.0.0-beta1', 'tensorflow-datasets'])\n",
"else:\n",
" run_compute.amlcompute.vm_size = 'Standard_DS4_v2'\n",
" run_compute.environment.docker.base_image = DEFAULT_CPU_IMAGE\n",
" run_compute.environment.python.conda_dependencies = CondaDependencies.create(\n",
" pip_packages=['tensorflow==2.0.0-beta1', 'tensorflow-datasets'])\n",
"\n",
"# Enable Docker\n",
"run_compute.environment.docker.enabled = True"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Option B: Persistent Compute\n",
"A persistent Azure Machine Learning Compute can be reused across jobs. The compute can be shared with other users in the workspace and is kept between jobs."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Choose a name for your cluster\n",
"cluster_name = \"my_cluster\"\n",
"\n",
"# Verify that cluster does not exist already\n",
"try:\n",
" cluster = ComputeTarget(workspace=ws, name=cluster_name)\n",
" print('Found existing cluster, use it.')\n",
"except ComputeTargetException:\n",
" if GPU:\n",
" vm_size = 'Standard_NC6s_v2'\n",
" else:\n",
" vm_size = 'Standard_DS4_v2'\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size,\n",
" min_nodes=1,\n",
" max_nodes=4,\n",
" admin_username='pcoady',\n",
" admin_user_password='abc123')\n",
" cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)\n",
"\n",
"cluster.wait_for_completion(show_output=True)\n",
"# TODO - What is best practice for authentication?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cluster.list_nodes()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create a new runconfig object\n",
"persist_compute = RunConfiguration()\n",
"\n",
"# Use the cpu_cluster you created above. \n",
"persist_compute.target = cluster\n",
"\n",
"# Enable Docker\n",
"persist_compute.environment.docker.enabled = True\n",
"\n",
"# GPU-base has CUDA and CUDNN pre-installed\n",
"# Supports conda_packages also\n",
"if USE_GPU:\n",
" persist_compute.environment.docker.base_image = DEFAULT_GPU_IMAGE\n",
" persist_compute.environment.python.conda_dependencies = CondaDependencies.create(\n",
" pip_packages=['tensorflow-gpu==2.0.0-beta1', 'tensorflow-datasets'])\n",
"else:\n",
" persist_compute.environment.docker.base_image = DEFAULT_CPU_IMAGE\n",
" persist_compute.environment.python.conda_dependencies = CondaDependencies.create(\n",
" pip_packages=['tensorflow==2.0.0-beta1', 'tensorflow-datasets'])\n",
"\n",
"# Use conda_dependencies.yml to create a conda environment in the Docker image for execution\n",
"persist_compute.environment.python.user_managed_dependencies = False\n",
"\n",
"# Auto-prepare the Docker image when used for execution (if it is not already prepared)\n",
"# TODO: What does this do? Delete?\n",
"persist_compute.auto_prepare_environment = True\n",
"\n",
"# Specify CondaDependencies obj, add necessary packages\n",
"persist_compute.environment.python.conda_dependencies = CondaDependencies.create(\n",
" pip_packages=['tensorflow==2.0.0-beta1', 'tensorflow-datasets'])\n",
"\n",
"# TODO - What does WARNING below mean?\n",
"# TODO - How do I \"see\" the Docker environment created above above?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Take cluster down\n",
"cluster.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create an Experiment and Run"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"experiment_name = 'my_experiment'\n",
"exp = Experiment(workspace=ws, name=experiment_name)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RunId: my_experiment_1566217437_c6c71141\n",
"Web View: https://mlworkspace.azure.ai/portal/subscriptions/5fb52191-233d-4b0f-9713-de0e41784e6e/resourceGroups/MyResourceGroup/providers/Microsoft.MachineLearningServices/workspaces/MyWorkspace/experiments/my_experiment/runs/my_experiment_1566217437_c6c71141\n",
"\n",
"Streaming azureml-logs/20_image_build_log.txt\n",
"=============================================\n",
"\n",
"2019/08/19 12:24:04 Downloading source code...\n",
"2019/08/19 12:24:05 Finished downloading source code\n",
"2019/08/19 12:24:06 Using acb_vol_10cd6c7f-bb53-453b-8f54-eac0665a5c8c as the home volume\n",
"2019/08/19 12:24:06 Creating Docker network: acb_default_network, driver: 'bridge'\n",
"2019/08/19 12:24:06 Successfully set up Docker network: acb_default_network\n",
"2019/08/19 12:24:06 Setting up Docker configuration...\n",
"2019/08/19 12:24:07 Successfully set up Docker configuration\n",
"2019/08/19 12:24:07 Logging in to registry: myworkspace0bc58e64.azurecr.io\n",
"2019/08/19 12:24:08 Successfully logged into myworkspace0bc58e64.azurecr.io\n",
"2019/08/19 12:24:08 Executing step ID: acb_step_0. Timeout(sec): 5400, Working directory: '', Network: 'acb_default_network'\n",
"2019/08/19 12:24:08 Scanning for dependencies...\n",
"2019/08/19 12:24:09 Successfully scanned dependencies\n",
"2019/08/19 12:24:09 Launching container with name: acb_step_0\n",
"Sending build context to Docker daemon 58.88kB\n",
"\n",
"Step 1/15 : FROM mcr.microsoft.com/azureml/base-gpu:intelmpi2018.3-cuda10.0-cudnn7-ubuntu16.04@sha256:72b4c4a403fe51e22875950cb0d740e751a4f6ff8ee1d8e7efc201d2aca3da54\n",
"sha256:72b4c4a403fe51e22875950cb0d740e751a4f6ff8ee1d8e7efc201d2aca3da54: Pulling from azureml/base-gpu\n",
"34667c7e4631: Pulling fs layer\n",
"d18d76a881a4: Pulling fs layer\n",
"119c7358fbfc: Pulling fs layer\n",
"2aaf13f3eff0: Pulling fs layer\n",
"28d5148dfcec: Pulling fs layer\n",
"454bc542fc4c: Pulling fs layer\n",
"369f77cbea49: Pulling fs layer\n",
"ac4ef821cc62: Pulling fs layer\n",
"9b9781a46f34: Pulling fs layer\n",
"ade089defcf2: Pulling fs layer\n",
"d019e3e4cfbf: Pulling fs layer\n",
"760466bbcaac: Pulling fs layer\n",
"248f2eda89bd: Pulling fs layer\n",
"0b3dd3eae0b6: Pulling fs layer\n",
"a7b7b3a17514: Pulling fs layer\n",
"3ee89eb8f0db: Pulling fs layer\n",
"d9710d2db7a8: Pulling fs layer\n",
"2aaf13f3eff0: Waiting\n",
"28d5148dfcec: Waiting\n",
"454bc542fc4c: Waiting\n",
"369f77cbea49: Waiting\n",
"760466bbcaac: Waiting\n",
"248f2eda89bd: Waiting\n",
"ac4ef821cc62: Waiting\n",
"0b3dd3eae0b6: Waiting\n",
"a7b7b3a17514: Waiting\n",
"3ee89eb8f0db: Waiting\n",
"d9710d2db7a8: Waiting\n",
"9b9781a46f34: Waiting\n",
"ade089defcf2: Waiting\n",
"d019e3e4cfbf: Waiting\n",
"d18d76a881a4: Verifying Checksum\n",
"d18d76a881a4: Download complete\n",
"119c7358fbfc: Verifying Checksum\n",
"119c7358fbfc: Download complete\n",
"2aaf13f3eff0: Download complete\n",
"\n",
"Execution Summary\n",
"=================\n",
"RunId: my_experiment_1566217437_c6c71141\n",
"Web View: https://mlworkspace.azure.ai/portal/subscriptions/5fb52191-233d-4b0f-9713-de0e41784e6e/resourceGroups/MyResourceGroup/providers/Microsoft.MachineLearningServices/workspaces/MyWorkspace/experiments/my_experiment/runs/my_experiment_1566217437_c6c71141\n"
]
},
{
"ename": "ActivityFailedException",
"evalue": "ActivityFailedException:\n\tMessage: Activity Failed:\n\"Detailed error not set on the Run. Please check the logs for details.\"\n\tInnerException None\n\tErrorResponse {\"error\": {\"message\": \"Activity Failed:\\n\\\"Detailed error not set on the Run. Please check the logs for details.\\\"\"}}",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mActivityFailedException\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-7-8a6eb9a50fe1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0msrc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mScriptRunConfig\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_directory\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscript_folder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscript\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'train.py'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrun_compute\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mrun\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mexp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msubmit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mrun\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for_completion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mshow_output\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/venvs/aml/lib/python3.6/site-packages/azureml/core/run.py\u001b[0m in \u001b[0;36mwait_for_completion\u001b[0;34m(self, show_output, wait_post_processing, raise_on_error)\u001b[0m\n\u001b[1;32m 530\u001b[0m \u001b[0mfile_handle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstdout\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 531\u001b[0m \u001b[0mwait_post_processing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mwait_post_processing\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 532\u001b[0;31m raise_on_error=raise_on_error)\n\u001b[0m\u001b[1;32m 533\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_details\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 534\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/venvs/aml/lib/python3.6/site-packages/azureml/core/run.py\u001b[0m in \u001b[0;36m_stream_run_output\u001b[0;34m(self, file_handle, wait_post_processing, raise_on_error)\u001b[0m\n\u001b[1;32m 769\u001b[0m \u001b[0mfile_handle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 770\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 771\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mActivityFailedException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_details\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdumps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 772\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 773\u001b[0m \u001b[0mfile_handle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mActivityFailedException\u001b[0m: ActivityFailedException:\n\tMessage: Activity Failed:\n\"Detailed error not set on the Run. Please check the logs for details.\"\n\tInnerException None\n\tErrorResponse {\"error\": {\"message\": \"Activity Failed:\\n\\\"Detailed error not set on the Run. Please check the logs for details.\\\"\"}}"
]
}
],
"source": [
"# run_config could also be persist_compute, or local_compute (no example for local_compute above)\n",
"script_folder = os.getcwd()\n",
"src = ScriptRunConfig(source_directory=script_folder, script='train.py', run_config=run_compute)\n",
"run = exp.submit(src)\n",
"run.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# TODO - What is best way to dry-run a training script that needs a GPU (which my local machine doesn't have)?\n",
"# Docker bring-up and tear-down too slow for this purpose."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# TODO - Crashes when I try to use GPU - can't figure out error from logs. (CPU runs OK)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment