pat-coady/az_tf_train.ipynb

## az_tf_train.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train Custom TensorFlow Model\n",
    "\n",
    "see:\n",
    "https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from azureml.core.runconfig import RunConfiguration\n",
    "from azureml.core.conda_dependencies import CondaDependencies\n",
    "from azureml.core import Workspace\n",
    "from azureml.core.compute import ComputeTarget\n",
    "from azureml.core.compute import AmlCompute\n",
    "from azureml.core.compute_target import ComputeTargetException\n",
    "from azureml.core.runconfig import DEFAULT_CPU_IMAGE\n",
    "from azureml.core.runconfig import DEFAULT_GPU_IMAGE\n",
    "from azureml.core import Experiment\n",
    "from azureml.core import ScriptRunConfig\n",
    "import os "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Initial Setup\n",
    "\n",
    "Install CLI: `brew update && brew install azure-cli`\n",
    "\n",
    "First use: `az login` (opens browser window for ADI SSO)\n",
    "\n",
    "Create resource group: `az group create --name myResourceGroup --location eastus`\n",
    "\n",
    "Crease ML workspace: `az ml workspace create -w MyWorkspace -g MyResourceGroup`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# grabbed config.json file from Azure web portal\n",
    "# TODO - lookup method to use CLI or Python SDK method to get this file\n",
    "\n",
    "ws = Workspace.from_config(path=\"./.azureml/config.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'name': 'Standard_DS1_v2', 'vCPUs': 1, 'memoryGB': 3.5}, {'name': 'Standard_DS2_v2', 'vCPUs': 2, 'memoryGB': 7.0}, {'name': 'Standard_DS3_v2', 'vCPUs': 4, 'memoryGB': 14.0}, {'name': 'Standard_DS4_v2', 'vCPUs': 8, 'memoryGB': 28.0}, {'name': 'Standard_DS5_v2', 'vCPUs': 16, 'memoryGB': 56.0}, {'name': 'Standard_DS11_v2', 'vCPUs': 2, 'memoryGB': 14.0}, {'name': 'Standard_DS12_v2', 'vCPUs': 4, 'memoryGB': 28.0}, {'name': 'Standard_DS13_v2', 'vCPUs': 8, 'memoryGB': 56.0}, {'name': 'Standard_DS14_v2', 'vCPUs': 16, 'memoryGB': 112.0}, {'name': 'Standard_DS15_v2', 'vCPUs': 20, 'memoryGB': 140.0}, {'name': 'Standard_D1_v2', 'vCPUs': 1, 'memoryGB': 3.5}, {'name': 'Standard_D2_v2', 'vCPUs': 2, 'memoryGB': 7.0}, {'name': 'Standard_D3_v2', 'vCPUs': 4, 'memoryGB': 14.0}, {'name': 'Standard_D4_v2', 'vCPUs': 8, 'memoryGB': 28.0}, {'name': 'Standard_D11_v2', 'vCPUs': 2, 'memoryGB': 14.0}, {'name': 'Standard_D12_v2', 'vCPUs': 4, 'memoryGB': 28.0}, {'name': 'Standard_D13_v2', 'vCPUs': 8, 'memoryGB': 56.0}, {'name': 'Standard_D14_v2', 'vCPUs': 16, 'memoryGB': 112.0}, {'name': 'Standard_D1', 'vCPUs': 1, 'memoryGB': 3.5}, {'name': 'Standard_D2', 'vCPUs': 2, 'memoryGB': 7.0}, {'name': 'Standard_D3', 'vCPUs': 4, 'memoryGB': 14.0}, {'name': 'Standard_D4', 'vCPUs': 8, 'memoryGB': 28.0}, {'name': 'Standard_D11', 'vCPUs': 2, 'memoryGB': 14.0}, {'name': 'Standard_D12', 'vCPUs': 4, 'memoryGB': 28.0}, {'name': 'Standard_D13', 'vCPUs': 8, 'memoryGB': 56.0}, {'name': 'Standard_D14', 'vCPUs': 16, 'memoryGB': 112.0}, {'name': 'Standard_NV6', 'vCPUs': 6, 'memoryGB': 56.0}, {'name': 'Standard_NV12', 'vCPUs': 12, 'memoryGB': 112.0}, {'name': 'Standard_NV24', 'vCPUs': 24, 'memoryGB': 224.0}, {'name': 'Standard_NC6s_v2', 'vCPUs': 6, 'memoryGB': 112.0}, {'name': 'Standard_NC12s_v2', 'vCPUs': 12, 'memoryGB': 224.0}, {'name': 'Standard_NC24rs_v2', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_NC24s_v2', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_F2s_v2', 'vCPUs': 2, 'memoryGB': 4.0}, {'name': 'Standard_F4s_v2', 'vCPUs': 4, 'memoryGB': 8.0}, {'name': 'Standard_F8s_v2', 'vCPUs': 8, 'memoryGB': 16.0}, {'name': 'Standard_F16s_v2', 'vCPUs': 16, 'memoryGB': 32.0}, {'name': 'Standard_F32s_v2', 'vCPUs': 32, 'memoryGB': 64.0}, {'name': 'Standard_F64s_v2', 'vCPUs': 64, 'memoryGB': 128.0}, {'name': 'Standard_F72s_v2', 'vCPUs': 72, 'memoryGB': 144.0}, {'name': 'Standard_NC6', 'vCPUs': 6, 'memoryGB': 56.0}, {'name': 'Standard_NC12', 'vCPUs': 12, 'memoryGB': 112.0}, {'name': 'Standard_NC24', 'vCPUs': 24, 'memoryGB': 224.0}, {'name': 'Standard_NC24r', 'vCPUs': 24, 'memoryGB': 224.0}, {'name': 'Standard_ND6s', 'vCPUs': 6, 'memoryGB': 112.0}, {'name': 'Standard_ND12s', 'vCPUs': 12, 'memoryGB': 224.0}, {'name': 'Standard_ND24rs', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_ND24s', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_NC6s_v3', 'vCPUs': 6, 'memoryGB': 112.0}, {'name': 'Standard_NC12s_v3', 'vCPUs': 12, 'memoryGB': 224.0}, {'name': 'Standard_NC24rs_v3', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_NC24s_v3', 'vCPUs': 24, 'memoryGB': 448.0}]\n"
     ]
    }
   ],
   "source": [
    "# List VM families for Azure Machine Learning Compute\n",
    "# Likely you'll have to request quota increase to use GPUs\n",
    "# TODO - Can Microsoft bump up quotas for everyone at once, instead of one-at-a-time?\n",
    "\n",
    "print(AmlCompute.supported_vmsizes(workspace=ws))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "USE_GPU = True"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Option A: Run-based Compute\n",
    "You can create Azure Machine Learning Compute as a compute target at run time. The compute is automatically created for your run. The compute is deleted automatically once the run completes.  \n",
    "  \n",
    "**Note from Azure docs:** *Run-based creation of Azure Machine Learning compute is currently in Preview. Don't use run-based creation if you use automated hyperparameter tuning or automated machine learning. To use hyperparameter tuning or automated machine learning, create a persistent compute target instead.*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a new runconfig object\n",
    "run_compute = RunConfiguration()\n",
    "\n",
    "# Signal that you want to use AmlCompute to execute the script\n",
    "run_compute.target = \"amlcompute\"\n",
    "\n",
    "# AmlCompute is created in the same region as your workspace\n",
    "# Set the VM size for AmlCompute from the list of supported_vmsizes\n",
    "# GPU-base has CUDA and CUDNN pre-installed\n",
    "# supports conda_packages also\n",
    "if USE_GPU:\n",
    "    run_compute.amlcompute.vm_size = 'Standard_NC6s_v2'\n",
    "    run_compute.environment.docker.base_image = DEFAULT_GPU_IMAGE\n",
    "    run_compute.environment.python.conda_dependencies = CondaDependencies.create(\n",
    "        pip_packages=['tensorflow-gpu==2.0.0-beta1', 'tensorflow-datasets'])\n",
    "else:\n",
    "    run_compute.amlcompute.vm_size = 'Standard_DS4_v2'\n",
    "    run_compute.environment.docker.base_image = DEFAULT_CPU_IMAGE\n",
    "    run_compute.environment.python.conda_dependencies = CondaDependencies.create(\n",
    "        pip_packages=['tensorflow==2.0.0-beta1', 'tensorflow-datasets'])\n",
    "\n",
    "# Enable Docker\n",
    "run_compute.environment.docker.enabled = True"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Option B: Persistent Compute\n",
    "A persistent Azure Machine Learning Compute can be reused across jobs. The compute can be shared with other users in the workspace and is kept between jobs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Choose a name for your cluster\n",
    "cluster_name = \"my_cluster\"\n",
    "\n",
    "# Verify that cluster does not exist already\n",
    "try:\n",
    "    cluster = ComputeTarget(workspace=ws, name=cluster_name)\n",
    "    print('Found existing cluster, use it.')\n",
    "except ComputeTargetException:\n",
    "    if GPU:\n",
    "        vm_size = 'Standard_NC6s_v2'\n",
    "    else:\n",
    "        vm_size = 'Standard_DS4_v2'\n",
    "    compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size,\n",
    "                                                           min_nodes=1,\n",
    "                                                           max_nodes=4,\n",
    "                                                           admin_username='pcoady',\n",
    "                                                           admin_user_password='abc123')\n",
    "    cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)\n",
    "\n",
    "cluster.wait_for_completion(show_output=True)\n",
    "# TODO - What is best practice for authentication?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cluster.list_nodes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a new runconfig object\n",
    "persist_compute = RunConfiguration()\n",
    "\n",
    "# Use the cpu_cluster you created above. \n",
    "persist_compute.target = cluster\n",
    "\n",
    "# Enable Docker\n",
    "persist_compute.environment.docker.enabled = True\n",
    "\n",
    "# GPU-base has CUDA and CUDNN pre-installed\n",
    "# Supports conda_packages also\n",
    "if USE_GPU:\n",
    "    persist_compute.environment.docker.base_image = DEFAULT_GPU_IMAGE\n",
    "    persist_compute.environment.python.conda_dependencies = CondaDependencies.create(\n",
    "        pip_packages=['tensorflow-gpu==2.0.0-beta1', 'tensorflow-datasets'])\n",
    "else:\n",
    "    persist_compute.environment.docker.base_image = DEFAULT_CPU_IMAGE\n",
    "    persist_compute.environment.python.conda_dependencies = CondaDependencies.create(\n",
    "        pip_packages=['tensorflow==2.0.0-beta1', 'tensorflow-datasets'])\n",
    "\n",
    "# Use conda_dependencies.yml to create a conda environment in the Docker image for execution\n",
    "persist_compute.environment.python.user_managed_dependencies = False\n",
    "\n",
    "# Auto-prepare the Docker image when used for execution (if it is not already prepared)\n",
    "# TODO: What does this do? Delete?\n",
    "persist_compute.auto_prepare_environment = True\n",
    "\n",
    "# Specify CondaDependencies obj, add necessary packages\n",
    "persist_compute.environment.python.conda_dependencies = CondaDependencies.create(\n",
    "    pip_packages=['tensorflow==2.0.0-beta1', 'tensorflow-datasets'])\n",
    "\n",
    "# TODO - What does WARNING below mean?\n",
    "# TODO - How do I \"see\" the Docker environment created above above?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Take cluster down\n",
    "cluster.delete()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create an Experiment and Run"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "experiment_name = 'my_experiment'\n",
    "exp = Experiment(workspace=ws, name=experiment_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RunId: my_experiment_1566217437_c6c71141\n",
      "Web View: https://mlworkspace.azure.ai/portal/subscriptions/5fb52191-233d-4b0f-9713-de0e41784e6e/resourceGroups/MyResourceGroup/providers/Microsoft.MachineLearningServices/workspaces/MyWorkspace/experiments/my_experiment/runs/my_experiment_1566217437_c6c71141\n",
      "\n",
      "Streaming azureml-logs/20_image_build_log.txt\n",
      "=============================================\n",
      "\n",
      "2019/08/19 12:24:04 Downloading source code...\n",
      "2019/08/19 12:24:05 Finished downloading source code\n",
      "2019/08/19 12:24:06 Using acb_vol_10cd6c7f-bb53-453b-8f54-eac0665a5c8c as the home volume\n",
      "2019/08/19 12:24:06 Creating Docker network: acb_default_network, driver: 'bridge'\n",
      "2019/08/19 12:24:06 Successfully set up Docker network: acb_default_network\n",
      "2019/08/19 12:24:06 Setting up Docker configuration...\n",
      "2019/08/19 12:24:07 Successfully set up Docker configuration\n",
      "2019/08/19 12:24:07 Logging in to registry: myworkspace0bc58e64.azurecr.io\n",
      "2019/08/19 12:24:08 Successfully logged into myworkspace0bc58e64.azurecr.io\n",
      "2019/08/19 12:24:08 Executing step ID: acb_step_0. Timeout(sec): 5400, Working directory: '', Network: 'acb_default_network'\n",
      "2019/08/19 12:24:08 Scanning for dependencies...\n",
      "2019/08/19 12:24:09 Successfully scanned dependencies\n",
      "2019/08/19 12:24:09 Launching container with name: acb_step_0\n",
      "Sending build context to Docker daemon  58.88kB\n",
      "\n",
      "Step 1/15 : FROM mcr.microsoft.com/azureml/base-gpu:intelmpi2018.3-cuda10.0-cudnn7-ubuntu16.04@sha256:72b4c4a403fe51e22875950cb0d740e751a4f6ff8ee1d8e7efc201d2aca3da54\n",
      "sha256:72b4c4a403fe51e22875950cb0d740e751a4f6ff8ee1d8e7efc201d2aca3da54: Pulling from azureml/base-gpu\n",
      "34667c7e4631: Pulling fs layer\n",
      "d18d76a881a4: Pulling fs layer\n",
      "119c7358fbfc: Pulling fs layer\n",
      "2aaf13f3eff0: Pulling fs layer\n",
      "28d5148dfcec: Pulling fs layer\n",
      "454bc542fc4c: Pulling fs layer\n",
      "369f77cbea49: Pulling fs layer\n",
      "ac4ef821cc62: Pulling fs layer\n",
      "9b9781a46f34: Pulling fs layer\n",
      "ade089defcf2: Pulling fs layer\n",
      "d019e3e4cfbf: Pulling fs layer\n",
      "760466bbcaac: Pulling fs layer\n",
      "248f2eda89bd: Pulling fs layer\n",
      "0b3dd3eae0b6: Pulling fs layer\n",
      "a7b7b3a17514: Pulling fs layer\n",
      "3ee89eb8f0db: Pulling fs layer\n",
      "d9710d2db7a8: Pulling fs layer\n",
      "2aaf13f3eff0: Waiting\n",
      "28d5148dfcec: Waiting\n",
      "454bc542fc4c: Waiting\n",
      "369f77cbea49: Waiting\n",
      "760466bbcaac: Waiting\n",
      "248f2eda89bd: Waiting\n",
      "ac4ef821cc62: Waiting\n",
      "0b3dd3eae0b6: Waiting\n",
      "a7b7b3a17514: Waiting\n",
      "3ee89eb8f0db: Waiting\n",
      "d9710d2db7a8: Waiting\n",
      "9b9781a46f34: Waiting\n",
      "ade089defcf2: Waiting\n",
      "d019e3e4cfbf: Waiting\n",
      "d18d76a881a4: Verifying Checksum\n",
      "d18d76a881a4: Download complete\n",
      "119c7358fbfc: Verifying Checksum\n",
      "119c7358fbfc: Download complete\n",
      "2aaf13f3eff0: Download complete\n",
      "\n",
      "Execution Summary\n",
      "=================\n",
      "RunId: my_experiment_1566217437_c6c71141\n",
      "Web View: https://mlworkspace.azure.ai/portal/subscriptions/5fb52191-233d-4b0f-9713-de0e41784e6e/resourceGroups/MyResourceGroup/providers/Microsoft.MachineLearningServices/workspaces/MyWorkspace/experiments/my_experiment/runs/my_experiment_1566217437_c6c71141\n"
     ]
    },
    {
     "ename": "ActivityFailedException",
     "evalue": "ActivityFailedException:\n\tMessage: Activity Failed:\n\"Detailed error not set on the Run. Please check the logs for details.\"\n\tInnerException None\n\tErrorResponse {\"error\": {\"message\": \"Activity Failed:\\n\\\"Detailed error not set on the Run. Please check the logs for details.\\\"\"}}",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mActivityFailedException\u001b[0m                   Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-7-8a6eb9a50fe1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0msrc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mScriptRunConfig\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_directory\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscript_folder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscript\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'train.py'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrun_compute\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mrun\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mexp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msubmit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mrun\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for_completion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mshow_output\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/venvs/aml/lib/python3.6/site-packages/azureml/core/run.py\u001b[0m in \u001b[0;36mwait_for_completion\u001b[0;34m(self, show_output, wait_post_processing, raise_on_error)\u001b[0m\n\u001b[1;32m    530\u001b[0m                     \u001b[0mfile_handle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstdout\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    531\u001b[0m                     \u001b[0mwait_post_processing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mwait_post_processing\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 532\u001b[0;31m                     raise_on_error=raise_on_error)\n\u001b[0m\u001b[1;32m    533\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_details\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    534\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/venvs/aml/lib/python3.6/site-packages/azureml/core/run.py\u001b[0m in \u001b[0;36m_stream_run_output\u001b[0;34m(self, file_handle, wait_post_processing, raise_on_error)\u001b[0m\n\u001b[1;32m    769\u001b[0m                 \u001b[0mfile_handle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    770\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 771\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0mActivityFailedException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_details\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdumps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    772\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    773\u001b[0m         \u001b[0mfile_handle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mActivityFailedException\u001b[0m: ActivityFailedException:\n\tMessage: Activity Failed:\n\"Detailed error not set on the Run. Please check the logs for details.\"\n\tInnerException None\n\tErrorResponse {\"error\": {\"message\": \"Activity Failed:\\n\\\"Detailed error not set on the Run. Please check the logs for details.\\\"\"}}"
     ]
    }
   ],
   "source": [
    "# run_config could also be persist_compute, or local_compute (no example for local_compute above)\n",
    "script_folder = os.getcwd()\n",
    "src = ScriptRunConfig(source_directory=script_folder, script='train.py', run_config=run_compute)\n",
    "run = exp.submit(src)\n",
    "run.wait_for_completion(show_output=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO - What is best way to dry-run a training script that needs a GPU (which my local machine doesn't have)?\n",
    "#   Docker bring-up and tear-down too slow for this purpose."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO - Crashes when I try to use GPU - can't figure out error from logs. (CPU runs OK)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Train Custom TensorFlow Model\n",
	"\n",
	"see:\n",
	"https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"from azureml.core.runconfig import RunConfiguration\n",
	"from azureml.core.conda_dependencies import CondaDependencies\n",
	"from azureml.core import Workspace\n",
	"from azureml.core.compute import ComputeTarget\n",
	"from azureml.core.compute import AmlCompute\n",
	"from azureml.core.compute_target import ComputeTargetException\n",
	"from azureml.core.runconfig import DEFAULT_CPU_IMAGE\n",
	"from azureml.core.runconfig import DEFAULT_GPU_IMAGE\n",
	"from azureml.core import Experiment\n",
	"from azureml.core import ScriptRunConfig\n",
	"import os "
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Initial Setup\n",
	"\n",
	"Install CLI: `brew update && brew install azure-cli`\n",
	"\n",
	"First use: `az login` (opens browser window for ADI SSO)\n",
	"\n",
	"Create resource group: `az group create --name myResourceGroup --location eastus`\n",
	"\n",
	"Crease ML workspace: `az ml workspace create -w MyWorkspace -g MyResourceGroup`"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"# grabbed config.json file from Azure web portal\n",
	"# TODO - lookup method to use CLI or Python SDK method to get this file\n",
	"\n",
	"ws = Workspace.from_config(path=\"./.azureml/config.json\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[{'name': 'Standard_DS1_v2', 'vCPUs': 1, 'memoryGB': 3.5}, {'name': 'Standard_DS2_v2', 'vCPUs': 2, 'memoryGB': 7.0}, {'name': 'Standard_DS3_v2', 'vCPUs': 4, 'memoryGB': 14.0}, {'name': 'Standard_DS4_v2', 'vCPUs': 8, 'memoryGB': 28.0}, {'name': 'Standard_DS5_v2', 'vCPUs': 16, 'memoryGB': 56.0}, {'name': 'Standard_DS11_v2', 'vCPUs': 2, 'memoryGB': 14.0}, {'name': 'Standard_DS12_v2', 'vCPUs': 4, 'memoryGB': 28.0}, {'name': 'Standard_DS13_v2', 'vCPUs': 8, 'memoryGB': 56.0}, {'name': 'Standard_DS14_v2', 'vCPUs': 16, 'memoryGB': 112.0}, {'name': 'Standard_DS15_v2', 'vCPUs': 20, 'memoryGB': 140.0}, {'name': 'Standard_D1_v2', 'vCPUs': 1, 'memoryGB': 3.5}, {'name': 'Standard_D2_v2', 'vCPUs': 2, 'memoryGB': 7.0}, {'name': 'Standard_D3_v2', 'vCPUs': 4, 'memoryGB': 14.0}, {'name': 'Standard_D4_v2', 'vCPUs': 8, 'memoryGB': 28.0}, {'name': 'Standard_D11_v2', 'vCPUs': 2, 'memoryGB': 14.0}, {'name': 'Standard_D12_v2', 'vCPUs': 4, 'memoryGB': 28.0}, {'name': 'Standard_D13_v2', 'vCPUs': 8, 'memoryGB': 56.0}, {'name': 'Standard_D14_v2', 'vCPUs': 16, 'memoryGB': 112.0}, {'name': 'Standard_D1', 'vCPUs': 1, 'memoryGB': 3.5}, {'name': 'Standard_D2', 'vCPUs': 2, 'memoryGB': 7.0}, {'name': 'Standard_D3', 'vCPUs': 4, 'memoryGB': 14.0}, {'name': 'Standard_D4', 'vCPUs': 8, 'memoryGB': 28.0}, {'name': 'Standard_D11', 'vCPUs': 2, 'memoryGB': 14.0}, {'name': 'Standard_D12', 'vCPUs': 4, 'memoryGB': 28.0}, {'name': 'Standard_D13', 'vCPUs': 8, 'memoryGB': 56.0}, {'name': 'Standard_D14', 'vCPUs': 16, 'memoryGB': 112.0}, {'name': 'Standard_NV6', 'vCPUs': 6, 'memoryGB': 56.0}, {'name': 'Standard_NV12', 'vCPUs': 12, 'memoryGB': 112.0}, {'name': 'Standard_NV24', 'vCPUs': 24, 'memoryGB': 224.0}, {'name': 'Standard_NC6s_v2', 'vCPUs': 6, 'memoryGB': 112.0}, {'name': 'Standard_NC12s_v2', 'vCPUs': 12, 'memoryGB': 224.0}, {'name': 'Standard_NC24rs_v2', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_NC24s_v2', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_F2s_v2', 'vCPUs': 2, 'memoryGB': 4.0}, {'name': 'Standard_F4s_v2', 'vCPUs': 4, 'memoryGB': 8.0}, {'name': 'Standard_F8s_v2', 'vCPUs': 8, 'memoryGB': 16.0}, {'name': 'Standard_F16s_v2', 'vCPUs': 16, 'memoryGB': 32.0}, {'name': 'Standard_F32s_v2', 'vCPUs': 32, 'memoryGB': 64.0}, {'name': 'Standard_F64s_v2', 'vCPUs': 64, 'memoryGB': 128.0}, {'name': 'Standard_F72s_v2', 'vCPUs': 72, 'memoryGB': 144.0}, {'name': 'Standard_NC6', 'vCPUs': 6, 'memoryGB': 56.0}, {'name': 'Standard_NC12', 'vCPUs': 12, 'memoryGB': 112.0}, {'name': 'Standard_NC24', 'vCPUs': 24, 'memoryGB': 224.0}, {'name': 'Standard_NC24r', 'vCPUs': 24, 'memoryGB': 224.0}, {'name': 'Standard_ND6s', 'vCPUs': 6, 'memoryGB': 112.0}, {'name': 'Standard_ND12s', 'vCPUs': 12, 'memoryGB': 224.0}, {'name': 'Standard_ND24rs', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_ND24s', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_NC6s_v3', 'vCPUs': 6, 'memoryGB': 112.0}, {'name': 'Standard_NC12s_v3', 'vCPUs': 12, 'memoryGB': 224.0}, {'name': 'Standard_NC24rs_v3', 'vCPUs': 24, 'memoryGB': 448.0}, {'name': 'Standard_NC24s_v3', 'vCPUs': 24, 'memoryGB': 448.0}]\n"
	]
	}
	],
	"source": [
	"# List VM families for Azure Machine Learning Compute\n",
	"# Likely you'll have to request quota increase to use GPUs\n",
	"# TODO - Can Microsoft bump up quotas for everyone at once, instead of one-at-a-time?\n",
	"\n",
	"print(AmlCompute.supported_vmsizes(workspace=ws))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"USE_GPU = True"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Option A: Run-based Compute\n",
	"You can create Azure Machine Learning Compute as a compute target at run time. The compute is automatically created for your run. The compute is deleted automatically once the run completes. \n",
	" \n",
	"Note from Azure docs: Run-based creation of Azure Machine Learning compute is currently in Preview. Don't use run-based creation if you use automated hyperparameter tuning or automated machine learning. To use hyperparameter tuning or automated machine learning, create a persistent compute target instead."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Create a new runconfig object\n",
	"run_compute = RunConfiguration()\n",
	"\n",
	"# Signal that you want to use AmlCompute to execute the script\n",
	"run_compute.target = \"amlcompute\"\n",
	"\n",
	"# AmlCompute is created in the same region as your workspace\n",
	"# Set the VM size for AmlCompute from the list of supported_vmsizes\n",
	"# GPU-base has CUDA and CUDNN pre-installed\n",
	"# supports conda_packages also\n",
	"if USE_GPU:\n",
	" run_compute.amlcompute.vm_size = 'Standard_NC6s_v2'\n",
	" run_compute.environment.docker.base_image = DEFAULT_GPU_IMAGE\n",
	" run_compute.environment.python.conda_dependencies = CondaDependencies.create(\n",
	" pip_packages=['tensorflow-gpu==2.0.0-beta1', 'tensorflow-datasets'])\n",
	"else:\n",
	" run_compute.amlcompute.vm_size = 'Standard_DS4_v2'\n",
	" run_compute.environment.docker.base_image = DEFAULT_CPU_IMAGE\n",
	" run_compute.environment.python.conda_dependencies = CondaDependencies.create(\n",
	" pip_packages=['tensorflow==2.0.0-beta1', 'tensorflow-datasets'])\n",
	"\n",
	"# Enable Docker\n",
	"run_compute.environment.docker.enabled = True"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Option B: Persistent Compute\n",
	"A persistent Azure Machine Learning Compute can be reused across jobs. The compute can be shared with other users in the workspace and is kept between jobs."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Choose a name for your cluster\n",
	"cluster_name = \"my_cluster\"\n",
	"\n",
	"# Verify that cluster does not exist already\n",
	"try:\n",
	" cluster = ComputeTarget(workspace=ws, name=cluster_name)\n",
	" print('Found existing cluster, use it.')\n",
	"except ComputeTargetException:\n",
	" if GPU:\n",
	" vm_size = 'Standard_NC6s_v2'\n",
	" else:\n",
	" vm_size = 'Standard_DS4_v2'\n",
	" compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size,\n",
	" min_nodes=1,\n",
	" max_nodes=4,\n",
	" admin_username='pcoady',\n",
	" admin_user_password='abc123')\n",
	" cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)\n",
	"\n",
	"cluster.wait_for_completion(show_output=True)\n",
	"# TODO - What is best practice for authentication?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"cluster.list_nodes()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Create a new runconfig object\n",
	"persist_compute = RunConfiguration()\n",
	"\n",
	"# Use the cpu_cluster you created above. \n",
	"persist_compute.target = cluster\n",
	"\n",
	"# Enable Docker\n",
	"persist_compute.environment.docker.enabled = True\n",
	"\n",
	"# GPU-base has CUDA and CUDNN pre-installed\n",
	"# Supports conda_packages also\n",
	"if USE_GPU:\n",
	" persist_compute.environment.docker.base_image = DEFAULT_GPU_IMAGE\n",
	" persist_compute.environment.python.conda_dependencies = CondaDependencies.create(\n",
	" pip_packages=['tensorflow-gpu==2.0.0-beta1', 'tensorflow-datasets'])\n",
	"else:\n",
	" persist_compute.environment.docker.base_image = DEFAULT_CPU_IMAGE\n",
	" persist_compute.environment.python.conda_dependencies = CondaDependencies.create(\n",
	" pip_packages=['tensorflow==2.0.0-beta1', 'tensorflow-datasets'])\n",
	"\n",
	"# Use conda_dependencies.yml to create a conda environment in the Docker image for execution\n",
	"persist_compute.environment.python.user_managed_dependencies = False\n",
	"\n",
	"# Auto-prepare the Docker image when used for execution (if it is not already prepared)\n",
	"# TODO: What does this do? Delete?\n",
	"persist_compute.auto_prepare_environment = True\n",
	"\n",
	"# Specify CondaDependencies obj, add necessary packages\n",
	"persist_compute.environment.python.conda_dependencies = CondaDependencies.create(\n",
	" pip_packages=['tensorflow==2.0.0-beta1', 'tensorflow-datasets'])\n",
	"\n",
	"# TODO - What does WARNING below mean?\n",
	"# TODO - How do I \"see\" the Docker environment created above above?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Take cluster down\n",
	"cluster.delete()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Create an Experiment and Run"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"experiment_name = 'my_experiment'\n",
	"exp = Experiment(workspace=ws, name=experiment_name)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"RunId: my_experiment_1566217437_c6c71141\n",
	"Web View: https://mlworkspace.azure.ai/portal/subscriptions/5fb52191-233d-4b0f-9713-de0e41784e6e/resourceGroups/MyResourceGroup/providers/Microsoft.MachineLearningServices/workspaces/MyWorkspace/experiments/my_experiment/runs/my_experiment_1566217437_c6c71141\n",
	"\n",
	"Streaming azureml-logs/20_image_build_log.txt\n",
	"=============================================\n",
	"\n",
	"2019/08/19 12:24:04 Downloading source code...\n",
	"2019/08/19 12:24:05 Finished downloading source code\n",
	"2019/08/19 12:24:06 Using acb_vol_10cd6c7f-bb53-453b-8f54-eac0665a5c8c as the home volume\n",
	"2019/08/19 12:24:06 Creating Docker network: acb_default_network, driver: 'bridge'\n",
	"2019/08/19 12:24:06 Successfully set up Docker network: acb_default_network\n",
	"2019/08/19 12:24:06 Setting up Docker configuration...\n",
	"2019/08/19 12:24:07 Successfully set up Docker configuration\n",
	"2019/08/19 12:24:07 Logging in to registry: myworkspace0bc58e64.azurecr.io\n",
	"2019/08/19 12:24:08 Successfully logged into myworkspace0bc58e64.azurecr.io\n",
	"2019/08/19 12:24:08 Executing step ID: acb_step_0. Timeout(sec): 5400, Working directory: '', Network: 'acb_default_network'\n",
	"2019/08/19 12:24:08 Scanning for dependencies...\n",
	"2019/08/19 12:24:09 Successfully scanned dependencies\n",
	"2019/08/19 12:24:09 Launching container with name: acb_step_0\n",
	"Sending build context to Docker daemon 58.88kB\n",
	"\n",
	"Step 1/15 : FROM mcr.microsoft.com/azureml/base-gpu:intelmpi2018.3-cuda10.0-cudnn7-ubuntu16.04@sha256:72b4c4a403fe51e22875950cb0d740e751a4f6ff8ee1d8e7efc201d2aca3da54\n",
	"sha256:72b4c4a403fe51e22875950cb0d740e751a4f6ff8ee1d8e7efc201d2aca3da54: Pulling from azureml/base-gpu\n",
	"34667c7e4631: Pulling fs layer\n",
	"d18d76a881a4: Pulling fs layer\n",
	"119c7358fbfc: Pulling fs layer\n",
	"2aaf13f3eff0: Pulling fs layer\n",
	"28d5148dfcec: Pulling fs layer\n",
	"454bc542fc4c: Pulling fs layer\n",
	"369f77cbea49: Pulling fs layer\n",
	"ac4ef821cc62: Pulling fs layer\n",
	"9b9781a46f34: Pulling fs layer\n",
	"ade089defcf2: Pulling fs layer\n",
	"d019e3e4cfbf: Pulling fs layer\n",
	"760466bbcaac: Pulling fs layer\n",
	"248f2eda89bd: Pulling fs layer\n",
	"0b3dd3eae0b6: Pulling fs layer\n",
	"a7b7b3a17514: Pulling fs layer\n",
	"3ee89eb8f0db: Pulling fs layer\n",
	"d9710d2db7a8: Pulling fs layer\n",
	"2aaf13f3eff0: Waiting\n",
	"28d5148dfcec: Waiting\n",
	"454bc542fc4c: Waiting\n",
	"369f77cbea49: Waiting\n",
	"760466bbcaac: Waiting\n",
	"248f2eda89bd: Waiting\n",
	"ac4ef821cc62: Waiting\n",
	"0b3dd3eae0b6: Waiting\n",
	"a7b7b3a17514: Waiting\n",
	"3ee89eb8f0db: Waiting\n",
	"d9710d2db7a8: Waiting\n",
	"9b9781a46f34: Waiting\n",
	"ade089defcf2: Waiting\n",
	"d019e3e4cfbf: Waiting\n",
	"d18d76a881a4: Verifying Checksum\n",
	"d18d76a881a4: Download complete\n",
	"119c7358fbfc: Verifying Checksum\n",
	"119c7358fbfc: Download complete\n",
	"2aaf13f3eff0: Download complete\n",
	"\n",
	"Execution Summary\n",
	"=================\n",
	"RunId: my_experiment_1566217437_c6c71141\n",
	"Web View: https://mlworkspace.azure.ai/portal/subscriptions/5fb52191-233d-4b0f-9713-de0e41784e6e/resourceGroups/MyResourceGroup/providers/Microsoft.MachineLearningServices/workspaces/MyWorkspace/experiments/my_experiment/runs/my_experiment_1566217437_c6c71141\n"
	]
	},
	{
	"ename": "ActivityFailedException",
	"evalue": "ActivityFailedException:\n\tMessage: Activity Failed:\n\"Detailed error not set on the Run. Please check the logs for details.\"\n\tInnerException None\n\tErrorResponse {\"error\": {\"message\": \"Activity Failed:\\n\\\"Detailed error not set on the Run. Please check the logs for details.\\\"\"}}",
	"output_type": "error",
	"traceback": [
	"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
	"\u001b[0;31mActivityFailedException\u001b[0m Traceback (most recent call last)",
	"\u001b[0;32m<ipython-input-7-8a6eb9a50fe1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0msrc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mScriptRunConfig\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_directory\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscript_folder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscript\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'train.py'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrun_compute\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mrun\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mexp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msubmit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mrun\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for_completion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mshow_output\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
	"\u001b[0;32m~/venvs/aml/lib/python3.6/site-packages/azureml/core/run.py\u001b[0m in \u001b[0;36mwait_for_completion\u001b[0;34m(self, show_output, wait_post_processing, raise_on_error)\u001b[0m\n\u001b[1;32m 530\u001b[0m \u001b[0mfile_handle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstdout\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 531\u001b[0m \u001b[0mwait_post_processing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mwait_post_processing\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 532\u001b[0;31m raise_on_error=raise_on_error)\n\u001b[0m\u001b[1;32m 533\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_details\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 534\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;32m~/venvs/aml/lib/python3.6/site-packages/azureml/core/run.py\u001b[0m in \u001b[0;36m_stream_run_output\u001b[0;34m(self, file_handle, wait_post_processing, raise_on_error)\u001b[0m\n\u001b[1;32m 769\u001b[0m \u001b[0mfile_handle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 770\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 771\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mActivityFailedException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_details\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdumps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 772\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 773\u001b[0m \u001b[0mfile_handle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;31mActivityFailedException\u001b[0m: ActivityFailedException:\n\tMessage: Activity Failed:\n\"Detailed error not set on the Run. Please check the logs for details.\"\n\tInnerException None\n\tErrorResponse {\"error\": {\"message\": \"Activity Failed:\\n\\\"Detailed error not set on the Run. Please check the logs for details.\\\"\"}}"
	]
	}
	],
	"source": [
	"# run_config could also be persist_compute, or local_compute (no example for local_compute above)\n",
	"script_folder = os.getcwd()\n",
	"src = ScriptRunConfig(source_directory=script_folder, script='train.py', run_config=run_compute)\n",
	"run = exp.submit(src)\n",
	"run.wait_for_completion(show_output=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# TODO - What is best way to dry-run a training script that needs a GPU (which my local machine doesn't have)?\n",
	"# Docker bring-up and tear-down too slow for this purpose."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# TODO - Crashes when I try to use GPU - can't figure out error from logs. (CPU runs OK)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}