This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def generate_zbh1_dag(workers, num_microbatches): | |
num_workers = len(workers) | |
num_lead_microbatches = num_workers | |
with InputNode() as inp: | |
fwd_queues = [[] for _ in range(num_workers)] | |
bwd_queues = [[] for _ in range(num_workers)] | |
# Once a worker's counter reaches 0, it cannot execute another fwd until it | |
# executes a bwd first. | |
fwd_counter = [num_lead_microbatches - i for i in range(num_workers)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ray | |
import ray.cluster_utils | |
from ray.experimental.channel.torch_tensor_type import TorchTensorType | |
from ray.dag import InputNode, MultiOutputNode | |
from typing import Optional | |
from ray.dag.compiled_dag_node import CompiledDAG | |
from argparse import ArgumentError, ArgumentParser |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Traceback (most recent call last): | |
File "/home/ray/default/skeleton_zb_h1.py", line 106, in <module> | |
ray.get(dag.execute(1)) | |
File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper | |
return fn(*args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper | |
return func(*args, **kwargs) | |
File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/_private/worker.py", line 2648, in get | |
return object_refs.get(timeout=timeout) | |
File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/experimental/compiled_dag_ref.py", line 90, in get |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
import torch | |
import torch.distributed as dist | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from log_utils import rank_log, get_logger, verify_min_gpu_count | |
# ---- GPU check ------------ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ray | |
from ray.air.util.torch_dist import _init_torch_distributed | |
from ray.air._internal.util import find_free_port | |
from ray.dag.input_node import InputNode | |
from ray.dag.output_node import MultiOutputNode | |
from ray.experimental.channel.torch_tensor_type import TorchTensorType | |
import os | |
import torch | |
import torch.nn as nn | |
from torch.nn import functional as F |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ray | |
import torch | |
from ray.dag.input_node import InputNode | |
from ray.dag.output_node import MultiOutputNode | |
from ray.experimental.channel.torch_tensor_type import TorchTensorType | |
@ray.remote(num_gpus=1) | |
class MyActor: | |
def __init__(self): | |
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ray | |
from ray.air.util.torch_dist import _init_torch_distributed | |
from ray.air._internal.util import find_free_port | |
from ray.dag.input_node import InputNode | |
from ray.dag.output_node import MultiOutputNode | |
from ray.experimental.channel.torch_tensor_type import TorchTensorType | |
import os | |
import torch | |
import torch.nn as nn | |
from torch.nn import functional as F |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ray | |
import torch | |
from ray.experimental.channel.torch_tensor_type import TorchTensorType | |
# shape = (4, 8192) | |
shape = (4, 24576) | |
@ray.remote(num_gpus=1) | |
class MyActor: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
import argparse | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
import torch.optim as optim | |
from torchvision import datasets, transforms | |
from torch.optim.lr_scheduler import StepLR | |
from ray.train.torch import TorchTrainer | |
from ray.train import ScalingConfig, RunConfig |
NewerOlder