Skip to content

Instantly share code, notes, and snippets.

View woshiyyya's full-sized avatar
zzz

Yunxuan Xiao woshiyyya

zzz
View GitHub Profile
task.
class_name: RayTrainWorker
actor_id: 9e6790a209b7c509e64301f305000000
pid: 35979
namespace: f205d617-4ee1-4fae-a76a-c3f2382b7527
ip: 172.24.101.245
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an exit code None. Traceback (most recent call last):
File "python/ray/_raylet.pyx", line 1883, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1984, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1889, in ray._raylet.execute_task
@woshiyyya
woshiyyya / run.py
Created April 9, 2024 18:14
Test Async Actor DDP
from collections import defaultdict
from ray.train._internal.utils import get_address_and_port
import ray
import os
import torch
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel as DDP
import time
import ray
ray.init()
node_resources = {}
for node in ray.nodes():
print(node, "\n")
node_resources[node["NodeID"]] = node["Resources"]
import ray
#!/usr/bin/env python3
# pylint: skip-file
import os
import torch
from torch import distributed as dist
from torchvision.models import resnet18
from torchvision.datasets import FashionMNIST
from torchvision.transforms import ToTensor, Normalize, Compose
from torch.utils.data import DataLoader, DistributedSampler
import ray
import ray.train
import numpy as np
from ray.train.torch import TorchTrainer
from ray.train import ScalingConfig
from dataclasses import dataclass
@dataclass
class DummyDataclass:
import os
import time
import torch
from torch import nn
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer
from ray.train.torch.xla import TorchXLAConfig
from torchvision.datasets import mnist
"""
Cluster: 16 x A10G GPUs
Command: python precompute_latents.py --subset_size 50 --mode debug
"""
import argparse
import io
import pandas as pd
import pyarrow.dataset as pds
import os
# This script is tested with the PR(https://github.com/ray-project/ray/pull/39130) from AWS team.
# It configures the required environment variables for Neuron XLA.
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_backend # noqa: F401
import torch
import torch.nn as nn
import torch.optim as optim
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_backend # noqa: F401
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer, prepare_model
from ray.train.torch.xla import TorchXLAConfig
@woshiyyya
woshiyyya / torch_ddp.py
Last active October 23, 2023 21:16
Torch_DDP_Example
import os
import tempfile
import torch
from torch import nn
from torch.nn.parallel import DistributedDataParallel
import ray
from ray.train import Checkpoint, CheckpointConfig, RunConfig, ScalingConfig
from ray.train.torch import TorchTrainer