Skip to content

Instantly share code, notes, and snippets.

View woshiyyya's full-sized avatar
zzz

Yunxuan Xiao woshiyyya

zzz
View GitHub Profile
View ray_neuron_training.py
import os
import time
import torch
from torch import nn
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer
from ray.train.torch.xla import TorchXLAConfig
from torchvision.datasets import mnist
View precompute_latents.py
"""
Cluster: 16 x A10G GPUs
Command: python precompute_latents.py --subset_size 50 --mode debug
"""
import argparse
import io
import pandas as pd
import pyarrow.dataset as pds
import os
View trainium-2node.py
# This script is tested with the PR(https://github.com/ray-project/ray/pull/39130) from AWS team.
# It configures the required environment variables for Neuron XLA.
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_backend # noqa: F401
View ray_trainium_ddp.py
import torch
import torch.nn as nn
import torch.optim as optim
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_backend # noqa: F401
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer, prepare_model
from ray.train.torch.xla import TorchXLAConfig
@woshiyyya
woshiyyya / torch_ddp.py
Last active October 23, 2023 21:16
Torch_DDP_Example
View torch_ddp.py
import os
import tempfile
import torch
from torch import nn
from torch.nn.parallel import DistributedDataParallel
import ray
from ray.train import Checkpoint, CheckpointConfig, RunConfig, ScalingConfig
from ray.train.torch import TorchTrainer
View requirements-repro-262.txt
accelerate==0.19.0
adal==1.2.7
aiofiles==22.1.0
aiohttp==3.8.5
aiohttp-cors==0.7.0
aiorwlock==1.3.0
aiosignal==1.3.1
aiosqlite==0.19.0
alabaster==0.7.13
anyio==3.7.1
View requirements-repro-nightly.txt
accelerate==0.19.0
adal==1.2.7
aiofiles==22.1.0
aiohttp==3.8.5
aiohttp-cors==0.7.0
aiorwlock==1.3.0
aiosignal==1.3.1
aiosqlite==0.19.0
alabaster==0.7.13
anyio==3.7.1
View requirements-release-test.txt
about-time==4.2.1
absl-py==1.4.0
accelerate==0.19.0
adal==1.2.7
aim==3.17.5
aim-ui==3.17.5
aimrecords==0.0.7
aimrocks==0.4.0
aioboto3==11.2.0
aiobotocore==2.5.0
View deepspeed_torch_trainer.py
# Minimal Example adapted from https://huggingface.co/docs/transformers/training
import deepspeed
import evaluate
import torch
from datasets import load_dataset
from deepspeed.accelerator import get_accelerator
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
AutoModelForSequenceClassification,
View accelerate_torch_trainer.py
import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
get_linear_schedule_with_warmup,
set_seed,