-
-
Save EnisBerk/f5a71482f14b4a127a53e3aab1fceff3 to your computer and use it in GitHub Desktop.
test for using pytorch distributed package on gradient, script to run on slave
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import torch | |
import torch.distributed as dist | |
from torch.multiprocessing import Process | |
import argparse | |
import socket | |
import time | |
import requests | |
def run(rank, size): | |
gpu_id=0 | |
# tensor = torch.zeros(1) | |
mn = torch.ones(10) | |
if torch.cuda.is_available(): | |
print("cuda:{}".format(gpu_id)) | |
device = torch.device("cuda:{}".format(gpu_id)) | |
else: | |
device="cpu" | |
mn=mn.to(device) | |
print(device) | |
print(mn.is_cuda) | |
print(mn.is_sparse) | |
dist.all_reduce(mn, op=dist.reduce_op.SUM) | |
mn= mn.to("cpu") | |
print(mn[0]) | |
print("done",rank) | |
def init_processes(rank, size, fn,master_ip=None, backend="nccl"): | |
""" Initialize the distributed environment. """ | |
print("started init at{}".format(rank)) | |
if master_ip!=None: | |
print("using ip initilisation") | |
# os.environ['MASTER_ADDR'] = '127.0.0.1' | |
os.environ['MASTER_ADDR'] = master_ip | |
os.environ['MASTER_PORT'] = '29500' | |
os.environ['WORLD_SIZE'] = str(size) | |
os.environ['RANK'] = str(rank) | |
dist.init_process_group(backend) | |
else: | |
print("using storage initilisation") | |
dist.init_process_group(backend,init_method='file:///storage',rank=rank,world_size=size) | |
print("end init at{}".format(rank)) | |
fn(rank, size) | |
print("fn called at{}".format(rank)) | |
def main(): | |
file_path='/storage/masterip_tmp12.txt' | |
while (os.path.exists(file_path)==False): | |
time.sleep(10) | |
print("waiting for master") | |
with open(file_path,"r") as masterip_tmp: | |
master_ip=masterip_tmp.readline().strip() | |
print("My master's IP",master_ip) | |
size = 2 | |
rank=1 | |
init_processes(rank,size,run,master_ip) | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment