-
-
Save EnisBerk/1c1c10df476acd60cb28bffeab5bc2f7 to your computer and use it in GitHub Desktop.
test for using pytorch distributed package on gradient, script to run on master
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import torch | |
import torch.distributed as dist | |
from torch.multiprocessing import Process | |
import argparse | |
import socket | |
import time | |
import requests | |
def run(rank, size): | |
mn = torch.ones(10) | |
if torch.cuda.is_available(): | |
print("cuda:{}".format(rank)) | |
device = torch.device("cuda:{}".format(rank)) | |
else: | |
device="cpu" | |
print(device) | |
mn=mn.to(device) | |
print(mn.is_cuda) | |
print(mn.is_sparse) | |
dist.all_reduce(mn, op=dist.reduce_op.SUM) | |
mn= mn.to("cpu") | |
print(mn[0]) | |
print("done",rank) | |
def init_processes(rank, size, fn,master_ip=None, backend="nccl"): | |
""" Initialize the distributed environment. """ | |
print("started init at{}".format(rank)) | |
# we need those so process can talk to each other including over a network | |
if master_ip!=None: | |
print("using ip initilisation") | |
os.environ['MASTER_ADDR'] = master_ip | |
os.environ['MASTER_PORT'] = '29500' | |
os.environ['WORLD_SIZE'] = str(size) | |
os.environ['RANK'] = str(rank) | |
dist.init_process_group(backend) | |
else: | |
print("using storage initilisation") | |
dist.init_process_group(backend,init_method='file:///storage',rank=rank,world_size=size) | |
print("end init at{}".format(rank)) | |
fn(rank, size) | |
print("fn called at{}".format(rank)) | |
def main(): | |
file_path='/storage/masterip_tmp12.txt' | |
# If I am master? | |
# get ip address of the current machine | |
response = requests.get('https://metadata.paperspace.com/meta-data/machine') | |
response=response.json() | |
master_ip=response.get("privateIpAddress",None) | |
# master_ip=response.get("publicIpAddress",None) | |
print("Master IP",master_ip) | |
with open(file_path,"w+") as masterip_tmp: | |
masterip_tmp.write(master_ip) | |
time.sleep(5) | |
size = 2 | |
rank=0 | |
init_processes(rank,size,run,master_ip) | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment