Skip to content

Instantly share code, notes, and snippets.

@EnisBerk
Last active November 30, 2018 22:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save EnisBerk/1c1c10df476acd60cb28bffeab5bc2f7 to your computer and use it in GitHub Desktop.
Save EnisBerk/1c1c10df476acd60cb28bffeab5bc2f7 to your computer and use it in GitHub Desktop.
test for using pytorch distributed package on gradient, script to run on master
import os
import torch
import torch.distributed as dist
from torch.multiprocessing import Process
import argparse
import socket
import time
import requests
def run(rank, size):
mn = torch.ones(10)
if torch.cuda.is_available():
print("cuda:{}".format(rank))
device = torch.device("cuda:{}".format(rank))
else:
device="cpu"
print(device)
mn=mn.to(device)
print(mn.is_cuda)
print(mn.is_sparse)
dist.all_reduce(mn, op=dist.reduce_op.SUM)
mn= mn.to("cpu")
print(mn[0])
print("done",rank)
def init_processes(rank, size, fn,master_ip=None, backend="nccl"):
""" Initialize the distributed environment. """
print("started init at{}".format(rank))
# we need those so process can talk to each other including over a network
if master_ip!=None:
print("using ip initilisation")
os.environ['MASTER_ADDR'] = master_ip
os.environ['MASTER_PORT'] = '29500'
os.environ['WORLD_SIZE'] = str(size)
os.environ['RANK'] = str(rank)
dist.init_process_group(backend)
else:
print("using storage initilisation")
dist.init_process_group(backend,init_method='file:///storage',rank=rank,world_size=size)
print("end init at{}".format(rank))
fn(rank, size)
print("fn called at{}".format(rank))
def main():
file_path='/storage/masterip_tmp12.txt'
# If I am master?
# get ip address of the current machine
response = requests.get('https://metadata.paperspace.com/meta-data/machine')
response=response.json()
master_ip=response.get("privateIpAddress",None)
# master_ip=response.get("publicIpAddress",None)
print("Master IP",master_ip)
with open(file_path,"w+") as masterip_tmp:
masterip_tmp.write(master_ip)
time.sleep(5)
size = 2
rank=0
init_processes(rank,size,run,master_ip)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment