Xinfeng GD06

## mlp_ddp_nccl.py
import argparse
from datetime import datetime
import numpy as np
import os
import logging
import torch
import torch.distributed as dist
import torch.multiprocessing as mp

from torch.nn.parallel import DistributedDataParallel as DDP

## main.cpp
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <memory.h>
#include <string.h>
#include <math.h>

void DataInit(float* ptr, int length)
{
    srand(7);

## mkldnn_conv.cpp
#include <stdlib.h>
#include <stdio.h>
#include <mkldnn.h>
#include <sys/time.h>
#include <memory.h>

#define CHECK(f) do { \
    mkldnn_status_t s = f; \
    if (s != mkldnn_success) { \
        printf("[%s:%d] error: %s return %d,\n", __FILE__, __LINE__, #f, s); \

## Makefile
MKLROOT=/home/security/intel/mkl
MKLDNNROOT=/home/security/.local
COMMON_FLAGS=-O4 -std=c++11

all:main.o mkldnn_conv.o im2col_mkl.o
	g++ $(COMMON_FLAGS) -o main $^ \
		-L ${MKLDNNROOT}/lib -lmkldnn -lmklml_intel \
		-Wl,--start-group \
		${MKLROOT}/lib/intel64/libmkl_intel_lp64.a \
		${MKLROOT}/lib/intel64/libmkl_gnu_thread.a \

## nv-topo-matrix.txt
	[4mGPU0	GPU1	GPU2	GPU3	GPU4	GPU5	GPU6	GPU7	mlx4_0	CPU Affinity[0m
GPU0	 X 	PIX	PHB	PHB	SOC	SOC	SOC	SOC	SOC	0-9,20-29
GPU1	PIX	 X 	PHB	PHB	SOC	SOC	SOC	SOC	SOC	0-9,20-29
GPU2	PHB	PHB	 X 	PIX	SOC	SOC	SOC	SOC	SOC	0-9,20-29
GPU3	PHB	PHB	PIX	 X 	SOC	SOC	SOC	SOC	SOC	0-9,20-29
GPU4	SOC	SOC	SOC	SOC	 X 	PIX	PHB	PHB	PHB	10-19,30-39
GPU5	SOC	SOC	SOC	SOC	PIX	 X 	PHB	PHB	PHB	10-19,30-39
GPU6	SOC	SOC	SOC	SOC	PHB	PHB	 X 	PIX	PHB	10-19,30-39
GPU7	SOC	SOC	SOC	SOC	PHB	PHB	PIX	 X 	PHB	10-19,30-39
mlx4_0	SOC	SOC	SOC	SOC	PHB	PHB	PHB	PHB	 X

## tf_multiGPU.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
import os
import argparse
import pwd
import re
import csv
import numpy as np
import pickle

## training.py
'''
Distributed Tensorflow 0.8.0 example of using data parallelism and share model parameters.
Trains a simple sigmoid neural network on mnist for 20 epochs on three machines using one parameter server.

Change the hardcoded host urls below with your own hosts.
Run like this:

pc-01$ python example.py --job_name="ps" --task_index=0
pc-02$ python example.py --job_name="worker" --task_index=0
pc-03$ python example.py --job_name="worker" --task_index=1

## training.py
'''
Distributed Tensorflow 0.8.0 example of using data parallelism and share model parameters.
Trains a simple sigmoid neural network on mnist for 20 epochs on three machines using one parameter server.

Change the hardcoded host urls below with your own hosts.
Run like this:

pc-01$ python example.py --job_name="ps" --task_index=0
pc-02$ python example.py --job_name="worker" --task_index=0
pc-03$ python example.py --job_name="worker" --task_index=1

## gist:3f38afff3296e98ec9db8be7f47ba16d
'''
Distributed Tensorflow 0.8.0 example of using data parallelism and share model parameters.
Trains a simple sigmoid neural network on mnist for 20 epochs on three machines using one parameter server.

Change the hardcoded host urls below with your own hosts.
Run like this:

pc-01$ python example.py --job_name="ps" --task_index=0
pc-02$ python example.py --job_name="worker" --task_index=0
pc-03$ python example.py --job_name="worker" --task_index=1

## run.sh
#!/bin/bash -e

CUDA_VISIBLE_DEVICES='' python3 training.py --job_name="ps" --task_index=0 &
sleep 60
CUDA_VISIBLE_DEVICES='0' python3 training.py --job_name="worker" --task_index=0 2> worker_1_log &
CUDA_VISIBLE_DEVICES='1' python3 training.py --job_name="worker" --task_index=1 2> worker_2_log
	import argparse
	from datetime import datetime
	import numpy as np
	import os
	import logging
	import torch
	import torch.distributed as dist
	import torch.multiprocessing as mp

	from torch.nn.parallel import DistributedDataParallel as DDP
	#include <stdio.h>
	#include <stdlib.h>
	#include <sys/time.h>
	#include <memory.h>
	#include <string.h>
	#include <math.h>

	void DataInit(float* ptr, int length)
	{
	srand(7);
	#include <stdlib.h>
	#include <stdio.h>
	#include <mkldnn.h>
	#include <sys/time.h>
	#include <memory.h>

	#define CHECK(f) do { \
	mkldnn_status_t s = f; \
	if (s != mkldnn_success) { \
	printf("[%s:%d] error: %s return %d,\n", __FILE__, __LINE__, #f, s); \
	MKLROOT=/home/security/intel/mkl
	MKLDNNROOT=/home/security/.local
	COMMON_FLAGS=-O4 -std=c++11

	all:main.o mkldnn_conv.o im2col_mkl.o
	g++ $(COMMON_FLAGS) -o main $^ \
	-L ${MKLDNNROOT}/lib -lmkldnn -lmklml_intel \
	-Wl,--start-group \
	${MKLROOT}/lib/intel64/libmkl_intel_lp64.a \
	${MKLROOT}/lib/intel64/libmkl_gnu_thread.a \
	[4mGPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx4_0 CPU Affinity[0m
	GPU0 X PIX PHB PHB SOC SOC SOC SOC SOC 0-9,20-29
	GPU1 PIX X PHB PHB SOC SOC SOC SOC SOC 0-9,20-29
	GPU2 PHB PHB X PIX SOC SOC SOC SOC SOC 0-9,20-29
	GPU3 PHB PHB PIX X SOC SOC SOC SOC SOC 0-9,20-29
	GPU4 SOC SOC SOC SOC X PIX PHB PHB PHB 10-19,30-39
	GPU5 SOC SOC SOC SOC PIX X PHB PHB PHB 10-19,30-39
	GPU6 SOC SOC SOC SOC PHB PHB X PIX PHB 10-19,30-39
	GPU7 SOC SOC SOC SOC PHB PHB PIX X PHB 10-19,30-39
	mlx4_0 SOC SOC SOC SOC PHB PHB PHB PHB X
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	#
	import os
	import argparse
	import pwd
	import re
	import csv
	import numpy as np
	import pickle
	'''
	Distributed Tensorflow 0.8.0 example of using data parallelism and share model parameters.
	Trains a simple sigmoid neural network on mnist for 20 epochs on three machines using one parameter server.

	Change the hardcoded host urls below with your own hosts.
	Run like this:

	pc-01$ python example.py --job_name="ps" --task_index=0
	pc-02$ python example.py --job_name="worker" --task_index=0
	pc-03$ python example.py --job_name="worker" --task_index=1
	#!/bin/bash -e

	CUDA_VISIBLE_DEVICES='' python3 training.py --job_name="ps" --task_index=0 &
	sleep 60
	CUDA_VISIBLE_DEVICES='0' python3 training.py --job_name="worker" --task_index=0 2> worker_1_log &
	CUDA_VISIBLE_DEVICES='1' python3 training.py --job_name="worker" --task_index=1 2> worker_2_log