shibacow/.bashrc

## .bashrc
export LD_LIBRARY_PATH=/opt/intel/mkl/lib/intel64:/opt/intel/compilers_and_libra
ries/linux/lib/intel64:/home/ec2-user/hpl-2.0_FERMI_v15/src/cuda:/usr/lib64/open
mpi/lib:/opt/intel/compilers_and_libraries_2017.0.098/linux/compiler/lib/intel64
_lin
export PATH=/usr/lib64/openmpi/bin:$PATH

## HPL.dat
HPLinpack benchmark input file
Innovative Computing Laboratory, University of Tennessee
HPL.out      output file name (if any)
6            device out (6=stdout,7=stderr,file)
2           # of problems sizes (N)
65536 81920 100000 110000 160000 180000 39007 39000  20960 364160 359424 276480 138240 115200 23040 354432 236160 95040 9600 20737 16129 16128 Ns
2             # of NBs
768 1536 640 768 896 960 1024 1152 1280 384 640 960 768 640 256  960 512 768 11\52         NBs
0            PMAP process mapping (0=Row-,1=Column-major)
1            # of process grids (P x Q)
1       Ps
1       Qs
16.0         threshold
1            # of panel fact
0 1 2        PFACTs (0=left, 1=Crout, 2=Right)
1            # of recursive stopping criterium
4 2 8          NBMINs (>= 1)
1            # of panels in recursion
2            NDIVs
1            # of recursive panel fact.
0 1 2        RFACTs (0=left, 1=Crout, 2=Right)
1            # of broadcast
0 2          BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
1            # of lookahead depth
1 0          DEPTHs (>=0)
2            SWAP (0=bin-exch,1=long,2=mix)
128          swapping threshold
1            L1 in (0=transposed,1=no-transposed) form
1            U  in (0=transposed,1=no-transposed) form
1            Equilibration (0=no,1=yes)
8            memory alignment in double (> 0)

## make.CUDA
[ec2-user@ip-10-0-0-192 hpl-2.0_FERMI_v15]$ more Make.CUDA
#
#     This is just a sample Make.
#     The user may need to edit:
#         1.) TOPdir
#         2.) MPI variables (MPdir,MPinc,MPlib)
#         3.) MKL BLAS variables (LAdir, LAinc, LAlib)
#         4.) The Compiler and Compiler/Linker Options (CC,CCFLAGS)
#

#
#  -- High Performance Computing Linpack Benchmark (HPL)
#     HPL - 1.0a - January 20, 2004
#     Antoine P. Petitet
#     University of Tennessee, Knoxville
#     Innovative Computing Laboratories
#     (C) Copyright 2000-2004 All Rights Reserved
#
#  -- Copyright notice and Licensing terms:
#
#  Redistribution  and  use in  source and binary forms, with or without
#  modification, are  permitted provided  that the following  conditions
#  are met:
#
#  1. Redistributions  of  source  code  must retain the above copyright
#  notice, this list of conditions and the following disclaimer.
#
#  2. Redistributions in binary form must reproduce  the above copyright
#  notice, this list of conditions,  and the following disclaimer in the
#  documentation and/or other materials provided with the distribution.
#
#  3. All  advertising  materials  mentioning  features  or  use of this
#  software must display the following acknowledgement:
#  This  product  includes  software  developed  at  the  University  of
#  Tennessee, Knoxville, Innovative Computing Laboratories.
#
#  4. The name of the  University,  the name of the  Laboratory,  or the
#  names  of  its  contributors  may  not  be used to endorse or promote
#  products  derived   from   this  software  without  specific  written
#  permission.
#
#  -- Disclaimer:
#
#  THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
#  OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
#  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#  DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
#  THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#
# ----------------------------------------------------------------------
# - shell --------------------------------------------------------------
# ----------------------------------------------------------------------
#
SHELL        = /bin/sh
#
CD           = cd
CP           = cp
LN_S         = ln -fs
MKDIR        = mkdir -p
RM           = /bin/rm -f
TOUCH        = touch
#
# ----------------------------------------------------------------------
# - Platform identifier ------------------------------------------------
# ----------------------------------------------------------------------
#
ARCH         = CUDA
#
# ----------------------------------------------------------------------
# - HPL Directory Structure / HPL library ------------------------------
# ----------------------------------------------------------------------
#
# Set TOPdir to the location of where this is being built
ifndef  TOPdir
TOPdir = /home/ec2-user/hpl-2.0_FERMI_v15
endif
INCdir       = $(TOPdir)/include
BINdir       = $(TOPdir)/bin/$(ARCH)
LIBdir       = $(TOPdir)/lib/$(ARCH)
#
HPLlib       = $(LIBdir)/libhpl.a
#
# ----------------------------------------------------------------------
# - Message Passing library (MPI) --------------------------------------
# ----------------------------------------------------------------------
# MPinc tells the  C  compiler where to find the Message Passing library
# header files,  MPlib  is defined  to be the name of  the library to be
# used. The variable MPdir is only used for defining MPinc and MPlib.
#
#MPdir        = /opt/intel/mpi/3.0
#MPinc        = -I$(MPdir)/include64
#MPlib        = $(MPdir)/lib64/libmpi.a
#MPlib        = $(MPdir)/lib64/libmpich.a
#
# ----------------------------------------------------------------------
# - Linear Algebra library (BLAS) -----------------------------
# ----------------------------------------------------------------------
# LAinc tells the  C  compiler where to find the Linear Algebra  library
# header files,  LAlib  is defined  to be the name of  the library to be
# used. The variable LAdir is only used for defining LAinc and LAlib.
#
#LAdir        = $(TOPdir)/../../lib/em64t
#LAdir        = /share/apps/intel/mkl/10.2.4.032/libem64t
LAdir        = /opt/intel/mkl/lib/intel64
#LAMP5dir = /opt/intel/compilers_and_libraries/linux/lib/intel64
LAMP5dir= /opt/intel/compilers_and_libraries_2017.0.098/linux/compiler/lib/intel64_lin
#LAinc        =
LAinc        = -I/opt/intel/mkl/include
# CUDA
#LAlib        = -L /home/cuda/Fortran_Cuda_Blas  -ldgemm -L/usr/local/cuda/lib -lcublas  -L$(LAdir) -lmkl -lguide -lpthread
LAlib        = -L $(TOPdir)/src/cuda  -ldgemm -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -L$(LAdir) -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -L$(LAMP5dir) -liomp5
#
# ----------------------------------------------------------------------
# - F77 / C interface --------------------------------------------------
# ----------------------------------------------------------------------
# You can skip this section  if and only if  you are not planning to use
# a  BLAS  library featuring a Fortran 77 interface.  Otherwise,  it  is
# necessary  to  fill out the  F2CDEFS  variable  with  the  appropriate
# options.  **One and only one**  option should be chosen in **each** of
# the 3 following categories:
#
# 1) name space (How C calls a Fortran 77 routine)
#
# -DAdd_              : all lower case and a suffixed underscore  (Suns,
#                       Intel, ...),                           [default]
# -DNoChange          : all lower case (IBM RS6000),
# -DUpCase            : all upper case (Cray),
# -DAdd__             : the FORTRAN compiler in use is f2c.
#
# 2) C and Fortran 77 integer mapping
#
# -DF77_INTEGER=int   : Fortran 77 INTEGER is a C int,         [default]
# -DF77_INTEGER=long  : Fortran 77 INTEGER is a C long,
# -DF77_INTEGER=short : Fortran 77 INTEGER is a C short.
#
# 3) Fortran 77 string handling
#
# -DStringSunStyle    : The string address is passed at the string loca-
#                       tion on the stack, and the string length is then
#                       passed as  an  F77_INTEGER  after  all  explicit
#                       stack arguments,                       [default]
# -DStringStructPtr   : The address  of  a  structure  is  passed  by  a
#                       Fortran 77  string,  and the structure is of the
#                       form: struct {char *cp; F77_INTEGER len;},
# -DStringStructVal   : A structure is passed by value for each  Fortran
#                       77 string,  and  the  structure is  of the form:
#                       struct {char *cp; F77_INTEGER len;},
# -DStringCrayStyle   : Special option for  Cray  machines,  which  uses
#                       Cray  fcd  (fortran  character  descriptor)  for
#                       interoperation.
#
F2CDEFS      = -DAdd__ -DF77_INTEGER=int -DStringSunStyle
#
# ----------------------------------------------------------------------
# - HPL includes / libraries / specifics -------------------------------
# ----------------------------------------------------------------------
#
HPL_INCLUDES = -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc) -I/usr/local/cuda/include
HPL_LIBS     = $(HPLlib) $(LAlib) $(MPlib)
#
# - Compile time options -----------------------------------------------
#
# -DHPL_COPY_L           force the copy of the panel L before bcast;
# -DHPL_CALL_CBLAS       call the cblas interface;
# -DHPL_DETAILED_TIMING  enable detailed timers;
# -DASYOUGO              enable timing information as you go (nonintrusive)
# -DASYOUGO2             slightly intrusive timing information
# -DASYOUGO2_DISPLAY     display detailed DGEMM information
# -DENDEARLY             end the problem early
# -DFASTSWAP             insert to use DLASWP instead of HPL code
#
# By default HPL will:
#    *) not copy L before broadcast,
#    *) call the BLAS Fortran 77 interface,
#    *) not display detailed timing information.
#
HPL_OPTS     =  -DCUDA
# ----------------------------------------------------------------------
#
HPL_DEFS     = $(F2CDEFS) $(HPL_OPTS) $(HPL_INCLUDES)
#
# ----------------------------------------------------------------------
# - Compilers / linkers - Optimization flags ---------------------------
#
# next two lines for GNU Compilers:
CC      = mpicc
CCFLAGS = $(HPL_DEFS) -fomit-frame-pointer -O3 -funroll-loops -W -Wall -fopenmp
# next two lines for Intel Compilers:
#CC      = mpicc
#CCFLAGS = $(HPL_DEFS) -O3 -axS -w -fomit-frame-pointer -funroll-loops -openmp
#
CCNOOPT      = $(HPL_DEFS) -O0 -w
#
# On some platforms,  it is necessary  to use the Fortran linker to find
# the Fortran internals used in the BLAS library.
#
LINKER       = $(CC)
#LINKFLAGS    = $(CCFLAGS) -static_mpi
LINKFLAGS    = $(CCFLAGS)
#
ARCHIVER     = ar
ARFLAGS      = r
RANLIB       = echo
#
# ----------------------------------------------------------------------
MAKE = make TOPdir=$(TOPdir)

## p2.8xlarge_bench_result
================================================================================
[ec2-user@ip-xxxx-xxxx-xxxx-xxxx CUDA]$ mpirun -np 8 -host localhost ./run_linpack
================================================================================
HPLinpack 2.0  --  High-Performance Linpack benchmark  --   September 10, 2008
Written by A. Petitet and R. Clint Whaley,  Innovative Computing Laboratory, UTK
Modified by Piotr Luszczek, Innovative Computing Laboratory, UTK
Modified by Julien Langou, University of Colorado Denver
================================================================================

An explanation of the input/output parameters follows:
T/V    : Wall time / encoded variant.                                                                                                             [401/873]
N      : The order of the coefficient matrix A.
NB     : The partitioning blocking factor.
P      : The number of process rows.
Q      : The number of process columns.
Time   : Time in seconds to solve the linear system.
Gflops : Rate of execution for solving the linear system.

The following parameter values will be used:

N      :   65536    81920
NB     :     768     1536     2048
PMAP   : Row-major process mapping
P      :       2
Q      :       4
PFACT  :    Left
NBMIN  :       4
NDIV   :       2
RFACT  :    Left
BCAST  :   1ring
DEPTH  :       1
SWAP   : Mix (threshold = 128)
L1     : no-transposed form
U      : no-transposed form
EQUIL  : yes
ALIGN  : 8 double precision words

--------------------------------------------------------------------------------

- The matrix A is randomly generated for each test.
- The following scaled residual check will be computed:
      ||Ax-b||_oo / ( eps * ( || x ||_oo * || A ||_oo + || b ||_oo ) * N )
- The relative machine precision (eps) is taken to be               1.110223e-16
- Computational tests pass if scaled residuals are less than                16.0

================================================================================
T/V                N    NB     P     Q               Time                 Gflops
--------------------------------------------------------------------------------
WR10L2L4       65536   768     2     4             126.13              1.488e+03
--------------------------------------------------------------------------------
||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)=        0.0030666 ...... PASSED
================================================================================
T/V                N    NB     P     Q               Time                 Gflops
--------------------------------------------------------------------------------
WR10L2L4       65536  1536     2     4             106.63              1.760e+03
--------------------------------------------------------------------------------
||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)=        0.0033980 ...... PASSED
================================================================================
T/V                N    NB     P     Q               Time                 Gflops
--------------------------------------------------------------------------------
WR10L2L4       65536  2048     2     4              90.45              2.075e+03
--------------------------------------------------------------------------------
||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)=        0.0032682 ...... PASSED
================================================================================
T/V                N    NB     P     Q               Time                 Gflops
--------------------------------------------------------------------------------
WR10L2L4       81920   768     2     4             231.72              1.582e+03
--------------------------------------------------------------------------------
||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)=        0.0029593 ...... PASSED
================================================================================
T/V                N    NB     P     Q               Time                 Gflops
--------------------------------------------------------------------------------
WR10L2L4       81920  1536     2     4             184.96              1.982e+03
--------------------------------------------------------------------------------
||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)=        0.0031443 ...... PASSED
================================================================================
T/V                N    NB     P     Q               Time                 Gflops
--------------------------------------------------------------------------------
WR10L2L4       81920  2048     2     4             151.98              2.412e+03
--------------------------------------------------------------------------------
||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)=        0.0030877 ...... PASSED
================================================================================

Finished      6 tests with the following results:
              6 tests completed and passed residual checks,
              0 tests completed and failed residual checks,
              0 tests skipped because of illegal input values.
--------------------------------------------------------------------------------

End of Tests.

## run_linpack
#!/bin/bash

#location of HPL
HPL_DIR=/home/ec2-user/hpl-2.0_FERMI_v15

# Number of CPU cores ( per GPU used = per MPI process )
CPU_CORES_PER_GPU=4

# FOR MKL
export MKL_NUM_THREADS=$CPU_CORES_PER_GPU
# FOR GOTO
export GOTO_NUM_THREADS=$CPU_CORES_PER_GPU
# FOR OMP
export OMP_NUM_THREADS=$CPU_CORES_PER_GPU

export MKL_DYNAMIC=FALSE

# hint: for 2050 or 2070 card
#       try 350/(350 + MKL_NUM_THREADS*4*cpu frequency in GHz)
export CUDA_DGEMM_SPLIT=0.80

# hint: try CUDA_DGEMM_SPLIT - 0.10
export CUDA_DTRSM_SPLIT=0.70

export LD_LIBRARY_PATH=$HPL_DIR/src/cuda:$LD_LIBRARY_PATH

$HPL_DIR/bin/CUDA/xhpl

## xhpl_benchmark_result
[ec2-user@ip-xxx-xxx-xxx-xxx CUDA]$ mpirun -np 1 -host localhost ./run_linpack
================================================================================
HPLinpack 2.0  --  High-Performance Linpack benchmark  --   September 10, 2008
Written by A. Petitet and R. Clint Whaley,  Innovative Computing Laboratory, UTK
Modified by Piotr Luszczek, Innovative Computing Laboratory, UTK
Modified by Julien Langou, University of Colorado Denver
================================================================================

An explanation of the input/output parameters follows:
T/V    : Wall time / encoded variant.
N      : The order of the coefficient matrix A.
NB     : The partitioning blocking factor.
P      : The number of process rows.
Q      : The number of process columns.
Time   : Time in seconds to solve the linear system.
Gflops : Rate of execution for solving the linear system.

The following parameter values will be used:

N      :   65536    81920
NB     :     768     1536
PMAP   : Row-major process mapping
P      :       1
Q      :       1
PFACT  :    Left
NBMIN  :       4
NDIV   :       2
RFACT  :    Left
BCAST  :   1ring
DEPTH  :       1
SWAP   : Mix (threshold = 128)
L1     : no-transposed form
U      : no-transposed form
EQUIL  : yes
ALIGN  : 4 double precision words

--------------------------------------------------------------------------------

- The matrix A is randomly generated for each test.
- The following scaled residual check will be computed:
      ||Ax-b||_oo / ( eps * ( || x ||_oo * || A ||_oo + || b ||_oo ) * N )
- The relative machine precision (eps) is taken to be               1.110223e-16
- Computational tests pass if scaled residuals are less than                16.0

================================================================================
T/V                N    NB     P     Q               Time                 Gflops
--------------------------------------------------------------------------------
WR10L2L4       65536   768     1     1             242.50              7.738e+02
--------------------------------------------------------------------------------
||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)=        0.0038208 ...... PASSED
================================================================================
T/V                N    NB     P     Q               Time                 Gflops
--------------------------------------------------------------------------------
WR10L2L4       65536  1536     1     1             235.32              7.975e+02
--------------------------------------------------------------------------------
||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)=        0.0043131 ...... PASSED
================================================================================
T/V                N    NB     P     Q               Time                 Gflops
--------------------------------------------------------------------------------
WR10L2L4       81920   768     1     1             449.06              8.162e+02
--------------------------------------------------------------------------------
||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)=        0.0040796 ...... PASSED
================================================================================
T/V                N    NB     P     Q               Time                 Gflops
--------------------------------------------------------------------------------
WR10L2L4       81920  1536     1     1             454.85              8.058e+02
--------------------------------------------------------------------------------
||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)=        0.0047358 ...... PASSED
================================================================================

Finished      4 tests with the following results:
              4 tests completed and passed residual checks,
              0 tests completed and failed residual checks,
              0 tests skipped because of illegal input values.
--------------------------------------------------------------------------------

End of Tests.
================================================================================
	export LD_LIBRARY_PATH=/opt/intel/mkl/lib/intel64:/opt/intel/compilers_and_libra
	ries/linux/lib/intel64:/home/ec2-user/hpl-2.0_FERMI_v15/src/cuda:/usr/lib64/open
	mpi/lib:/opt/intel/compilers_and_libraries_2017.0.098/linux/compiler/lib/intel64
	_lin
	export PATH=/usr/lib64/openmpi/bin:$PATH
	HPLinpack benchmark input file
	Innovative Computing Laboratory, University of Tennessee
	HPL.out output file name (if any)
	6 device out (6=stdout,7=stderr,file)
	2 # of problems sizes (N)
	65536 81920 100000 110000 160000 180000 39007 39000 20960 364160 359424 276480 138240 115200 23040 354432 236160 95040 9600 20737 16129 16128 Ns
	2 # of NBs
	768 1536 640 768 896 960 1024 1152 1280 384 640 960 768 640 256 960 512 768 11\52 NBs
	0 PMAP process mapping (0=Row-,1=Column-major)
	1 # of process grids (P x Q)
	1 Ps
	1 Qs
	16.0 threshold
	1 # of panel fact
	0 1 2 PFACTs (0=left, 1=Crout, 2=Right)
	1 # of recursive stopping criterium
	4 2 8 NBMINs (>= 1)
	1 # of panels in recursion
	2 NDIVs
	1 # of recursive panel fact.
	0 1 2 RFACTs (0=left, 1=Crout, 2=Right)
	1 # of broadcast
	0 2 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
	1 # of lookahead depth
	1 0 DEPTHs (>=0)
	2 SWAP (0=bin-exch,1=long,2=mix)
	128 swapping threshold
	1 L1 in (0=transposed,1=no-transposed) form
	1 U in (0=transposed,1=no-transposed) form
	1 Equilibration (0=no,1=yes)
	8 memory alignment in double (> 0)
	[ec2-user@ip-10-0-0-192 hpl-2.0_FERMI_v15]$ more Make.CUDA
	#
	# This is just a sample Make.
	# The user may need to edit:
	# 1.) TOPdir
	# 2.) MPI variables (MPdir,MPinc,MPlib)
	# 3.) MKL BLAS variables (LAdir, LAinc, LAlib)
	# 4.) The Compiler and Compiler/Linker Options (CC,CCFLAGS)
	#

	#
	# -- High Performance Computing Linpack Benchmark (HPL)
	# HPL - 1.0a - January 20, 2004
	# Antoine P. Petitet
	# University of Tennessee, Knoxville
	# Innovative Computing Laboratories
	# (C) Copyright 2000-2004 All Rights Reserved
	#
	# -- Copyright notice and Licensing terms:
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions
	# are met:
	#
	# 1. Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	#
	# 2. Redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions, and the following disclaimer in the
	# documentation and/or other materials provided with the distribution.
	#
	# 3. All advertising materials mentioning features or use of this
	# software must display the following acknowledgement:
	# This product includes software developed at the University of
	# Tennessee, Knoxville, Innovative Computing Laboratories.
	#
	# 4. The name of the University, the name of the Laboratory, or the
	# names of its contributors may not be used to endorse or promote
	# products derived from this software without specific written
	# permission.
	#
	# -- Disclaimer:
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
	# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	# DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
	# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
	# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
	# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	#
	# ----------------------------------------------------------------------
	# - shell --------------------------------------------------------------
	# ----------------------------------------------------------------------
	#
	SHELL = /bin/sh
	#
	CD = cd
	CP = cp
	LN_S = ln -fs
	MKDIR = mkdir -p
	RM = /bin/rm -f
	TOUCH = touch
	#
	# ----------------------------------------------------------------------
	# - Platform identifier ------------------------------------------------
	# ----------------------------------------------------------------------
	#
	ARCH = CUDA
	#
	# ----------------------------------------------------------------------
	# - HPL Directory Structure / HPL library ------------------------------
	# ----------------------------------------------------------------------
	#
	# Set TOPdir to the location of where this is being built
	ifndef TOPdir
	TOPdir = /home/ec2-user/hpl-2.0_FERMI_v15
	endif
	INCdir = $(TOPdir)/include
	BINdir = $(TOPdir)/bin/$(ARCH)
	LIBdir = $(TOPdir)/lib/$(ARCH)
	#
	HPLlib = $(LIBdir)/libhpl.a
	#
	# ----------------------------------------------------------------------
	# - Message Passing library (MPI) --------------------------------------
	# ----------------------------------------------------------------------
	# MPinc tells the C compiler where to find the Message Passing library
	# header files, MPlib is defined to be the name of the library to be
	# used. The variable MPdir is only used for defining MPinc and MPlib.
	#
	#MPdir = /opt/intel/mpi/3.0
	#MPinc = -I$(MPdir)/include64
	#MPlib = $(MPdir)/lib64/libmpi.a
	#MPlib = $(MPdir)/lib64/libmpich.a
	#
	# ----------------------------------------------------------------------
	# - Linear Algebra library (BLAS) -----------------------------
	# ----------------------------------------------------------------------
	# LAinc tells the C compiler where to find the Linear Algebra library
	# header files, LAlib is defined to be the name of the library to be
	# used. The variable LAdir is only used for defining LAinc and LAlib.
	#
	#LAdir = $(TOPdir)/../../lib/em64t
	#LAdir = /share/apps/intel/mkl/10.2.4.032/libem64t
	LAdir = /opt/intel/mkl/lib/intel64
	#LAMP5dir = /opt/intel/compilers_and_libraries/linux/lib/intel64
	LAMP5dir= /opt/intel/compilers_and_libraries_2017.0.098/linux/compiler/lib/intel64_lin
	#LAinc =
	LAinc = -I/opt/intel/mkl/include
	# CUDA
	#LAlib = -L /home/cuda/Fortran_Cuda_Blas -ldgemm -L/usr/local/cuda/lib -lcublas -L$(LAdir) -lmkl -lguide -lpthread
	LAlib = -L $(TOPdir)/src/cuda -ldgemm -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -L$(LAdir) -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -L$(LAMP5dir) -liomp5
	#
	# ----------------------------------------------------------------------
	# - F77 / C interface --------------------------------------------------
	# ----------------------------------------------------------------------
	# You can skip this section if and only if you are not planning to use
	# a BLAS library featuring a Fortran 77 interface. Otherwise, it is
	# necessary to fill out the F2CDEFS variable with the appropriate
	# options. One and only one option should be chosen in each of
	# the 3 following categories:
	#
	# 1) name space (How C calls a Fortran 77 routine)
	#
	# -DAdd_ : all lower case and a suffixed underscore (Suns,
	# Intel, ...), [default]
	# -DNoChange : all lower case (IBM RS6000),
	# -DUpCase : all upper case (Cray),
	# -DAdd__ : the FORTRAN compiler in use is f2c.
	#
	# 2) C and Fortran 77 integer mapping
	#
	# -DF77_INTEGER=int : Fortran 77 INTEGER is a C int, [default]
	# -DF77_INTEGER=long : Fortran 77 INTEGER is a C long,
	# -DF77_INTEGER=short : Fortran 77 INTEGER is a C short.
	#
	# 3) Fortran 77 string handling
	#
	# -DStringSunStyle : The string address is passed at the string loca-
	# tion on the stack, and the string length is then
	# passed as an F77_INTEGER after all explicit
	# stack arguments, [default]
	# -DStringStructPtr : The address of a structure is passed by a
	# Fortran 77 string, and the structure is of the
	# form: struct {char *cp; F77_INTEGER len;},
	# -DStringStructVal : A structure is passed by value for each Fortran
	# 77 string, and the structure is of the form:
	# struct {char *cp; F77_INTEGER len;},
	# -DStringCrayStyle : Special option for Cray machines, which uses
	# Cray fcd (fortran character descriptor) for
	# interoperation.
	#
	F2CDEFS = -DAdd__ -DF77_INTEGER=int -DStringSunStyle
	#
	# ----------------------------------------------------------------------
	# - HPL includes / libraries / specifics -------------------------------
	# ----------------------------------------------------------------------
	#
	HPL_INCLUDES = -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc) -I/usr/local/cuda/include
	HPL_LIBS = $(HPLlib) $(LAlib) $(MPlib)
	#
	# - Compile time options -----------------------------------------------
	#
	# -DHPL_COPY_L force the copy of the panel L before bcast;
	# -DHPL_CALL_CBLAS call the cblas interface;
	# -DHPL_DETAILED_TIMING enable detailed timers;
	# -DASYOUGO enable timing information as you go (nonintrusive)
	# -DASYOUGO2 slightly intrusive timing information
	# -DASYOUGO2_DISPLAY display detailed DGEMM information
	# -DENDEARLY end the problem early
	# -DFASTSWAP insert to use DLASWP instead of HPL code
	#
	# By default HPL will:
	# *) not copy L before broadcast,
	# *) call the BLAS Fortran 77 interface,
	# *) not display detailed timing information.
	#
	HPL_OPTS = -DCUDA
	# ----------------------------------------------------------------------
	#
	HPL_DEFS = $(F2CDEFS) $(HPL_OPTS) $(HPL_INCLUDES)
	#
	# ----------------------------------------------------------------------
	# - Compilers / linkers - Optimization flags ---------------------------
	#
	# next two lines for GNU Compilers:
	CC = mpicc
	CCFLAGS = $(HPL_DEFS) -fomit-frame-pointer -O3 -funroll-loops -W -Wall -fopenmp
	# next two lines for Intel Compilers:
	#CC = mpicc
	#CCFLAGS = $(HPL_DEFS) -O3 -axS -w -fomit-frame-pointer -funroll-loops -openmp
	#
	CCNOOPT = $(HPL_DEFS) -O0 -w
	#
	# On some platforms, it is necessary to use the Fortran linker to find
	# the Fortran internals used in the BLAS library.
	#
	LINKER = $(CC)
	#LINKFLAGS = $(CCFLAGS) -static_mpi
	LINKFLAGS = $(CCFLAGS)
	#
	ARCHIVER = ar
	ARFLAGS = r
	RANLIB = echo
	#
	# ----------------------------------------------------------------------
	MAKE = make TOPdir=$(TOPdir)
	================================================================================
	[ec2-user@ip-xxxx-xxxx-xxxx-xxxx CUDA]$ mpirun -np 8 -host localhost ./run_linpack
	================================================================================
	HPLinpack 2.0 -- High-Performance Linpack benchmark -- September 10, 2008
	Written by A. Petitet and R. Clint Whaley, Innovative Computing Laboratory, UTK
	Modified by Piotr Luszczek, Innovative Computing Laboratory, UTK
	Modified by Julien Langou, University of Colorado Denver
	================================================================================

	An explanation of the input/output parameters follows:
	T/V : Wall time / encoded variant. [401/873]
	N : The order of the coefficient matrix A.
	NB : The partitioning blocking factor.
	P : The number of process rows.
	Q : The number of process columns.
	Time : Time in seconds to solve the linear system.
	Gflops : Rate of execution for solving the linear system.

	The following parameter values will be used:

	N : 65536 81920
	NB : 768 1536 2048
	PMAP : Row-major process mapping
	P : 2
	Q : 4
	PFACT : Left
	NBMIN : 4
	NDIV : 2
	RFACT : Left
	BCAST : 1ring
	DEPTH : 1
	SWAP : Mix (threshold = 128)
	L1 : no-transposed form
	U : no-transposed form
	EQUIL : yes
	ALIGN : 8 double precision words

	--------------------------------------------------------------------------------

	- The matrix A is randomly generated for each test.
	- The following scaled residual check will be computed:
	\|\|Ax-b\|\|_oo / ( eps * ( \|\| x \|\|_oo * \|\| A \|\|_oo + \|\| b \|\|_oo ) * N )
	- The relative machine precision (eps) is taken to be 1.110223e-16
	- Computational tests pass if scaled residuals are less than 16.0

	================================================================================
	T/V N NB P Q Time Gflops
	--------------------------------------------------------------------------------
	WR10L2L4 65536 768 2 4 126.13 1.488e+03
	--------------------------------------------------------------------------------
	\|\|Ax-b\|\|_oo/(eps(\|\|A\|\|_oo\|\|x\|\|_oo+\|\|b\|\|_oo)*N)= 0.0030666 ...... PASSED
	================================================================================
	T/V N NB P Q Time Gflops
	--------------------------------------------------------------------------------
	WR10L2L4 65536 1536 2 4 106.63 1.760e+03
	--------------------------------------------------------------------------------
	\|\|Ax-b\|\|_oo/(eps(\|\|A\|\|_oo\|\|x\|\|_oo+\|\|b\|\|_oo)*N)= 0.0033980 ...... PASSED
	================================================================================
	T/V N NB P Q Time Gflops
	--------------------------------------------------------------------------------
	WR10L2L4 65536 2048 2 4 90.45 2.075e+03
	--------------------------------------------------------------------------------
	\|\|Ax-b\|\|_oo/(eps(\|\|A\|\|_oo\|\|x\|\|_oo+\|\|b\|\|_oo)*N)= 0.0032682 ...... PASSED
	================================================================================
	T/V N NB P Q Time Gflops
	--------------------------------------------------------------------------------
	WR10L2L4 81920 768 2 4 231.72 1.582e+03
	--------------------------------------------------------------------------------
	\|\|Ax-b\|\|_oo/(eps(\|\|A\|\|_oo\|\|x\|\|_oo+\|\|b\|\|_oo)*N)= 0.0029593 ...... PASSED
	================================================================================
	T/V N NB P Q Time Gflops
	--------------------------------------------------------------------------------
	WR10L2L4 81920 1536 2 4 184.96 1.982e+03
	--------------------------------------------------------------------------------
	\|\|Ax-b\|\|_oo/(eps(\|\|A\|\|_oo\|\|x\|\|_oo+\|\|b\|\|_oo)*N)= 0.0031443 ...... PASSED
	================================================================================
	T/V N NB P Q Time Gflops
	--------------------------------------------------------------------------------
	WR10L2L4 81920 2048 2 4 151.98 2.412e+03
	--------------------------------------------------------------------------------
	\|\|Ax-b\|\|_oo/(eps(\|\|A\|\|_oo\|\|x\|\|_oo+\|\|b\|\|_oo)*N)= 0.0030877 ...... PASSED
	================================================================================

	Finished 6 tests with the following results:
	6 tests completed and passed residual checks,
	0 tests completed and failed residual checks,
	0 tests skipped because of illegal input values.
	--------------------------------------------------------------------------------

	End of Tests.
	#!/bin/bash

	#location of HPL
	HPL_DIR=/home/ec2-user/hpl-2.0_FERMI_v15

	# Number of CPU cores ( per GPU used = per MPI process )
	CPU_CORES_PER_GPU=4

	# FOR MKL
	export MKL_NUM_THREADS=$CPU_CORES_PER_GPU
	# FOR GOTO
	export GOTO_NUM_THREADS=$CPU_CORES_PER_GPU
	# FOR OMP
	export OMP_NUM_THREADS=$CPU_CORES_PER_GPU

	export MKL_DYNAMIC=FALSE

	# hint: for 2050 or 2070 card
	# try 350/(350 + MKL_NUM_THREADS4cpu frequency in GHz)
	export CUDA_DGEMM_SPLIT=0.80

	# hint: try CUDA_DGEMM_SPLIT - 0.10
	export CUDA_DTRSM_SPLIT=0.70

	export LD_LIBRARY_PATH=$HPL_DIR/src/cuda:$LD_LIBRARY_PATH

	$HPL_DIR/bin/CUDA/xhpl
	[ec2-user@ip-xxx-xxx-xxx-xxx CUDA]$ mpirun -np 1 -host localhost ./run_linpack
	================================================================================
	HPLinpack 2.0 -- High-Performance Linpack benchmark -- September 10, 2008
	Written by A. Petitet and R. Clint Whaley, Innovative Computing Laboratory, UTK
	Modified by Piotr Luszczek, Innovative Computing Laboratory, UTK
	Modified by Julien Langou, University of Colorado Denver
	================================================================================

	An explanation of the input/output parameters follows:
	T/V : Wall time / encoded variant.
	N : The order of the coefficient matrix A.
	NB : The partitioning blocking factor.
	P : The number of process rows.
	Q : The number of process columns.
	Time : Time in seconds to solve the linear system.
	Gflops : Rate of execution for solving the linear system.

	The following parameter values will be used:

	N : 65536 81920
	NB : 768 1536
	PMAP : Row-major process mapping
	P : 1
	Q : 1
	PFACT : Left
	NBMIN : 4
	NDIV : 2
	RFACT : Left
	BCAST : 1ring
	DEPTH : 1
	SWAP : Mix (threshold = 128)
	L1 : no-transposed form
	U : no-transposed form
	EQUIL : yes
	ALIGN : 4 double precision words

	--------------------------------------------------------------------------------

	- The matrix A is randomly generated for each test.
	- The following scaled residual check will be computed:
	\|\|Ax-b\|\|_oo / ( eps * ( \|\| x \|\|_oo * \|\| A \|\|_oo + \|\| b \|\|_oo ) * N )
	- The relative machine precision (eps) is taken to be 1.110223e-16
	- Computational tests pass if scaled residuals are less than 16.0

	================================================================================
	T/V N NB P Q Time Gflops
	--------------------------------------------------------------------------------
	WR10L2L4 65536 768 1 1 242.50 7.738e+02
	--------------------------------------------------------------------------------
	\|\|Ax-b\|\|_oo/(eps(\|\|A\|\|_oo\|\|x\|\|_oo+\|\|b\|\|_oo)*N)= 0.0038208 ...... PASSED
	================================================================================
	T/V N NB P Q Time Gflops
	--------------------------------------------------------------------------------
	WR10L2L4 65536 1536 1 1 235.32 7.975e+02
	--------------------------------------------------------------------------------
	\|\|Ax-b\|\|_oo/(eps(\|\|A\|\|_oo\|\|x\|\|_oo+\|\|b\|\|_oo)*N)= 0.0043131 ...... PASSED
	================================================================================
	T/V N NB P Q Time Gflops
	--------------------------------------------------------------------------------
	WR10L2L4 81920 768 1 1 449.06 8.162e+02
	--------------------------------------------------------------------------------
	\|\|Ax-b\|\|_oo/(eps(\|\|A\|\|_oo\|\|x\|\|_oo+\|\|b\|\|_oo)*N)= 0.0040796 ...... PASSED
	================================================================================
	T/V N NB P Q Time Gflops
	--------------------------------------------------------------------------------
	WR10L2L4 81920 1536 1 1 454.85 8.058e+02
	--------------------------------------------------------------------------------
	\|\|Ax-b\|\|_oo/(eps(\|\|A\|\|_oo\|\|x\|\|_oo+\|\|b\|\|_oo)*N)= 0.0047358 ...... PASSED
	================================================================================

	Finished 4 tests with the following results:
	4 tests completed and passed residual checks,
	0 tests completed and failed residual checks,
	0 tests skipped because of illegal input values.
	--------------------------------------------------------------------------------

	End of Tests.
	================================================================================