SixZero Sixzero

## gist:8f9c8cab34731e6cffff498ee6c513cc
using CUDA

function distributed_matmul(C, A::CuMatrix{Float32}, B::CuMatrix{Float32}, num_gpus::Int)
    m, n = size(A)
    n, k = size(B)

    # Ensure we have the correct number of GPUs
    @assert CUDA.ndevices() >= num_gpus "Not enough GPUs available"

    # Calculate rows per GPU

## gist:3312071709aadc9e7e6fcc1290cfd58a
#%%
# Copyright 2020 DeepMind Technologies Limited. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software

## test_gpu_bigFN_compilation.jl
using BenchmarkTools
using CUDA
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 1.00


test_sum(a, b, c) = begin
  I = (blockIdx().x - 1) * blockDim().x + threadIdx().x
  if I > 1000
    return
  end

## Zygote simple local min problem


X, Y = (Float32[-0.31125240132442067; 0.8163067649323273;;;],
        Float32[5.7064323; 2.599511;;;])

# w = randn(1,1,1) .* ones(2,1,1)
b = Float32[25.510088, ]
# b = randn(1,1,1) .* ones(2,1,1)
w = Float32[0.15980364, ]
modl(X,w,b) = begin

## example.jl


a = randn(1024)
b = randn(1024)
c = randn(1024)

@time c .= a .+ b
@time c .= a .+ b
;

## assign_test.jl
#%%
using BenchmarkTools
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 2.50
using Base: @opaque

add_opaq = @opaque (a::Vector, b::Vector) -> a .+ b
@noinline add(a::Vector, b::Vector) = a .+ b
@noinline add!(c::Vector, a::Vector, b::Vector) = c .= a .+ b
@inline add_inline(a::Vector, b::Vector) = a .+ b
#%%
	using CUDA

	function distributed_matmul(C, A::CuMatrix{Float32}, B::CuMatrix{Float32}, num_gpus::Int)
	m, n = size(A)
	n, k = size(B)

	# Ensure we have the correct number of GPUs
	@assert CUDA.ndevices() >= num_gpus "Not enough GPUs available"

	# Calculate rows per GPU
	#%%
	# Copyright 2020 DeepMind Technologies Limited. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	using BenchmarkTools
	using CUDA
	BenchmarkTools.DEFAULT_PARAMETERS.seconds = 1.00


	test_sum(a, b, c) = begin
	I = (blockIdx().x - 1) * blockDim().x + threadIdx().x
	if I > 1000
	return
	end


	X, Y = (Float32[-0.31125240132442067; 0.8163067649323273;;;],
	Float32[5.7064323; 2.599511;;;])

	# w = randn(1,1,1) .* ones(2,1,1)
	b = Float32[25.510088, ]
	# b = randn(1,1,1) .* ones(2,1,1)
	w = Float32[0.15980364, ]
	modl(X,w,b) = begin


	a = randn(1024)
	b = randn(1024)
	c = randn(1024)

	@time c .= a .+ b
	@time c .= a .+ b
	;
	#%%
	using BenchmarkTools
	BenchmarkTools.DEFAULT_PARAMETERS.seconds = 2.50
	using Base: @opaque

	add_opaq = @opaque (a::Vector, b::Vector) -> a .+ b
	@noinline add(a::Vector, b::Vector) = a .+ b
	@noinline add!(c::Vector, a::Vector, b::Vector) = c .= a .+ b
	@inline add_inline(a::Vector, b::Vector) = a .+ b
	#%%