ailzhang/gist:568991eb2a2f685c09562699b849d9fd

## gistfile1.txt
import torch
import torch.nn as nn
import time
import subprocess
import argparse
import numpy as np
from torch.autograd import Variable
import pdb
def linearforward(batchsize, dim_in, dim_out):
    data = np.random.random_sample([batchsize, dim_in])
    data = torch.FloatTensor(data)
    data_in = data.cuda()
    data_in = Variable(data_in)
    net = nn.Linear(dim_in, dim_out).cuda()
    torch.cuda.synchronize()
    start = time.time()
    data_out = net(data_in)
    torch.cuda.synchronize()
    end = time.time()
    return end -start


if __name__=="__main__":
    shapes = [(22764, 2276), (2276, 30), (21740, 2174), (2174, 1024), (1024, 63), (21740, 2174), (2174, 1087), (1024, 1024)]
    sizes = []
    for i in (32, 64, 128, 256):
        for dim_in, dim_out in shapes:
            sizes.append((i, dim_in, dim_out))
            sizes.append((i, dim_out, dim_in))
            sizes.append((dim_in, i, dim_out))

    rep = 200
    warmup = 100
    result = {}
    for batchsize, dim_in, dim_out in sizes:
        time_acc= 0.0
        for i in range(rep):
            t = linearforward(batchsize, dim_in, dim_out)
            if i >= warmup:
                time_acc = time_acc + t

        avg = time_acc / (rep - warmup)
        flops = batchsize * dim_in * dim_out * 2 / avg
        # cublas baseline
        cu_out = subprocess.run(['./matrixMulCUBLAS', '--hA='+str(batchsize), '--wA='+str(dim_in), '--wB='+str(dim_out)], stdout=subprocess.PIPE)
        cu_out = str(cu_out.stdout)
        anchor1 = cu_out.find("Performance=")
        anchor2 = cu_out.find("GFlop/s")
        cu_flops = float(cu_out[anchor1+13: anchor2-1])
        p100_peak = 9.3
        result[(batchsize, dim_in, dim_out)] = (flops, flops / (p100_peak * 10**12), cu_flops, cu_flops / 1000 / p100_peak)
    print(result)
    np.save('p100_pt3.npy', result)
	import torch
	import torch.nn as nn
	import time
	import subprocess
	import argparse
	import numpy as np
	from torch.autograd import Variable
	import pdb
	def linearforward(batchsize, dim_in, dim_out):
	data = np.random.random_sample([batchsize, dim_in])
	data = torch.FloatTensor(data)
	data_in = data.cuda()
	data_in = Variable(data_in)
	net = nn.Linear(dim_in, dim_out).cuda()
	torch.cuda.synchronize()
	start = time.time()
	data_out = net(data_in)
	torch.cuda.synchronize()
	end = time.time()
	return end -start



	if __name__=="__main__":
	shapes = [(22764, 2276), (2276, 30), (21740, 2174), (2174, 1024), (1024, 63), (21740, 2174), (2174, 1087), (1024, 1024)]
	sizes = []
	for i in (32, 64, 128, 256):
	for dim_in, dim_out in shapes:
	sizes.append((i, dim_in, dim_out))
	sizes.append((i, dim_out, dim_in))
	sizes.append((dim_in, i, dim_out))

	rep = 200
	warmup = 100
	result = {}
	for batchsize, dim_in, dim_out in sizes:
	time_acc= 0.0
	for i in range(rep):
	t = linearforward(batchsize, dim_in, dim_out)
	if i >= warmup:
	time_acc = time_acc + t

	avg = time_acc / (rep - warmup)
	flops = batchsize * dim_in * dim_out * 2 / avg
	# cublas baseline
	cu_out = subprocess.run(['./matrixMulCUBLAS', '--hA='+str(batchsize), '--wA='+str(dim_in), '--wB='+str(dim_out)], stdout=subprocess.PIPE)
	cu_out = str(cu_out.stdout)
	anchor1 = cu_out.find("Performance=")
	anchor2 = cu_out.find("GFlop/s")
	cu_flops = float(cu_out[anchor1+13: anchor2-1])
	p100_peak = 9.3
	result[(batchsize, dim_in, dim_out)] = (flops, flops / (p100_peak * 10**12), cu_flops, cu_flops / 1000 / p100_peak)
	print(result)
	np.save('p100_pt3.npy', result)