mkolod/torch_1.6_fork_wait.py

## torch_1.6_fork_wait.py
# NOTE: The network here is not means to make any sense. It's just for measuring perf impact.
import torch
import torch.nn.functional as F
from time import time

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        fcs = [torch.nn.Linear(10, 100)] + [torch.nn.Linear(100, 100) for _ in range(20)]
        self.fcs = torch.nn.Sequential(*fcs)

    def forward(self, x):
        return self.fcs(x)

class Combine(torch.nn.Module):
    def __init__(self, fork=False):
        super(Combine, self).__init__()

        self.fork = fork
        self.branch0 = Net()
        self.branch1 = Net()

    @torch.jit.export
    def forward_forked(self, x):
        fut0 = torch.jit.fork(self.branch0, x)
        fut1 = torch.jit.fork(self.branch1, x)
        return torch.jit.wait(fut0) + torch.jit.wait(fut1)

    @torch.jit.export
    def forward_reg(self, x):
        return self.branch0(x) + self.branch1(x)

    def forward(self, x):
        return self.forward_forked(x) if self.fork else self.forward_reg(x)

if __name__ == '__main__':
    combine_reg = torch.jit.script(Combine(fork=False).cuda().eval())
    combine_forked = torch.jit.script(Combine(fork=True).cuda().eval())
    x = torch.randn(10, 10).cuda()

    for _ in range(50):
        res0 = combine_reg(x)
        res1 = combine_forked(x)

    torch.cuda.synchronize()

    start_reg = time()
    res0 = combine_reg(x)
    torch.cuda.synchronize()
    end_reg = time()

    res1 = combine_forked(x)
    torch.cuda.synchronize()
    end_forked = time()

    print(f"exec time of non-forked: {(end_reg - start_reg) * 1000:.3f} ms")
    print(f"exec time of forked: {(end_forked - end_reg) * 1000:.3f} ms")
	# NOTE: The network here is not means to make any sense. It's just for measuring perf impact.
	import torch
	import torch.nn.functional as F
	from time import time

	class Net(torch.nn.Module):
	def __init__(self):
	super(Net, self).__init__()
	fcs = [torch.nn.Linear(10, 100)] + [torch.nn.Linear(100, 100) for _ in range(20)]
	self.fcs = torch.nn.Sequential(*fcs)

	def forward(self, x):
	return self.fcs(x)

	class Combine(torch.nn.Module):
	def __init__(self, fork=False):
	super(Combine, self).__init__()

	self.fork = fork
	self.branch0 = Net()
	self.branch1 = Net()

	@torch.jit.export
	def forward_forked(self, x):
	fut0 = torch.jit.fork(self.branch0, x)
	fut1 = torch.jit.fork(self.branch1, x)
	return torch.jit.wait(fut0) + torch.jit.wait(fut1)

	@torch.jit.export
	def forward_reg(self, x):
	return self.branch0(x) + self.branch1(x)

	def forward(self, x):
	return self.forward_forked(x) if self.fork else self.forward_reg(x)

	if __name__ == '__main__':
	combine_reg = torch.jit.script(Combine(fork=False).cuda().eval())
	combine_forked = torch.jit.script(Combine(fork=True).cuda().eval())
	x = torch.randn(10, 10).cuda()

	for _ in range(50):
	res0 = combine_reg(x)
	res1 = combine_forked(x)

	torch.cuda.synchronize()

	start_reg = time()
	res0 = combine_reg(x)
	torch.cuda.synchronize()
	end_reg = time()

	res1 = combine_forked(x)
	torch.cuda.synchronize()
	end_forked = time()

	print(f"exec time of non-forked: {(end_reg - start_reg) * 1000:.3f} ms")
	print(f"exec time of forked: {(end_forked - end_reg) * 1000:.3f} ms")