harpone/lru.py

## lru.py
def forward_sequential(h, xs, U, W, nu, theta):
    """Forward pass through the network sequentially over input `xs` of any length.
    NOTE: has no batch dimension. To be batched with `vmap`.

    Args:
        h (torch.tensor): shape [D_h, ]; previous state
        xs (torch.tensor): shape [T, D_x]; input sequence
        U (torch.tensor): Parameter matrix of shape [D_h, D_x]
        W (torch.tensor): Parameter matrix of shape [D_h, D_x]
        xi (torch.tensor): Parameter vector of shape [D_h, ]
        eta (torch.tensor): Parameter vector of shape [D_h, ]
    Returns:
        hs (torch.tensor): shape [T, D_h]; output sequence
    """
    T = xs.shape[0]
    D_h = h.shape[0]
    hs = torch.zeros(T, D_h, device=xs.device)
    for t in range(T):
        h = torch.exp(U @ xs[t] - nu - theta * 1j) * h + W @ xs[t]
        hs[t] = h
    return hs.real


def forward_parallel(h, xs, U, W, nu, theta):
    """Forward pass through the network in parallel over input `xs` of any length by using
    the exact solution of the recurrence relation.
    NOTE: has no batch dimension. To be batched with `vmap`.

    Args:
        h (torch.tensor): shape [D_h, ]; previous state
        xs (torch.tensor): shape [T, D_x]; input sequence
        U (torch.tensor): Parameter matrix of shape [D_h, D_x]
        W (torch.tensor): Parameter matrix of shape [D_h, D_x]
        xi (torch.tensor): Parameter vector of shape [D_h, ]
        eta (torch.tensor): Parameter vector of shape [D_h, ]
    Returns:
        hs (torch.tensor): shape [T, D_h]; output sequence
    """
    gammas = torch.cumsum(torch.matmul(xs, U.T) - nu - theta * 1j, dim=0) # [T, D_h]
    betas = torch.matmul(xs, W.T) # [T, D_h]
    source = torch.cumsum(torch.exp(-gammas) * betas, dim=0)  # [T, D_h]
    hs = torch.exp(gammas) * (h[None] + source)

    return hs.real


#### Benchmark code:
device = torch.device('cuda')

D_h = 256
D_x = 64
U = torch.randn(D_h, D_x, device=device)
W = torch.randn(D_h, D_x, device=device)
xi = torch.linspace(0.001, 0.5, D_h, device=device)
eta = torch.linspace(0, 2 * math.pi * (D_h - 1) / D_h, D_h, device=device)

T = 1024

xs = torch.randn(T, D_x, device=device)
h = torch.randn(D_h, device=device)

def sequential_timer():
    hs_seq = forward_sequential(h, xs, U, W, xi, eta)
    torch.cuda.synchronize()

def parallel_timer():
    hs_par = forward_parallel(h, xs, U, W, xi, eta)
    torch.cuda.synchronize()
	def forward_sequential(h, xs, U, W, nu, theta):
	"""Forward pass through the network sequentially over input `xs` of any length.
	NOTE: has no batch dimension. To be batched with `vmap`.

	Args:
	h (torch.tensor): shape [D_h, ]; previous state
	xs (torch.tensor): shape [T, D_x]; input sequence
	U (torch.tensor): Parameter matrix of shape [D_h, D_x]
	W (torch.tensor): Parameter matrix of shape [D_h, D_x]
	xi (torch.tensor): Parameter vector of shape [D_h, ]
	eta (torch.tensor): Parameter vector of shape [D_h, ]
	Returns:
	hs (torch.tensor): shape [T, D_h]; output sequence
	"""
	T = xs.shape[0]
	D_h = h.shape[0]
	hs = torch.zeros(T, D_h, device=xs.device)
	for t in range(T):
	h = torch.exp(U @ xs[t] - nu - theta * 1j) * h + W @ xs[t]
	hs[t] = h
	return hs.real


	def forward_parallel(h, xs, U, W, nu, theta):
	"""Forward pass through the network in parallel over input `xs` of any length by using
	the exact solution of the recurrence relation.
	NOTE: has no batch dimension. To be batched with `vmap`.

	Args:
	h (torch.tensor): shape [D_h, ]; previous state
	xs (torch.tensor): shape [T, D_x]; input sequence
	U (torch.tensor): Parameter matrix of shape [D_h, D_x]
	W (torch.tensor): Parameter matrix of shape [D_h, D_x]
	xi (torch.tensor): Parameter vector of shape [D_h, ]
	eta (torch.tensor): Parameter vector of shape [D_h, ]
	Returns:
	hs (torch.tensor): shape [T, D_h]; output sequence
	"""
	gammas = torch.cumsum(torch.matmul(xs, U.T) - nu - theta * 1j, dim=0) # [T, D_h]
	betas = torch.matmul(xs, W.T) # [T, D_h]
	source = torch.cumsum(torch.exp(-gammas) * betas, dim=0) # [T, D_h]
	hs = torch.exp(gammas) * (h[None] + source)

	return hs.real


	#### Benchmark code:
	device = torch.device('cuda')

	D_h = 256
	D_x = 64
	U = torch.randn(D_h, D_x, device=device)
	W = torch.randn(D_h, D_x, device=device)
	xi = torch.linspace(0.001, 0.5, D_h, device=device)
	eta = torch.linspace(0, 2 * math.pi * (D_h - 1) / D_h, D_h, device=device)

	T = 1024

	xs = torch.randn(T, D_x, device=device)
	h = torch.randn(D_h, device=device)

	def sequential_timer():
	hs_seq = forward_sequential(h, xs, U, W, xi, eta)
	torch.cuda.synchronize()

	def parallel_timer():
	hs_par = forward_parallel(h, xs, U, W, xi, eta)
	torch.cuda.synchronize()