BarclayII/ppr.py

## ppr.py
import scipy.sparse as ssp
import numpy as np
from sklearn.preprocessing import normalize
from numba.typed import Dict
from numba import jit, types, njit, prange


@njit(parallel=True, nogil=True)
def reverse_push_bipartite(A_indptr, A_indices, A_data, nL, nR, r_max, alpha):
    '''
    Compute significant PPRs based on 2-hop random walks with restart on the
    right side.

    A_indptr: CSC indptr
    A_indices: CSC indices
    A_data: CSC data
    nL: number of nodes on left side
    nR: number of nodes on right side
    r_max: residual, a small positive real number
    alpha: restart probability

    Return:
    List[Dict[int, float]]
        For nL <= v < nL + nR, if PPR(u, v) > r_max, then it would show up as

            PPR(u, v) = result[v - nL][u]

        with an error of o(r_max)
    '''
    result = [Dict.empty(key_type=types.int64, value_type=types.float64)
              for _ in range(nL, nL + nR)]

    for t in prange(nL, nL + nR):
        r = Dict.empty(key_type=types.int64, value_type=types.float64)
        p = Dict.empty(key_type=types.int64, value_type=types.float64)
        r[t] = 1.
        while True:
            for v, r_v in r.items():
                if r_v > r_max:
                    break
            else:
                break
            A_v_indices = A_indices[A_indptr[v]:A_indptr[v+1]]
            A_v_data = A_data[A_indptr[v]:A_indptr[v+1]]
            a = alpha if v >= nL else 0
            for i, d in zip(A_v_indices, A_v_data):
                r[i] = r.get(i, 0) + (1 - a) * d * r_v
            if a > 0:
                p[v] = p.get(v, 0) + a * r_v
            del r[v]

        result[t - nL] = p
        print(t)

    return result

if __name__ == '__main__':
    # 0, 1, 2, 3 are on the left
    # 4, 5, 6, 7 are on the right
    # We want to compute all PPR(u, v) > 0.01 with restart probability 0.2,
    # based on 2-hop random walks on the right side.
    A = ssp.coo_matrix(
            (np.ones(9),
              # source nodes                destination nodes
             ([0, 0, 1, 2, 3, 4, 5, 6, 7], [5, 6, 6, 7, 6, 0, 1, 2, 3])))
    A = normalize(A, norm='l1', axis=1).tocsc()
    p = reverse_push_bipartite(
            A.indptr, A.indices, A.data, 4, 4, 0.01, 0.2)
    print('Converting to dict')
    p = [dict(d) for d in p]
    for i, d in enumerate(p):
        for u, v in d.items():
            print('PPR(%d, %d) = %f' % (u, 4 + i, v))
# Result:
# PPR(4, 4) = 0.200000
# PPR(5, 5) = 0.200000
# PPR(4, 5) = 0.080000
# PPR(6, 6) = 0.551456
# PPR(4, 6) = 0.390993
# PPR(5, 6) = 0.439320
# PPR(7, 6) = 0.439320
# PPR(7, 7) = 0.551456
# PPR(6, 7) = 0.439320
# PPR(4, 7) = 0.310993
# PPR(5, 7) = 0.351456
	import scipy.sparse as ssp
	import numpy as np
	from sklearn.preprocessing import normalize
	from numba.typed import Dict
	from numba import jit, types, njit, prange


	@njit(parallel=True, nogil=True)
	def reverse_push_bipartite(A_indptr, A_indices, A_data, nL, nR, r_max, alpha):
	'''
	Compute significant PPRs based on 2-hop random walks with restart on the
	right side.

	A_indptr: CSC indptr
	A_indices: CSC indices
	A_data: CSC data
	nL: number of nodes on left side
	nR: number of nodes on right side
	r_max: residual, a small positive real number
	alpha: restart probability

	Return:
	List[Dict[int, float]]
	For nL <= v < nL + nR, if PPR(u, v) > r_max, then it would show up as

	PPR(u, v) = result[v - nL][u]

	with an error of o(r_max)
	'''
	result = [Dict.empty(key_type=types.int64, value_type=types.float64)
	for _ in range(nL, nL + nR)]

	for t in prange(nL, nL + nR):
	r = Dict.empty(key_type=types.int64, value_type=types.float64)
	p = Dict.empty(key_type=types.int64, value_type=types.float64)
	r[t] = 1.
	while True:
	for v, r_v in r.items():
	if r_v > r_max:
	break
	else:
	break
	A_v_indices = A_indices[A_indptr[v]:A_indptr[v+1]]
	A_v_data = A_data[A_indptr[v]:A_indptr[v+1]]
	a = alpha if v >= nL else 0
	for i, d in zip(A_v_indices, A_v_data):
	r[i] = r.get(i, 0) + (1 - a) * d * r_v
	if a > 0:
	p[v] = p.get(v, 0) + a * r_v
	del r[v]

	result[t - nL] = p
	print(t)

	return result

	if __name__ == '__main__':
	# 0, 1, 2, 3 are on the left
	# 4, 5, 6, 7 are on the right
	# We want to compute all PPR(u, v) > 0.01 with restart probability 0.2,
	# based on 2-hop random walks on the right side.
	A = ssp.coo_matrix(
	(np.ones(9),
	# source nodes destination nodes
	([0, 0, 1, 2, 3, 4, 5, 6, 7], [5, 6, 6, 7, 6, 0, 1, 2, 3])))
	A = normalize(A, norm='l1', axis=1).tocsc()
	p = reverse_push_bipartite(
	A.indptr, A.indices, A.data, 4, 4, 0.01, 0.2)
	print('Converting to dict')
	p = [dict(d) for d in p]
	for i, d in enumerate(p):
	for u, v in d.items():
	print('PPR(%d, %d) = %f' % (u, 4 + i, v))
	# Result:
	# PPR(4, 4) = 0.200000
	# PPR(5, 5) = 0.200000
	# PPR(4, 5) = 0.080000
	# PPR(6, 6) = 0.551456
	# PPR(4, 6) = 0.390993
	# PPR(5, 6) = 0.439320
	# PPR(7, 6) = 0.439320
	# PPR(7, 7) = 0.551456
	# PPR(6, 7) = 0.439320
	# PPR(4, 7) = 0.310993
	# PPR(5, 7) = 0.351456