public
Created

  • Download Gist
pagerank.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
from numpy import *
 
def transposeLinkMatrix(
outGoingLinks = [[]]
):
"""
Transpose the link matrix. The link matrix contains the pages each page points to.
However, what we want is to know which pages point to a given page. However, we
will still want to know how many links each page contains (so store that in a separate array),
as well as which pages contain no links at all (leaf nodes).
 
@param outGoingLinks outGoingLinks[ii] contains the indices of pages pointed to by page ii
@return a tuple of (incomingLinks, numOutGoingLinks, leafNodes)
"""
 
nPages = len(outGoingLinks)
# incomingLinks[ii] will contain the indices jj of the pages linking to page ii
incomingLinks = [[] for ii in range(nPages)]
# the number of links in each page
numLinks = zeros(nPages, int32)
# the indices of the leaf nodes
leafNodes = []
 
for ii in range(nPages):
if len(outGoingLinks[ii]) == 0:
leafNodes.append(ii)
else:
numLinks[ii] = len(outGoingLinks[ii])
# transpose the link matrix
for jj in outGoingLinks[ii]:
incomingLinks[jj].append(ii)
incomingLinks = [array(ii) for ii in incomingLinks]
numLinks = array(numLinks)
leafNodes = array(leafNodes)
 
 
def pageRankGenerator(
At = [array((), int32)],
numLinks = array((), int32),
ln = array((), int32),
alpha = 0.85,
convergence = 0.01,
checkSteps = 10
):
"""
Compute an approximate page rank vector of N pages to within some convergence factor.
@param At a sparse square matrix with N rows. At[ii] contains the indices of pages jj linking to ii.
@param numLinks iNumLinks[ii] is the number of links going out from ii.
@param ln contains the indices of pages without links
@param alpha a value between 0 and 1. Determines the relative importance of "stochastic" links.
@param convergence a relative convergence criterion. smaller means better, but more expensive.
@param checkSteps check for convergence after so many steps
"""
 
# the number of "pages"
N = len(At)
 
# the number of "pages without links"
M = ln.shape[0]
 
# initialize: single-precision should be good enough
iNew = ones((N,), float32) / N
iOld = ones((N,), float32) / N
 
done = False
while not done:
 
# normalize every now and then for numerical stability
iNew /= sum(iNew)
 
for step in range(checkSteps):
 
# swap arrays
iOld, iNew = iNew, iOld
 
# an element in the 1 x I vector.
# all elements are identical.
oneIv = (1 - alpha) * sum(iOld) / N
 
# an element of the A x I vector.
# all elements are identical.
oneAv = 0.0
if M > 0:
oneAv = alpha * sum(iOld.take(ln, axis = 0)) / N
 
# the elements of the H x I multiplication
ii = 0
while ii < N:
page = At[ii]
h = 0
if page.shape[0]:
h = alpha * dot(
iOld.take(page, axis = 0),
1. / numLinks.take(page, axis = 0)
)
iNew[ii] = h + oneAv + oneIv
ii += 1
 
diff = iNew - iOld
done = (sqrt(dot(diff, diff)) / N < convergence)
 
yield iNew
 
def pageRank(
linkMatrix = [[]],
alpha = 0.85,
convergence = 0.01,
checkSteps = 10
):
"""
Convenience wrap for the link matrix transpose and the generator.
"""
incomingLinks, numLinks, leafNodes = transposeLinkMatrix(linkMatrix)
 
for gr in pageRankGenerator(incomingLinks, numLinks, leafNodes,
alpha = alpha,
convergence = convergence,
checkSteps = checkSteps):
final = gr
 
return final
usage_example.py
Python
1 2 3 4 5 6 7 8 9 10 11
#!/usr/bin/env python
 
from pageRank import pageRank
 
# a graph with one link from page one to page two
links = [
[1],
[]
]
 
print pageRank(links, alpha = 1.0)

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.