Skip to content

Instantly share code, notes, and snippets.

@cloudaice
Created October 16, 2012 01:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cloudaice/3896844 to your computer and use it in GitHub Desktop.
Save cloudaice/3896844 to your computer and use it in GitHub Desktop.
一个和pagerank相关的算法Algorithm in 126 Lines
# PageRank algorithm
# By Peter Bengtsson
# http://www.peterbe.com/
# mail@peterbe.com
#
# Requires the numarray module
# http://www.stsci.edu/resources/software_hardware/numarray
from numarray import *
import numarray.linear_algebra as la
def _sum_sequence(seq):
""" sums up a sequence """
def _add(x,y): return x+y
return reduce(_add, seq, 0)
class PageRanker:
def __init__(self, p, webmatrix):
assert p>=0 and p <= 1
self.p = float(p)
if type(webmatrix)in [type([]), type(())]:
webmatrix = array(webmatrix, Float)
assert webmatrix.shape[0]==webmatrix.shape[1]
self.webmatrix = webmatrix
# create the deltamatrix
imatrix = identity(webmatrix.shape[0], Float)
for i in range(webmatrix.shape[0]):
imatrix[i] = imatrix[i]*sum(webmatrix[i,:])
deltamatrix = la.inverse(imatrix)
self.deltamatrix = deltamatrix
# create the fmatrix
self.fmatrix = ones(webmatrix.shape, Float)
self.sigma = webmatrix.shape[0]
# calculate the Stochastic matrix
_f_normalized = (self.sigma**-1)*self.fmatrix
_randmatrix = (1-p)*_f_normalized
_linkedmatrix = p * matrixmultiply(deltamatrix, webmatrix)
M = _randmatrix + _linkedmatrix
self.stochasticmatrix = M
self.invariantmeasure = ones((1, webmatrix.shape[0]), Float)
def improve_guess(self, times=1):
for i in range(times):
self._improve()
def _improve(self):
self.invariantmeasure = matrixmultiply(self.invariantmeasure, self.stochasticmatrix)
def get_invariant_measure(self):
return self.invariantmeasure
def getPageRank(self):
sum = _sum_sequence(self.invariantmeasure[0])
copy = self.invariantmeasure[0]
for i in range(len(copy)):
copy[i] = copy[i]/sum
return copy
if __name__=='__main__':
# Example usage
web = ((0, 1, 0, 0),
(0, 0, 1, 0),
(0, 0, 0, 1),
(1, 0, 0, 0))
pr = PageRanker(0.85, web)
pr.improve_guess(100)
print pr.getPageRank()
#!/usr/bin/env python
from numpy import *
def pageRankGenerator(
At = [array((), int32)],
numLinks = array((), int32),
ln = array((), int32),
alpha = 0.85,
convergence = 0.01,
checkSteps = 10
):
"""
Compute an approximate page rank vector of N pages to within some convergence factor.
@param At a sparse square matrix with N rows. At[ii] contains the indices of pages jj linking to ii.
@param numLinks iNumLinks[ii] is the number of links going out from ii.
@param ln contains the indices of pages without links
@param alpha a value between 0 and 1. Determines the relative importance of "stochastic" links.
@param convergence a relative convergence criterion. smaller means better, but more expensive.
@param checkSteps check for convergence after so many steps
"""
# the number of "pages"
N = len(At)
# the number of "pages without links"
M = ln.shape[0]
# initialize: single-precision should be good enough
iNew = ones((N,), float32) / N
iOld = ones((N,), float32) / N
done = False
while not done:
# normalize every now and then for numerical stability
iNew /= sum(iNew)
for step in range(checkSteps):
# swap arrays
iOld, iNew = iNew, iOld
# an element in the 1 x I vector.
# all elements are identical.
oneIv = (1 - alpha) * sum(iOld) / N
# an element of the A x I vector.
# all elements are identical.
oneAv = 0.0
if M > 0:
oneAv = alpha * sum(iOld.take(ln, axis = 0)) * M / N
# the elements of the H x I multiplication
ii = 0
while ii < N:
page = At[ii]
h = 0
if page.shape[0]:
h = alpha * dot(
iOld.take(page, axis = 0),
1. / numLinks.take(page, axis = 0)
)
iNew[ii] = h + oneAv + oneIv
ii += 1
diff = iNew - iOld
done = (sqrt(dot(diff, diff)) / N < convergence)
yield iNew
def transposeLinkMatrix(
outGoingLinks = [[]]
):
"""
Transpose the link matrix. The link matrix contains the pages each page points to.
But what we want is to know which pages point to a given page, while retaining information
about how many links each page contains (so store that in a separate array),
as well as which pages contain no links at all (leaf nodes).
@param outGoingLinks outGoingLinks[ii] contains the indices of pages pointed to by page ii
@return a tuple of (incomingLinks, numOutGoingLinks, leafNodes)
"""
nPages = len(outGoingLinks)
# incomingLinks[ii] will contain the indices jj of the pages linking to page ii
incomingLinks = [[] for ii in range(nPages)]
# the number of links in each page
numLinks = zeros(nPages, int32)
# the indices of the leaf nodes
leafNodes = []
for ii in range(nPages):
if len(outGoingLinks[ii]) == 0:
leafNodes.append(ii)
else:
numLinks[ii] = len(outGoingLinks[ii])
# transpose the link matrix
for jj in outGoingLinks[ii]:
incomingLinks[jj].append(ii)
incomingLinks = [array(ii) for ii in incomingLinks]
numLinks = array(numLinks)
leafNodes = array(leafNodes)
return incomingLinks, numLinks, leafNodes
def pageRank(
linkMatrix = [[]],
alpha = 0.85,
convergence = 0.01,
checkSteps = 10
):
"""
Convenience wrap for the link matrix transpose and the generator.
@see pageRankGenerator for parameter description
"""
incomingLinks, numLinks, leafNodes = transposeLinkMatrix(linkMatrix)
for gr in pageRankGenerator(incomingLinks, numLinks, leafNodes, alpha = alpha, convergence = convergence, checkSteps = checkSteps):
final = gr
return final
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment