Skip to content

Instantly share code, notes, and snippets.

@UPstartDeveloper
Last active July 15, 2020 11:30
Show Gist options
  • Save UPstartDeveloper/c07fb82e2a179af76b6f63b271124f4b to your computer and use it in GitHub Desktop.
Save UPstartDeveloper/c07fb82e2a179af76b6f63b271124f4b to your computer and use it in GitHub Desktop.
import math
import numpy as np
from collections import deque
class PageVertex:
"""
Representation of a single web page and its linked pages.
"""
def __init__(self, id):
"""Initialize attributes of a new PageVertex instance.
Parameters:
id(str): unique identifier of the instance, e.g. 'A'
Returns: None
"""
self.page_id = id
# map each linked page_id --> PageVertex obj
self.neighbors = dict()
# each link has the same weight
self.link_weight = 0
def __set_link_weight(self, num_links=None):
"""Adjust the weight of any PageVertex linked by this
instance to be the inverse of its number of
neighbors.
Parameter:
num_links(int): the number of outlinks of
this instance
Returns: None
"""
if num_links is None:
num_links = len(self.neighbors)
self.link_weight = (1 / num_links)
return None
def add_link(self, page):
"""Link another PageVertex from this instance.
Parameters:
page(PageVertex): the PageVertex object the
outlink leads to
"""
# recalculate the link weight
num_links = 1 + len(self.neighbors)
self.__set_link_weight(num_links)
# add neighbor
self.neighbors[page.get_id()] = page
return None
def __str__(self):
'''Output the PageVertex and its linked neighbors.'''
neighbor_ids = list(self.neighbors.keys())
return f'{self.page_id} adjacent to {neighbor_ids}'
def __repr__(self):
'''Output the list of neighbors of this PageVertex.'''
return self.__str__()
def get_neighbors(self):
"""Return the PageVertex instances that are linked by
this instance.
"""
return list(self.neighbors.values())
def get_neighbors_with_weights(self):
"""Return the PageVertex instances that are linked by
this instance, along with the weight of that link.
"""
neighbors = self.get_neighbors()
# add each neighbir along with its weights
neighbor_weights = list()
for n in neighbors:
pair = (n, n.link_weight)
neighbor_weights.append(pair)
return neighbor_weights
def get_id(self):
'''Return the id of this vertex.'''
return self.page_id
def has_neighbor(self, page_id):
'''Return True or False based on if this Page links to the other.'''
return page_id in self.neighbors
class InternetGraph:
"""
Represents a composition of all PageVertex instances
in the network. Is a directed, weighted, and
not neccessarily connected graph.
"""
def __init__(self):
'''Construct a new InternetGraph instance.'''
# map each page_id --> PageVertex obj
self.pages = dict()
def add_page_by_id(self, page_id):
"""Instaniate a new PageVertex, then add to
the InternetGraph.
Parameters:
page_id(str): the id of the new PageVertex
to be added
Returns: None
"""
self.pages[page_id] = PageVertex(page_id)
return None
def add_page_by_obj(self, page):
"""Add a PageVertex instance into the dictionary
of all PageVertex instances.
Parameters:
page(PageVertex): the PageVertex to be added
Returns: None
"""
self.pages[page.get_id()] = page
return None
def get_pages(self):
'''Return a list of all PageVertex instances.'''
return list(self.pages.values())
def contains_page(self, page_id):
"""Returns True if a PageVertex exists with 'page_id'.
Parameters:
page_id(str): the id of the PageVertex being queried
Returns: bool: True only if the id is found in self.pages
"""
return page_id in self.pages
def get_page(self, page_id):
"""Returns a PageVertex with matching page_id.
Parameters:
page_id(str): the id of the PageVertex being queried
Returns: PageVertex instance, or else raises KeyError.
"""
# raise error, or return object
if page_id not in self.pages:
raise KeyError(f'PageVertex {page_id} not found.')
return self.pages[page_id]
def link_pages(self, page1_id, page2_id):
"""Adds a link from PageVertex 1 to PageVertex 2.
Parameters:
page1_id(str): the id of the PageVertex where the link originates
page2_id(str): the id of the PageVertex where the link ends
Returns: None
"""
page1_obj, page2_obj = (
self.get_page(page1_id),
self.get_page(page2_id)
)
page1_obj.add_link(page2_obj)
return None
def __str__(self):
'''Return the PageVertexs in this instance.'''
return f'InternetGraph with PageVertices: {self.get_pages()}'
"""What's the PageRank rating of each page?"""
def compute_inlink_values(self):
"""Return a dict of the total endorsement given
to each PageVertex.
Parameters: None
Returns:
list: rankings is an array of the eigenvalues computed from
an square matrix of each PageVertex's endorsements
Complexity Analysis:
The runtime of this implementation scales quadratically
with the size of P, the number of PageVertexs in the InternetGraph.
It also scales with the size of L, the number of links
in the InternetGraph, because that is the sum total of addition
operations we will use to calculate the total endorsement
received by each PageVertex.
This runtime can be expressed in Big O as O(P^2 + L)
"""
# compute how much endorsement each PageVertex got
all_page_ids = list(self.pages.keys()) # O(P)
inlinks = list()
for page_id1 in all_page_ids: # P^2 iterations
endorsements = list()
for page_id2 in all_page_ids:
# get the two Page objects
pag1, page2 = (
self.get_page(page_id1),
self.get_page(page_id2)
)
# use outlinks to see endorsements from others
if page2.has_neighbor(page_id1) is True:
endorsement_value = page2.link_weight
# otherwise no endorsement was given
else:
endorsement_value = 0
# add the single endorsement
endorsements.append(endorsement_value)
# add the PageVertex's total endorsement vector
inlinks.append(endorsements)
# Rank the sites by finding the "eigenvalues" after 50 iterations
rankings = np.array([1/len(self.pages) for page in self.pages]) # O(P)
for i in range(50):
rankings = np.dot(inlinks, rankings) # O(P^2)
return rankings
def bucket_ranked_pages(self, page_eigs):
"""Return a list of tuples for each PageVertex,
along with its PageRank rating.
Parameters:
page_eigs(dict): hash map between page ids and
their eigenvalues
Returns:
dict: each PR rating is mapped to a list
of the page ids which have that rating
Complexity Analysis:
The runtime of this method scales linearithmically with
the size of P, the number of PageVertices. Therefore it
id expressed in Big O as O(P log P).
"""
# store variables for number of pages to rank
num_rankings = len(page_eigs) # O(P)
# store possible ratings we can give out (scale 1-10)
num_possible_ranks = 10
# compute length of each rating "bucket"
bucket_len = math.ceil(num_rankings / num_possible_ranks) # O(1)
# sort the pages by the greatest eig values
sorted_pages = list(page_eigs.items())
sorted_pages.sort(reverse=True, key=lambda p_eig: p_eig[1]) # O(P log P)
# map each rating to a list of the appropiate pages
rating_pages = dict()
for rating in range(1, num_possible_ranks + 1): # 10 iterations
# store a list of the page ids
pages = list()
# compute indices of the pages we want
start = 0 + (bucket_len * (rating - 1))
end = start + bucket_len
# add the pages in that bucket
for page, eig in sorted_pages[start:end]: # O(P / 10) in total
pages.append(page)
# map to the rating - if we haven't already added all pages
if len(pages) > 0:
rating_pages[rating] = pages
return rating_pages
def rank_pages(self):
"""
Return the PageRank rating for each page.
Parameters: None
Returns:
List<tuple<str, int>>: tuples in the form (str: page_id, int: rating),
where:
- page_id is the id of a PageVertex instance
- rating is its PageRank value (from 1-10, 1 being highest)
- list elements sorted in order of highest ratings,
i.e. all PageRanks of 1 come first, then the 2's, and so on
Complexity Analysis:
The runtime of this method grows in relation to P and L
asymptotically, which represent the number of PageVertexs and the
total number of links in the InternetGraph respectively. The Big O
notation for the combined runtime of the three helper methods
would be O(P^2 + L).
"""
# compute eigenvalues of alll pages
rankings_vector = self.compute_inlink_values()
# map all pages to their eigenvalues
page_eigs = dict(zip(list(self.pages), rankings_vector))
# convert to list of PageRank ratings
rating_pages = self.bucket_ranked_pages(page_eigs) # O(P log P)
return rating_pages
"""What pages can I reach N links away from this page?"""
def find_pages_n_away(self, start_id, link_distance):
"""
Find and return all pages n links away.
In this implementation, if a page has multiple
paths to it from the start that differ in distance,
then the shortest distance is used to determine if
should be returned or not. If that shortest distance
is less than the link_distance argument, the PageVertex
will not be returned at all.
For example:
Suppose start_id = A, and link_distance = 2,
and PageVertex A can reached PageVertex C
using either 1 link or 2 links - then PageVertex C will not
be returned, as the function considers C to only to be 1 link
away from A.
Parmeters:
start_id (str): The id of the start PageVertex.
link_distance (int): The distance from the
start vertex we want
Returns:
List<str>: All PageVertex ids that are 'link_distance' away
Complexity Analysis:
This function attempts to process each PageVertex, by way of
each link in the InternetGraph. It scales in linear proportion to
P and L as they grow asymptotically larger, therefore
the Big O notation of this function is O(P + L).
"""
# check to make sure we have a valid start_id
if not self.contains_page(start_id):
raise KeyError(f"PageVertex {start_id}.")
# Store the starting page in a variable
start_page_obj = self.get_page(start_id)
# Keep a count of steps taken from start so far
steps = -1
# Keep a dict of PageVertex ids, mapped to their distance from start
page_distances = dict()
# queue of vertices to visit next
queue = deque()
queue.append(start_page_obj)
# Perform a BFS
while steps < link_distance:
# init a list of neighbors to process next
neighbors = list()
# Dequeue all the pages in the queue
while len(queue) > 0:
current_page_obj = queue.popleft()
current_page_id = current_page_obj.get_id()
# add the current PageVertex to the dict
if current_page_id not in page_distances:
page_distances[current_page_id] = steps + 1
# Keep track of page to process on next iteration
neighbors.extend(current_page_obj.get_neighbors())
# enqueue the vertices to visit on the next iteration
for neighbor in neighbors:
queue.append(neighbor)
# Increment the steps taken so far
steps += 1
# Return the ids of pages, target distance away
pages_n_away = list()
for page_id in page_distances:
if page_distances[page_id] == link_distance:
pages_n_away.append(page_id)
return pages_n_away
"""What's the Shortest Weighted Path Between 2 PageVertexs (by links)?"""
def find_shortest_path(self, start_id, target_id):
"""
Use Dijkstra's Algorithm to return the total weight
of the shortest path from a start page
to a destination.
Parameters:
start_id(str): the id of the PageVertex where the path begins
target_id(str): the id of the PageVertex where the path ends
Returns:
float: the total weight of the edges along the shortest path
from the starting to target PageVertex
Complexity Analysis:
The runtime of this method asymptotically increases quadratically
with respect to P, the number of PageVertexs. The longest step is
finding the minimum PageVertex to add, since an ordinary array is
currently used to simulate using a binary min heap.
Overall, the runtime of this method is O(P^2).
"""
# Check that both start and target PageVertexs valid
if self.contains_page(start_id) is False: # O(P)
raise KeyError(f'{start_id} not found in InternetGraph!')
elif self.contains_page(target_id) is False: # O(P)
raise KeyError(f'{target_id} not found in InternetGraph!')
# A: initialize all page distances to INFINITY away
page_weight = dict() # O(1)
for page_obj in self.pages.values(): # P iterations
page_weight[page_obj] = float('inf')
# B: Calculate Shortest Paths from Start PageVertex
start_page = self.pages[start_id] # O(1)
page_weight[start_page] = 0
while len(list(page_weight.items())) > 0: # P iterations
# Get the minimum-distance remaining PageVertex
min_distance = min(list(page_weight.values())) # O(P)
min_page = None
# find the minumum-weighted PageVertex
for page in page_weight: # P iterations
if page_weight[page] == min_distance:
min_page = page
# If target found, return its distance
if min_page.page_id == target_id:
return page_weight[min_page]
# B: List the PageVertex's neighbors
neighbor_weights = min_page.get_neighbors_with_weights() # O(L)
# C: Update the PageVertex's neighbors
for neighbor, weight in neighbor_weights:
if neighbor in page_weight:
current_distance = page_weight[neighbor]
# Update ONLY to reduce the weight of the distance
new_dist = weight + page_weight[min_page]
if new_dist < current_distance:
page_weight[neighbor] = new_dist
# remove the min weighted page from the dict
del page_weight[min_page]
if __name__ == "__main__":
# Runner Script
# A: instaniate the PageVertexs and Graph
pageA = PageVertex('A')
pageB = PageVertex('B')
pageC = PageVertex('C')
pageD = PageVertex('D')
pageA.add_link(pageB)
pageB.add_link(pageC)
pageB.add_link(pageD)
pageC.add_link(pageA)
pageC.add_link(pageD)
pageD.add_link(pageA)
pageD.add_link(pageB)
internet = InternetGraph()
internet.add_page_by_obj(pageA)
internet.add_page_by_obj(pageB)
internet.add_page_by_obj(pageC)
internet.add_page_by_obj(pageD)
# B: Test PageRank
rankings = internet.rank_pages()
print(f'Final rankings: {rankings}')
# C: Seeing What PageVertexs Can be Reached from 'B'
neighbors = internet.find_pages_n_away('B', 2)
print('Shortest distance neighbors 2 links away ' +
f'from B: {neighbors}')
# D: Finding Length of Shortest Path
b_to_d = internet.find_shortest_path('B', 'D')
print(f'Minimum weight of path from B to D: {b_to_d}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment