Skip to content

Instantly share code, notes, and snippets.

@lelambonzo
Created December 29, 2023 12:11
Show Gist options
  • Save lelambonzo/11db1d9a2a9b95c10f163fa2d96e467c to your computer and use it in GitHub Desktop.
Save lelambonzo/11db1d9a2a9b95c10f163fa2d96e467c to your computer and use it in GitHub Desktop.
PageRank algorithm, an effective method for assessing the significance of web pages in a network.
import networkx as nx
import os
from bs4 import BeautifulSoup
def parse_html(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
soup = BeautifulSoup(file, 'html.parser')
links = [a.get('href') for a in soup.find_all('a', href=True)]
return links
def simple_pagerank(directory: str) -> dict:
graph = nx.DiGraph()
# Iterate through HTML files in the directory
for file_name in os.listdir(directory):
if file_name.endswith('.html'):
file_path = os.path.join(directory, file_name)
graph.add_node(file_name)
# Parse HTML and add edges based on links
links = parse_html(file_path)
for link in links:
if link.endswith('.html') and link in os.listdir(directory):
graph.add_edge(file_name, link)
# Calculate PageRank
page_ranks = nx.pagerank(graph)
return page_ranks
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment