Skip to content

Instantly share code, notes, and snippets.

@AlexanderCollins
Created April 8, 2024 07:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AlexanderCollins/7da3071c56d49333687ca4d781ad243d to your computer and use it in GitHub Desktop.
Save AlexanderCollins/7da3071c56d49333687ca4d781ad243d to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
from bs4.element import Comment
def tag_visible(element):
"""
Determines if a tag is visible based on its parent's name and whether it is a comment.
Parameters:
element: The BeautifulSoup element to check visibility for.
Returns:
bool: True if the element is visible, False otherwise.
"""
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
class SiteTextFetcherUtility:
"""
A utility class for fetching the content of a website by its URL and extracting visible text using BeautifulSoup.
"""
@classmethod
def fetch_site(cls, url):
"""
Class method to fetch the content of a website given its URL.
Parameters:
url (str): The URL of the website to fetch.
Returns:
bytes: The raw HTML content of the website.
"""
response = requests.get(url)
response.raise_for_status() # Raises HTTPError for bad responses
return response.content
@classmethod
def extract_text(cls, html_content):
"""
Class method to extract visible text from HTML content.
Parameters:
html_content (bytes): The HTML content from which to extract text.
Returns:
str: The extracted visible text.
"""
soup = BeautifulSoup(html_content, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return " ".join(t.strip() for t in visible_texts)
@classmethod
def fetch_and_extract(cls, url):
"""
Class method to fetch a website by URL and extract all visible text from it.
Parameters:
url (str): The URL of the website to fetch and extract text from.
Returns:
str: The visible text extracted from the website.
"""
site_content = cls.fetch_site(url)
return cls.extract_text(site_content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment