pszemraj/extract_article.py

## extract_article.py
"""
define fn for extracting articles


Example usage:

url = 'https://www.businessinsider.com/hundreds-google-workers-walk-out-zurich-protest-over-layoffs-2023-3'
markdown = extract_article(url)
print(markdown)
"""

import requests
import re
from bs4 import BeautifulSoup


def extract_article(url, strip_links=True):
    """
    Given a URL to a website, extracts all the text article from the URL formatted as markdown.
    :param url: str, URL of the website to extract the article from
    :param strip_links: bool, decides whether or not to strip links (by default true)
    :return: str, article content in markdown format
    """
    # Make a request to the URL
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(
            f"Request to {url} failed with status code {response.status_code}"
        )

    try:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")

        # Find the article content
        article = soup.find("article")

        # Remove unwanted elements from the article
        for element in article.find_all(["script", "style"]):
            element.extract()

        # Convert the article to markdown format
        markdown = ""

        # Add the article title
        title_elem = article.find("h1")
        if title_elem:
            title = title_elem.get_text()
            markdown += f"\n# {title}\n\n"

        # Add the article image
        image = article.find("img")
        if image:
            alt = image.get("alt")
            src = image.get("src")
            markdown += f"![{alt}]({src})\n\n"

        # Add the article content
        for paragraph in article.find_all("p"):
            text = paragraph.get_text()
            if strip_links:
                try:
                    text = re.sub(r"\[.*?\]\(.*?\)", "", text)  # Strip links
                except re.error as e:
                    print(f"Error stripping links from article text: {e}")
            markdown += f"{text}\n\n"

        # Log the number of words
        try:
            word_count = len(re.findall(r"\b\w+\b", markdown))
            print(f"The article contains {word_count} words.")
        except TypeError:
            print("The article content is empty.")

        return markdown

    except Exception as e:
        print(f"Error extracting article content: {e}")
        return ""
	"""
	define fn for extracting articles


	Example usage:

	url = 'https://www.businessinsider.com/hundreds-google-workers-walk-out-zurich-protest-over-layoffs-2023-3'
	markdown = extract_article(url)
	print(markdown)
	"""

	import requests
	import re
	from bs4 import BeautifulSoup


	def extract_article(url, strip_links=True):
	"""
	Given a URL to a website, extracts all the text article from the URL formatted as markdown.
	:param url: str, URL of the website to extract the article from
	:param strip_links: bool, decides whether or not to strip links (by default true)
	:return: str, article content in markdown format
	"""
	# Make a request to the URL
	response = requests.get(url)
	if response.status_code != 200:
	raise Exception(
	f"Request to {url} failed with status code {response.status_code}"
	)

	try:
	# Parse the HTML content using BeautifulSoup
	soup = BeautifulSoup(response.content, "html.parser")

	# Find the article content
	article = soup.find("article")

	# Remove unwanted elements from the article
	for element in article.find_all(["script", "style"]):
	element.extract()

	# Convert the article to markdown format
	markdown = ""

	# Add the article title
	title_elem = article.find("h1")
	if title_elem:
	title = title_elem.get_text()
	markdown += f"\n# {title}\n\n"

	# Add the article image
	image = article.find("img")
	if image:
	alt = image.get("alt")
	src = image.get("src")
	markdown += f"![{alt}]({src})\n\n"

	# Add the article content
	for paragraph in article.find_all("p"):
	text = paragraph.get_text()
	if strip_links:
	try:
	text = re.sub(r"\[.?\]\(.?\)", "", text) # Strip links
	except re.error as e:
	print(f"Error stripping links from article text: {e}")
	markdown += f"{text}\n\n"

	# Log the number of words
	try:
	word_count = len(re.findall(r"\b\w+\b", markdown))
	print(f"The article contains {word_count} words.")
	except TypeError:
	print("The article content is empty.")

	return markdown

	except Exception as e:
	print(f"Error extracting article content: {e}")
	return ""