-
-
Save asehmi/6b931e2e2fb8bd3844465934dde2169b to your computer and use it in GitHub Desktop.
Given a URL to a website, extracts all the text article from the URL formatted as markdown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
define fn for extracting articles | |
Example usage: | |
url = 'https://www.businessinsider.com/hundreds-google-workers-walk-out-zurich-protest-over-layoffs-2023-3' | |
markdown = extract_article(url) | |
print(markdown) | |
""" | |
import requests | |
import re | |
from bs4 import BeautifulSoup | |
def extract_article(url, strip_links=True): | |
""" | |
Given a URL to a website, extracts all the text article from the URL formatted as markdown. | |
:param url: str, URL of the website to extract the article from | |
:param strip_links: bool, decides whether or not to strip links (by default true) | |
:return: str, article content in markdown format | |
""" | |
# Make a request to the URL | |
response = requests.get(url) | |
if response.status_code != 200: | |
raise Exception( | |
f"Request to {url} failed with status code {response.status_code}" | |
) | |
try: | |
# Parse the HTML content using BeautifulSoup | |
soup = BeautifulSoup(response.content, "html.parser") | |
# Find the article content | |
article = soup.find("article") | |
# Remove unwanted elements from the article | |
for element in article.find_all(["script", "style"]): | |
element.extract() | |
# Convert the article to markdown format | |
markdown = "" | |
# Add the article title | |
title_elem = article.find("h1") | |
if title_elem: | |
title = title_elem.get_text() | |
markdown += f"\n# {title}\n\n" | |
# Add the article image | |
image = article.find("img") | |
if image: | |
alt = image.get("alt") | |
src = image.get("src") | |
markdown += f"![{alt}]({src})\n\n" | |
# Add the article content | |
for paragraph in article.find_all("p"): | |
text = paragraph.get_text() | |
if strip_links: | |
try: | |
text = re.sub(r"\[.*?\]\(.*?\)", "", text) # Strip links | |
except re.error as e: | |
print(f"Error stripping links from article text: {e}") | |
markdown += f"{text}\n\n" | |
# Log the number of words | |
try: | |
word_count = len(re.findall(r"\b\w+\b", markdown)) | |
print(f"The article contains {word_count} words.") | |
except TypeError: | |
print("The article content is empty.") | |
return markdown | |
except Exception as e: | |
print(f"Error extracting article content: {e}") | |
return "" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment