Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Created March 21, 2023 01:55
Show Gist options
  • Save pszemraj/356782b2e94da45acaf20b0b3ef8d903 to your computer and use it in GitHub Desktop.
Save pszemraj/356782b2e94da45acaf20b0b3ef8d903 to your computer and use it in GitHub Desktop.
Given a URL to a website, extracts all the text article from the URL formatted as markdown.
"""
define fn for extracting articles
Example usage:
url = 'https://www.businessinsider.com/hundreds-google-workers-walk-out-zurich-protest-over-layoffs-2023-3'
markdown = extract_article(url)
print(markdown)
"""
import requests
import re
from bs4 import BeautifulSoup
def extract_article(url, strip_links=True):
"""
Given a URL to a website, extracts all the text article from the URL formatted as markdown.
:param url: str, URL of the website to extract the article from
:param strip_links: bool, decides whether or not to strip links (by default true)
:return: str, article content in markdown format
"""
# Make a request to the URL
response = requests.get(url)
if response.status_code != 200:
raise Exception(
f"Request to {url} failed with status code {response.status_code}"
)
try:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")
# Find the article content
article = soup.find("article")
# Remove unwanted elements from the article
for element in article.find_all(["script", "style"]):
element.extract()
# Convert the article to markdown format
markdown = ""
# Add the article title
title_elem = article.find("h1")
if title_elem:
title = title_elem.get_text()
markdown += f"\n# {title}\n\n"
# Add the article image
image = article.find("img")
if image:
alt = image.get("alt")
src = image.get("src")
markdown += f"![{alt}]({src})\n\n"
# Add the article content
for paragraph in article.find_all("p"):
text = paragraph.get_text()
if strip_links:
try:
text = re.sub(r"\[.*?\]\(.*?\)", "", text) # Strip links
except re.error as e:
print(f"Error stripping links from article text: {e}")
markdown += f"{text}\n\n"
# Log the number of words
try:
word_count = len(re.findall(r"\b\w+\b", markdown))
print(f"The article contains {word_count} words.")
except TypeError:
print("The article content is empty.")
return markdown
except Exception as e:
print(f"Error extracting article content: {e}")
return ""
@pszemraj
Copy link
Author

additional cleaning ezmode:

pip install -U -q clean-text
cln_markdown = clean(markdown, lower=False, no_urls=True, no_emails=True, no_phone_numbers=True,)
print(cln_markdown)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment