Skip to content

Instantly share code, notes, and snippets.

@thingsitried
Created March 7, 2025 06:16
Now we extract the URL, the link text and the modified date of our leads
from bs4 import BeautifulSoup
import re
import csv
import requests
def get_last_modified(url):
"""Retrieve the Last-Modified header from a URL if available."""
try:
response = requests.head(url, timeout=5)
return response.headers.get("Last-Modified", "Unknown")
except requests.RequestException:
return "Unknown"
def extract_urls_from_html(html_path):
"""Extract all URLs, their link text, and last modified dates from an HTML file."""
with open(html_path, "r", encoding="utf-8") as file:
soup = BeautifulSoup(file, "html.parser")
url_data = set() # Use a set to avoid duplicates
# Extract URLs from <a href="…"> and their link text
count = 0
for link in soup.find_all("a", href=True):
url = link["href"].strip()
link_text = link.get_text(strip=True) # Extract visible link text
last_modified = get_last_modified(url)
url_data.add((url, link_text, last_modified))
print(str(count) + ' : ' + url + " : " + link_text + " : " +last_modified)
count = count + 1
# Extract URLs appearing as plain text
url_pattern = re.compile(r"https?://[^\s\"'>]+")
for text in soup.stripped_strings:
for match in url_pattern.findall(text):
last_modified = get_last_modified(match)
url_data.add((match, "", last_modified)) # No link text for plain text URLs
return list(url_data)
def save_urls_to_csv(url_data, output_file):
"""Save extracted URLs, their link text, and last modified dates to a CSV file."""
with open(output_file, "w", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(["URL", "Link Text", "Last Modified"]) # Header row
writer.writerows(url_data)
print(f"URLs saved to {output_file}")
if __name__ == "__main__":
html_path = "leads-filtered-utf8.htm" # Change this to your HTML file path
extracted_urls = extract_urls_from_html(html_path)
if extracted_urls:
output_file = "lead_urls.csv"
print("Extracted URLs:")
for url, text, last_modified in extracted_urls:
print(f"URL: {url}, Link Text: {text}, Last Modified: {last_modified}")
save_urls_to_csv(extracted_urls, output_file)
else:
print("No URLs found in the HTML file.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment