Created
March 7, 2025 06:16
Now we extract the URL, the link text and the modified date of our leads
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import re | |
import csv | |
import requests | |
def get_last_modified(url): | |
"""Retrieve the Last-Modified header from a URL if available.""" | |
try: | |
response = requests.head(url, timeout=5) | |
return response.headers.get("Last-Modified", "Unknown") | |
except requests.RequestException: | |
return "Unknown" | |
def extract_urls_from_html(html_path): | |
"""Extract all URLs, their link text, and last modified dates from an HTML file.""" | |
with open(html_path, "r", encoding="utf-8") as file: | |
soup = BeautifulSoup(file, "html.parser") | |
url_data = set() # Use a set to avoid duplicates | |
# Extract URLs from <a href="…"> and their link text | |
count = 0 | |
for link in soup.find_all("a", href=True): | |
url = link["href"].strip() | |
link_text = link.get_text(strip=True) # Extract visible link text | |
last_modified = get_last_modified(url) | |
url_data.add((url, link_text, last_modified)) | |
print(str(count) + ' : ' + url + " : " + link_text + " : " +last_modified) | |
count = count + 1 | |
# Extract URLs appearing as plain text | |
url_pattern = re.compile(r"https?://[^\s\"'>]+") | |
for text in soup.stripped_strings: | |
for match in url_pattern.findall(text): | |
last_modified = get_last_modified(match) | |
url_data.add((match, "", last_modified)) # No link text for plain text URLs | |
return list(url_data) | |
def save_urls_to_csv(url_data, output_file): | |
"""Save extracted URLs, their link text, and last modified dates to a CSV file.""" | |
with open(output_file, "w", newline="", encoding="utf-8") as file: | |
writer = csv.writer(file) | |
writer.writerow(["URL", "Link Text", "Last Modified"]) # Header row | |
writer.writerows(url_data) | |
print(f"URLs saved to {output_file}") | |
if __name__ == "__main__": | |
html_path = "leads-filtered-utf8.htm" # Change this to your HTML file path | |
extracted_urls = extract_urls_from_html(html_path) | |
if extracted_urls: | |
output_file = "lead_urls.csv" | |
print("Extracted URLs:") | |
for url, text, last_modified in extracted_urls: | |
print(f"URL: {url}, Link Text: {text}, Last Modified: {last_modified}") | |
save_urls_to_csv(extracted_urls, output_file) | |
else: | |
print("No URLs found in the HTML file.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment