thingsitried/extract_leads_with_dates.py

## extract_leads_with_dates.py
from bs4 import BeautifulSoup
import re
import csv
import requests

def get_last_modified(url):
    """Retrieve the Last-Modified header from a URL if available."""
    try:
        response = requests.head(url, timeout=5)
        return response.headers.get("Last-Modified", "Unknown")
    except requests.RequestException:
        return "Unknown"

def extract_urls_from_html(html_path):
    """Extract all URLs, their link text, and last modified dates from an HTML file."""

    with open(html_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    url_data = set()  # Use a set to avoid duplicates

    # Extract URLs from <a href="…"> and their link text
    count = 0
    for link in soup.find_all("a", href=True):
        url = link["href"].strip()
        link_text = link.get_text(strip=True)  # Extract visible link text
        last_modified = get_last_modified(url)
        url_data.add((url, link_text, last_modified))
        print(str(count) + ' : ' + url + " : " + link_text + " : " +last_modified)
        count = count + 1

    # Extract URLs appearing as plain text
    url_pattern = re.compile(r"https?://[^\s\"'>]+")
    for text in soup.stripped_strings:
        for match in url_pattern.findall(text):
            last_modified = get_last_modified(match)
            url_data.add((match, "", last_modified))  # No link text for plain text URLs

    return list(url_data)

def save_urls_to_csv(url_data, output_file):
    """Save extracted URLs, their link text, and last modified dates to a CSV file."""
    with open(output_file, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["URL", "Link Text", "Last Modified"])  # Header row
        writer.writerows(url_data)
    print(f"URLs saved to {output_file}")

if __name__ == "__main__":
    html_path = "leads-filtered-utf8.htm"  # Change this to your HTML file path
    extracted_urls = extract_urls_from_html(html_path)

    if extracted_urls:
        output_file = "lead_urls.csv"
        print("Extracted URLs:")
        for url, text, last_modified in extracted_urls:
            print(f"URL: {url}, Link Text: {text}, Last Modified: {last_modified}")
        save_urls_to_csv(extracted_urls, output_file)
    else:
        print("No URLs found in the HTML file.")
	from bs4 import BeautifulSoup
	import re
	import csv
	import requests

	def get_last_modified(url):
	"""Retrieve the Last-Modified header from a URL if available."""
	try:
	response = requests.head(url, timeout=5)
	return response.headers.get("Last-Modified", "Unknown")
	except requests.RequestException:
	return "Unknown"

	def extract_urls_from_html(html_path):
	"""Extract all URLs, their link text, and last modified dates from an HTML file."""

	with open(html_path, "r", encoding="utf-8") as file:
	soup = BeautifulSoup(file, "html.parser")

	url_data = set() # Use a set to avoid duplicates

	# Extract URLs from <a href="…"> and their link text
	count = 0
	for link in soup.find_all("a", href=True):
	url = link["href"].strip()
	link_text = link.get_text(strip=True) # Extract visible link text
	last_modified = get_last_modified(url)
	url_data.add((url, link_text, last_modified))
	print(str(count) + ' : ' + url + " : " + link_text + " : " +last_modified)
	count = count + 1

	# Extract URLs appearing as plain text
	url_pattern = re.compile(r"https?://[^\s\"'>]+")
	for text in soup.stripped_strings:
	for match in url_pattern.findall(text):
	last_modified = get_last_modified(match)
	url_data.add((match, "", last_modified)) # No link text for plain text URLs

	return list(url_data)

	def save_urls_to_csv(url_data, output_file):
	"""Save extracted URLs, their link text, and last modified dates to a CSV file."""
	with open(output_file, "w", newline="", encoding="utf-8") as file:
	writer = csv.writer(file)
	writer.writerow(["URL", "Link Text", "Last Modified"]) # Header row
	writer.writerows(url_data)
	print(f"URLs saved to {output_file}")

	if __name__ == "__main__":
	html_path = "leads-filtered-utf8.htm" # Change this to your HTML file path
	extracted_urls = extract_urls_from_html(html_path)

	if extracted_urls:
	output_file = "lead_urls.csv"
	print("Extracted URLs:")
	for url, text, last_modified in extracted_urls:
	print(f"URL: {url}, Link Text: {text}, Last Modified: {last_modified}")
	save_urls_to_csv(extracted_urls, output_file)
	else:
	print("No URLs found in the HTML file.")