Skip to content

Instantly share code, notes, and snippets.

@eshaben
Last active July 18, 2024 14:19
Show Gist options
  • Save eshaben/3de7db754a90e7fa68e86682e0763ca7 to your computer and use it in GitHub Desktop.
Save eshaben/3de7db754a90e7fa68e86682e0763ca7 to your computer and use it in GitHub Desktop.
# This script is designed to extract meta tags and page titles from
# URLs listed in a sitemap and save the results to a CSV file.
#
# The script accepts two arguments:
#
# Sitemap URL: The URL of the sitemap to process.
# CSV File Name: The name of the CSV file to which the results will be saved.
#
# Run the script using the following command:
# python <file_name_of_script> <sitemap_url> <output_csv_file>
#
# Note that sitemaps are typically in this format: <website>/sitemap.xml
import requests
from bs4 import BeautifulSoup
import csv
import argparse
# Function to extract meta tags
def extract_meta_tags(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
meta_description = soup.find('meta', property='og:description')
return meta_description['content'] if meta_description else ''
# Function to extract page title
def extract_page_title(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
page_title = soup.title.text if soup.title else ''
return page_title
# Function to fetch sitemap URLs
def fetch_sitemap_urls(sitemap_url):
response = requests.get(sitemap_url)
soup = BeautifulSoup(response.content, 'xml')
urls = soup.find_all('loc')
return [url.text for url in urls]
# Main function
def main():
# Argument parser setup
parser = argparse.ArgumentParser(description='Extract meta tags and page titles from a sitemap.')
parser.add_argument('sitemap', type=str, help='The URL of the sitemap.')
parser.add_argument('csvfile', type=str, help='The name of the output CSV file.')
args = parser.parse_args()
# Fetch URLs from the sitemap
urls = fetch_sitemap_urls(args.sitemap)
# Write to CSV file
with open(args.csvfile, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['URL', 'Page Title', 'Meta Description'])
for url in urls:
page_title = extract_page_title(url)
meta_description = extract_meta_tags(url)
writer.writerow([url, page_title, meta_description])
print(f"CSV file '{args.csvfile}' created successfully.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment