eshaben/get-sitemap.py

## get-sitemap.py
# This script is designed to extract meta tags and page titles from
# URLs listed in a sitemap and save the results to a CSV file.
#
# The script accepts two arguments:
#
# Sitemap URL: The URL of the sitemap to process.
# CSV File Name: The name of the CSV file to which the results will be saved.
#
# Run the script using the following command:
# python <file_name_of_script> <sitemap_url> <output_csv_file>
#
# Note that sitemaps are typically in this format: <website>/sitemap.xml

import requests
from bs4 import BeautifulSoup
import csv
import argparse

# Function to extract meta tags
def extract_meta_tags(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    meta_description = soup.find('meta', property='og:description')
    return meta_description['content'] if meta_description else ''

# Function to extract page title
def extract_page_title(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    page_title = soup.title.text if soup.title else ''
    return page_title

# Function to fetch sitemap URLs
def fetch_sitemap_urls(sitemap_url):
    response = requests.get(sitemap_url)
    soup = BeautifulSoup(response.content, 'xml')
    urls = soup.find_all('loc')
    return [url.text for url in urls]

# Main function
def main():
    # Argument parser setup
    parser = argparse.ArgumentParser(description='Extract meta tags and page titles from a sitemap.')
    parser.add_argument('sitemap', type=str, help='The URL of the sitemap.')
    parser.add_argument('csvfile', type=str, help='The name of the output CSV file.')
    args = parser.parse_args()

    # Fetch URLs from the sitemap
    urls = fetch_sitemap_urls(args.sitemap)

    # Write to CSV file
    with open(args.csvfile, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['URL', 'Page Title', 'Meta Description'])

        for url in urls:
            page_title = extract_page_title(url)
            meta_description = extract_meta_tags(url)
            writer.writerow([url, page_title, meta_description])

    print(f"CSV file '{args.csvfile}' created successfully.")


if __name__ == "__main__":
    main()
	# This script is designed to extract meta tags and page titles from
	# URLs listed in a sitemap and save the results to a CSV file.
	#
	# The script accepts two arguments:
	#
	# Sitemap URL: The URL of the sitemap to process.
	# CSV File Name: The name of the CSV file to which the results will be saved.
	#
	# Run the script using the following command:
	# python <file_name_of_script> <sitemap_url> <output_csv_file>
	#
	# Note that sitemaps are typically in this format: <website>/sitemap.xml

	import requests
	from bs4 import BeautifulSoup
	import csv
	import argparse

	# Function to extract meta tags
	def extract_meta_tags(url):
	response = requests.get(url)
	soup = BeautifulSoup(response.content, 'html.parser')
	meta_description = soup.find('meta', property='og:description')
	return meta_description['content'] if meta_description else ''

	# Function to extract page title
	def extract_page_title(url):
	response = requests.get(url)
	soup = BeautifulSoup(response.content, 'html.parser')
	page_title = soup.title.text if soup.title else ''
	return page_title

	# Function to fetch sitemap URLs
	def fetch_sitemap_urls(sitemap_url):
	response = requests.get(sitemap_url)
	soup = BeautifulSoup(response.content, 'xml')
	urls = soup.find_all('loc')
	return [url.text for url in urls]

	# Main function
	def main():
	# Argument parser setup
	parser = argparse.ArgumentParser(description='Extract meta tags and page titles from a sitemap.')
	parser.add_argument('sitemap', type=str, help='The URL of the sitemap.')
	parser.add_argument('csvfile', type=str, help='The name of the output CSV file.')
	args = parser.parse_args()

	# Fetch URLs from the sitemap
	urls = fetch_sitemap_urls(args.sitemap)

	# Write to CSV file
	with open(args.csvfile, 'w', newline='', encoding='utf-8') as csvfile:
	writer = csv.writer(csvfile)
	writer.writerow(['URL', 'Page Title', 'Meta Description'])

	for url in urls:
	page_title = extract_page_title(url)
	meta_description = extract_meta_tags(url)
	writer.writerow([url, page_title, meta_description])

	print(f"CSV file '{args.csvfile}' created successfully.")


	if __name__ == "__main__":
	main()