Last active
July 18, 2024 14:19
-
-
Save eshaben/3de7db754a90e7fa68e86682e0763ca7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This script is designed to extract meta tags and page titles from | |
# URLs listed in a sitemap and save the results to a CSV file. | |
# | |
# The script accepts two arguments: | |
# | |
# Sitemap URL: The URL of the sitemap to process. | |
# CSV File Name: The name of the CSV file to which the results will be saved. | |
# | |
# Run the script using the following command: | |
# python <file_name_of_script> <sitemap_url> <output_csv_file> | |
# | |
# Note that sitemaps are typically in this format: <website>/sitemap.xml | |
import requests | |
from bs4 import BeautifulSoup | |
import csv | |
import argparse | |
# Function to extract meta tags | |
def extract_meta_tags(url): | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
meta_description = soup.find('meta', property='og:description') | |
return meta_description['content'] if meta_description else '' | |
# Function to extract page title | |
def extract_page_title(url): | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
page_title = soup.title.text if soup.title else '' | |
return page_title | |
# Function to fetch sitemap URLs | |
def fetch_sitemap_urls(sitemap_url): | |
response = requests.get(sitemap_url) | |
soup = BeautifulSoup(response.content, 'xml') | |
urls = soup.find_all('loc') | |
return [url.text for url in urls] | |
# Main function | |
def main(): | |
# Argument parser setup | |
parser = argparse.ArgumentParser(description='Extract meta tags and page titles from a sitemap.') | |
parser.add_argument('sitemap', type=str, help='The URL of the sitemap.') | |
parser.add_argument('csvfile', type=str, help='The name of the output CSV file.') | |
args = parser.parse_args() | |
# Fetch URLs from the sitemap | |
urls = fetch_sitemap_urls(args.sitemap) | |
# Write to CSV file | |
with open(args.csvfile, 'w', newline='', encoding='utf-8') as csvfile: | |
writer = csv.writer(csvfile) | |
writer.writerow(['URL', 'Page Title', 'Meta Description']) | |
for url in urls: | |
page_title = extract_page_title(url) | |
meta_description = extract_meta_tags(url) | |
writer.writerow([url, page_title, meta_description]) | |
print(f"CSV file '{args.csvfile}' created successfully.") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment