Skip to content

Instantly share code, notes, and snippets.

@salvatorecapolupo
Last active July 4, 2024 17:46
Show Gist options
  • Save salvatorecapolupo/f5629d5212e727829e1b6bdf91ea5ef6 to your computer and use it in GitHub Desktop.
Save salvatorecapolupo/f5629d5212e727829e1b6bdf91ea5ef6 to your computer and use it in GitHub Desktop.
Questo codice Python estrae automaticamente title e meta description da una qualsiasi sitemap xml, e li salva in un file Excel. Capito, possiamo modificare lo script in modo che esplori ricorsivamente tutte le sitemaps XML e, quando trova URL HTML, estragga il titolo e la meta description. Prego :-) Spiegone: https://trovalost.it/estrae-title-me…
# https://trovalost.it/sitemap_index.xml
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Funzione per estrarre title e meta description da una pagina
def extract_title_meta(url):
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.title.string if soup.title else 'No title'
meta_description = ''
for meta in soup.find_all('meta'):
if 'name' in meta.attrs and meta.attrs['name'].lower() == 'description':
meta_description = meta.attrs['content']
break
return title, meta_description
except Exception as e:
return 'Error', str(e)
# Funzione per estrarre gli URL da una sitemap, gestendo ricorsivamente sitemaps composte
def extract_urls_from_sitemap(sitemap_url):
urls = []
response = requests.get(sitemap_url)
soup = BeautifulSoup(response.content, 'xml')
total_urls = len(soup.find_all('url')) + len(soup.find_all('sitemap'))
processed_urls = 0
for sitemap in soup.find_all('sitemap'):
loc = sitemap.find('loc').text
urls.extend(extract_urls_from_sitemap(loc))
processed_urls += 1
print_progress(processed_urls, total_urls, loc)
for url in soup.find_all('url'):
loc = url.find('loc').text
urls.append(loc)
processed_urls += 1
print_progress(processed_urls, total_urls, loc)
return urls
# Funzione per stampare il progresso in percentuale
def print_progress(processed, total, url):
progress = processed / total * 100
print(f'Progresso: {progress:.2f}% - URL: {url}')
# URL della sitemap principale
sitemap_url = 'https://trovalost.it/sitemap_index.xml'
# Estrazione degli URL dalla sitemap
urls = extract_urls_from_sitemap(sitemap_url)
# Creazione della lista di dizionari con i dati estratti
data = []
for idx, url in enumerate(urls):
title, meta_description = extract_title_meta(url)
data.append({'URL': url, 'Title': title, 'Meta Description': meta_description})
print(f'Estrazione dati URL {idx+1}/{len(urls)} - URL: {url}')
# Creazione del DataFrame
df = pd.DataFrame(data)
# Salvataggio del DataFrame in un file Excel
df.to_excel('sitemap_data.xlsx', index=False)
print("Dati estratti e salvati in 'sitemap_data.xlsx'")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment