Skip to content

Instantly share code, notes, and snippets.

@kxrz
Created June 19, 2024 15:57
Show Gist options
  • Select an option

  • Save kxrz/bfd43736b5bbeef3243eec09f0d0990c to your computer and use it in GitHub Desktop.

Select an option

Save kxrz/bfd43736b5bbeef3243eec09f0d0990c to your computer and use it in GitHub Desktop.
Scrapping text from sitemap.xml
import requests
from bs4 import BeautifulSoup
from langdetect import detect, DetectorFactory
from markdownify import markdownify as md
import unicodedata
# Fix seed for language detection to ensure reproducibility
DetectorFactory.seed = 0
# Fonction pour extraire les URLs du sitemap
def get_urls_from_sitemap(sitemap_url):
print("Fetching sitemap...")
response = requests.get(sitemap_url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.content, 'xml')
urls = [url.loc.text for url in soup.find_all('url')]
print(f"Found {len(urls)} URLs in the sitemap.")
return urls
# Fonction pour nettoyer le texte en excluant les caractères non valides tout en conservant les accents
def clean_text(text):
normalized_text = unicodedata.normalize('NFKD', text)
clean_text = ''.join(c for c in normalized_text if unicodedata.category(c) != 'Mn')
return clean_text
# Fonction pour détecter si le texte est en français
def is_french(text):
try:
return detect(text) == 'fr'
except:
return False
# Fonction pour scraper le texte d'une page
def scrape_page_text(url):
print(f"Scraping text from {url}...")
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.content, 'html.parser')
# Obtenir le titre de la page
title = soup.title.string if soup.title else 'No Title'
# Extraire le contenu HTML en texte brut
texts = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'])
full_text = "\n".join([str(text) for text in texts])
# Convertir le HTML en markdown
markdown_text = md(full_text)
if is_french(markdown_text):
cleaned_text = clean_text(markdown_text)
word_count = len(cleaned_text.split())
return title, cleaned_text, word_count
else:
return "", "", 0
# URL du sitemap
sitemap_url = 'https://URLDUSITE.com/page-sitemap.xml'
# Extraire les URLs
urls = get_urls_from_sitemap(sitemap_url)
# Scraper et sauvegarder le contenu des pages en français
with open('site_content.md', 'w', encoding='utf-8', errors='replace') as f:
for url in urls:
try:
title, page_text, word_count = scrape_page_text(url)
if page_text: # Only write non-empty texts
separator = '=' * 40
f.write(f'{separator}\n{separator}\n')
f.write(f'TITRE: {title.upper()}\n')
f.write(f'URL: {url.lower()}\n')
f.write(f'NOMBRE DE MOTS: {word_count}\n')
f.write(f'{separator}\n{separator}\n')
f.write(f'{page_text}\n\n')
print(f"Finished scraping {url}.")
except Exception as e:
print(f"Error scraping {url}: {e}")
print("All pages have been scraped and saved to site_content.md.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment