Skip to content

Instantly share code, notes, and snippets.

@kxrz
Created August 26, 2024 16:02
Show Gist options
  • Save kxrz/32a8b8f36b640d2d8df3896a0e279d80 to your computer and use it in GitHub Desktop.
Save kxrz/32a8b8f36b640d2d8df3896a0e279d80 to your computer and use it in GitHub Desktop.
Analyse de contenus avec un script Python et ChatGPT
import requests
from bs4 import BeautifulSoup
from langdetect import detect, DetectorFactory
from markdownify import markdownify as md
import unicodedata
# Fix seed for language detection to ensure reproducibility
DetectorFactory.seed = 0
# Fonction pour extraire les URLs du sitemap
def get_urls_from_sitemap(sitemap_url):
print("Fetching sitemap...")
response = requests.get(sitemap_url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.content, 'xml')
urls = [url.loc.text for url in soup.find_all('url')]
print(f"Found {len(urls)} URLs in the sitemap.")
return urls
# Fonction pour nettoyer le texte en excluant les caractères non valides tout en conservant les accents
def clean_text(text):
normalized_text = unicodedata.normalize('NFKD', text)
clean_text = ''.join(c for c in normalized_text if unicodedata.category(c) != 'Mn')
return clean_text
# Fonction pour détecter si le texte est en français
def is_french(text):
try:
return detect(text) == 'fr'
except:
return False
# Fonction pour scraper le texte d'une page
def scrape_page_text(url):
print(f"Scraping text from {url}...")
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.content, 'html.parser')
# Obtenir le titre de la page
title = soup.title.string if soup.title else 'No Title'
# Extraire le contenu HTML en texte brut
texts = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'])
full_text = "\n".join([str(text) for text in texts])
# Convertir le HTML en markdown
markdown_text = md(full_text)
if is_french(markdown_text):
cleaned_text = clean_text(markdown_text)
word_count = len(cleaned_text.split())
return title, cleaned_text, word_count
else:
return "", "", 0
# URL du sitemap
sitemap_url = 'https://leswww.com/sitemap.xml'
# Extraire les URLs
urls = get_urls_from_sitemap(sitemap_url)
# Scraper et sauvegarder le contenu des pages en français
with open('site_content.md', 'w', encoding='utf-8', errors='replace') as f:
for url in urls:
try:
title, page_text, word_count = scrape_page_text(url)
if page_text: # Only write non-empty texts
separator = '=' * 40
f.write(f'{separator}\n{separator}\n')
f.write(f'TITRE: {title.upper()}\n')
f.write(f'URL: {url.lower()}\n')
f.write(f'NOMBRE DE MOTS: {word_count}\n')
f.write(f'{separator}\n{separator}\n')
f.write(f'{page_text}\n\n')
print(f"Finished scraping {url}.")
except Exception as e:
print(f"Error scraping {url}: {e}")
print("All pages have been scraped and saved to site_content.md.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment