Created
August 26, 2024 16:02
-
-
Save kxrz/32a8b8f36b640d2d8df3896a0e279d80 to your computer and use it in GitHub Desktop.
Analyse de contenus avec un script Python et ChatGPT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
from langdetect import detect, DetectorFactory | |
from markdownify import markdownify as md | |
import unicodedata | |
# Fix seed for language detection to ensure reproducibility | |
DetectorFactory.seed = 0 | |
# Fonction pour extraire les URLs du sitemap | |
def get_urls_from_sitemap(sitemap_url): | |
print("Fetching sitemap...") | |
response = requests.get(sitemap_url) | |
response.encoding = 'utf-8' | |
soup = BeautifulSoup(response.content, 'xml') | |
urls = [url.loc.text for url in soup.find_all('url')] | |
print(f"Found {len(urls)} URLs in the sitemap.") | |
return urls | |
# Fonction pour nettoyer le texte en excluant les caractères non valides tout en conservant les accents | |
def clean_text(text): | |
normalized_text = unicodedata.normalize('NFKD', text) | |
clean_text = ''.join(c for c in normalized_text if unicodedata.category(c) != 'Mn') | |
return clean_text | |
# Fonction pour détecter si le texte est en français | |
def is_french(text): | |
try: | |
return detect(text) == 'fr' | |
except: | |
return False | |
# Fonction pour scraper le texte d'une page | |
def scrape_page_text(url): | |
print(f"Scraping text from {url}...") | |
response = requests.get(url) | |
response.encoding = 'utf-8' | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Obtenir le titre de la page | |
title = soup.title.string if soup.title else 'No Title' | |
# Extraire le contenu HTML en texte brut | |
texts = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']) | |
full_text = "\n".join([str(text) for text in texts]) | |
# Convertir le HTML en markdown | |
markdown_text = md(full_text) | |
if is_french(markdown_text): | |
cleaned_text = clean_text(markdown_text) | |
word_count = len(cleaned_text.split()) | |
return title, cleaned_text, word_count | |
else: | |
return "", "", 0 | |
# URL du sitemap | |
sitemap_url = 'https://leswww.com/sitemap.xml' | |
# Extraire les URLs | |
urls = get_urls_from_sitemap(sitemap_url) | |
# Scraper et sauvegarder le contenu des pages en français | |
with open('site_content.md', 'w', encoding='utf-8', errors='replace') as f: | |
for url in urls: | |
try: | |
title, page_text, word_count = scrape_page_text(url) | |
if page_text: # Only write non-empty texts | |
separator = '=' * 40 | |
f.write(f'{separator}\n{separator}\n') | |
f.write(f'TITRE: {title.upper()}\n') | |
f.write(f'URL: {url.lower()}\n') | |
f.write(f'NOMBRE DE MOTS: {word_count}\n') | |
f.write(f'{separator}\n{separator}\n') | |
f.write(f'{page_text}\n\n') | |
print(f"Finished scraping {url}.") | |
except Exception as e: | |
print(f"Error scraping {url}: {e}") | |
print("All pages have been scraped and saved to site_content.md.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment