me-suzy/py

## py
import os
import re
import unidecode
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from datetime import datetime

def add_content_to_meta(html_content, content_to_add):
    meta_pattern = r'<meta name="description" content="(.*?)">'
    match = re.search(meta_pattern, html_content)
    if match:
        old_meta_tag = match.group(0)
        new_content = re.sub(r'"', '', content_to_add)
        new_meta_tag = f'<meta name="description" content="{new_content}">'
        updated_html_content = html_content.replace(old_meta_tag, new_meta_tag)
        return updated_html_content
    else:
        return html_content

def extract_data_from_docx(file_path):
    doc = Document(file_path)
    articles = []
    current_title = None
    current_body = []

    for para in doc.paragraphs:
        if para.alignment == WD_PARAGRAPH_ALIGNMENT.CENTER and para.text.strip():
            if current_title:
                articles.append((current_title, current_body))
                current_body = []
            current_title = para.text.strip()
        elif current_title:
            formatted_text = ""
            for run in para.runs:
                if run.bold and run.italic:
                    formatted_text += f'***{run.text}***'
                elif run.bold:
                    formatted_text += f'**{run.text}**'
                elif run.italic:
                    formatted_text += f'*{run.text}*'
                else:
                    formatted_text += run.text
            current_body.append(formatted_text)

    if current_title:
        articles.append((current_title, current_body))

    return articles

def remove_diacritics(text):
    return unidecode.unidecode(text)

def generate_filename(title):
    normalized_title = remove_diacritics(title.lower())
    normalized_title = re.sub(r'[^a-z0-9\-]+', '-', normalized_title)
    normalized_title = re.sub(r'-+', '-', normalized_title).strip('-')
    return f"{normalized_title}.html"

def format_body(body):
    formatted_body = ""
    for paragraph in body:
        paragraph = paragraph.strip()
        if paragraph.startswith("Leadership:"):
            formatted_body += f'<p class="text_obisnuit2">{paragraph}</p>\n'
        else:
            # Gestionăm bold și italic în interiorul paragrafului
            parts = re.split(r'(\*\*.*?\*\*|\*.*?\*)', paragraph)
            formatted_paragraph = ""
            for part in parts:
                if part.startswith('**') and part.endswith('**'):
                    formatted_paragraph += f'<span class="text_obisnuit2">{part[2:-2]}</span>'
                elif part.startswith('*') and part.endswith('*'):
                    formatted_paragraph += f'<em>{part[1:-1]}</em>'
                else:
                    formatted_paragraph += part

            # Verificăm dacă întregul paragraf este bold
            if paragraph.startswith('**') and paragraph.endswith('**'):
                # Eliminăm span-ul suplimentar pentru paragrafe complet bold
                formatted_body += f'<p class="text_obisnuit2">{formatted_paragraph[29:-7]}</p>\n'
            else:
                formatted_body += f'<p class="text_obisnuit">{formatted_paragraph}</p>\n'

    return formatted_body

def update_html_content(html_content, title, first_sentence, body, filename):
    title_without_diacritics = remove_diacritics(title)

    html_content = re.sub(r'<title>.*?</title>', f'<title>{title_without_diacritics} | Neculai Fantanaru (en)</title>', html_content)
    html_content = re.sub(r'<h1 class="den_articol" itemprop="name">.*?</h1>', f'<h1 class="den_articol" itemprop="name">{title}</h1>', html_content)

    html_content = html_content.replace('zzz.html', filename)

    meta_desc = f'<meta name="description" content="{first_sentence}">'
    html_content = re.sub(r'<meta name="description" content=".*?">', meta_desc, html_content)

    formatted_body = format_body(body)
    html_content = re.sub(r'<!-- SASA-1 -->.*?<!-- SASA-2 -->', f'<!-- SASA-1 -->\n{formatted_body}\n<!-- SASA-2 -->', html_content, flags=re.DOTALL)

    current_date = datetime.now().strftime("%B %d, %Y")
    html_content = re.sub(r'On .*?, in', f'On {current_date}, in', html_content)

    return html_content

def post_process_html(html_content):
    # Înlocuire string "NBSP" cu spațiu
    html_content = html_content.replace("NBSP", " ")

    # Înlocuire caracter non-breaking space (U+00A0) cu spațiu normal
    html_content = html_content.replace("\u00A0", " ")

    # Înlocuire entitate HTML &nbsp; cu spațiu normal
    html_content = html_content.replace("&nbsp;", " ")

    return html_content

def main():
    docx_path = "bebe.docx"
    html_path = "index.html"
    output_dir = "output"

    if not os.path.exists(docx_path):
        print(f"Error: File '{docx_path}' not found.")
        return

    if not os.path.exists(html_path):
        print(f"Error: File '{html_path}' not found.")
        return

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created output directory: {output_dir}")

    articles = extract_data_from_docx(docx_path)

    if not articles:
        print("No articles found in the document.")
        return

    for title, body in articles:
        filename = generate_filename(title)
        print(f"Processing article: {title}")
        print(f"Generated filename: {filename}")

        with open(html_path, 'r', encoding='utf-8') as file:
            html_content = file.read()

        if not body:
            print(f"Warning: Empty body for article '{title}'. Skipping.")
            continue

        updated_html = update_html_content(html_content, title, body[0], body, filename)

        # Aplicăm post-procesarea chiar înainte de salvare
        updated_html = post_process_html(updated_html)

        output_path = os.path.join(output_dir, filename)
        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(updated_html)

        print(f"Saved article as: {output_path}")

    print("All articles have been processed successfully.")

if __name__ == "__main__":
    main()
	import os
	import re
	import unidecode
	from docx import Document
	from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
	from datetime import datetime

	def add_content_to_meta(html_content, content_to_add):
	meta_pattern = r'<meta name="description" content="(.*?)">'
	match = re.search(meta_pattern, html_content)
	if match:
	old_meta_tag = match.group(0)
	new_content = re.sub(r'"', '', content_to_add)
	new_meta_tag = f'<meta name="description" content="{new_content}">'
	updated_html_content = html_content.replace(old_meta_tag, new_meta_tag)
	return updated_html_content
	else:
	return html_content

	def extract_data_from_docx(file_path):
	doc = Document(file_path)
	articles = []
	current_title = None
	current_body = []

	for para in doc.paragraphs:
	if para.alignment == WD_PARAGRAPH_ALIGNMENT.CENTER and para.text.strip():
	if current_title:
	articles.append((current_title, current_body))
	current_body = []
	current_title = para.text.strip()
	elif current_title:
	formatted_text = ""
	for run in para.runs:
	if run.bold and run.italic:
	formatted_text += f'*{run.text}*'
	elif run.bold:
	formatted_text += f'{run.text}'
	elif run.italic:
	formatted_text += f'{run.text}'
	else:
	formatted_text += run.text
	current_body.append(formatted_text)

	if current_title:
	articles.append((current_title, current_body))

	return articles

	def remove_diacritics(text):
	return unidecode.unidecode(text)

	def generate_filename(title):
	normalized_title = remove_diacritics(title.lower())
	normalized_title = re.sub(r'[^a-z0-9\-]+', '-', normalized_title)
	normalized_title = re.sub(r'-+', '-', normalized_title).strip('-')
	return f"{normalized_title}.html"

	def format_body(body):
	formatted_body = ""
	for paragraph in body:
	paragraph = paragraph.strip()
	if paragraph.startswith("Leadership:"):
	formatted_body += f'<p class="text_obisnuit2">{paragraph}</p>\n'
	else:
	# Gestionăm bold și italic în interiorul paragrafului
	parts = re.split(r'(\\.?\\\|\.?\)', paragraph)
	formatted_paragraph = ""
	for part in parts:
	if part.startswith('') and part.endswith(''):
	formatted_paragraph += f'<span class="text_obisnuit2">{part[2:-2]}</span>'
	elif part.startswith('') and part.endswith(''):
	formatted_paragraph += f'<em>{part[1:-1]}</em>'
	else:
	formatted_paragraph += part

	# Verificăm dacă întregul paragraf este bold
	if paragraph.startswith('') and paragraph.endswith(''):
	# Eliminăm span-ul suplimentar pentru paragrafe complet bold
	formatted_body += f'<p class="text_obisnuit2">{formatted_paragraph[29:-7]}</p>\n'
	else:
	formatted_body += f'<p class="text_obisnuit">{formatted_paragraph}</p>\n'

	return formatted_body

	def update_html_content(html_content, title, first_sentence, body, filename):
	title_without_diacritics = remove_diacritics(title)

	html_content = re.sub(r'<title>.*?</title>', f'<title>{title_without_diacritics} \| Neculai Fantanaru (en)</title>', html_content)
	html_content = re.sub(r'<h1 class="den_articol" itemprop="name">.*?</h1>', f'<h1 class="den_articol" itemprop="name">{title}</h1>', html_content)

	html_content = html_content.replace('zzz.html', filename)

	meta_desc = f'<meta name="description" content="{first_sentence}">'
	html_content = re.sub(r'<meta name="description" content=".*?">', meta_desc, html_content)

	formatted_body = format_body(body)
	html_content = re.sub(r'<!-- SASA-1 -->.*?<!-- SASA-2 -->', f'<!-- SASA-1 -->\n{formatted_body}\n<!-- SASA-2 -->', html_content, flags=re.DOTALL)

	current_date = datetime.now().strftime("%B %d, %Y")
	html_content = re.sub(r'On .*?, in', f'On {current_date}, in', html_content)

	return html_content

	def post_process_html(html_content):
	# Înlocuire string "NBSP" cu spațiu
	html_content = html_content.replace("NBSP", " ")

	# Înlocuire caracter non-breaking space (U+00A0) cu spațiu normal
	html_content = html_content.replace("\u00A0", " ")

	# Înlocuire entitate HTML   cu spațiu normal
	html_content = html_content.replace(" ", " ")

	return html_content

	def main():
	docx_path = "bebe.docx"
	html_path = "index.html"
	output_dir = "output"

	if not os.path.exists(docx_path):
	print(f"Error: File '{docx_path}' not found.")
	return

	if not os.path.exists(html_path):
	print(f"Error: File '{html_path}' not found.")
	return

	if not os.path.exists(output_dir):
	os.makedirs(output_dir)
	print(f"Created output directory: {output_dir}")

	articles = extract_data_from_docx(docx_path)

	if not articles:
	print("No articles found in the document.")
	return

	for title, body in articles:
	filename = generate_filename(title)
	print(f"Processing article: {title}")
	print(f"Generated filename: {filename}")

	with open(html_path, 'r', encoding='utf-8') as file:
	html_content = file.read()

	if not body:
	print(f"Warning: Empty body for article '{title}'. Skipping.")
	continue

	updated_html = update_html_content(html_content, title, body[0], body, filename)

	# Aplicăm post-procesarea chiar înainte de salvare
	updated_html = post_process_html(updated_html)

	output_path = os.path.join(output_dir, filename)
	with open(output_path, 'w', encoding='utf-8') as file:
	file.write(updated_html)

	print(f"Saved article as: {output_path}")

	print("All articles have been processed successfully.")

	if __name__ == "__main__":
	main()