-
-
Save me-suzy/7572a32ecb90c7faf3d159b16a362118 to your computer and use it in GitHub Desktop.
iulia-FINAL-docx-FINAL-engleza
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import unidecode | |
from docx import Document | |
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT | |
from datetime import datetime | |
def add_content_to_meta(html_content, content_to_add): | |
meta_pattern = r'<meta name="description" content="(.*?)">' | |
match = re.search(meta_pattern, html_content) | |
if match: | |
old_meta_tag = match.group(0) | |
new_content = re.sub(r'"', '', content_to_add) | |
new_meta_tag = f'<meta name="description" content="{new_content}">' | |
updated_html_content = html_content.replace(old_meta_tag, new_meta_tag) | |
return updated_html_content | |
else: | |
return html_content | |
def extract_data_from_docx(file_path): | |
doc = Document(file_path) | |
articles = [] | |
current_title = None | |
current_body = [] | |
for para in doc.paragraphs: | |
if para.alignment == WD_PARAGRAPH_ALIGNMENT.CENTER and para.text.strip(): | |
if current_title: | |
articles.append((current_title, current_body)) | |
current_body = [] | |
current_title = para.text.strip() | |
elif current_title: | |
formatted_text = "" | |
for run in para.runs: | |
if run.bold and run.italic: | |
formatted_text += f'***{run.text}***' | |
elif run.bold: | |
formatted_text += f'**{run.text}**' | |
elif run.italic: | |
formatted_text += f'*{run.text}*' | |
else: | |
formatted_text += run.text | |
current_body.append(formatted_text) | |
if current_title: | |
articles.append((current_title, current_body)) | |
return articles | |
def remove_diacritics(text): | |
return unidecode.unidecode(text) | |
def generate_filename(title): | |
normalized_title = remove_diacritics(title.lower()) | |
normalized_title = re.sub(r'[^a-z0-9\-]+', '-', normalized_title) | |
normalized_title = re.sub(r'-+', '-', normalized_title).strip('-') | |
return f"{normalized_title}.html" | |
def format_body(body): | |
formatted_body = "" | |
for paragraph in body: | |
paragraph = paragraph.strip() | |
if paragraph.startswith("Leadership:"): | |
formatted_body += f'<p class="text_obisnuit2">{paragraph}</p>\n' | |
else: | |
# Gestionăm bold și italic în interiorul paragrafului | |
parts = re.split(r'(\*\*.*?\*\*|\*.*?\*)', paragraph) | |
formatted_paragraph = "" | |
for part in parts: | |
if part.startswith('**') and part.endswith('**'): | |
formatted_paragraph += f'<span class="text_obisnuit2">{part[2:-2]}</span>' | |
elif part.startswith('*') and part.endswith('*'): | |
formatted_paragraph += f'<em>{part[1:-1]}</em>' | |
else: | |
formatted_paragraph += part | |
# Verificăm dacă întregul paragraf este bold | |
if paragraph.startswith('**') and paragraph.endswith('**'): | |
# Eliminăm span-ul suplimentar pentru paragrafe complet bold | |
formatted_body += f'<p class="text_obisnuit2">{formatted_paragraph[29:-7]}</p>\n' | |
else: | |
formatted_body += f'<p class="text_obisnuit">{formatted_paragraph}</p>\n' | |
return formatted_body | |
def update_html_content(html_content, title, first_sentence, body, filename): | |
title_without_diacritics = remove_diacritics(title) | |
html_content = re.sub(r'<title>.*?</title>', f'<title>{title_without_diacritics} | Neculai Fantanaru (en)</title>', html_content) | |
html_content = re.sub(r'<h1 class="den_articol" itemprop="name">.*?</h1>', f'<h1 class="den_articol" itemprop="name">{title}</h1>', html_content) | |
html_content = html_content.replace('zzz.html', filename) | |
meta_desc = f'<meta name="description" content="{first_sentence}">' | |
html_content = re.sub(r'<meta name="description" content=".*?">', meta_desc, html_content) | |
formatted_body = format_body(body) | |
html_content = re.sub(r'<!-- SASA-1 -->.*?<!-- SASA-2 -->', f'<!-- SASA-1 -->\n{formatted_body}\n<!-- SASA-2 -->', html_content, flags=re.DOTALL) | |
current_date = datetime.now().strftime("%B %d, %Y") | |
html_content = re.sub(r'On .*?, in', f'On {current_date}, in', html_content) | |
return html_content | |
def post_process_html(html_content): | |
# Înlocuire string "NBSP" cu spațiu | |
html_content = html_content.replace("NBSP", " ") | |
# Înlocuire caracter non-breaking space (U+00A0) cu spațiu normal | |
html_content = html_content.replace("\u00A0", " ") | |
# Înlocuire entitate HTML cu spațiu normal | |
html_content = html_content.replace(" ", " ") | |
return html_content | |
def main(): | |
docx_path = "bebe.docx" | |
html_path = "index.html" | |
output_dir = "output" | |
if not os.path.exists(docx_path): | |
print(f"Error: File '{docx_path}' not found.") | |
return | |
if not os.path.exists(html_path): | |
print(f"Error: File '{html_path}' not found.") | |
return | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir) | |
print(f"Created output directory: {output_dir}") | |
articles = extract_data_from_docx(docx_path) | |
if not articles: | |
print("No articles found in the document.") | |
return | |
for title, body in articles: | |
filename = generate_filename(title) | |
print(f"Processing article: {title}") | |
print(f"Generated filename: {filename}") | |
with open(html_path, 'r', encoding='utf-8') as file: | |
html_content = file.read() | |
if not body: | |
print(f"Warning: Empty body for article '{title}'. Skipping.") | |
continue | |
updated_html = update_html_content(html_content, title, body[0], body, filename) | |
# Aplicăm post-procesarea chiar înainte de salvare | |
updated_html = post_process_html(updated_html) | |
output_path = os.path.join(output_dir, filename) | |
with open(output_path, 'w', encoding='utf-8') as file: | |
file.write(updated_html) | |
print(f"Saved article as: {output_path}") | |
print("All articles have been processed successfully.") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment