Skip to content

Instantly share code, notes, and snippets.

@me-suzy
Last active July 5, 2024 10:12
Show Gist options
  • Save me-suzy/7572a32ecb90c7faf3d159b16a362118 to your computer and use it in GitHub Desktop.
Save me-suzy/7572a32ecb90c7faf3d159b16a362118 to your computer and use it in GitHub Desktop.
iulia-FINAL-docx-FINAL-engleza
import os
import re
import unidecode
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from datetime import datetime
def add_content_to_meta(html_content, content_to_add):
meta_pattern = r'<meta name="description" content="(.*?)">'
match = re.search(meta_pattern, html_content)
if match:
old_meta_tag = match.group(0)
new_content = re.sub(r'"', '', content_to_add)
new_meta_tag = f'<meta name="description" content="{new_content}">'
updated_html_content = html_content.replace(old_meta_tag, new_meta_tag)
return updated_html_content
else:
return html_content
def extract_data_from_docx(file_path):
doc = Document(file_path)
articles = []
current_title = None
current_body = []
for para in doc.paragraphs:
if para.alignment == WD_PARAGRAPH_ALIGNMENT.CENTER and para.text.strip():
if current_title:
articles.append((current_title, current_body))
current_body = []
current_title = para.text.strip()
elif current_title:
formatted_text = ""
for run in para.runs:
if run.bold and run.italic:
formatted_text += f'***{run.text}***'
elif run.bold:
formatted_text += f'**{run.text}**'
elif run.italic:
formatted_text += f'*{run.text}*'
else:
formatted_text += run.text
current_body.append(formatted_text)
if current_title:
articles.append((current_title, current_body))
return articles
def remove_diacritics(text):
return unidecode.unidecode(text)
def generate_filename(title):
normalized_title = remove_diacritics(title.lower())
normalized_title = re.sub(r'[^a-z0-9\-]+', '-', normalized_title)
normalized_title = re.sub(r'-+', '-', normalized_title).strip('-')
return f"{normalized_title}.html"
def format_body(body):
formatted_body = ""
for paragraph in body:
paragraph = paragraph.strip()
if paragraph.startswith("Leadership:"):
formatted_body += f'<p class="text_obisnuit2">{paragraph}</p>\n'
else:
# Gestionăm bold și italic în interiorul paragrafului
parts = re.split(r'(\*\*.*?\*\*|\*.*?\*)', paragraph)
formatted_paragraph = ""
for part in parts:
if part.startswith('**') and part.endswith('**'):
formatted_paragraph += f'<span class="text_obisnuit2">{part[2:-2]}</span>'
elif part.startswith('*') and part.endswith('*'):
formatted_paragraph += f'<em>{part[1:-1]}</em>'
else:
formatted_paragraph += part
# Verificăm dacă întregul paragraf este bold
if paragraph.startswith('**') and paragraph.endswith('**'):
# Eliminăm span-ul suplimentar pentru paragrafe complet bold
formatted_body += f'<p class="text_obisnuit2">{formatted_paragraph[29:-7]}</p>\n'
else:
formatted_body += f'<p class="text_obisnuit">{formatted_paragraph}</p>\n'
return formatted_body
def update_html_content(html_content, title, first_sentence, body, filename):
title_without_diacritics = remove_diacritics(title)
html_content = re.sub(r'<title>.*?</title>', f'<title>{title_without_diacritics} | Neculai Fantanaru (en)</title>', html_content)
html_content = re.sub(r'<h1 class="den_articol" itemprop="name">.*?</h1>', f'<h1 class="den_articol" itemprop="name">{title}</h1>', html_content)
html_content = html_content.replace('zzz.html', filename)
meta_desc = f'<meta name="description" content="{first_sentence}">'
html_content = re.sub(r'<meta name="description" content=".*?">', meta_desc, html_content)
formatted_body = format_body(body)
html_content = re.sub(r'<!-- SASA-1 -->.*?<!-- SASA-2 -->', f'<!-- SASA-1 -->\n{formatted_body}\n<!-- SASA-2 -->', html_content, flags=re.DOTALL)
current_date = datetime.now().strftime("%B %d, %Y")
html_content = re.sub(r'On .*?, in', f'On {current_date}, in', html_content)
return html_content
def post_process_html(html_content):
# Înlocuire string "NBSP" cu spațiu
html_content = html_content.replace("NBSP", " ")
# Înlocuire caracter non-breaking space (U+00A0) cu spațiu normal
html_content = html_content.replace("\u00A0", " ")
# Înlocuire entitate HTML &nbsp; cu spațiu normal
html_content = html_content.replace("&nbsp;", " ")
return html_content
def main():
docx_path = "bebe.docx"
html_path = "index.html"
output_dir = "output"
if not os.path.exists(docx_path):
print(f"Error: File '{docx_path}' not found.")
return
if not os.path.exists(html_path):
print(f"Error: File '{html_path}' not found.")
return
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Created output directory: {output_dir}")
articles = extract_data_from_docx(docx_path)
if not articles:
print("No articles found in the document.")
return
for title, body in articles:
filename = generate_filename(title)
print(f"Processing article: {title}")
print(f"Generated filename: {filename}")
with open(html_path, 'r', encoding='utf-8') as file:
html_content = file.read()
if not body:
print(f"Warning: Empty body for article '{title}'. Skipping.")
continue
updated_html = update_html_content(html_content, title, body[0], body, filename)
# Aplicăm post-procesarea chiar înainte de salvare
updated_html = post_process_html(updated_html)
output_path = os.path.join(output_dir, filename)
with open(output_path, 'w', encoding='utf-8') as file:
file.write(updated_html)
print(f"Saved article as: {output_path}")
print("All articles have been processed successfully.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment