Skip to content

Instantly share code, notes, and snippets.

@thibaudcolas
Last active July 2, 2024 16:45
Show Gist options
  • Save thibaudcolas/cb8e644abda4c6b7f0298fa917d8a3bc to your computer and use it in GitHub Desktop.
Save thibaudcolas/cb8e644abda4c6b7f0298fa917d8a3bc to your computer and use it in GitHub Desktop.
Real-world example of Wagtail structured content importing

For wagtail-content-import. Uses mammoth to convert .docx to HTML, rather than python-docx.

Settings:

WAGTAILCONTENTIMPORT_DOCX_PARSER = "content_import_parsers.DocxHTMLParser"

Mapper to a page’s body StreamBlock:

class BodyStreamBLockContentMapper(streamfield.StreamFieldMapper):
    heading = converters.TextConverter("heading")
    subheading = converters.TextConverter("subheading")
    html = converters.RichTextConverter("paragraph")
    image = custom_converters.ImageBlockConverter("image")
    embed = custom_converters.EmbedConverter("embed")
import bs4
import mammoth
from django.core import exceptions, validators
from wagtail.embeds import embeds
from wagtail_content_import.parsers import base as base_parser
class DocxHTMLParser(base_parser.DocumentParser):
def __init__(self, document):
self.document = document
def close_paragraph(self, block, stream_data):
if block:
stream_data.append({"type": "html", "value": "".join(block)})
block.clear()
return
def parse(self):
html = mammoth.convert_to_html(self.document).value
soup = bs4.BeautifulSoup(html, "html5lib")
stream_data = []
# Run through contents and populate stream
current_paragraph_block = []
for tag in soup.body.recursiveChildGenerator():
# Remove all inline styles and classes
if hasattr(tag, "attrs"):
for attr in ["class", "style"]:
tag.attrs.pop(attr, None)
title = ""
for tag in soup.body.contents:
if isinstance(tag, bs4.NavigableString):
stream_data.append({"type": "html", "value": str(tag)})
else:
if tag.name == "h1":
if not title:
title = tag.text
else:
self.close_paragraph(current_paragraph_block, stream_data)
stream_data.append({"type": "heading", "value": tag.text})
elif tag.name == "h2":
self.close_paragraph(current_paragraph_block, stream_data)
stream_data.append({"type": "heading", "value": tag.text})
elif tag.name in ["h3", "h4", "h5", "h6"]:
self.close_paragraph(current_paragraph_block, stream_data)
stream_data.append({"type": "subheading", "value": tag.text})
elif tag.name == "img":
# Break the paragraph and add an image
self.close_paragraph(current_paragraph_block, stream_data)
stream_data.append(
{
"type": "image",
"value": tag.get("src"),
"title": tag.get("alt", ""),
}
)
elif tag.text:
if tag.text.startswith("http:") or tag.text.startswith("https:"):
validate = validators.URLValidator()
url = tag.text.strip()
try:
validate(url)
if embed := embeds.get_embed(url):
self.close_paragraph(
current_paragraph_block, stream_data
)
stream_data.append({"type": "embed", "value": embed})
except exceptions.ValidationError:
current_paragraph_block.append(str(tag))
else:
current_paragraph_block.append(str(tag))
if tag.find_all("img"):
# Break the paragraph and add images
self.close_paragraph(current_paragraph_block, stream_data)
for img in tag.find_all("img"):
stream_data.append(
{
"type": "image",
"value": img.get("src"),
"title": img.get("alt", ""),
}
)
self.close_paragraph(current_paragraph_block, stream_data)
return {"title": title, "elements": stream_data}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment