Skip to content

Instantly share code, notes, and snippets.

@reZach
Last active March 19, 2021 01:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save reZach/bc0cbadd95584fa205b5b443a554c8bc to your computer and use it in GitHub Desktop.
Save reZach/bc0cbadd95584fa205b5b443a554c8bc to your computer and use it in GitHub Desktop.
C# Merging Runs in a Microsoft Word.docx file, and converting it to HTML
// Cleans up runs that can be merged, ie:
/*
<w:p w:rsidR="00D242F1" w:rsidP="00D242F1" w:rsidRDefault="005F6285" w14:paraId="66169407" w14:textId="101467E0"
xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml">
<w:pPr>
<w:spacing w:after="0" w:line="240" w:lineRule="auto" />
<w:jc w:val="center" />
<w:rPr>
<w:rFonts w:ascii="Garamond" w:hAnsi="Garamond" />
</w:rPr>
</w:pPr>
<w:r>
<w:rPr>
<w:rFonts w:ascii="Garamond" w:hAnsi="Garamond" />
</w:rPr>
<w:t>hey, I</w:t>
</w:r>
<w:r>
<w:rPr>
<w:rFonts w:ascii="Garamond" w:hAnsi="Garamond" />
</w:rPr>
<w:t> am split and don't need to be!</w:t>
</w:r>
</w:p>
into
<w:p w:rsidR="00D242F1" w:rsidP="00D242F1" w:rsidRDefault="005F6285" w14:paraId="66169407" w14:textId="101467E0"
xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml">
<w:pPr>
<w:spacing w:after="0" w:line="240" w:lineRule="auto" />
<w:jc w:val="center" />
<w:rPr>
<w:rFonts w:ascii="Garamond" w:hAnsi="Garamond" />
</w:rPr>
</w:pPr>
<w:r>
<w:rPr>
<w:rFonts w:ascii="Garamond" w:hAnsi="Garamond" />
</w:rPr>
<w:t>hey, I am split and don't need to be!</w:t>
</w:r>
</w:p>
*/
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
using System;
using System.Collections.Generic;
using System.Linq;
namespace openxml
{
public class Program
{
public static void Main(string[] args)
{
FillDocument("Official.docx");
}
public static void FillDocument(string filename)
{
using (WordprocessingDocument wordDoc = WordprocessingDocument.Open(filename, true))
{
MergeRuns(wordDoc);
ConvertDocxToHTML(wordDoc);
}
}
public static string ConvertDocxToHTML(WordprocessingDocument document)
{
string html = @"
<!DOCTYPE html>
<html lang='en-US'>
<head></head>
<body>";
Document doc = document.MainDocumentPart.Document;
OpenXmlElementList bodyElements = doc.ChildElements[0].ChildElements;
foreach (OpenXmlElement xmlElement in bodyElements)
{
if (xmlElement is Paragraph)
html += RenderParagraph((Paragraph)xmlElement);
else if (xmlElement is Table)
html += RenderTable((Table)xmlElement);
}
html += @"</body></html>";
return html;
}
public static void MergeRuns(WordprocessingDocument document)
{
OpenXmlElementList bodyElements = document.MainDocumentPart.Document.ChildElements[0].ChildElements;
// For each element in the body
foreach (OpenXmlElement xmlElement in bodyElements)
{
if (xmlElement is Paragraph)
{
Paragraph p = (Paragraph)xmlElement;
// For each element in each paragraph
MergeRunsParagraph(p);
}
else if (xmlElement is Table)
{
// Merge runs within paragraphs within table elements
Table table = (Table)xmlElement;
List<TableRow> tableRows = table.Elements<TableRow>().ToList();
foreach (TableRow tableRow in tableRows)
{
List<TableCell> tableCells = tableRow.Elements<TableCell>().ToList();
foreach (TableCell tableCell in tableCells)
{
Paragraph tableCellParagraph = tableCell.GetFirstChild<Paragraph>();
if (tableCellParagraph != null)
MergeRunsParagraph(tableCellParagraph);
}
}
}
}
// Save updates
document.MainDocumentPart.Document.Save();
}
public static void MergeRunsParagraph(Paragraph paragraph)
{
// For each element in each paragraph
OpenXmlElementList children = paragraph.ChildElements;
List<Run> paragraphRuns = new List<Run>();
foreach (OpenXmlElement element in children)
{
// Keep track of consecutive runs
if (element is Run)
paragraphRuns.Add((Run)element);
else if (paragraphRuns.Count > 1)
{
MergeRunsParagraphRuns(paragraphRuns, paragraph);
paragraphRuns.Clear();
}
}
// Merge any existing runs once we finish
// iterating over child elements
if (paragraphRuns.Count > 1)
MergeRunsParagraphRuns(paragraphRuns, paragraph);
}
public static void MergeRunsParagraphRuns(List<Run> paragraphRuns, Paragraph root)
{
// Compare properties of runs;
// only merge runs if all properties are the same
List<RunProperties> paragraphRunProperties = new List<RunProperties>();
foreach (Run run in paragraphRuns)
paragraphRunProperties.Add(run.GetFirstChild<RunProperties>());
bool paragraphRunPropertiesMatch = true;
for (int i = 0; i < paragraphRunProperties.Count; i++)
for (int j = 0; j < paragraphRunProperties.Count; j++)
{
if (i != j)
{
// todo - should do a deep compare here
if (!string.Equals(paragraphRunProperties[i].RunFonts.Ascii.Value, paragraphRunProperties[j].RunFonts.Ascii.Value, System.StringComparison.OrdinalIgnoreCase))
paragraphRunPropertiesMatch = false;
}
}
// Properties match, can merge runs
if (paragraphRunPropertiesMatch)
{
string combinedText = string.Empty;
// Save off combined text
foreach (Run run in paragraphRuns)
{
foreach (OpenXmlElement runChild in run.ChildElements)
{
if (runChild is Break)
combinedText += "<br />";
else if (runChild is Text)
combinedText += run.InnerText;
}
}
// Remove runs
foreach (Run run in paragraphRuns)
run.Remove();
// Create new run with properties
Run newRun = new Run();
RunProperties newRunProperties = (RunProperties)paragraphRunProperties[0].Clone();
Text newText = new Text(combinedText);
newRun.AddChild(newRunProperties);
newRun.AddChild(newText);
root.AddChild(newRun);
}
}
public static string RenderParagraph(Paragraph paragraph)
{
ParagraphProperties paragraphProperties = paragraph.GetFirstChild<ParagraphProperties>();
Run run = paragraph.GetFirstChild<Run>();
// If we have no text in the paragraph,
// render an empty paragraph
if (run == null)
return "<p style='margin:0px;height:16px;'></p>";
RunProperties runProperties = run.GetFirstChild<RunProperties>();
string text = run.InnerText;
List<string> styles = new List<string>();
// Paragraph properties
string textAlign = paragraphProperties?.Justification?.Val ?? string.Empty;
if (!string.IsNullOrEmpty(textAlign))
styles.Add($"text-align:{textAlign}");
string marginTop = paragraphProperties?.SpacingBetweenLines?.Before ?? string.Empty;
string marginBottom = paragraphProperties?.SpacingBetweenLines?.After ?? string.Empty;
if (string.IsNullOrEmpty(marginTop))
{
styles.Add($"margin-top:0px");
}
if (!string.IsNullOrEmpty(marginBottom))
{
if (string.Equals(marginBottom, "0", StringComparison.OrdinalIgnoreCase))
styles.Add($"margin-bottom:0px");
}
// Run properties
if (runProperties.Bold != null)
text = AddTag("strong", text);
if (runProperties.RunFonts.Ascii.HasValue)
styles.Add($"font-family:'{runProperties.RunFonts.Ascii.Value}'");
return @$"<p style=""{string.Join(';', styles)}"">
{text}
</p>";
}
public static string RenderTable(Table table)
{
TableProperties tableProperties = table.GetFirstChild<TableProperties>();
TableGrid tableGrid = table.GetFirstChild<TableGrid>();
List<TableRow> tableRowElements = table.Elements<TableRow>().ToList();
decimal tableWidth = tableProperties.TableWidth.Width.HasValue ? decimal.Parse(tableProperties.TableWidth.Width.Value) : 0;
List<int> cellWidths = new List<int>(tableGrid.ChildElements.Count);
foreach (GridColumn gridColumn in tableGrid.ChildElements)
{
decimal cellW = decimal.Parse(gridColumn.Width.HasValue ? gridColumn.Width.Value : "0");
if (cellW > 0)
cellWidths.Add((int)Math.Floor((cellW / tableWidth) * 100m));
}
string row = string.Empty;
List<string> tableRows = new List<string>();
foreach (TableRow tr in tableRowElements)
{
TableRowProperties trProperties = tr.GetFirstChild<TableRowProperties>();
List<TableCell> tableCells = tr.Elements<TableCell>().ToList();
row += "<tr>";
for (int i = 0; i < tableCells.Count; i++)
{
List<Paragraph> tablecellParagraphs = tableCells[i].Elements<Paragraph>().ToList();
row += $"<td style=\"width:{cellWidths[i]}%\">";
foreach (Paragraph tcParagraph in tablecellParagraphs)
{
row += RenderParagraph(tcParagraph);
}
row += "</td>";
}
row += "</tr>";
tableRows.Add(row);
row = string.Empty;
}
return @$"<table>
<tbody>
{string.Join('\n', tableRows)}
</tbody>
</table>";
}
private static string AddTag(string tag, string source)
{
return $"<{tag}>{source}</{tag}>";
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment