Skip to content

Instantly share code, notes, and snippets.

@ryanohs
Last active August 2, 2021 02:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ryanohs/795caa5e3be0f8c9d9e0c6d1efd88989 to your computer and use it in GitHub Desktop.
Save ryanohs/795caa5e3be0f8c9d9e0c6d1efd88989 to your computer and use it in GitHub Desktop.
public class DocxToJournalTransformer
{
private const int BLOCK_TEXT = -2;
public void Transform(string docxFilename, Stream outputStream)
{
var root = new Node();
using (var document = DocX.Load(docxFilename))
{
var lastNode = root;
foreach(var p in document.Paragraphs)
{
if(string.IsNullOrWhiteSpace(p.Text))
{
continue;
}
var currIndent = GetIndentLevel(p);
if(currIndent == BLOCK_TEXT)
{
if(!string.IsNullOrEmpty(lastNode.BlockText))
{
lastNode.BlockText += "\n";
}
lastNode.BlockText += p.Text;
}
else
{
var parent = GetParent(currIndent, lastNode);
var node = new Node()
{
Id = p.Xml.Attributes().FirstOrDefault(a => a.Name.LocalName == "paraId")?.Value,
Parent = parent,
Indent = currIndent,
Text = p.Text,
Type = GetNodeType(p),
StyleId = p.StyleId
};
parent.Children.Add(node);
lastNode = node;
}
}
}
using(var writer = new StreamWriter(outputStream, Encoding.UTF8, 1024, true))
{
WriteNode(root, -1, writer);
}
}
private NodeType GetNodeType(Paragraph paragraph)
{
if(paragraph.MagicText?.Any(t => t.formatting?.Highlight == Highlight.green) ?? false)
{
return NodeType.Task;
}
if(paragraph.MagicText?.Any(t => t.formatting?.Highlight == Highlight.yellow) ?? false)
{
return NodeType.Inspiration;
}
return NodeType.Note;
}
private Node GetParent(int currIndent, Node lastNode)
{
var parent = lastNode;
while(currIndent <= parent.Indent)
{
parent = parent.Parent;
}
return parent;
}
private int GetIndentLevel(Paragraph paragraph)
{
if(paragraph.IndentationBefore > 0)
{
return BLOCK_TEXT;
}
switch (paragraph.StyleId)
{
case "Heading1":
return 1;
case "Heading2":
return 2;
case "Normal":
return 3;
case "ListParagraph":
return 4 + (paragraph.IndentLevel ?? 0);
default:
throw new Exception($"Unknown StyleId {paragraph.StyleId} at text {paragraph.Text}");
}
}
private void WriteNode(Node node, int indentLevel, StreamWriter streamWriter)
{
if(indentLevel > -1)
{
var tabs = new string('\t', indentLevel);
var bullet = GetBullet(node.Type);
streamWriter.WriteLine($"{tabs}{bullet}{node.Text}");
if(node.BlockText != null)
{
tabs += '\t';
foreach(var line in node.BlockText.Split('\n'))
{
streamWriter.WriteLine($"{tabs}{line}");
}
}
}
foreach(var child in node.Children)
{
WriteNode(child, indentLevel + 1, streamWriter);
}
}
private string GetBullet(NodeType nodeType)
{
switch (nodeType)
{
case NodeType.Title:
return "";
case NodeType.Note:
return "- ";
case NodeType.Task:
return "* ";
case NodeType.Event:
return "o ";
case NodeType.Inspiration:
return "! ";
default:
throw new ArgumentOutOfRangeException(nameof(nodeType), nodeType, null);
}
}
}
public class Node
{
public string Id { get; set; }
public NodeType Type { get; set; }
public string Text { get; set; }
public string StyleId { get; set; }
public int Indent { get; set; }
public string BlockText { get; set; }
public Node Parent { get; set; }
public List<Node> Children { get; set; } = new List<Node>();
}
public enum NodeType
{
Title,
Note,
Task,
Event,
Inspiration
}
public class DocXTesting
{
private readonly ITestOutputHelper _testOutputHelper;
public DocXTesting(ITestOutputHelper testOutputHelper)
{
_testOutputHelper = testOutputHelper;
}
[Fact]
public void TransformDocument()
{
var stream = new MemoryStream();
var transformer = new DocxToJournalTransformer();
transformer.Transform(@"C:\code\docx\All Notes.docx", stream);
stream.Position = 0;
using(var reader = new StreamReader(stream))
{
while(!reader.EndOfStream)
{
_testOutputHelper.WriteLine(reader.ReadLine());
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment