Skip to content

Instantly share code, notes, and snippets.

@aspose-com-gists
Last active December 6, 2021 14:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aspose-com-gists/27785844db7029d5d9558f808c89c2b8 to your computer and use it in GitHub Desktop.
Save aspose-com-gists/27785844db7029d5d9558f808c89c2b8 to your computer and use it in GitHub Desktop.
Extract Text from Word Documents in C#
// Load Word document
Document doc = new Document("document.docx");
Paragraph startPara = (Paragraph)doc.LastSection.GetChild(NodeType.Paragraph, 2, true);
Table endTable = (Table)doc.LastSection.GetChild(NodeType.Table, 0, true);
// Extract the content between these nodes in the document. Include these markers in the extraction.
ArrayList extractedNodes = ExtractContent(startPara, endTable, true);
// Insert the content into a new document and save it to disk.
Document dstDoc = GenerateDocument(doc, extractedNodes);
dstDoc.Save("output.docx");
// Load Word document
Document doc = new Document("document.docx");
// Gather a list of the paragraphs using the respective heading styles.
List<Paragraph> parasStyleHeading1 = ParagraphsByStyleName(doc, "Heading 1");
List<Paragraph> parasStyleHeading3 = ParagraphsByStyleName(doc, "Heading 3");
// Use the first instance of the paragraphs with those styles.
Node startPara1 = (Node)parasStyleHeading1[0];
Node endPara1 = (Node)parasStyleHeading3[0];
// Extract the content between these nodes in the document. Don't include these markers in the extraction.
ArrayList extractedNodes = ExtractContent(startPara1, endPara1, false);
// Insert the content into a new document and save it to disk.
Document dstDoc = GenerateDocument(doc, extractedNodes);
dstDoc.Save("output.docx");
// Load Word document
Document doc = new Document("document.docx");
// Gather the nodes (the GetChild method uses 0-based index)
Paragraph startPara = (Paragraph)doc.FirstSection.Body.GetChild(NodeType.Paragraph, 6, true);
Paragraph endPara = (Paragraph)doc.FirstSection.Body.GetChild(NodeType.Paragraph, 10, true);
// Extract the content between these nodes in the document. Include these markers in the extraction.
ArrayList extractedNodes = ExtractContent(startPara, endPara, true);
// Insert the content into a new document and save it to disk.
Document dstDoc = GenerateDocument(doc, extractedNodes);
dstDoc.Save("output.docx");
public static ArrayList ExtractContent(Node startNode, Node endNode, bool isInclusive)
{
// First check that the nodes passed to this method are valid for use.
VerifyParameterNodes(startNode, endNode);
// Create a list to store the extracted nodes.
ArrayList nodes = new ArrayList();
// Keep a record of the original nodes passed to this method so we can split marker nodes if needed.
Node originalStartNode = startNode;
Node originalEndNode = endNode;
// Extract content based on block level nodes (paragraphs and tables). Traverse through parent nodes to find them.
// We will split the content of first and last nodes depending if the marker nodes are inline
while (startNode.ParentNode.NodeType != NodeType.Body)
startNode = startNode.ParentNode;
while (endNode.ParentNode.NodeType != NodeType.Body)
endNode = endNode.ParentNode;
bool isExtracting = true;
bool isStartingNode = true;
bool isEndingNode = false;
// The current node we are extracting from the document.
Node currNode = startNode;
// Begin extracting content. Process all block level nodes and specifically split the first and last nodes when needed so paragraph formatting is retained.
// Method is little more complex than a regular extractor as we need to factor in extracting using inline nodes, fields, bookmarks etc as to make it really useful.
while (isExtracting)
{
// Clone the current node and its children to obtain a copy.
Node cloneNode = currNode.Clone(true);
isEndingNode = currNode.Equals(endNode);
if ((isStartingNode || isEndingNode) && cloneNode.IsComposite)
{
// We need to process each marker separately so pass it off to a separate method instead.
if (isStartingNode)
{
ProcessMarker((CompositeNode)cloneNode, nodes, originalStartNode, isInclusive, isStartingNode, isEndingNode);
isStartingNode = false;
}
// Conditional needs to be separate as the block level start and end markers maybe the same node.
if (isEndingNode)
{
ProcessMarker((CompositeNode)cloneNode, nodes, originalEndNode, isInclusive, isStartingNode, isEndingNode);
isExtracting = false;
}
}
else
// Node is not a start or end marker, simply add the copy to the list.
nodes.Add(cloneNode);
// Move to the next node and extract it. If next node is null that means the rest of the content is found in a different section.
if (currNode.NextSibling == null && isExtracting)
{
// Move to the next section.
Section nextSection = (Section)currNode.GetAncestor(NodeType.Section).NextSibling;
currNode = nextSection.Body.FirstChild;
}
else
{
// Move to the next node in the body.
currNode = currNode.NextSibling;
}
}
// Return the nodes between the node markers.
return nodes;
}
public static List<Paragraph> ParagraphsByStyleName(Document doc, string styleName)
{
// Create an array to collect paragraphs of the specified style.
List<Paragraph> paragraphsWithStyle = new List<Paragraph>();
NodeCollection paragraphs = doc.GetChildNodes(NodeType.Paragraph, true);
// Look through all paragraphs to find those with the specified style.
foreach (Paragraph paragraph in paragraphs)
{
if (paragraph.ParagraphFormat.Style.Name == styleName)
paragraphsWithStyle.Add(paragraph);
}
return paragraphsWithStyle;
}
private static void VerifyParameterNodes(Node startNode, Node endNode)
{
// The order in which these checks are done is important.
if (startNode == null)
throw new ArgumentException("Start node cannot be null");
if (endNode == null)
throw new ArgumentException("End node cannot be null");
if (!startNode.Document.Equals(endNode.Document))
throw new ArgumentException("Start node and end node must belong to the same document");
if (startNode.GetAncestor(NodeType.Body) == null || endNode.GetAncestor(NodeType.Body) == null)
throw new ArgumentException("Start node and end node must be a child or descendant of a body");
// Check the end node is after the start node in the DOM tree
// First check if they are in different sections, then if they're not check their position in the body of the same section they are in.
Section startSection = (Section)startNode.GetAncestor(NodeType.Section);
Section endSection = (Section)endNode.GetAncestor(NodeType.Section);
int startIndex = startSection.ParentNode.IndexOf(startSection);
int endIndex = endSection.ParentNode.IndexOf(endSection);
if (startIndex == endIndex)
{
if (startSection.Body.IndexOf(startNode) > endSection.Body.IndexOf(endNode))
throw new ArgumentException("The end node must be after the start node in the body");
}
else if (startIndex > endIndex)
throw new ArgumentException("The section of end node must be after the section start node");
}
private static bool IsInline(Node node)
{
// Test if the node is desendant of a Paragraph or Table node and also is not a paragraph or a table a paragraph inside a comment class which is decesant of a pararaph is possible.
return ((node.GetAncestor(NodeType.Paragraph) != null || node.GetAncestor(NodeType.Table) != null) && !(node.NodeType == NodeType.Paragraph || node.NodeType == NodeType.Table));
}
private static void ProcessMarker(CompositeNode cloneNode, ArrayList nodes, Node node, bool isInclusive, bool isStartMarker, bool isEndMarker)
{
// If we are dealing with a block level node just see if it should be included and add it to the list.
if (!IsInline(node))
{
// Don't add the node twice if the markers are the same node
if (!(isStartMarker && isEndMarker))
{
if (isInclusive)
nodes.Add(cloneNode);
}
return;
}
// If a marker is a FieldStart node check if it's to be included or not.
// We assume for simplicity that the FieldStart and FieldEnd appear in the same paragraph.
if (node.NodeType == NodeType.FieldStart)
{
// If the marker is a start node and is not be included then skip to the end of the field.
// If the marker is an end node and it is to be included then move to the end field so the field will not be removed.
if ((isStartMarker && !isInclusive) || (!isStartMarker && isInclusive))
{
while (node.NextSibling != null && node.NodeType != NodeType.FieldEnd)
node = node.NextSibling;
}
}
// If either marker is part of a comment then to include the comment itself we need to move the pointer forward to the Comment
// Node found after the CommentRangeEnd node.
if (node.NodeType == NodeType.CommentRangeEnd)
{
while (node.NextSibling != null && node.NodeType != NodeType.Comment)
node = node.NextSibling;
}
// Find the corresponding node in our cloned node by index and return it.
// If the start and end node are the same some child nodes might already have been removed. Subtract the
// Difference to get the right index.
int indexDiff = node.ParentNode.ChildNodes.Count - cloneNode.ChildNodes.Count;
// Child node count identical.
if (indexDiff == 0)
node = cloneNode.ChildNodes[node.ParentNode.IndexOf(node)];
else
node = cloneNode.ChildNodes[node.ParentNode.IndexOf(node) - indexDiff];
// Remove the nodes up to/from the marker.
bool isSkip = false;
bool isProcessing = true;
bool isRemoving = isStartMarker;
Node nextNode = cloneNode.FirstChild;
while (isProcessing && nextNode != null)
{
Node currentNode = nextNode;
isSkip = false;
if (currentNode.Equals(node))
{
if (isStartMarker)
{
isProcessing = false;
if (isInclusive)
isRemoving = false;
}
else
{
isRemoving = true;
if (isInclusive)
isSkip = true;
}
}
nextNode = nextNode.NextSibling;
if (isRemoving && !isSkip)
currentNode.Remove();
}
// After processing the composite node may become empty. If it has don't include it.
if (!(isStartMarker && isEndMarker))
{
if (cloneNode.HasChildNodes)
nodes.Add(cloneNode);
}
}
public static Document GenerateDocument(Document srcDoc, ArrayList nodes)
{
// Create a blank document.
Document dstDoc = new Document();
// Remove the first paragraph from the empty document.
dstDoc.FirstSection.Body.RemoveAllChildren();
// Import each node from the list into the new document. Keep the original formatting of the node.
NodeImporter importer = new NodeImporter(srcDoc, dstDoc, ImportFormatMode.KeepSourceFormatting);
foreach (Node node in nodes)
{
Node importNode = importer.ImportNode(node, true);
dstDoc.FirstSection.Body.AppendChild(importNode);
}
// Return the generated document.
return dstDoc;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment