Learn how to extract text from Word documents using C#: https://blog.aspose.com/2021/12/06/extract-text-from-word-in-csharp/
Last active
December 6, 2021 14:15
-
-
Save aspose-com-gists/27785844db7029d5d9558f808c89c2b8 to your computer and use it in GitHub Desktop.
Extract Text from Word Documents in C#
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Load Word document | |
Document doc = new Document("document.docx"); | |
Paragraph startPara = (Paragraph)doc.LastSection.GetChild(NodeType.Paragraph, 2, true); | |
Table endTable = (Table)doc.LastSection.GetChild(NodeType.Table, 0, true); | |
// Extract the content between these nodes in the document. Include these markers in the extraction. | |
ArrayList extractedNodes = ExtractContent(startPara, endTable, true); | |
// Insert the content into a new document and save it to disk. | |
Document dstDoc = GenerateDocument(doc, extractedNodes); | |
dstDoc.Save("output.docx"); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Load Word document | |
Document doc = new Document("document.docx"); | |
// Gather a list of the paragraphs using the respective heading styles. | |
List<Paragraph> parasStyleHeading1 = ParagraphsByStyleName(doc, "Heading 1"); | |
List<Paragraph> parasStyleHeading3 = ParagraphsByStyleName(doc, "Heading 3"); | |
// Use the first instance of the paragraphs with those styles. | |
Node startPara1 = (Node)parasStyleHeading1[0]; | |
Node endPara1 = (Node)parasStyleHeading3[0]; | |
// Extract the content between these nodes in the document. Don't include these markers in the extraction. | |
ArrayList extractedNodes = ExtractContent(startPara1, endPara1, false); | |
// Insert the content into a new document and save it to disk. | |
Document dstDoc = GenerateDocument(doc, extractedNodes); | |
dstDoc.Save("output.docx"); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Load Word document | |
Document doc = new Document("document.docx"); | |
// Gather the nodes (the GetChild method uses 0-based index) | |
Paragraph startPara = (Paragraph)doc.FirstSection.Body.GetChild(NodeType.Paragraph, 6, true); | |
Paragraph endPara = (Paragraph)doc.FirstSection.Body.GetChild(NodeType.Paragraph, 10, true); | |
// Extract the content between these nodes in the document. Include these markers in the extraction. | |
ArrayList extractedNodes = ExtractContent(startPara, endPara, true); | |
// Insert the content into a new document and save it to disk. | |
Document dstDoc = GenerateDocument(doc, extractedNodes); | |
dstDoc.Save("output.docx"); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public static ArrayList ExtractContent(Node startNode, Node endNode, bool isInclusive) | |
{ | |
// First check that the nodes passed to this method are valid for use. | |
VerifyParameterNodes(startNode, endNode); | |
// Create a list to store the extracted nodes. | |
ArrayList nodes = new ArrayList(); | |
// Keep a record of the original nodes passed to this method so we can split marker nodes if needed. | |
Node originalStartNode = startNode; | |
Node originalEndNode = endNode; | |
// Extract content based on block level nodes (paragraphs and tables). Traverse through parent nodes to find them. | |
// We will split the content of first and last nodes depending if the marker nodes are inline | |
while (startNode.ParentNode.NodeType != NodeType.Body) | |
startNode = startNode.ParentNode; | |
while (endNode.ParentNode.NodeType != NodeType.Body) | |
endNode = endNode.ParentNode; | |
bool isExtracting = true; | |
bool isStartingNode = true; | |
bool isEndingNode = false; | |
// The current node we are extracting from the document. | |
Node currNode = startNode; | |
// Begin extracting content. Process all block level nodes and specifically split the first and last nodes when needed so paragraph formatting is retained. | |
// Method is little more complex than a regular extractor as we need to factor in extracting using inline nodes, fields, bookmarks etc as to make it really useful. | |
while (isExtracting) | |
{ | |
// Clone the current node and its children to obtain a copy. | |
Node cloneNode = currNode.Clone(true); | |
isEndingNode = currNode.Equals(endNode); | |
if ((isStartingNode || isEndingNode) && cloneNode.IsComposite) | |
{ | |
// We need to process each marker separately so pass it off to a separate method instead. | |
if (isStartingNode) | |
{ | |
ProcessMarker((CompositeNode)cloneNode, nodes, originalStartNode, isInclusive, isStartingNode, isEndingNode); | |
isStartingNode = false; | |
} | |
// Conditional needs to be separate as the block level start and end markers maybe the same node. | |
if (isEndingNode) | |
{ | |
ProcessMarker((CompositeNode)cloneNode, nodes, originalEndNode, isInclusive, isStartingNode, isEndingNode); | |
isExtracting = false; | |
} | |
} | |
else | |
// Node is not a start or end marker, simply add the copy to the list. | |
nodes.Add(cloneNode); | |
// Move to the next node and extract it. If next node is null that means the rest of the content is found in a different section. | |
if (currNode.NextSibling == null && isExtracting) | |
{ | |
// Move to the next section. | |
Section nextSection = (Section)currNode.GetAncestor(NodeType.Section).NextSibling; | |
currNode = nextSection.Body.FirstChild; | |
} | |
else | |
{ | |
// Move to the next node in the body. | |
currNode = currNode.NextSibling; | |
} | |
} | |
// Return the nodes between the node markers. | |
return nodes; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public static List<Paragraph> ParagraphsByStyleName(Document doc, string styleName) | |
{ | |
// Create an array to collect paragraphs of the specified style. | |
List<Paragraph> paragraphsWithStyle = new List<Paragraph>(); | |
NodeCollection paragraphs = doc.GetChildNodes(NodeType.Paragraph, true); | |
// Look through all paragraphs to find those with the specified style. | |
foreach (Paragraph paragraph in paragraphs) | |
{ | |
if (paragraph.ParagraphFormat.Style.Name == styleName) | |
paragraphsWithStyle.Add(paragraph); | |
} | |
return paragraphsWithStyle; | |
} | |
private static void VerifyParameterNodes(Node startNode, Node endNode) | |
{ | |
// The order in which these checks are done is important. | |
if (startNode == null) | |
throw new ArgumentException("Start node cannot be null"); | |
if (endNode == null) | |
throw new ArgumentException("End node cannot be null"); | |
if (!startNode.Document.Equals(endNode.Document)) | |
throw new ArgumentException("Start node and end node must belong to the same document"); | |
if (startNode.GetAncestor(NodeType.Body) == null || endNode.GetAncestor(NodeType.Body) == null) | |
throw new ArgumentException("Start node and end node must be a child or descendant of a body"); | |
// Check the end node is after the start node in the DOM tree | |
// First check if they are in different sections, then if they're not check their position in the body of the same section they are in. | |
Section startSection = (Section)startNode.GetAncestor(NodeType.Section); | |
Section endSection = (Section)endNode.GetAncestor(NodeType.Section); | |
int startIndex = startSection.ParentNode.IndexOf(startSection); | |
int endIndex = endSection.ParentNode.IndexOf(endSection); | |
if (startIndex == endIndex) | |
{ | |
if (startSection.Body.IndexOf(startNode) > endSection.Body.IndexOf(endNode)) | |
throw new ArgumentException("The end node must be after the start node in the body"); | |
} | |
else if (startIndex > endIndex) | |
throw new ArgumentException("The section of end node must be after the section start node"); | |
} | |
private static bool IsInline(Node node) | |
{ | |
// Test if the node is desendant of a Paragraph or Table node and also is not a paragraph or a table a paragraph inside a comment class which is decesant of a pararaph is possible. | |
return ((node.GetAncestor(NodeType.Paragraph) != null || node.GetAncestor(NodeType.Table) != null) && !(node.NodeType == NodeType.Paragraph || node.NodeType == NodeType.Table)); | |
} | |
private static void ProcessMarker(CompositeNode cloneNode, ArrayList nodes, Node node, bool isInclusive, bool isStartMarker, bool isEndMarker) | |
{ | |
// If we are dealing with a block level node just see if it should be included and add it to the list. | |
if (!IsInline(node)) | |
{ | |
// Don't add the node twice if the markers are the same node | |
if (!(isStartMarker && isEndMarker)) | |
{ | |
if (isInclusive) | |
nodes.Add(cloneNode); | |
} | |
return; | |
} | |
// If a marker is a FieldStart node check if it's to be included or not. | |
// We assume for simplicity that the FieldStart and FieldEnd appear in the same paragraph. | |
if (node.NodeType == NodeType.FieldStart) | |
{ | |
// If the marker is a start node and is not be included then skip to the end of the field. | |
// If the marker is an end node and it is to be included then move to the end field so the field will not be removed. | |
if ((isStartMarker && !isInclusive) || (!isStartMarker && isInclusive)) | |
{ | |
while (node.NextSibling != null && node.NodeType != NodeType.FieldEnd) | |
node = node.NextSibling; | |
} | |
} | |
// If either marker is part of a comment then to include the comment itself we need to move the pointer forward to the Comment | |
// Node found after the CommentRangeEnd node. | |
if (node.NodeType == NodeType.CommentRangeEnd) | |
{ | |
while (node.NextSibling != null && node.NodeType != NodeType.Comment) | |
node = node.NextSibling; | |
} | |
// Find the corresponding node in our cloned node by index and return it. | |
// If the start and end node are the same some child nodes might already have been removed. Subtract the | |
// Difference to get the right index. | |
int indexDiff = node.ParentNode.ChildNodes.Count - cloneNode.ChildNodes.Count; | |
// Child node count identical. | |
if (indexDiff == 0) | |
node = cloneNode.ChildNodes[node.ParentNode.IndexOf(node)]; | |
else | |
node = cloneNode.ChildNodes[node.ParentNode.IndexOf(node) - indexDiff]; | |
// Remove the nodes up to/from the marker. | |
bool isSkip = false; | |
bool isProcessing = true; | |
bool isRemoving = isStartMarker; | |
Node nextNode = cloneNode.FirstChild; | |
while (isProcessing && nextNode != null) | |
{ | |
Node currentNode = nextNode; | |
isSkip = false; | |
if (currentNode.Equals(node)) | |
{ | |
if (isStartMarker) | |
{ | |
isProcessing = false; | |
if (isInclusive) | |
isRemoving = false; | |
} | |
else | |
{ | |
isRemoving = true; | |
if (isInclusive) | |
isSkip = true; | |
} | |
} | |
nextNode = nextNode.NextSibling; | |
if (isRemoving && !isSkip) | |
currentNode.Remove(); | |
} | |
// After processing the composite node may become empty. If it has don't include it. | |
if (!(isStartMarker && isEndMarker)) | |
{ | |
if (cloneNode.HasChildNodes) | |
nodes.Add(cloneNode); | |
} | |
} | |
public static Document GenerateDocument(Document srcDoc, ArrayList nodes) | |
{ | |
// Create a blank document. | |
Document dstDoc = new Document(); | |
// Remove the first paragraph from the empty document. | |
dstDoc.FirstSection.Body.RemoveAllChildren(); | |
// Import each node from the list into the new document. Keep the original formatting of the node. | |
NodeImporter importer = new NodeImporter(srcDoc, dstDoc, ImportFormatMode.KeepSourceFormatting); | |
foreach (Node node in nodes) | |
{ | |
Node importNode = importer.ImportNode(node, true); | |
dstDoc.FirstSection.Body.AppendChild(importNode); | |
} | |
// Return the generated document. | |
return dstDoc; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment