aspose-com-gists/extract-text-nodes.cs

## readme.md

      
    Raw
  

              readme.md
            
          
    Learn how to extract text from Word documents using C#: https://blog.aspose.com/2021/12/06/extract-text-from-word-in-csharp/

  
## extract-text-nodes.cs
// Load Word document
Document doc = new Document("document.docx");

Paragraph startPara = (Paragraph)doc.LastSection.GetChild(NodeType.Paragraph, 2, true);
Table endTable = (Table)doc.LastSection.GetChild(NodeType.Table, 0, true);

// Extract the content between these nodes in the document. Include these markers in the extraction.
ArrayList extractedNodes = ExtractContent(startPara, endTable, true);

// Insert the content into a new document and save it to disk.
Document dstDoc = GenerateDocument(doc, extractedNodes);
dstDoc.Save("output.docx");

## extract-text-paragraphs-styles.cs
// Load Word document
Document doc = new Document("document.docx");

// Gather a list of the paragraphs using the respective heading styles.
List<Paragraph> parasStyleHeading1 = ParagraphsByStyleName(doc, "Heading 1");
List<Paragraph> parasStyleHeading3 = ParagraphsByStyleName(doc, "Heading 3");

// Use the first instance of the paragraphs with those styles.
Node startPara1 = (Node)parasStyleHeading1[0];
Node endPara1 = (Node)parasStyleHeading3[0];

// Extract the content between these nodes in the document. Don't include these markers in the extraction.
ArrayList extractedNodes = ExtractContent(startPara1, endPara1, false);

// Insert the content into a new document and save it to disk.
Document dstDoc = GenerateDocument(doc, extractedNodes);
dstDoc.Save("output.docx");

## extract-text-paragraphs.cs
// Load Word document
Document doc = new Document("document.docx");

// Gather the nodes (the GetChild method uses 0-based index)
Paragraph startPara = (Paragraph)doc.FirstSection.Body.GetChild(NodeType.Paragraph, 6, true);
Paragraph endPara = (Paragraph)doc.FirstSection.Body.GetChild(NodeType.Paragraph, 10, true);

// Extract the content between these nodes in the document. Include these markers in the extraction.
ArrayList extractedNodes = ExtractContent(startPara, endPara, true);

// Insert the content into a new document and save it to disk.
Document dstDoc = GenerateDocument(doc, extractedNodes);
dstDoc.Save("output.docx");

## extract-text.cs
public static ArrayList ExtractContent(Node startNode, Node endNode, bool isInclusive)
{
    // First check that the nodes passed to this method are valid for use.
    VerifyParameterNodes(startNode, endNode);

    // Create a list to store the extracted nodes.
    ArrayList nodes = new ArrayList();

    // Keep a record of the original nodes passed to this method so we can split marker nodes if needed.
    Node originalStartNode = startNode;
    Node originalEndNode = endNode;

    // Extract content based on block level nodes (paragraphs and tables). Traverse through parent nodes to find them.
    // We will split the content of first and last nodes depending if the marker nodes are inline
    while (startNode.ParentNode.NodeType != NodeType.Body)
        startNode = startNode.ParentNode;

    while (endNode.ParentNode.NodeType != NodeType.Body)
        endNode = endNode.ParentNode;

    bool isExtracting = true;
    bool isStartingNode = true;
    bool isEndingNode = false;
    // The current node we are extracting from the document.
    Node currNode = startNode;

    // Begin extracting content. Process all block level nodes and specifically split the first and last nodes when needed so paragraph formatting is retained.
    // Method is little more complex than a regular extractor as we need to factor in extracting using inline nodes, fields, bookmarks etc as to make it really useful.
    while (isExtracting)
    {
        // Clone the current node and its children to obtain a copy.
        Node cloneNode = currNode.Clone(true);
        isEndingNode = currNode.Equals(endNode);

        if ((isStartingNode || isEndingNode) && cloneNode.IsComposite)
        {
            // We need to process each marker separately so pass it off to a separate method instead.
            if (isStartingNode)
            {
                ProcessMarker((CompositeNode)cloneNode, nodes, originalStartNode, isInclusive, isStartingNode, isEndingNode);
                isStartingNode = false;
            }

            // Conditional needs to be separate as the block level start and end markers maybe the same node.
            if (isEndingNode)
            {
                ProcessMarker((CompositeNode)cloneNode, nodes, originalEndNode, isInclusive, isStartingNode, isEndingNode);
                isExtracting = false;
            }
        }
        else
            // Node is not a start or end marker, simply add the copy to the list.
            nodes.Add(cloneNode);

        // Move to the next node and extract it. If next node is null that means the rest of the content is found in a different section.
        if (currNode.NextSibling == null && isExtracting)
        {
            // Move to the next section.
            Section nextSection = (Section)currNode.GetAncestor(NodeType.Section).NextSibling;
            currNode = nextSection.Body.FirstChild;
        }
        else
        {
            // Move to the next node in the body.
            currNode = currNode.NextSibling;
        }
    }

    // Return the nodes between the node markers.
    return nodes;
}

## text-extraction-helpers.cs
public static List<Paragraph> ParagraphsByStyleName(Document doc, string styleName)
{
    // Create an array to collect paragraphs of the specified style.
    List<Paragraph> paragraphsWithStyle = new List<Paragraph>();

    NodeCollection paragraphs = doc.GetChildNodes(NodeType.Paragraph, true);

    // Look through all paragraphs to find those with the specified style.
    foreach (Paragraph paragraph in paragraphs)
    {
        if (paragraph.ParagraphFormat.Style.Name == styleName)
            paragraphsWithStyle.Add(paragraph);
    }

    return paragraphsWithStyle;
}
private static void VerifyParameterNodes(Node startNode, Node endNode)
{
    // The order in which these checks are done is important.
    if (startNode == null)
        throw new ArgumentException("Start node cannot be null");
    if (endNode == null)
        throw new ArgumentException("End node cannot be null");

    if (!startNode.Document.Equals(endNode.Document))
        throw new ArgumentException("Start node and end node must belong to the same document");

    if (startNode.GetAncestor(NodeType.Body) == null || endNode.GetAncestor(NodeType.Body) == null)
        throw new ArgumentException("Start node and end node must be a child or descendant of a body");

    // Check the end node is after the start node in the DOM tree
    // First check if they are in different sections, then if they're not check their position in the body of the same section they are in.
    Section startSection = (Section)startNode.GetAncestor(NodeType.Section);
    Section endSection = (Section)endNode.GetAncestor(NodeType.Section);

    int startIndex = startSection.ParentNode.IndexOf(startSection);
    int endIndex = endSection.ParentNode.IndexOf(endSection);

    if (startIndex == endIndex)
    {
        if (startSection.Body.IndexOf(startNode) > endSection.Body.IndexOf(endNode))
            throw new ArgumentException("The end node must be after the start node in the body");
    }
    else if (startIndex > endIndex)
        throw new ArgumentException("The section of end node must be after the section start node");
}
private static bool IsInline(Node node)
{
    // Test if the node is desendant of a Paragraph or Table node and also is not a paragraph or a table a paragraph inside a comment class which is decesant of a pararaph is possible.
    return ((node.GetAncestor(NodeType.Paragraph) != null || node.GetAncestor(NodeType.Table) != null) && !(node.NodeType == NodeType.Paragraph || node.NodeType == NodeType.Table));
}
private static void ProcessMarker(CompositeNode cloneNode, ArrayList nodes, Node node, bool isInclusive, bool isStartMarker, bool isEndMarker)
{
    // If we are dealing with a block level node just see if it should be included and add it to the list.
    if (!IsInline(node))
    {
        // Don't add the node twice if the markers are the same node
        if (!(isStartMarker && isEndMarker))
        {
            if (isInclusive)
                nodes.Add(cloneNode);
        }
        return;
    }

    // If a marker is a FieldStart node check if it's to be included or not.
    // We assume for simplicity that the FieldStart and FieldEnd appear in the same paragraph.
    if (node.NodeType == NodeType.FieldStart)
    {
        // If the marker is a start node and is not be included then skip to the end of the field.
        // If the marker is an end node and it is to be included then move to the end field so the field will not be removed.
        if ((isStartMarker && !isInclusive) || (!isStartMarker && isInclusive))
        {
            while (node.NextSibling != null && node.NodeType != NodeType.FieldEnd)
                node = node.NextSibling;

        }
    }

    // If either marker is part of a comment then to include the comment itself we need to move the pointer forward to the Comment
    // Node found after the CommentRangeEnd node.
    if (node.NodeType == NodeType.CommentRangeEnd)
    {
        while (node.NextSibling != null && node.NodeType != NodeType.Comment)
            node = node.NextSibling;

    }

    // Find the corresponding node in our cloned node by index and return it.
    // If the start and end node are the same some child nodes might already have been removed. Subtract the
    // Difference to get the right index.
    int indexDiff = node.ParentNode.ChildNodes.Count - cloneNode.ChildNodes.Count;

    // Child node count identical.
    if (indexDiff == 0)
        node = cloneNode.ChildNodes[node.ParentNode.IndexOf(node)];
    else
        node = cloneNode.ChildNodes[node.ParentNode.IndexOf(node) - indexDiff];

    // Remove the nodes up to/from the marker.
    bool isSkip = false;
    bool isProcessing = true;
    bool isRemoving = isStartMarker;
    Node nextNode = cloneNode.FirstChild;

    while (isProcessing && nextNode != null)
    {
        Node currentNode = nextNode;
        isSkip = false;

        if (currentNode.Equals(node))
        {
            if (isStartMarker)
            {
                isProcessing = false;
                if (isInclusive)
                    isRemoving = false;
            }
            else
            {
                isRemoving = true;
                if (isInclusive)
                    isSkip = true;
            }
        }

        nextNode = nextNode.NextSibling;
        if (isRemoving && !isSkip)
            currentNode.Remove();
    }

    // After processing the composite node may become empty. If it has don't include it.
    if (!(isStartMarker && isEndMarker))
    {
        if (cloneNode.HasChildNodes)
            nodes.Add(cloneNode);
    }

}
public static Document GenerateDocument(Document srcDoc, ArrayList nodes)
{
    // Create a blank document.
    Document dstDoc = new Document();
    // Remove the first paragraph from the empty document.
    dstDoc.FirstSection.Body.RemoveAllChildren();

    // Import each node from the list into the new document. Keep the original formatting of the node.
    NodeImporter importer = new NodeImporter(srcDoc, dstDoc, ImportFormatMode.KeepSourceFormatting);

    foreach (Node node in nodes)
    {
        Node importNode = importer.ImportNode(node, true);
        dstDoc.FirstSection.Body.AppendChild(importNode);
    }

    // Return the generated document.
    return dstDoc;
}
	// Load Word document
	Document doc = new Document("document.docx");

	Paragraph startPara = (Paragraph)doc.LastSection.GetChild(NodeType.Paragraph, 2, true);
	Table endTable = (Table)doc.LastSection.GetChild(NodeType.Table, 0, true);

	// Extract the content between these nodes in the document. Include these markers in the extraction.
	ArrayList extractedNodes = ExtractContent(startPara, endTable, true);

	// Insert the content into a new document and save it to disk.
	Document dstDoc = GenerateDocument(doc, extractedNodes);
	dstDoc.Save("output.docx");
	// Load Word document
	Document doc = new Document("document.docx");

	// Gather a list of the paragraphs using the respective heading styles.
	List<Paragraph> parasStyleHeading1 = ParagraphsByStyleName(doc, "Heading 1");
	List<Paragraph> parasStyleHeading3 = ParagraphsByStyleName(doc, "Heading 3");

	// Use the first instance of the paragraphs with those styles.
	Node startPara1 = (Node)parasStyleHeading1[0];
	Node endPara1 = (Node)parasStyleHeading3[0];

	// Extract the content between these nodes in the document. Don't include these markers in the extraction.
	ArrayList extractedNodes = ExtractContent(startPara1, endPara1, false);

	// Insert the content into a new document and save it to disk.
	Document dstDoc = GenerateDocument(doc, extractedNodes);
	dstDoc.Save("output.docx");
	// Load Word document
	Document doc = new Document("document.docx");

	// Gather the nodes (the GetChild method uses 0-based index)
	Paragraph startPara = (Paragraph)doc.FirstSection.Body.GetChild(NodeType.Paragraph, 6, true);
	Paragraph endPara = (Paragraph)doc.FirstSection.Body.GetChild(NodeType.Paragraph, 10, true);

	// Extract the content between these nodes in the document. Include these markers in the extraction.
	ArrayList extractedNodes = ExtractContent(startPara, endPara, true);

	// Insert the content into a new document and save it to disk.
	Document dstDoc = GenerateDocument(doc, extractedNodes);
	dstDoc.Save("output.docx");
	public static ArrayList ExtractContent(Node startNode, Node endNode, bool isInclusive)
	{
	// First check that the nodes passed to this method are valid for use.
	VerifyParameterNodes(startNode, endNode);

	// Create a list to store the extracted nodes.
	ArrayList nodes = new ArrayList();

	// Keep a record of the original nodes passed to this method so we can split marker nodes if needed.
	Node originalStartNode = startNode;
	Node originalEndNode = endNode;

	// Extract content based on block level nodes (paragraphs and tables). Traverse through parent nodes to find them.
	// We will split the content of first and last nodes depending if the marker nodes are inline
	while (startNode.ParentNode.NodeType != NodeType.Body)
	startNode = startNode.ParentNode;

	while (endNode.ParentNode.NodeType != NodeType.Body)
	endNode = endNode.ParentNode;

	bool isExtracting = true;
	bool isStartingNode = true;
	bool isEndingNode = false;
	// The current node we are extracting from the document.
	Node currNode = startNode;

	// Begin extracting content. Process all block level nodes and specifically split the first and last nodes when needed so paragraph formatting is retained.
	// Method is little more complex than a regular extractor as we need to factor in extracting using inline nodes, fields, bookmarks etc as to make it really useful.
	while (isExtracting)
	{
	// Clone the current node and its children to obtain a copy.
	Node cloneNode = currNode.Clone(true);
	isEndingNode = currNode.Equals(endNode);

	if ((isStartingNode \|\| isEndingNode) && cloneNode.IsComposite)
	{
	// We need to process each marker separately so pass it off to a separate method instead.
	if (isStartingNode)
	{
	ProcessMarker((CompositeNode)cloneNode, nodes, originalStartNode, isInclusive, isStartingNode, isEndingNode);
	isStartingNode = false;
	}

	// Conditional needs to be separate as the block level start and end markers maybe the same node.
	if (isEndingNode)
	{
	ProcessMarker((CompositeNode)cloneNode, nodes, originalEndNode, isInclusive, isStartingNode, isEndingNode);
	isExtracting = false;
	}
	}
	else
	// Node is not a start or end marker, simply add the copy to the list.
	nodes.Add(cloneNode);

	// Move to the next node and extract it. If next node is null that means the rest of the content is found in a different section.
	if (currNode.NextSibling == null && isExtracting)
	{
	// Move to the next section.
	Section nextSection = (Section)currNode.GetAncestor(NodeType.Section).NextSibling;
	currNode = nextSection.Body.FirstChild;
	}
	else
	{
	// Move to the next node in the body.
	currNode = currNode.NextSibling;
	}
	}

	// Return the nodes between the node markers.
	return nodes;
	}
	public static List<Paragraph> ParagraphsByStyleName(Document doc, string styleName)
	{
	// Create an array to collect paragraphs of the specified style.
	List<Paragraph> paragraphsWithStyle = new List<Paragraph>();

	NodeCollection paragraphs = doc.GetChildNodes(NodeType.Paragraph, true);

	// Look through all paragraphs to find those with the specified style.
	foreach (Paragraph paragraph in paragraphs)
	{
	if (paragraph.ParagraphFormat.Style.Name == styleName)
	paragraphsWithStyle.Add(paragraph);
	}

	return paragraphsWithStyle;
	}
	private static void VerifyParameterNodes(Node startNode, Node endNode)
	{
	// The order in which these checks are done is important.
	if (startNode == null)
	throw new ArgumentException("Start node cannot be null");
	if (endNode == null)
	throw new ArgumentException("End node cannot be null");

	if (!startNode.Document.Equals(endNode.Document))
	throw new ArgumentException("Start node and end node must belong to the same document");

	if (startNode.GetAncestor(NodeType.Body) == null \|\| endNode.GetAncestor(NodeType.Body) == null)
	throw new ArgumentException("Start node and end node must be a child or descendant of a body");

	// Check the end node is after the start node in the DOM tree
	// First check if they are in different sections, then if they're not check their position in the body of the same section they are in.
	Section startSection = (Section)startNode.GetAncestor(NodeType.Section);
	Section endSection = (Section)endNode.GetAncestor(NodeType.Section);

	int startIndex = startSection.ParentNode.IndexOf(startSection);
	int endIndex = endSection.ParentNode.IndexOf(endSection);

	if (startIndex == endIndex)
	{
	if (startSection.Body.IndexOf(startNode) > endSection.Body.IndexOf(endNode))
	throw new ArgumentException("The end node must be after the start node in the body");
	}
	else if (startIndex > endIndex)
	throw new ArgumentException("The section of end node must be after the section start node");
	}
	private static bool IsInline(Node node)
	{
	// Test if the node is desendant of a Paragraph or Table node and also is not a paragraph or a table a paragraph inside a comment class which is decesant of a pararaph is possible.
	return ((node.GetAncestor(NodeType.Paragraph) != null \|\| node.GetAncestor(NodeType.Table) != null) && !(node.NodeType == NodeType.Paragraph \|\| node.NodeType == NodeType.Table));
	}
	private static void ProcessMarker(CompositeNode cloneNode, ArrayList nodes, Node node, bool isInclusive, bool isStartMarker, bool isEndMarker)
	{
	// If we are dealing with a block level node just see if it should be included and add it to the list.
	if (!IsInline(node))
	{
	// Don't add the node twice if the markers are the same node
	if (!(isStartMarker && isEndMarker))
	{
	if (isInclusive)
	nodes.Add(cloneNode);
	}
	return;
	}

	// If a marker is a FieldStart node check if it's to be included or not.
	// We assume for simplicity that the FieldStart and FieldEnd appear in the same paragraph.
	if (node.NodeType == NodeType.FieldStart)
	{
	// If the marker is a start node and is not be included then skip to the end of the field.
	// If the marker is an end node and it is to be included then move to the end field so the field will not be removed.
	if ((isStartMarker && !isInclusive) \|\| (!isStartMarker && isInclusive))
	{
	while (node.NextSibling != null && node.NodeType != NodeType.FieldEnd)
	node = node.NextSibling;

	}
	}

	// If either marker is part of a comment then to include the comment itself we need to move the pointer forward to the Comment
	// Node found after the CommentRangeEnd node.
	if (node.NodeType == NodeType.CommentRangeEnd)
	{
	while (node.NextSibling != null && node.NodeType != NodeType.Comment)
	node = node.NextSibling;

	}

	// Find the corresponding node in our cloned node by index and return it.
	// If the start and end node are the same some child nodes might already have been removed. Subtract the
	// Difference to get the right index.
	int indexDiff = node.ParentNode.ChildNodes.Count - cloneNode.ChildNodes.Count;

	// Child node count identical.
	if (indexDiff == 0)
	node = cloneNode.ChildNodes[node.ParentNode.IndexOf(node)];
	else
	node = cloneNode.ChildNodes[node.ParentNode.IndexOf(node) - indexDiff];

	// Remove the nodes up to/from the marker.
	bool isSkip = false;
	bool isProcessing = true;
	bool isRemoving = isStartMarker;
	Node nextNode = cloneNode.FirstChild;

	while (isProcessing && nextNode != null)
	{
	Node currentNode = nextNode;
	isSkip = false;

	if (currentNode.Equals(node))
	{
	if (isStartMarker)
	{
	isProcessing = false;
	if (isInclusive)
	isRemoving = false;
	}
	else
	{
	isRemoving = true;
	if (isInclusive)
	isSkip = true;
	}
	}

	nextNode = nextNode.NextSibling;
	if (isRemoving && !isSkip)
	currentNode.Remove();
	}

	// After processing the composite node may become empty. If it has don't include it.
	if (!(isStartMarker && isEndMarker))
	{
	if (cloneNode.HasChildNodes)
	nodes.Add(cloneNode);
	}

	}
	public static Document GenerateDocument(Document srcDoc, ArrayList nodes)
	{
	// Create a blank document.
	Document dstDoc = new Document();
	// Remove the first paragraph from the empty document.
	dstDoc.FirstSection.Body.RemoveAllChildren();

	// Import each node from the list into the new document. Keep the original formatting of the node.
	NodeImporter importer = new NodeImporter(srcDoc, dstDoc, ImportFormatMode.KeepSourceFormatting);

	foreach (Node node in nodes)
	{
	Node importNode = importer.ImportNode(node, true);
	dstDoc.FirstSection.Body.AppendChild(importNode);
	}

	// Return the generated document.
	return dstDoc;
	}