aspose-com-gists/extract-text-nodes.py

## readme.md

      
    Raw
  

              readme.md
            
          
    Learn how to extract text from Word documents in Python: https://blog.aspose.com/2021/11/25/extract-text-from-word-docx-in-python/

  
## extract-text-nodes.py
# Load document
doc = aw.Document("Extract content.docx")

# Define starting and ending nodes.
start_para = doc.last_section.get_child(aw.NodeType.PARAGRAPH, 2, True).as_paragraph()
end_table = doc.last_section.get_child(aw.NodeType.TABLE, 0, True).as_table()

# Extract the content between these nodes in the document. Include these markers in the extraction.
extracted_nodes = extract_content(start_para, end_table, True)

# Generate document containing extracted content.
dstDoc = generate_document(doc, extractedNodes)

# Save document.
dstDoc.save("extract_content_between_nodes.docx")

## extract-text-paragraphs-styles.py
# Load document
doc = aw.Document("Extract content.docx")

# Gather a list of the paragraphs using the respective heading styles.
parasStyleHeading1 = paragraphs_by_style_name(doc, "Heading 1")
parasStyleHeading3 = paragraphs_by_style_name(doc, "Heading 3")

# Use the first instance of the paragraphs with those styles.
startPara1 = parasStyleHeading1[0]
endPara1 = parasStyleHeading3[0]

# Extract the content between these nodes in the document. Don't include these markers in the extraction.
extractedNodes = extract_content(startPara1, endPara1, False)

# Generate document containing extracted content.
dstDoc = generate_document(doc, extractedNodes)

# Save document.
dstDoc.save("extract_content_between_paragraphs_based_on-Styles.docx")

## extract-text-paragraphs.py
# Load document.
doc = aw.Document("Extract content.docx")

# Define starting and ending paragraphs.
startPara = doc.first_section.body.get_child(aw.NodeType.PARAGRAPH, 6, True).as_paragraph()
endPara = doc.first_section.body.get_child(aw.NodeType.PARAGRAPH, 10, True).as_paragraph()

# Extract the content between these paragraphs in the document. Include these markers in the extraction.
extractedNodes = extract_content(startPara, endPara, True)

# Generate document containing extracted content.
dstDoc = generate_document(doc, extractedNodes)

# Save document.
dstDoc.save("extract_content_between_paragraphs.docx")

## extract-text.py
def extract_content(startNode : aw.Node, endNode : aw.Node, isInclusive : bool):

    # First, check that the nodes passed to this method are valid for use.
    verify_parameter_nodes(startNode, endNode)

    # Create a list to store the extracted nodes.
    nodes = []

    # If either marker is part of a comment, including the comment itself, we need to move the pointer
    # forward to the Comment Node found after the CommentRangeEnd node.
    if (endNode.node_type == aw.NodeType.COMMENT_RANGE_END and isInclusive) :

        node = find_next_node(aw.NodeType.COMMENT, endNode.next_sibling)
        if (node != None) :
            endNode = node

    # Keep a record of the original nodes passed to this method to split marker nodes if needed.
    originalStartNode = startNode
    originalEndNode = endNode

    # Extract content based on block-level nodes (paragraphs and tables). Traverse through parent nodes to find them.
    # We will split the first and last nodes' content, depending if the marker nodes are inline.
    startNode = get_ancestor_in_body(startNode)
    endNode = get_ancestor_in_body(endNode)

    isExtracting = True
    isStartingNode = True
    # The current node we are extracting from the document.
    currNode = startNode

    # Begin extracting content. Process all block-level nodes and specifically split the first
    # and last nodes when needed, so paragraph formatting is retained.
    # Method is a little more complicated than a regular extractor as we need to factor
    # in extracting using inline nodes, fields, bookmarks, etc. to make it useful.
    while (isExtracting) :

        # Clone the current node and its children to obtain a copy.
        cloneNode = currNode.clone(True)
        isEndingNode = currNode == endNode

        if (isStartingNode or isEndingNode) :

            # We need to process each marker separately, so pass it off to a separate method instead.
            # End should be processed at first to keep node indexes.
            if (isEndingNode) :
                # !isStartingNode: don't add the node twice if the markers are the same node.
                process_marker(cloneNode, nodes, originalEndNode, currNode, isInclusive, False, not isStartingNode, False)
                isExtracting = False

            # Conditional needs to be separate as the block level start and end markers, maybe the same node.
            if (isStartingNode) :
                process_marker(cloneNode, nodes, originalStartNode, currNode, isInclusive, True, True, False)
                isStartingNode = False

        else :
            # Node is not a start or end marker, simply add the copy to the list.
            nodes.append(cloneNode)

        # Move to the next node and extract it. If the next node is None,
        # the rest of the content is found in a different section.
        if (currNode.next_sibling == None and isExtracting) :
            # Move to the next section.
            nextSection = currNode.get_ancestor(aw.NodeType.SECTION).next_sibling.as_section()
            currNode = nextSection.body.first_child

        else :
            # Move to the next node in the body.
            currNode = currNode.next_sibling

    # For compatibility with mode with inline bookmarks, add the next paragraph (empty).
    if (isInclusive and originalEndNode == endNode and not originalEndNode.is_composite) :
        include_next_paragraph(endNode, nodes)

    # Return the nodes between the node markers.
    return nodes

## text-extraction-helpers.py
def verify_parameter_nodes(start_node: aw.Node, end_node: aw.Node):

    # The order in which these checks are done is important.
    if start_node is None:
        raise ValueError("Start node cannot be None")
    if end_node is None:
        raise ValueError("End node cannot be None")

    if start_node.document != end_node.document:
        raise ValueError("Start node and end node must belong to the same document")

    if start_node.get_ancestor(aw.NodeType.BODY) is None or end_node.get_ancestor(aw.NodeType.BODY) is None:
        raise ValueError("Start node and end node must be a child or descendant of a body")

    # Check the end node is after the start node in the DOM tree.
    # First, check if they are in different sections, then if they're not,
    # check their position in the body of the same section.
    start_section = start_node.get_ancestor(aw.NodeType.SECTION).as_section()
    end_section = end_node.get_ancestor(aw.NodeType.SECTION).as_section()

    start_index = start_section.parent_node.index_of(start_section)
    end_index = end_section.parent_node.index_of(end_section)

    if start_index == end_index:

        if (start_section.body.index_of(get_ancestor_in_body(start_node)) >
            end_section.body.index_of(get_ancestor_in_body(end_node))):
            raise ValueError("The end node must be after the start node in the body")

    elif start_index > end_index:
        raise ValueError("The section of end node must be after the section start node")


def find_next_node(node_type: aw.NodeType, from_node: aw.Node):

    if from_node is None or from_node.node_type == node_type:
        return from_node

    if from_node.is_composite:

        node = find_next_node(node_type, from_node.as_composite_node().first_child)
        if node is not None:
            return node

    return find_next_node(node_type, from_node.next_sibling)


def is_inline(node: aw.Node):

    # Test if the node is a descendant of a Paragraph or Table node and is not a paragraph
    # or a table a paragraph inside a comment class that is decent of a paragraph is possible.
    return ((node.get_ancestor(aw.NodeType.PARAGRAPH) is not None or node.get_ancestor(aw.NodeType.TABLE) is not None) and
            not (node.node_type == aw.NodeType.PARAGRAPH or node.node_type == aw.NodeType.TABLE))


def process_marker(clone_node: aw.Node, nodes, node: aw.Node, block_level_ancestor: aw.Node,
    is_inclusive: bool, is_start_marker: bool, can_add: bool, force_add: bool):

    # If we are dealing with a block-level node, see if it should be included and add it to the list.
    if node == block_level_ancestor:
        if can_add and is_inclusive:
            nodes.append(clone_node)
        return

    # cloneNode is a clone of blockLevelNode. If node != blockLevelNode, blockLevelAncestor
    # is the node's ancestor that means it is a composite node.
    assert clone_node.is_composite

    # If a marker is a FieldStart node check if it's to be included or not.
    # We assume for simplicity that the FieldStart and FieldEnd appear in the same paragraph.
    if node.node_type == aw.NodeType.FIELD_START:
        # If the marker is a start node and is not included, skip to the end of the field.
        # If the marker is an end node and is to be included, then move to the end field so the field will not be removed.
        if is_start_marker and not is_inclusive or not is_start_marker and is_inclusive:
            while node.next_sibling is not None and node.node_type != aw.NodeType.FIELD_END:
                node = node.next_sibling

    # Support a case if the marker node is on the third level of the document body or lower.
    node_branch = fill_self_and_parents(node, block_level_ancestor)

    # Process the corresponding node in our cloned node by index.
    current_clone_node = clone_node
    for i in range(len(node_branch) - 1, -1):

        current_node = node_branch[i]
        node_index = current_node.parent_node.index_of(current_node)
        current_clone_node = current_clone_node.as_composite_node.child_nodes[node_index]

        remove_nodes_outside_of_range(current_clone_node, is_inclusive or (i > 0), is_start_marker)

    # After processing, the composite node may become empty if it has doesn't include it.
    if can_add and (force_add or clone_node.as_composite_node().has_child_nodes):
        nodes.append(clone_node)


def remove_nodes_outside_of_range(marker_node: aw.Node, is_inclusive: bool, is_start_marker: bool):

    is_processing = True
    is_removing = is_start_marker
    next_node = marker_node.parent_node.first_child

    while is_processing and next_node is not None:

        current_node = next_node
        is_skip = False

        if current_node == marker_node:
            if is_start_marker:
                is_processing = False
                if is_inclusive:
                    is_removing = False
            else:
                is_removing = True
                if is_inclusive:
                    is_skip = True

        next_node = next_node.next_sibling
        if is_removing and not is_skip:
            current_node.remove()


def fill_self_and_parents(node: aw.Node, till_node: aw.Node):

    nodes = []
    current_node = node

    while current_node != till_node:
        nodes.append(current_node)
        current_node = current_node.parent_node

    return nodes


def include_next_paragraph(node: aw.Node, nodes):

    paragraph = find_next_node(aw.NodeType.PARAGRAPH, node.next_sibling).as_paragraph()
    if paragraph is not None:

        # Move to the first child to include paragraphs without content.
        marker_node = paragraph.first_child if paragraph.has_child_nodes else paragraph
        root_node = get_ancestor_in_body(paragraph)

        process_marker(root_node.clone(True), nodes, marker_node, root_node,
            marker_node == paragraph, False, True, True)


def get_ancestor_in_body(start_node: aw.Node):

    while start_node.parent_node.node_type != aw.NodeType.BODY:
        start_node = start_node.parent_node
    return start_node
def generate_document(src_doc: aw.Document, nodes):

    dst_doc = aw.Document()
    # Remove the first paragraph from the empty document.
    dst_doc.first_section.body.remove_all_children()

    # Import each node from the list into the new document. Keep the original formatting of the node.
    importer = aw.NodeImporter(src_doc, dst_doc, aw.ImportFormatMode.KEEP_SOURCE_FORMATTING)

    for node in nodes:
        import_node = importer.import_node(node, True)
        dst_doc.first_section.body.append_child(import_node)

    return dst_doc


def paragraphs_by_style_name(doc: aw.Document, style_name: str):

    paragraphs_with_style = []
    paragraphs = doc.get_child_nodes(aw.NodeType.PARAGRAPH, True)

    for paragraph in paragraphs:
        paragraph = paragraph.as_paragraph()
        if paragraph.paragraph_format.style.name == style_name:
            paragraphs_with_style.append(paragraph)

    return paragraphs_with_style
	# Load document
	doc = aw.Document("Extract content.docx")

	# Define starting and ending nodes.
	start_para = doc.last_section.get_child(aw.NodeType.PARAGRAPH, 2, True).as_paragraph()
	end_table = doc.last_section.get_child(aw.NodeType.TABLE, 0, True).as_table()

	# Extract the content between these nodes in the document. Include these markers in the extraction.
	extracted_nodes = extract_content(start_para, end_table, True)

	# Generate document containing extracted content.
	dstDoc = generate_document(doc, extractedNodes)

	# Save document.
	dstDoc.save("extract_content_between_nodes.docx")
	# Load document
	doc = aw.Document("Extract content.docx")

	# Gather a list of the paragraphs using the respective heading styles.
	parasStyleHeading1 = paragraphs_by_style_name(doc, "Heading 1")
	parasStyleHeading3 = paragraphs_by_style_name(doc, "Heading 3")

	# Use the first instance of the paragraphs with those styles.
	startPara1 = parasStyleHeading1[0]
	endPara1 = parasStyleHeading3[0]

	# Extract the content between these nodes in the document. Don't include these markers in the extraction.
	extractedNodes = extract_content(startPara1, endPara1, False)

	# Generate document containing extracted content.
	dstDoc = generate_document(doc, extractedNodes)

	# Save document.
	dstDoc.save("extract_content_between_paragraphs_based_on-Styles.docx")
	# Load document.
	doc = aw.Document("Extract content.docx")

	# Define starting and ending paragraphs.
	startPara = doc.first_section.body.get_child(aw.NodeType.PARAGRAPH, 6, True).as_paragraph()
	endPara = doc.first_section.body.get_child(aw.NodeType.PARAGRAPH, 10, True).as_paragraph()

	# Extract the content between these paragraphs in the document. Include these markers in the extraction.
	extractedNodes = extract_content(startPara, endPara, True)

	# Generate document containing extracted content.
	dstDoc = generate_document(doc, extractedNodes)

	# Save document.
	dstDoc.save("extract_content_between_paragraphs.docx")
	def extract_content(startNode : aw.Node, endNode : aw.Node, isInclusive : bool):

	# First, check that the nodes passed to this method are valid for use.
	verify_parameter_nodes(startNode, endNode)

	# Create a list to store the extracted nodes.
	nodes = []

	# If either marker is part of a comment, including the comment itself, we need to move the pointer
	# forward to the Comment Node found after the CommentRangeEnd node.
	if (endNode.node_type == aw.NodeType.COMMENT_RANGE_END and isInclusive) :

	node = find_next_node(aw.NodeType.COMMENT, endNode.next_sibling)
	if (node != None) :
	endNode = node

	# Keep a record of the original nodes passed to this method to split marker nodes if needed.
	originalStartNode = startNode
	originalEndNode = endNode

	# Extract content based on block-level nodes (paragraphs and tables). Traverse through parent nodes to find them.
	# We will split the first and last nodes' content, depending if the marker nodes are inline.
	startNode = get_ancestor_in_body(startNode)
	endNode = get_ancestor_in_body(endNode)

	isExtracting = True
	isStartingNode = True
	# The current node we are extracting from the document.
	currNode = startNode

	# Begin extracting content. Process all block-level nodes and specifically split the first
	# and last nodes when needed, so paragraph formatting is retained.
	# Method is a little more complicated than a regular extractor as we need to factor
	# in extracting using inline nodes, fields, bookmarks, etc. to make it useful.
	while (isExtracting) :

	# Clone the current node and its children to obtain a copy.
	cloneNode = currNode.clone(True)
	isEndingNode = currNode == endNode

	if (isStartingNode or isEndingNode) :

	# We need to process each marker separately, so pass it off to a separate method instead.
	# End should be processed at first to keep node indexes.
	if (isEndingNode) :
	# !isStartingNode: don't add the node twice if the markers are the same node.
	process_marker(cloneNode, nodes, originalEndNode, currNode, isInclusive, False, not isStartingNode, False)
	isExtracting = False

	# Conditional needs to be separate as the block level start and end markers, maybe the same node.
	if (isStartingNode) :
	process_marker(cloneNode, nodes, originalStartNode, currNode, isInclusive, True, True, False)
	isStartingNode = False

	else :
	# Node is not a start or end marker, simply add the copy to the list.
	nodes.append(cloneNode)

	# Move to the next node and extract it. If the next node is None,
	# the rest of the content is found in a different section.
	if (currNode.next_sibling == None and isExtracting) :
	# Move to the next section.
	nextSection = currNode.get_ancestor(aw.NodeType.SECTION).next_sibling.as_section()
	currNode = nextSection.body.first_child

	else :
	# Move to the next node in the body.
	currNode = currNode.next_sibling

	# For compatibility with mode with inline bookmarks, add the next paragraph (empty).
	if (isInclusive and originalEndNode == endNode and not originalEndNode.is_composite) :
	include_next_paragraph(endNode, nodes)

	# Return the nodes between the node markers.
	return nodes
	def verify_parameter_nodes(start_node: aw.Node, end_node: aw.Node):

	# The order in which these checks are done is important.
	if start_node is None:
	raise ValueError("Start node cannot be None")
	if end_node is None:
	raise ValueError("End node cannot be None")

	if start_node.document != end_node.document:
	raise ValueError("Start node and end node must belong to the same document")

	if start_node.get_ancestor(aw.NodeType.BODY) is None or end_node.get_ancestor(aw.NodeType.BODY) is None:
	raise ValueError("Start node and end node must be a child or descendant of a body")

	# Check the end node is after the start node in the DOM tree.
	# First, check if they are in different sections, then if they're not,
	# check their position in the body of the same section.
	start_section = start_node.get_ancestor(aw.NodeType.SECTION).as_section()
	end_section = end_node.get_ancestor(aw.NodeType.SECTION).as_section()

	start_index = start_section.parent_node.index_of(start_section)
	end_index = end_section.parent_node.index_of(end_section)

	if start_index == end_index:

	if (start_section.body.index_of(get_ancestor_in_body(start_node)) >
	end_section.body.index_of(get_ancestor_in_body(end_node))):
	raise ValueError("The end node must be after the start node in the body")

	elif start_index > end_index:
	raise ValueError("The section of end node must be after the section start node")


	def find_next_node(node_type: aw.NodeType, from_node: aw.Node):

	if from_node is None or from_node.node_type == node_type:
	return from_node

	if from_node.is_composite:

	node = find_next_node(node_type, from_node.as_composite_node().first_child)
	if node is not None:
	return node

	return find_next_node(node_type, from_node.next_sibling)


	def is_inline(node: aw.Node):

	# Test if the node is a descendant of a Paragraph or Table node and is not a paragraph
	# or a table a paragraph inside a comment class that is decent of a paragraph is possible.
	return ((node.get_ancestor(aw.NodeType.PARAGRAPH) is not None or node.get_ancestor(aw.NodeType.TABLE) is not None) and
	not (node.node_type == aw.NodeType.PARAGRAPH or node.node_type == aw.NodeType.TABLE))


	def process_marker(clone_node: aw.Node, nodes, node: aw.Node, block_level_ancestor: aw.Node,
	is_inclusive: bool, is_start_marker: bool, can_add: bool, force_add: bool):

	# If we are dealing with a block-level node, see if it should be included and add it to the list.
	if node == block_level_ancestor:
	if can_add and is_inclusive:
	nodes.append(clone_node)
	return

	# cloneNode is a clone of blockLevelNode. If node != blockLevelNode, blockLevelAncestor
	# is the node's ancestor that means it is a composite node.
	assert clone_node.is_composite

	# If a marker is a FieldStart node check if it's to be included or not.
	# We assume for simplicity that the FieldStart and FieldEnd appear in the same paragraph.
	if node.node_type == aw.NodeType.FIELD_START:
	# If the marker is a start node and is not included, skip to the end of the field.
	# If the marker is an end node and is to be included, then move to the end field so the field will not be removed.
	if is_start_marker and not is_inclusive or not is_start_marker and is_inclusive:
	while node.next_sibling is not None and node.node_type != aw.NodeType.FIELD_END:
	node = node.next_sibling

	# Support a case if the marker node is on the third level of the document body or lower.
	node_branch = fill_self_and_parents(node, block_level_ancestor)

	# Process the corresponding node in our cloned node by index.
	current_clone_node = clone_node
	for i in range(len(node_branch) - 1, -1):

	current_node = node_branch[i]
	node_index = current_node.parent_node.index_of(current_node)
	current_clone_node = current_clone_node.as_composite_node.child_nodes[node_index]

	remove_nodes_outside_of_range(current_clone_node, is_inclusive or (i > 0), is_start_marker)

	# After processing, the composite node may become empty if it has doesn't include it.
	if can_add and (force_add or clone_node.as_composite_node().has_child_nodes):
	nodes.append(clone_node)


	def remove_nodes_outside_of_range(marker_node: aw.Node, is_inclusive: bool, is_start_marker: bool):

	is_processing = True
	is_removing = is_start_marker
	next_node = marker_node.parent_node.first_child

	while is_processing and next_node is not None:

	current_node = next_node
	is_skip = False

	if current_node == marker_node:
	if is_start_marker:
	is_processing = False
	if is_inclusive:
	is_removing = False
	else:
	is_removing = True
	if is_inclusive:
	is_skip = True

	next_node = next_node.next_sibling
	if is_removing and not is_skip:
	current_node.remove()


	def fill_self_and_parents(node: aw.Node, till_node: aw.Node):

	nodes = []
	current_node = node

	while current_node != till_node:
	nodes.append(current_node)
	current_node = current_node.parent_node

	return nodes


	def include_next_paragraph(node: aw.Node, nodes):

	paragraph = find_next_node(aw.NodeType.PARAGRAPH, node.next_sibling).as_paragraph()
	if paragraph is not None:

	# Move to the first child to include paragraphs without content.
	marker_node = paragraph.first_child if paragraph.has_child_nodes else paragraph
	root_node = get_ancestor_in_body(paragraph)

	process_marker(root_node.clone(True), nodes, marker_node, root_node,
	marker_node == paragraph, False, True, True)


	def get_ancestor_in_body(start_node: aw.Node):

	while start_node.parent_node.node_type != aw.NodeType.BODY:
	start_node = start_node.parent_node
	return start_node
	def generate_document(src_doc: aw.Document, nodes):

	dst_doc = aw.Document()
	# Remove the first paragraph from the empty document.
	dst_doc.first_section.body.remove_all_children()

	# Import each node from the list into the new document. Keep the original formatting of the node.
	importer = aw.NodeImporter(src_doc, dst_doc, aw.ImportFormatMode.KEEP_SOURCE_FORMATTING)

	for node in nodes:
	import_node = importer.import_node(node, True)
	dst_doc.first_section.body.append_child(import_node)

	return dst_doc


	def paragraphs_by_style_name(doc: aw.Document, style_name: str):

	paragraphs_with_style = []
	paragraphs = doc.get_child_nodes(aw.NodeType.PARAGRAPH, True)

	for paragraph in paragraphs:
	paragraph = paragraph.as_paragraph()
	if paragraph.paragraph_format.style.name == style_name:
	paragraphs_with_style.append(paragraph)

	return paragraphs_with_style