conholdate-gists/ExtractTextFromWord_Java.md

## ExtractTextFromWord_Java.md

      
    Raw
  

              ExtractTextFromWord_Java.md
            
          
    Learn how to extract text from Word documents using Java: https://blog.conholdate.com/2021/10/13/extract-text-from-word-documents-using-java/
The following topics are discussed/covered in this article:

Extract Text from Word Documents using Java
Extract Text from Document Pages using Java
Get Text From Specific Index using Java
Extract Formatted Text from DOCX using Java
Extract Text by Table of Contents using Java


## ExtractTextFromWord_Java_ExtractFormattedText.java
// Create an instance of Parser class
try (Parser parser = new Parser("C:\\Files\\sample.docx")) {
    // Extract a formatted text into the reader
    try (TextReader reader = parser.getFormattedText(new FormattedTextOptions(FormattedTextMode.Html))) {
        // Print a formatted text from the document
        // If formatted text extraction isn't supported, a reader is null
        System.out.println(reader == null ? "Formatted text extraction isn't suppported" : reader.readToEnd());
    }
}

## ExtractTextFromWord_Java_ExtractHighlight.java
// Create an instance of Parser class
try (Parser parser = new Parser("C:\\Files\\sample.docx")) {
    // Extract a highlight:
    HighlightItem hl = parser.getHighlight(0, true, new HighlightOptions(8));
    // Check if highlight extraction is supported
    if (hl == null) {
        System.out.println("Highlight extraction isn't supported");
        return;
    }
    // Print an extracted highlight
    System.out.println(String.format("At %d: %s", hl.getPosition(), hl.getText()));
}

## ExtractTextFromWord_Java_ExtractText.java
// Create an instance of Parser class
Parser parser = new Parser("C:\\Files\\sample.docx");

// Extract a raw text into the reader
try (TextReader reader = parser.getText()) {
    // Print a text from the document
    // If text extraction isn't supported, a reader is null
    System.out.println(reader == null ? "Text extraction isn't supported" : reader.readToEnd());
}

## ExtractTextFromWord_Java_ExtractTextFromPages.java
// Create an instance of Parser class
Parser parser = new Parser("C:\\Files\\sample.docx");

// Check if the document supports text extraction
if (!parser.getFeatures().isText()) {
    System.out.println("The document doesn't support text extraction.");
    return;
}

// Get the document info
IDocumentInfo documentInfo = parser.getDocumentInfo();

// Check if the document has pages
if (documentInfo.getPageCount() == 0) {
    System.out.println("The document has zero pages.");
    return;
}

// Iterate over pages
for (int p = 0; p < documentInfo.getPageCount(); p++) {
    // Print a page number
    System.out.println(String.format("Page number: %d/%d", p + 1, documentInfo.getPageCount()));
    // Extract a text into the reader
    try (TextReader reader = parser.getText(p)) {
        // Print a text from the document
        // We ignore null-checking as we have checked text extraction feature support earlier
        System.out.println(reader.readToEnd());
    }
}

## ExtractTextFromWord_Java_ExtractTOC.java
// Create an instance of Parser class
try (Parser parser = new Parser("C:\\Files\\sampleTOC.docx")) {
    // Get table of contents
    Iterable<TocItem> tocItems = parser.getToc();
    // Check if toc extraction is supported
    if (tocItems == null) {
        System.out.println("Table of contents extraction isn't supported");
    }
    else
    {
        // Iterate over items
        for (TocItem tocItem : tocItems) {
            // Print the text of the chapter
            try (TextReader reader = tocItem.extractText()) {
                System.out.println("----");
                System.out.println(reader.readToEnd());
            }
        }
    }
}
	// Create an instance of Parser class
	try (Parser parser = new Parser("C:\\Files\\sample.docx")) {
	// Extract a formatted text into the reader
	try (TextReader reader = parser.getFormattedText(new FormattedTextOptions(FormattedTextMode.Html))) {
	// Print a formatted text from the document
	// If formatted text extraction isn't supported, a reader is null
	System.out.println(reader == null ? "Formatted text extraction isn't suppported" : reader.readToEnd());
	}
	}
	// Create an instance of Parser class
	try (Parser parser = new Parser("C:\\Files\\sample.docx")) {
	// Extract a highlight:
	HighlightItem hl = parser.getHighlight(0, true, new HighlightOptions(8));
	// Check if highlight extraction is supported
	if (hl == null) {
	System.out.println("Highlight extraction isn't supported");
	return;
	}
	// Print an extracted highlight
	System.out.println(String.format("At %d: %s", hl.getPosition(), hl.getText()));
	}
	// Create an instance of Parser class
	Parser parser = new Parser("C:\\Files\\sample.docx");

	// Extract a raw text into the reader
	try (TextReader reader = parser.getText()) {
	// Print a text from the document
	// If text extraction isn't supported, a reader is null
	System.out.println(reader == null ? "Text extraction isn't supported" : reader.readToEnd());
	}
	// Create an instance of Parser class
	Parser parser = new Parser("C:\\Files\\sample.docx");

	// Check if the document supports text extraction
	if (!parser.getFeatures().isText()) {
	System.out.println("The document doesn't support text extraction.");
	return;
	}

	// Get the document info
	IDocumentInfo documentInfo = parser.getDocumentInfo();

	// Check if the document has pages
	if (documentInfo.getPageCount() == 0) {
	System.out.println("The document has zero pages.");
	return;
	}

	// Iterate over pages
	for (int p = 0; p < documentInfo.getPageCount(); p++) {
	// Print a page number
	System.out.println(String.format("Page number: %d/%d", p + 1, documentInfo.getPageCount()));
	// Extract a text into the reader
	try (TextReader reader = parser.getText(p)) {
	// Print a text from the document
	// We ignore null-checking as we have checked text extraction feature support earlier
	System.out.println(reader.readToEnd());
	}
	}
	// Create an instance of Parser class
	try (Parser parser = new Parser("C:\\Files\\sampleTOC.docx")) {
	// Get table of contents
	Iterable<TocItem> tocItems = parser.getToc();
	// Check if toc extraction is supported
	if (tocItems == null) {
	System.out.println("Table of contents extraction isn't supported");
	}
	else
	{
	// Iterate over items
	for (TocItem tocItem : tocItems) {
	// Print the text of the chapter
	try (TextReader reader = tocItem.extractText()) {
	System.out.println("----");
	System.out.println(reader.readToEnd());
	}
	}
	}
	}