Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Extract Text from DOC or DOCX using C#
Extract Text from DOC or DOCX using C#
1. Extract Text from DOCX using C#
2. Get Formatted Text from DOCX using C#
3. Extract Formatted Text from Pages using C#
// Create an instance of Parser class
Parser parser = new Parser(@"C:\Files\sample.docx");
// Extract a formatted text into the reader
using (TextReader reader = parser.GetFormattedText(new FormattedTextOptions(FormattedTextMode.Html)))
{
// Print a formatted text from the document
// If formatted text extraction isn't supported, a reader is null
Console.WriteLine(reader == null ? "Formatted text extraction isn't suppported" : reader.ReadToEnd());
}
// Create an instance of Parser class
using (Parser parser = new Parser(@"C:\Files\sample.docx"))
{
// Check if the document supports formatted text extraction
if (!parser.Features.FormattedText)
{
Console.WriteLine("Document isn't supports formatted text extraction.");
return;
}
// Get the document info
IDocumentInfo documentInfo = parser.GetDocumentInfo();
// Check if the document has pages
if (documentInfo.PageCount == 0)
{
Console.WriteLine("Document hasn't pages.");
return;
}
// Iterate over pages
for (int p = 0; p < documentInfo.PageCount; p++)
{
// Print a page number
Console.WriteLine(string.Format("Page {0}/{1}", p + 1, documentInfo.PageCount));
// Extract a formatted text into the reader
using (TextReader reader = parser.GetFormattedText(p, new FormattedTextOptions(FormattedTextMode.Html)))
{
// Print a formatted text from the document
// We ignore null-checking as we have checked formatted text extraction feature support earlier
Console.WriteLine(reader.ReadToEnd());
}
}
}
// Create an instance of Parser class
Parser parser = new Parser(@"C:\Files\sample.docx");
// Extract a text into the reader
using (TextReader reader = parser.GetText())
{
// Print a text from the document
// If text extraction isn't supported, a reader is null
Console.WriteLine(reader == null ? "Text extraction isn't supported" : reader.ReadToEnd());
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment