Skip to content

Instantly share code, notes, and snippets.

@conholdate-gists
Created May 21, 2021 11:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save conholdate-gists/f97b365916bb38e917d273418dc88fc2 to your computer and use it in GitHub Desktop.
Save conholdate-gists/f97b365916bb38e917d273418dc88fc2 to your computer and use it in GitHub Desktop.
Extract Text from DOC or DOCX using C#
Extract Text from DOC or DOCX using C#
1. Extract Text from DOCX using C#
2. Get Formatted Text from DOCX using C#
3. Extract Formatted Text from Pages using C#
// Create an instance of Parser class
Parser parser = new Parser(@"C:\Files\sample.docx");
// Extract a formatted text into the reader
using (TextReader reader = parser.GetFormattedText(new FormattedTextOptions(FormattedTextMode.Html)))
{
// Print a formatted text from the document
// If formatted text extraction isn't supported, a reader is null
Console.WriteLine(reader == null ? "Formatted text extraction isn't suppported" : reader.ReadToEnd());
}
// Create an instance of Parser class
using (Parser parser = new Parser(@"C:\Files\sample.docx"))
{
// Check if the document supports formatted text extraction
if (!parser.Features.FormattedText)
{
Console.WriteLine("Document isn't supports formatted text extraction.");
return;
}
// Get the document info
IDocumentInfo documentInfo = parser.GetDocumentInfo();
// Check if the document has pages
if (documentInfo.PageCount == 0)
{
Console.WriteLine("Document hasn't pages.");
return;
}
// Iterate over pages
for (int p = 0; p < documentInfo.PageCount; p++)
{
// Print a page number
Console.WriteLine(string.Format("Page {0}/{1}", p + 1, documentInfo.PageCount));
// Extract a formatted text into the reader
using (TextReader reader = parser.GetFormattedText(p, new FormattedTextOptions(FormattedTextMode.Html)))
{
// Print a formatted text from the document
// We ignore null-checking as we have checked formatted text extraction feature support earlier
Console.WriteLine(reader.ReadToEnd());
}
}
}
// Create an instance of Parser class
Parser parser = new Parser(@"C:\Files\sample.docx");
// Extract a text into the reader
using (TextReader reader = parser.GetText())
{
// Print a text from the document
// If text extraction isn't supported, a reader is null
Console.WriteLine(reader == null ? "Text extraction isn't supported" : reader.ReadToEnd());
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment