Last active
August 19, 2022 13:30
-
-
Save deanebarker/525108d74d5304fee82b01b23865335d to your computer and use it in GitHub Desktop.
A quick and dirty way to extract text from Word document via C#
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This is some rough/stub code for extracting raw text from a Word document | |
// A modern Word document (.docx) is just a zip file. Extract it. | |
// Find a file called word/document.xml. That contains the text of the document. | |
// Paragraphs are in "w:p" tags, and text is in "w:t". | |
// Iterate the "p" tags, then concatenate all the "t" tags inside them | |
void Main() | |
{ | |
var doc = XDocument.Parse(File.ReadAllText(" [path to word/document.xml] ")); | |
XNamespace nsW = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; | |
var paras = new List<string>(); | |
foreach(var para in doc.Root.Descendants(nsW + "p")) | |
{ | |
var line = string.Join("", para.Descendants(nsW + "t").Select(x => x.Value)).Trim(); | |
if(string.IsNullOrWhiteSpace(line)) continue; // Don't add blanks... | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment