deanebarker/ExtractTextFromWord.cs

## ExtractTextFromWord.cs
// This is some rough/stub code for extracting raw text from a Word document
// A modern Word document (.docx) is just a zip file. Extract it.
// Find a file called word/document.xml. That contains the text of the document.
// Paragraphs are in "w:p" tags, and text is in "w:t".
// Iterate the "p" tags, then concatenate all the "t" tags inside them

void Main()
{
  var doc = XDocument.Parse(File.ReadAllText(" [path to word/document.xml] "));
  XNamespace nsW = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";

  var paras = new List<string>();
  foreach(var para in doc.Root.Descendants(nsW + "p"))
  {
    var line = string.Join("", para.Descendants(nsW + "t").Select(x => x.Value)).Trim();
    if(string.IsNullOrWhiteSpace(line)) continue; // Don't add blanks...
  }
}
	// This is some rough/stub code for extracting raw text from a Word document
	// A modern Word document (.docx) is just a zip file. Extract it.
	// Find a file called word/document.xml. That contains the text of the document.
	// Paragraphs are in "w:p" tags, and text is in "w:t".
	// Iterate the "p" tags, then concatenate all the "t" tags inside them

	void Main()
	{
	var doc = XDocument.Parse(File.ReadAllText(" [path to word/document.xml] "));
	XNamespace nsW = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";

	var paras = new List<string>();
	foreach(var para in doc.Root.Descendants(nsW + "p"))
	{
	var line = string.Join("", para.Descendants(nsW + "t").Select(x => x.Value)).Trim();
	if(string.IsNullOrWhiteSpace(line)) continue; // Don't add blanks...
	}
	}