Skip to content

Instantly share code, notes, and snippets.

@RobThree
Created February 19, 2015 17:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save RobThree/6eb86b8d732b39cc6284 to your computer and use it in GitHub Desktop.
Save RobThree/6eb86b8d732b39cc6284 to your computer and use it in GitHub Desktop.
Word table extractor
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Linq;
using System.Xml;
//Usage:
// var tables = new WordTableExtractor().ExtractTables(@"path\to\my\file.docx");
public class WordTableExtractor
{
public IEnumerable<IEnumerable<string[]>> ExtractTables(string filename)
{
return ExtractTables(ReadDocxFile(filename, "word/document.xml"));
}
public IEnumerable<IEnumerable<string[]>> ExtractTables(string filename, string entry)
{
return ExtractTables(ReadDocxFile(filename, entry));
}
private static IEnumerable<IEnumerable<string[]>> ExtractTables(XmlDocument doc)
{
var nsmgr = new XmlNamespaceManager(doc.NameTable);
nsmgr.AddNamespace("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main");
return SelectNodes(doc.DocumentElement, "//w:tbl", nsmgr) //Find all tables
.Select(t => SelectNodes(t, "w:tr", nsmgr) //Find their rows
.Select(r => SelectNodes(r, "w:tc", nsmgr) //Find their cells
.Select(c => c.InnerText.Trim()).ToArray() //Get their textcontent
));
}
private static IEnumerable<XmlElement> SelectNodes(XmlElement element, string xpath, XmlNamespaceManager nsmgr)
{
return element.SelectNodes(xpath, nsmgr).Cast<XmlElement>();
}
private static XmlDocument ReadDocxFile(string path, string entry)
{
using (var s = File.OpenRead(path))
{
using (var z = new ZipArchive(s, ZipArchiveMode.Read))
{
var f = z.Entries.FirstOrDefault(e => e.FullName.Equals(entry, StringComparison.OrdinalIgnoreCase));
if (f == null)
throw new FileNotFoundException(string.Format("Entry '{0}' was not found in '{1}'", entry, path));
using (var sr = f.Open())
{
var x = new XmlDocument();
x.Load(sr);
return x;
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment