Created
October 2, 2013 11:30
-
-
Save GeertVL-zz/6792352 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Xml; | |
namespace TOCParser | |
{ | |
public class XHtmlDocParser | |
{ | |
private XmlNamespaceManager _mgr; | |
public Navigator ParseTOC(string documentPath) | |
{ | |
if (!File.Exists(documentPath)) | |
{ | |
throw new FileNotFoundException("The document to parse is not found.", documentPath); | |
} | |
XmlDocument doc = new XmlDocument(); | |
doc.XmlResolver = null; | |
try | |
{ | |
doc.Load(documentPath); | |
_mgr = new XmlNamespaceManager(doc.NameTable); | |
_mgr.AddNamespace("xhtml", "http://www.w3.org/1999/xhtml"); | |
var tocNode = doc.SelectSingleNode(@"//xhtml:div[@id='wkb_view_toc']", _mgr); | |
if (tocNode != null) | |
{ | |
var navigators = GetTocItemNodes(tocNode); | |
if (navigators.Count > 0) | |
{ | |
// We only expect one tocitem as the root. | |
return navigators[0]; | |
} | |
} | |
} | |
catch (Exception ex) | |
{ | |
Console.WriteLine("Could not load document: " + documentPath); | |
Console.WriteLine(ex.Message); | |
} | |
return null; | |
} | |
public Navigator ParseLinks(string documentPath) | |
{ | |
if (!File.Exists(documentPath)) | |
{ | |
throw new FileNotFoundException("The document to parse is not found.", documentPath); | |
} | |
XmlDocument doc = new XmlDocument(); | |
doc.XmlResolver = null; | |
try | |
{ | |
doc.Load(documentPath); | |
_mgr = new XmlNamespaceManager(doc.NameTable); | |
_mgr.AddNamespace("xhtml", "http://www.w3.org/1999/xhtml"); | |
var linkNodes = doc.SelectNodes(@"//xhtml:div[@id='wkb_view_links']", _mgr); | |
var navigator = new Navigator(); | |
foreach (XmlNode node in linkNodes) | |
{ | |
Console.WriteLine(node.OuterXml); | |
navigator.Children.Add(new Navigator | |
{ | |
} | |
); | |
} | |
return navigator; | |
} | |
catch (Exception ex) | |
{ | |
Console.WriteLine("Could not load document: {0} : {1}", documentPath, ex.Message); | |
} | |
return null; | |
} | |
private List<Navigator> GetTocItemNodes(XmlNode itemContainer) | |
{ | |
var navigatorItems = new List<Navigator>(); | |
var tocItemNodes = itemContainer.SelectNodes(@"xhtml:div[contains(@class, 'tocitem')]", _mgr); | |
foreach (XmlNode tocItemNode in tocItemNodes) | |
{ | |
var navigatorItem = new Navigator | |
{ | |
Identifier = tocItemNode.Attributes["id"].Value, | |
DisplayLabel = tocItemNode.FirstChild.InnerText, | |
Description = tocItemNode.FirstChild.InnerText, | |
Expandable = true, | |
ParentIdentifier = itemContainer.Attributes["id"].Value | |
}; | |
navigatorItem.Children = new List<Navigator>(); | |
var tocLeafNodes = GetTocLeafNodes(tocItemNode); | |
tocLeafNodes.ForEach(x => navigatorItem.Children.Add(x)); | |
var tocItemChildrenNodes = this.GetTocItemNodes(tocItemNode); | |
tocItemChildrenNodes.ForEach(x => navigatorItem.Children.Add(x)); | |
navigatorItem.ItemCount = navigatorItem.Children.Count; | |
navigatorItems.Add(navigatorItem); | |
} | |
return navigatorItems; | |
} | |
private List<Navigator> GetTocLeafNodes(XmlNode leafContainer) | |
{ | |
var navigatorItems = new List<Navigator>(); | |
var tocLeafNodes = leafContainer.SelectNodes(@"xhtml:div[contains(@class, 'tocleaf')]", _mgr); | |
foreach (XmlNode tocLeafNode in tocLeafNodes) | |
{ | |
navigatorItems.Add(new Navigator | |
{ | |
Identifier = tocLeafNode.Attributes["id"].Value, | |
DisplayLabel = tocLeafNode.FirstChild.InnerText, | |
Description = tocLeafNode.FirstChild.InnerText, | |
Expandable = false, | |
ItemCount = 0, | |
ParentIdentifier = leafContainer.Attributes["id"].Value | |
}); | |
} | |
return navigatorItems; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment