Skip to content

Instantly share code, notes, and snippets.

@GeertVL-zz
Created October 2, 2013 11:30
Show Gist options
  • Save GeertVL-zz/6792352 to your computer and use it in GitHub Desktop.
Save GeertVL-zz/6792352 to your computer and use it in GitHub Desktop.
using System;
using System.Collections.Generic;
using System.IO;
using System.Xml;
namespace TOCParser
{
public class XHtmlDocParser
{
private XmlNamespaceManager _mgr;
public Navigator ParseTOC(string documentPath)
{
if (!File.Exists(documentPath))
{
throw new FileNotFoundException("The document to parse is not found.", documentPath);
}
XmlDocument doc = new XmlDocument();
doc.XmlResolver = null;
try
{
doc.Load(documentPath);
_mgr = new XmlNamespaceManager(doc.NameTable);
_mgr.AddNamespace("xhtml", "http://www.w3.org/1999/xhtml");
var tocNode = doc.SelectSingleNode(@"//xhtml:div[@id='wkb_view_toc']", _mgr);
if (tocNode != null)
{
var navigators = GetTocItemNodes(tocNode);
if (navigators.Count > 0)
{
// We only expect one tocitem as the root.
return navigators[0];
}
}
}
catch (Exception ex)
{
Console.WriteLine("Could not load document: " + documentPath);
Console.WriteLine(ex.Message);
}
return null;
}
public Navigator ParseLinks(string documentPath)
{
if (!File.Exists(documentPath))
{
throw new FileNotFoundException("The document to parse is not found.", documentPath);
}
XmlDocument doc = new XmlDocument();
doc.XmlResolver = null;
try
{
doc.Load(documentPath);
_mgr = new XmlNamespaceManager(doc.NameTable);
_mgr.AddNamespace("xhtml", "http://www.w3.org/1999/xhtml");
var linkNodes = doc.SelectNodes(@"//xhtml:div[@id='wkb_view_links']", _mgr);
var navigator = new Navigator();
foreach (XmlNode node in linkNodes)
{
Console.WriteLine(node.OuterXml);
navigator.Children.Add(new Navigator
{
}
);
}
return navigator;
}
catch (Exception ex)
{
Console.WriteLine("Could not load document: {0} : {1}", documentPath, ex.Message);
}
return null;
}
private List<Navigator> GetTocItemNodes(XmlNode itemContainer)
{
var navigatorItems = new List<Navigator>();
var tocItemNodes = itemContainer.SelectNodes(@"xhtml:div[contains(@class, 'tocitem')]", _mgr);
foreach (XmlNode tocItemNode in tocItemNodes)
{
var navigatorItem = new Navigator
{
Identifier = tocItemNode.Attributes["id"].Value,
DisplayLabel = tocItemNode.FirstChild.InnerText,
Description = tocItemNode.FirstChild.InnerText,
Expandable = true,
ParentIdentifier = itemContainer.Attributes["id"].Value
};
navigatorItem.Children = new List<Navigator>();
var tocLeafNodes = GetTocLeafNodes(tocItemNode);
tocLeafNodes.ForEach(x => navigatorItem.Children.Add(x));
var tocItemChildrenNodes = this.GetTocItemNodes(tocItemNode);
tocItemChildrenNodes.ForEach(x => navigatorItem.Children.Add(x));
navigatorItem.ItemCount = navigatorItem.Children.Count;
navigatorItems.Add(navigatorItem);
}
return navigatorItems;
}
private List<Navigator> GetTocLeafNodes(XmlNode leafContainer)
{
var navigatorItems = new List<Navigator>();
var tocLeafNodes = leafContainer.SelectNodes(@"xhtml:div[contains(@class, 'tocleaf')]", _mgr);
foreach (XmlNode tocLeafNode in tocLeafNodes)
{
navigatorItems.Add(new Navigator
{
Identifier = tocLeafNode.Attributes["id"].Value,
DisplayLabel = tocLeafNode.FirstChild.InnerText,
Description = tocLeafNode.FirstChild.InnerText,
Expandable = false,
ItemCount = 0,
ParentIdentifier = leafContainer.Attributes["id"].Value
});
}
return navigatorItems;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment