Kagee/dom.java

## dom.java
/*

The input documents somtimes has characters that are not valid as UTF-8 text. Where/how can i filter/remove them ?

[Fatal Error] week.html:191:320: An invalid XML character (Unicode: 0x19) was found in the element content of the document.
org.xml.sax.SAXParseException: An invalid XML character (Unicode: 0x19) was found in the element content of the document.
        at com.sun.org.apache.xerces.internal.parsers.DOMParser.parse(DOMParser.java:249)
        at com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderImpl.parse(DocumentBuilderImpl.java:284)
        at dom.main(dom.java:22)
*/

import java.io.*;
import java.util.Arrays;
import java.net.*;
import javax.xml.parsers.*;
import org.w3c.dom.*;
import org.xml.sax.*;

public class dom {
	public static void main(String argv[]) {
	try {

		InputSource inputSource = new InputSource("week.html");
		inputSource.setEncoding("iso-8859-1");

		DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
		DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();

		LocalDTDResolver localDTDResolver =
		new LocalDTDResolver("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd",
		new File("xhtml-lat1.ent"));
		dBuilder.setEntityResolver(localDTDResolver);
		Document doc = dBuilder.parse(inputSource);
		doc.getDocumentElement().normalize();
		NodeList tableList = doc.getElementsByTagName("table");

		int numUsers = 30;
		numUsers = 1;

		String[][] a = new String[numUsers][6];
		Node nNode = tableList.item(8);
		for(int i = 0; i < numUsers; i++) {
			a[i][0] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(2).getFirstChild().getFirstChild().getNodeValue();
			a[i][1] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(3).getFirstChild().getNodeValue();
			a[i][2] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(4).getFirstChild().getNodeValue();
			a[i][3] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(5).getFirstChild().getNodeValue();
			a[i][4] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(6).getFirstChild().getNodeValue();

			//This is a text int the html. Do some more work to get full urls
			Node n = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(7).getFirstChild();
			if(n.getParentNode().getChildNodes().getLength() > 1) {
					n = n.getParentNode();
					n.normalize();
					a[i][5] = n.getTextContent();
			} else {
				a[i][5] = n.getTextContent();
			}

		}
		System.out.println(Arrays.deepToString(a));

	} catch (Exception e) {
		e.printStackTrace();
	}
}


}
class LocalDTDResolver implements EntityResolver {
	String mySystemIdToIntercept;
	File myLocalDtdPath;
	URL localDtdFileAsUrl;
	public LocalDTDResolver( String systemIdToIntercept, File localDtdPath ) throws MalformedURLException {
		mySystemIdToIntercept = systemIdToIntercept;
		myLocalDtdPath = localDtdPath;
		localDtdFileAsUrl = myLocalDtdPath.toURI().toURL();
	}
	public InputSource resolveEntity (String publicId, String systemId) {
		if (systemId.equals( mySystemIdToIntercept )) {
			return new InputSource( localDtdFileAsUrl.toString() );
		}
		else {
			// use the default behaviour (?)
			return null;
		}
	}
}
	/*

	The input documents somtimes has characters that are not valid as UTF-8 text. Where/how can i filter/remove them ?

	[Fatal Error] week.html:191:320: An invalid XML character (Unicode: 0x19) was found in the element content of the document.
	org.xml.sax.SAXParseException: An invalid XML character (Unicode: 0x19) was found in the element content of the document.
	at com.sun.org.apache.xerces.internal.parsers.DOMParser.parse(DOMParser.java:249)
	at com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderImpl.parse(DocumentBuilderImpl.java:284)
	at dom.main(dom.java:22)
	*/

	import java.io.*;
	import java.util.Arrays;
	import java.net.*;
	import javax.xml.parsers.*;
	import org.w3c.dom.*;
	import org.xml.sax.*;

	public class dom {
	public static void main(String argv[]) {
	try {

	InputSource inputSource = new InputSource("week.html");
	inputSource.setEncoding("iso-8859-1");

	DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
	DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();

	LocalDTDResolver localDTDResolver =
	new LocalDTDResolver("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd",
	new File("xhtml-lat1.ent"));
	dBuilder.setEntityResolver(localDTDResolver);
	Document doc = dBuilder.parse(inputSource);
	doc.getDocumentElement().normalize();
	NodeList tableList = doc.getElementsByTagName("table");

	int numUsers = 30;
	numUsers = 1;

	String[][] a = new String[numUsers][6];
	Node nNode = tableList.item(8);
	for(int i = 0; i < numUsers; i++) {
	a[i][0] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(2).getFirstChild().getFirstChild().getNodeValue();
	a[i][1] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(3).getFirstChild().getNodeValue();
	a[i][2] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(4).getFirstChild().getNodeValue();
	a[i][3] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(5).getFirstChild().getNodeValue();
	a[i][4] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(6).getFirstChild().getNodeValue();

	//This is a text int the html. Do some more work to get full urls
	Node n = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(7).getFirstChild();
	if(n.getParentNode().getChildNodes().getLength() > 1) {
	n = n.getParentNode();
	n.normalize();
	a[i][5] = n.getTextContent();
	} else {
	a[i][5] = n.getTextContent();
	}

	}
	System.out.println(Arrays.deepToString(a));

	} catch (Exception e) {
	e.printStackTrace();
	}
	}


	}
	class LocalDTDResolver implements EntityResolver {
	String mySystemIdToIntercept;
	File myLocalDtdPath;
	URL localDtdFileAsUrl;
	public LocalDTDResolver( String systemIdToIntercept, File localDtdPath ) throws MalformedURLException {
	mySystemIdToIntercept = systemIdToIntercept;
	myLocalDtdPath = localDtdPath;
	localDtdFileAsUrl = myLocalDtdPath.toURI().toURL();
	}
	public InputSource resolveEntity (String publicId, String systemId) {
	if (systemId.equals( mySystemIdToIntercept )) {
	return new InputSource( localDtdFileAsUrl.toString() );
	}
	else {
	// use the default behaviour (?)
	return null;
	}
	}
	}