Skip to content

Instantly share code, notes, and snippets.

@Kagee
Created October 2, 2011 21:22
Show Gist options
  • Save Kagee/1257983 to your computer and use it in GitHub Desktop.
Save Kagee/1257983 to your computer and use it in GitHub Desktop.
/*
1. The input documents somtimes has characters that are not valid as UTF-8 text. Where/how can i filter/remove them ?
hildenae@sektober:~/Dokumenter/Kildekode/statsparser$ java dom
com.sun.org.apache.xerces.internal.impl.io.MalformedByteSequenceException: Invalid byte 1 of 1-byte UTF-8 sequence.
at com.sun.org.apache.xerces.internal.impl.io.UTF8Reader.invalidByte(UTF8Reader.java:684)
at com.sun.org.apache.xerces.internal.impl.io.UTF8Reader.read(UTF8Reader.java:554)
at com.sun.org.apache.xerces.internal.impl.XMLEntityScanner.load(XMLEntityScanner.java:1742)
at com.sun.org.apache.xerces.internal.impl.XMLEntityScanner.skipChar(XMLEntityScanner.java:1416)
at com.sun.org.apache.xerces.internal.impl.XMLDocumentFragmentScannerImpl$FragmentContentDriver.next(XMLDocumentFragmentScannerImpl.java:2792)
at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl.next(XMLDocumentScannerImpl.java:648)
at com.sun.org.apache.xerces.internal.impl.XMLDocumentFragmentScannerImpl.scanDocument(XMLDocumentFragmentScannerImpl.java:511)
at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(XML11Configuration.java:808)
at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(XML11Configuration.java:737)
at com.sun.org.apache.xerces.internal.parsers.XMLParser.parse(XMLParser.java:119)
at com.sun.org.apache.xerces.internal.parsers.DOMParser.parse(DOMParser.java:235)
at com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderImpl.parse(DocumentBuilderImpl.java:284)
at javax.xml.parsers.DocumentBuilder.parse(DocumentBuilder.java:124)
at dom.main(dom.java:65)
*/
import java.io.IOException;
import java.io.InputStream;
import java.util.Enumeration;
import java.util.Hashtable;
import java.io.*;
import java.io.FilterInputStream;
import java.util.Arrays;
import java.net.*;
import javax.xml.parsers.*;
import org.w3c.dom.*;
import org.xml.sax.*;
public class dom {
public static void main(String argv[]) {
try {
InputStream is = new BufferedInputStream(
new FileInputStream("week.html"));
//InputSource inputSource = new InputSource("week.html");
InputSource inputSource = new InputSource(is);
inputSource.setEncoding("iso-8859-1");
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
LocalDTDResolver localDTDResolver =
new LocalDTDResolver("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd", new File("xhtml-lat1.ent"));
dBuilder.setEntityResolver(localDTDResolver);
Document doc = dBuilder.parse(new ReplacingInputStream( inputSource.getByteStream() ));
doc.getDocumentElement().normalize();
NodeList tableList = doc.getElementsByTagName("table");
int numUsers = 30;
numUsers = 1;
String[][] a = new String[numUsers][6];
Node nNode = tableList.item(8);
for(int i = 0; i < numUsers; i++) {
a[i][0] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(2).getFirstChild().getFirstChild().getNodeValue();
a[i][1] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(3).getFirstChild().getNodeValue();
a[i][2] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(4).getFirstChild().getNodeValue();
a[i][3] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(5).getFirstChild().getNodeValue();
a[i][4] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(6).getFirstChild().getNodeValue();
//This is a text int the html. Do some more work to get full urls
Node n = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(7).getFirstChild();
if(n.getParentNode().getChildNodes().getLength() > 1) {
n = n.getParentNode();
n.normalize();
a[i][5] = n.getTextContent();
} else {
a[i][5] = n.getTextContent();
}
}
System.out.println(Arrays.deepToString(a));
} catch (Exception e) {
e.printStackTrace();
}
}
}
class ReplacingInputStream extends FilterInputStream {
public ReplacingInputStream(InputStream in) {
super(in);
this.in = in;
}
public int read() throws IOException {
int read = super.read();
if (read!=-1 && read<0x20 && !(read==0x9 || read==0xA || read==0xB))
{ read = 0x20;}
return read;
}
}
class LocalDTDResolver implements EntityResolver {
String mySystemIdToIntercept;
File myLocalDtdPath;
URL localDtdFileAsUrl;
public LocalDTDResolver( String systemIdToIntercept, File localDtdPath ) throws MalformedURLException {
mySystemIdToIntercept = systemIdToIntercept;
myLocalDtdPath = localDtdPath;
localDtdFileAsUrl = myLocalDtdPath.toURI().toURL();
}
public InputSource resolveEntity (String publicId, String systemId) {
if (systemId.equals( mySystemIdToIntercept )) {
return new InputSource( localDtdFileAsUrl.toString() );
}
else {
// use the default behaviour (?)
return null;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment