Skip to content

Instantly share code, notes, and snippets.

@Kagee
Created October 1, 2011 18:58
Show Gist options
  • Save Kagee/1256495 to your computer and use it in GitHub Desktop.
Save Kagee/1256495 to your computer and use it in GitHub Desktop.
How to filter out invalid UTF-8 charaters
/*
The input documents somtimes has characters that are not valid as UTF-8 text. Where/how can i filter/remove them ?
[Fatal Error] week.html:191:320: An invalid XML character (Unicode: 0x19) was found in the element content of the document.
org.xml.sax.SAXParseException: An invalid XML character (Unicode: 0x19) was found in the element content of the document.
at com.sun.org.apache.xerces.internal.parsers.DOMParser.parse(DOMParser.java:249)
at com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderImpl.parse(DocumentBuilderImpl.java:284)
at dom.main(dom.java:22)
*/
import java.io.*;
import java.util.Arrays;
import java.net.*;
import javax.xml.parsers.*;
import org.w3c.dom.*;
import org.xml.sax.*;
public class dom {
public static void main(String argv[]) {
try {
InputSource inputSource = new InputSource("week.html");
inputSource.setEncoding("iso-8859-1");
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
LocalDTDResolver localDTDResolver =
new LocalDTDResolver("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd",
new File("xhtml-lat1.ent"));
dBuilder.setEntityResolver(localDTDResolver);
Document doc = dBuilder.parse(inputSource);
doc.getDocumentElement().normalize();
NodeList tableList = doc.getElementsByTagName("table");
int numUsers = 30;
numUsers = 1;
String[][] a = new String[numUsers][6];
Node nNode = tableList.item(8);
for(int i = 0; i < numUsers; i++) {
a[i][0] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(2).getFirstChild().getFirstChild().getNodeValue();
a[i][1] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(3).getFirstChild().getNodeValue();
a[i][2] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(4).getFirstChild().getNodeValue();
a[i][3] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(5).getFirstChild().getNodeValue();
a[i][4] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(6).getFirstChild().getNodeValue();
//This is a text int the html. Do some more work to get full urls
Node n = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(7).getFirstChild();
if(n.getParentNode().getChildNodes().getLength() > 1) {
n = n.getParentNode();
n.normalize();
a[i][5] = n.getTextContent();
} else {
a[i][5] = n.getTextContent();
}
}
System.out.println(Arrays.deepToString(a));
} catch (Exception e) {
e.printStackTrace();
}
}
}
class LocalDTDResolver implements EntityResolver {
String mySystemIdToIntercept;
File myLocalDtdPath;
URL localDtdFileAsUrl;
public LocalDTDResolver( String systemIdToIntercept, File localDtdPath ) throws MalformedURLException {
mySystemIdToIntercept = systemIdToIntercept;
myLocalDtdPath = localDtdPath;
localDtdFileAsUrl = myLocalDtdPath.toURI().toURL();
}
public InputSource resolveEntity (String publicId, String systemId) {
if (systemId.equals( mySystemIdToIntercept )) {
return new InputSource( localDtdFileAsUrl.toString() );
}
else {
// use the default behaviour (?)
return null;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment