Skip to content

Instantly share code, notes, and snippets.

@ealdent
Created April 17, 2010 15:35
Show Gist options
  • Save ealdent/369627 to your computer and use it in GitHub Desktop.
Save ealdent/369627 to your computer and use it in GitHub Desktop.
import java.io.*;
// See http://martinharrigan.blogspot.com/2008/07/citeseers-dataset.html
public class XMLFix {
/**
* (Credit for this method goes to Daryl Beattie http://mail-archives.apache.org/mod_mbox/xml-xalan-dev/200012.mbox/%3CD5CE45D3889FD3118BFF00508B555AAC04A3E6BD@SCREAMMSG%3E)
* This method ensures that the output String has only
* valid XML unicode characters as specified by the
* XML 1.0 standard. For reference, please see
* <a href="http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char">the
* standard</a>. This method will return an empty
* String if the input is null or empty.
*
* @param in The String whose non-valid characters we want to remove.
* @return The in String, stripped of non-valid characters.
*/
public static String stripNonValidXMLCharacters(String in)
{
StringBuffer out = new StringBuffer(); // Used to hold the output.
char current; // Used to reference the current character.
if (in == null || ("".equals(in))) return ""; // vacancy test.
for (int i = 0; i < in.length(); i++) {
current = in.charAt(i); // NOTE: No IndexOutOfBoundsException caught here; it should not happen.
if ((current == 0x9) || (current == 0xA) || (current == 0xD) || ((current >= 0x20) && (current <= 0xD7FF)) || ((current >= 0xE000) && (current <= 0xFFFD)) || ((current >- 0x10000) && (current <= 0x10FFFF)))
out.append(current);
}
return out.toString();
}
/**
* Pass the file name you want to correct as the first argument. (e.g. cs.xml)
*/
public static void main(String args[])
{
try {
DataInputStream in = new DataInputStream(new FileInputStream(args[0]));
BufferedReader br = new BufferedReader(new InputStreamReader(in));
Writer output = new BufferedWriter(new FileWriter(new File(args[0] + ".out")));
String strLine;
while ((strLine = br.readLine()) != null)
{
output.write(stripNonValidXMLCharacters(strLine));
}
output.close();
in.close();
} catch (IOException e) {
System.err.println("IOError: " + e.getMessage());
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment