Created
April 17, 2010 15:35
-
-
Save ealdent/369627 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.*; | |
// See http://martinharrigan.blogspot.com/2008/07/citeseers-dataset.html | |
public class XMLFix { | |
/** | |
* (Credit for this method goes to Daryl Beattie http://mail-archives.apache.org/mod_mbox/xml-xalan-dev/200012.mbox/%3CD5CE45D3889FD3118BFF00508B555AAC04A3E6BD@SCREAMMSG%3E) | |
* This method ensures that the output String has only | |
* valid XML unicode characters as specified by the | |
* XML 1.0 standard. For reference, please see | |
* <a href="http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char">the | |
* standard</a>. This method will return an empty | |
* String if the input is null or empty. | |
* | |
* @param in The String whose non-valid characters we want to remove. | |
* @return The in String, stripped of non-valid characters. | |
*/ | |
public static String stripNonValidXMLCharacters(String in) | |
{ | |
StringBuffer out = new StringBuffer(); // Used to hold the output. | |
char current; // Used to reference the current character. | |
if (in == null || ("".equals(in))) return ""; // vacancy test. | |
for (int i = 0; i < in.length(); i++) { | |
current = in.charAt(i); // NOTE: No IndexOutOfBoundsException caught here; it should not happen. | |
if ((current == 0x9) || (current == 0xA) || (current == 0xD) || ((current >= 0x20) && (current <= 0xD7FF)) || ((current >= 0xE000) && (current <= 0xFFFD)) || ((current >- 0x10000) && (current <= 0x10FFFF))) | |
out.append(current); | |
} | |
return out.toString(); | |
} | |
/** | |
* Pass the file name you want to correct as the first argument. (e.g. cs.xml) | |
*/ | |
public static void main(String args[]) | |
{ | |
try { | |
DataInputStream in = new DataInputStream(new FileInputStream(args[0])); | |
BufferedReader br = new BufferedReader(new InputStreamReader(in)); | |
Writer output = new BufferedWriter(new FileWriter(new File(args[0] + ".out"))); | |
String strLine; | |
while ((strLine = br.readLine()) != null) | |
{ | |
output.write(stripNonValidXMLCharacters(strLine)); | |
} | |
output.close(); | |
in.close(); | |
} catch (IOException e) { | |
System.err.println("IOError: " + e.getMessage()); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment