Skip to content

Instantly share code, notes, and snippets.

@DanielJomphe
Created June 14, 2010 13:14
Show Gist options
  • Save DanielJomphe/437660 to your computer and use it in GitHub Desktop.
Save DanielJomphe/437660 to your computer and use it in GitHub Desktop.
/**
* Applies a very default Google Caja HTML cajoling pipeline.
* <p/>
* There's three things we might not like about this naive, default implementation.
* <ol>
* <li>It rewrites URLs by prefixing them and suffixing them with stuff.</li>
* <li>It uses Google Caja's default whitelists. They accept all valid markup that's not deemed insecure. We might prefer to provide more restrictive whitelists.</li>
* <li>It will be slow in an app where there's many users concurrently using it. This is not the case for our first client's needs, so we don't care for now.</li>
* </ol>
*/
public class MyCaja {
/**
* Applies a whitelist to all html&css contents in the provided <code>weirdHtml</code>, and completely removes any kind of javascript found in it.
* @param weirdHtml The markup to sanitize.
* @param charsetName The charset to use throughout. If ever you'd like the output to be in a different charset than this one, take a look at this method's code to see what parameter you may add.
* @return The sanitized version of <code>weirdHtml</code>.
* @throws MyCajaException if <code>sanitize/<code> couldn't do its job for any reason.
* @see MyCaja
*/
public static String sanitize(String weirdHtml, String charsetName) throws MyCajaException {
final EchoingMessageQueue mq = new EchoingMessageQueue(new PrintWriter(System.out), new MessageContext(), false);
BuildInfo.getInstance().addBuildInfo(mq);
String htmlAndJs;
try {
htmlAndJs = cajoleHtml(weirdHtml, charsetName, mq);
} catch (final Exception e) {
throw new MyCajaException();
}
/* The objective in our way of using Caja was to write as less custom code as possible; KISS.
* I feel Caja is still a moving target, so the less we custom-write, the less we'll have to maintain.
* Due to this decision, what we get out of htmlHandler.apply(...) is both html and a very small javascript function.
* I don't believe this superfluous javascript handling comes with too big of a performance hit for this client's needs, but if it does,
* we might want to try to inline HtmlHandler's class here and remove the superfluous javascript handling.
*/
final String html = htmlAndJs.split("<script[^>]*>")[0];
return html;
}
private static String cajoleHtml(String weirdHtml, String charsetName, final EchoingMessageQueue mq) throws IOException, UnsupportedContentTypeException {
final InputSource inputSource = InputSource.UNKNOWN;
final String contentType = ContentType.HTML.mimeType;
final FetchedData in = FetchedData.fromReader(new StringReader(weirdHtml), inputSource, contentType, charsetName);
// http://daniel.com: if there was a constructor that accepts an UriPolicy, that would be great. We don't need to rewrite URLs.
final HtmlHandler htmlHandler = new HtmlHandler(BuildInfo.getInstance(), "http://daniel.com", UriFetcher.NULL_NETWORK);
final ByteArrayOutputStream out = new ByteArrayOutputStream();
htmlHandler.apply(
inputSource.getUri(),
null,//Transform.CAJOLE,
null,//Lists.newArrayList(Directive.CAJITA),
new NullContentHandlerArgs(),
contentType, contentType,
new StrictContentTypeCheck(),
in, out,
mq);
return out.toString(charsetName);
}
public static void main(String[] args) throws IOException, MyCajaException {
System.out.println(sanitize("", "ISO-8859-1"));
System.out.println(sanitize(FileUtils.readFileToString(new File("C:/input.html")), "ISO-8859-1"));
}
/**
* HtmlHandler only uses it to check arg MODULE_CALLBACK, which we don't care about. Each time this code is maintained, please make sure HtmlHandler didn't start using more args.
*/
protected static class NullContentHandlerArgs extends ContentHandlerArgs {
@Override
public String get(String name) {
return null;
}
}
public static class MyCajaException extends Exception {
private static final long serialVersionUID = 0L;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment