Skip to content

Instantly share code, notes, and snippets.

@wfwei
Created May 23, 2012 13:11
Show Gist options
  • Save wfwei/2775168 to your computer and use it in GitHub Desktop.
Save wfwei/2775168 to your computer and use it in GitHub Desktop.
charset detect in tika
/**
* TIKA-332: Check for meta http-equiv tag with charset info in
* HTML content.
* <p>
* TODO: Move this into core, along with CharsetDetector
*/
private String getEncoding(InputStream stream, Metadata metadata) throws IOException {
stream.mark(META_TAG_BUFFER_SIZE);
char[] buffer = new char[META_TAG_BUFFER_SIZE];
InputStreamReader isr = new InputStreamReader(stream, "us-ascii");
// why use "us-ascii" cz: we only care about the <meta http-equiv="Content-type" content="text/html; charset=xxx"> tag
int bufferSize = isr.read(buffer);
stream.reset();
if (bufferSize != -1) {
String metaString = new String(buffer, 0, bufferSize);
Matcher m = HTTP_EQUIV_PATTERN.matcher(metaString);
if (m.find()) {
// TIKA-349: flexible handling of attributes
// We have one or more x or x=y attributes, separated by ';'
String[] attrs = m.group(1).split(";");
for (String attr : attrs) {
String[] keyValue = attr.trim().split("=");
if ((keyValue.length == 2) && keyValue[0].equalsIgnoreCase("charset")) {
// TIKA-459: improve charset handling.
String charset = CharsetUtils.clean(keyValue[1]);
if (CharsetUtils.isSupported(charset)) {
metadata.set(Metadata.CONTENT_ENCODING, charset);
return charset;
}
}
}
}
}
// No (valid) charset in a meta http-equiv tag, see if it's in the passed content-encoding
// hint, or the passed content-type hint.
CharsetDetector detector = new CharsetDetector();
String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
String incomingType = metadata.get(Metadata.CONTENT_TYPE);
if (incomingCharset == null && incomingType != null) {
// TIKA-341: Use charset in content-type
MediaType mt = MediaType.parse(incomingType);
if (mt != null) {
String charset = mt.getParameters().get("charset");
if ((charset != null) && Charset.isSupported(charset)) {
incomingCharset = charset;
}
}
}
if (incomingCharset != null) {
detector.setDeclaredEncoding(incomingCharset);
}
// TIKA-341 without enabling input filtering (stripping of tags) the
// short HTML tests don't work well.
detector.enableInputFilter(true);
detector.setText(stream);
for (CharsetMatch match : detector.detectAll()) {
if (Charset.isSupported(match.getName())) {
metadata.set(Metadata.CONTENT_ENCODING, match.getName());
// TIKA-339: Don't set language, as it's typically not a very good
// guess, and it can create ambiguity if another (better) language
// value is specified by a meta tag in the HTML (or via HTTP response
// header).
/*
String language = match.getLanguage();
if (language != null) {
metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
metadata.set(Metadata.LANGUAGE, match.getLanguage());
}
*/
break;
}
}
String encoding = metadata.get(Metadata.CONTENT_ENCODING);
if (encoding == null) {
if (Charset.isSupported(DEFAULT_CHARSET)) {
encoding = DEFAULT_CHARSET;
} else {
encoding = Charset.defaultCharset().name();
}
metadata.set(Metadata.CONTENT_ENCODING, encoding);
}
return encoding;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment