Created
May 23, 2012 13:11
-
-
Save wfwei/2775168 to your computer and use it in GitHub Desktop.
charset detect in tika
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* TIKA-332: Check for meta http-equiv tag with charset info in | |
* HTML content. | |
* <p> | |
* TODO: Move this into core, along with CharsetDetector | |
*/ | |
private String getEncoding(InputStream stream, Metadata metadata) throws IOException { | |
stream.mark(META_TAG_BUFFER_SIZE); | |
char[] buffer = new char[META_TAG_BUFFER_SIZE]; | |
InputStreamReader isr = new InputStreamReader(stream, "us-ascii"); | |
// why use "us-ascii" cz: we only care about the <meta http-equiv="Content-type" content="text/html; charset=xxx"> tag | |
int bufferSize = isr.read(buffer); | |
stream.reset(); | |
if (bufferSize != -1) { | |
String metaString = new String(buffer, 0, bufferSize); | |
Matcher m = HTTP_EQUIV_PATTERN.matcher(metaString); | |
if (m.find()) { | |
// TIKA-349: flexible handling of attributes | |
// We have one or more x or x=y attributes, separated by ';' | |
String[] attrs = m.group(1).split(";"); | |
for (String attr : attrs) { | |
String[] keyValue = attr.trim().split("="); | |
if ((keyValue.length == 2) && keyValue[0].equalsIgnoreCase("charset")) { | |
// TIKA-459: improve charset handling. | |
String charset = CharsetUtils.clean(keyValue[1]); | |
if (CharsetUtils.isSupported(charset)) { | |
metadata.set(Metadata.CONTENT_ENCODING, charset); | |
return charset; | |
} | |
} | |
} | |
} | |
} | |
// No (valid) charset in a meta http-equiv tag, see if it's in the passed content-encoding | |
// hint, or the passed content-type hint. | |
CharsetDetector detector = new CharsetDetector(); | |
String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING); | |
String incomingType = metadata.get(Metadata.CONTENT_TYPE); | |
if (incomingCharset == null && incomingType != null) { | |
// TIKA-341: Use charset in content-type | |
MediaType mt = MediaType.parse(incomingType); | |
if (mt != null) { | |
String charset = mt.getParameters().get("charset"); | |
if ((charset != null) && Charset.isSupported(charset)) { | |
incomingCharset = charset; | |
} | |
} | |
} | |
if (incomingCharset != null) { | |
detector.setDeclaredEncoding(incomingCharset); | |
} | |
// TIKA-341 without enabling input filtering (stripping of tags) the | |
// short HTML tests don't work well. | |
detector.enableInputFilter(true); | |
detector.setText(stream); | |
for (CharsetMatch match : detector.detectAll()) { | |
if (Charset.isSupported(match.getName())) { | |
metadata.set(Metadata.CONTENT_ENCODING, match.getName()); | |
// TIKA-339: Don't set language, as it's typically not a very good | |
// guess, and it can create ambiguity if another (better) language | |
// value is specified by a meta tag in the HTML (or via HTTP response | |
// header). | |
/* | |
String language = match.getLanguage(); | |
if (language != null) { | |
metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage()); | |
metadata.set(Metadata.LANGUAGE, match.getLanguage()); | |
} | |
*/ | |
break; | |
} | |
} | |
String encoding = metadata.get(Metadata.CONTENT_ENCODING); | |
if (encoding == null) { | |
if (Charset.isSupported(DEFAULT_CHARSET)) { | |
encoding = DEFAULT_CHARSET; | |
} else { | |
encoding = Charset.defaultCharset().name(); | |
} | |
metadata.set(Metadata.CONTENT_ENCODING, encoding); | |
} | |
return encoding; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment