wfwei/getEncoding

## getEncoding
 /**
     * TIKA-332: Check for meta http-equiv tag with charset info in
     * HTML content.
     * <p>
     * TODO: Move this into core, along with CharsetDetector
     */
    private String getEncoding(InputStream stream, Metadata metadata) throws IOException {
        stream.mark(META_TAG_BUFFER_SIZE);
        char[] buffer = new char[META_TAG_BUFFER_SIZE];
        InputStreamReader isr = new InputStreamReader(stream, "us-ascii");
        // why use "us-ascii"  cz: we only care about the <meta http-equiv="Content-type" content="text/html; charset=xxx"> tag
        int bufferSize = isr.read(buffer);
        stream.reset();

        if (bufferSize != -1) {
            String metaString = new String(buffer, 0, bufferSize);
            Matcher m = HTTP_EQUIV_PATTERN.matcher(metaString);
            if (m.find()) {
                // TIKA-349: flexible handling of attributes
                // We have one or more x or x=y attributes, separated by ';'
                String[] attrs = m.group(1).split(";");
                for (String attr : attrs) {
                    String[] keyValue = attr.trim().split("=");
                    if ((keyValue.length == 2) && keyValue[0].equalsIgnoreCase("charset")) {
                        // TIKA-459: improve charset handling.
                    	String charset = CharsetUtils.clean(keyValue[1]);
                    	if (CharsetUtils.isSupported(charset)) {
                    	    metadata.set(Metadata.CONTENT_ENCODING, charset);
                    	    return charset;
                    	}
                    }
                }
            }
        }

        // No (valid) charset in a meta http-equiv tag, see if it's in the passed content-encoding
        // hint, or the passed content-type hint.
        CharsetDetector detector = new CharsetDetector();
        String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
        String incomingType = metadata.get(Metadata.CONTENT_TYPE);
        if (incomingCharset == null && incomingType != null) {
            // TIKA-341: Use charset in content-type
            MediaType mt = MediaType.parse(incomingType);
            if (mt != null) {
                String charset = mt.getParameters().get("charset");
                if ((charset != null) && Charset.isSupported(charset)) {
                    incomingCharset = charset;
                }
            }
        }

        if (incomingCharset != null) {
            detector.setDeclaredEncoding(incomingCharset);
        }

        // TIKA-341 without enabling input filtering (stripping of tags) the
        // short HTML tests don't work well.
        detector.enableInputFilter(true);
        detector.setText(stream);
        for (CharsetMatch match : detector.detectAll()) {
            if (Charset.isSupported(match.getName())) {
                metadata.set(Metadata.CONTENT_ENCODING, match.getName());

                // TIKA-339: Don't set language, as it's typically not a very good
                // guess, and it can create ambiguity if another (better) language
                // value is specified by a meta tag in the HTML (or via HTTP response
                // header).
                /*
                String language = match.getLanguage();
                if (language != null) {
                    metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
                    metadata.set(Metadata.LANGUAGE, match.getLanguage());
                }
                */

                break;
            }
        }

        String encoding = metadata.get(Metadata.CONTENT_ENCODING);
        if (encoding == null) {
            if (Charset.isSupported(DEFAULT_CHARSET)) {
                encoding = DEFAULT_CHARSET;
            } else {
                encoding = Charset.defaultCharset().name();
            }

            metadata.set(Metadata.CONTENT_ENCODING, encoding);
        }

        return encoding;
    }
	/**
	* TIKA-332: Check for meta http-equiv tag with charset info in
	* HTML content.
	* <p>
	* TODO: Move this into core, along with CharsetDetector
	*/
	private String getEncoding(InputStream stream, Metadata metadata) throws IOException {
	stream.mark(META_TAG_BUFFER_SIZE);
	char[] buffer = new char[META_TAG_BUFFER_SIZE];
	InputStreamReader isr = new InputStreamReader(stream, "us-ascii");
	// why use "us-ascii" cz: we only care about the <meta http-equiv="Content-type" content="text/html; charset=xxx"> tag
	int bufferSize = isr.read(buffer);
	stream.reset();

	if (bufferSize != -1) {
	String metaString = new String(buffer, 0, bufferSize);
	Matcher m = HTTP_EQUIV_PATTERN.matcher(metaString);
	if (m.find()) {
	// TIKA-349: flexible handling of attributes
	// We have one or more x or x=y attributes, separated by ';'
	String[] attrs = m.group(1).split(";");
	for (String attr : attrs) {
	String[] keyValue = attr.trim().split("=");
	if ((keyValue.length == 2) && keyValue[0].equalsIgnoreCase("charset")) {
	// TIKA-459: improve charset handling.
	String charset = CharsetUtils.clean(keyValue[1]);
	if (CharsetUtils.isSupported(charset)) {
	metadata.set(Metadata.CONTENT_ENCODING, charset);
	return charset;
	}
	}
	}
	}
	}

	// No (valid) charset in a meta http-equiv tag, see if it's in the passed content-encoding
	// hint, or the passed content-type hint.
	CharsetDetector detector = new CharsetDetector();
	String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
	String incomingType = metadata.get(Metadata.CONTENT_TYPE);
	if (incomingCharset == null && incomingType != null) {
	// TIKA-341: Use charset in content-type
	MediaType mt = MediaType.parse(incomingType);
	if (mt != null) {
	String charset = mt.getParameters().get("charset");
	if ((charset != null) && Charset.isSupported(charset)) {
	incomingCharset = charset;
	}
	}
	}

	if (incomingCharset != null) {
	detector.setDeclaredEncoding(incomingCharset);
	}

	// TIKA-341 without enabling input filtering (stripping of tags) the
	// short HTML tests don't work well.
	detector.enableInputFilter(true);
	detector.setText(stream);
	for (CharsetMatch match : detector.detectAll()) {
	if (Charset.isSupported(match.getName())) {
	metadata.set(Metadata.CONTENT_ENCODING, match.getName());

	// TIKA-339: Don't set language, as it's typically not a very good
	// guess, and it can create ambiguity if another (better) language
	// value is specified by a meta tag in the HTML (or via HTTP response
	// header).
	/*
	String language = match.getLanguage();
	if (language != null) {
	metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
	metadata.set(Metadata.LANGUAGE, match.getLanguage());
	}
	*/

	break;
	}
	}

	String encoding = metadata.get(Metadata.CONTENT_ENCODING);
	if (encoding == null) {
	if (Charset.isSupported(DEFAULT_CHARSET)) {
	encoding = DEFAULT_CHARSET;
	} else {
	encoding = Charset.defaultCharset().name();
	}

	metadata.set(Metadata.CONTENT_ENCODING, encoding);
	}

	return encoding;
	}