Skip to content

Instantly share code, notes, and snippets.

@JobsDong
Last active August 29, 2015 14:03
Show Gist options
  • Save JobsDong/e07cd3aff253ad37b449 to your computer and use it in GitHub Desktop.
Save JobsDong/e07cd3aff253ad37b449 to your computer and use it in GitHub Desktop.
charset detect in web crawling
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HttpUtils {
private static final Set<String> METHODS = new HashSet<String>(Arrays.asList("get", "post"));
private static final int META_TAG_BUFFER_SIZE = 8192;
private static final Pattern HTTP_META_PATTERN = Pattern.compile(
"(?is)<\\s*meta\\s+([^<>]+)"
);
private static final Pattern FLEXIBLE_CHARSET_ATTR_PATTERN = Pattern.compile(
("(?is)charset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)")
);
private static final Charset ASCII = Charset.forName("US-ASCII");
/**
* judge http method legal
* @param method
* @return
*/
public static boolean islegalMethod(String method) {
if (method == null) {
return false;
}
String formatMethod = method.toLowerCase();
if (METHODS.contains(formatMethod)) {
return true;
}
return false;
}
/**
* format http headers
* @param headers
* @return
*/
public static Map<String, String> formatHeaders(Map<String, String> headers) {
Map<String, String> formatHeaders = new HashMap<String, String>();
Iterator<Map.Entry<String, String>> it = headers.entrySet().iterator();
while (it.hasNext()) {
Map.Entry<String, String> entity = it.next();
if (entity.getValue() != null) {
String lowKey = entity.getKey().toLowerCase();
String value = entity.getValue();
if (formatHeaders.containsKey(lowKey)) {
value = formatHeaders.get(lowKey) + ";" + entity.getValue();
}
formatHeaders.put(entity.getKey().toLowerCase(), value);
}
}
return formatHeaders;
}
/**
* convert Header[] to Map<String, String>
* @param headers
* @return
*/
public static Map<String, String> convertHeaders(Header[] headers) {
if (headers == null) {
return new HashMap<String, String>();
}
Map<String, String> formatedHeaders = new HashMap<String, String>();
for (Header header: headers) {
formatedHeaders.put(header.getName().toLowerCase(), header.getValue());
}
return formatedHeaders;
}
public static Charset detect(HttpEntity entity) {
if (entity == null) {
throw new NullPointerException("entity can't be null");
}
//1. get charset from http header
String contentType = entity.getContentType().toString().toLowerCase();
Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher(contentType);
//iterate through meta tags
while (charsetMatcher.find()) {
String candCharset = charsetMatcher.group(1);
if (Charset.isSupported(candCharset)) {
try {
return Charset.forName(candCharset);
} catch (Exception e) {
//ignore
}
}
}
return null;
}
public static Charset detect(InputStream input) throws IOException {
if (input == null) {
throw new NullPointerException("content can't be null");
}
//Read enough of the text stream capture possible meta tags
input.mark(META_TAG_BUFFER_SIZE);
byte[] buffer = new byte[META_TAG_BUFFER_SIZE];
int n = 0;
int m = input.read(buffer);
while (m != -1 && n < buffer.length) {
n += m;
m = input.read(buffer, n, buffer.length - n);
}
input.reset();
//Interpret the head as ASCII and try to spot a meta tag with
// a possible character encoding hint
String head = ASCII.decode(ByteBuffer.wrap(buffer, 0, n)).toString();
Matcher equiv = HTTP_META_PATTERN.matcher(head);
Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher("");
//iterate through meta tags
while (equiv.find()) {
String attrs = equiv.group(1);
charsetMatcher.reset(attrs);
while (charsetMatcher.find()) {
String candCharset = charsetMatcher.group(1);
if (Charset.isSupported(candCharset)) {
try {
return Charset.forName(candCharset);
} catch (Exception e) {
//ignore
}
}
}
}
return null;
}
}
@JobsDong
Copy link
Author

HttpEntity is a class in HTTPClient4.3

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment