Skip to content

Instantly share code, notes, and snippets.

@jnioche
Created March 14, 2017 17:00
Show Gist options
  • Save jnioche/6141308519694b5c57d4fbd45d5990ac to your computer and use it in GitHub Desktop.
Save jnioche/6141308519694b5c57d4fbd45d5990ac to your computer and use it in GitHub Desktop.
Example of code I used for passing cookies to the protocol implementation in Nutch
package org.apache.nutch.protocol.httpclient;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.nutch.crawl.CrawlDatum;
/**
* Returns a String representing the cookies to be sent to the protocol request.
*
* @see http://en.wikipedia.org/wiki/HTTP_cookie
**/
public class CookieConverter {
public final static Text useCookiesKey = new Text("nutch.use.cookies");
public final static Text cookiesKey = new Text("nutch.cookies");
public static SimpleDateFormat dateFormat = new SimpleDateFormat(
"EEE, dd MMM yyyy HH:mm:ss zzz");
public static List<Cookie> getCookies(CrawlDatum datum, URL targetURL) {
ArrayList<Cookie> list = new ArrayList<Cookie>();
Writable cookies = datum.getMetaData().get(cookiesKey);
if (cookies == null || cookies.toString().length() == 0)
return list;
String[] cookiestrings = cookies.toString().split("\t");
for (String cs : cookiestrings) {
String name = null;
String value = null;
String expires = null;
String domain = null;
String path = null;
boolean secure = false;
String[] tokens = cs.split(";");
int equals = tokens[0].indexOf("=");
name = tokens[0].substring(0, equals);
value = tokens[0].substring(equals + 1);
for (int i = 1; i < tokens.length; i++) {
String ti = tokens[i].trim();
if (ti.equalsIgnoreCase("secure"))
secure = true;
if (ti.toLowerCase().startsWith("path=")) {
path = ti.substring(5);
}
if (ti.toLowerCase().startsWith("domain=")) {
domain = ti.substring(7);
}
if (ti.toLowerCase().startsWith("expires=")) {
expires = ti.substring(8);
}
}
BasicClientCookie cookie = new BasicClientCookie(name, value);
// check domain
if (domain != null) {
cookie.setDomain(domain);
if (!targetURL.getHost().contains(domain))
continue;
}
// check path
if (path != null) {
cookie.setPath(path);
if (!targetURL.getPath().startsWith(path))
continue;
}
// check secure
if (secure) {
cookie.setSecure(secure);
if (!targetURL.getProtocol().equalsIgnoreCase("https"))
continue;
}
// check expiration
if (expires != null) {
try {
Date expirationDate = dateFormat.parse(expires);
// check that it hasn't expired?
if (cookie.isExpired(new Date()))
continue;
cookie.setExpiryDate(expirationDate);
} catch (ParseException e) {
// ignore exceptions
}
}
// attach additional infos to cookie
list.add(cookie);
}
return list;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment