Skip to content

Instantly share code, notes, and snippets.

Last active June 9, 2017 14:33
Show Gist options
  • Save isspek/7ee1d78a8a512abb000a3fac09cf2177 to your computer and use it in GitHub Desktop.
Save isspek/7ee1d78a8a512abb000a3fac09cf2177 to your computer and use it in GitHub Desktop.
Modification of HttpManagement in GERBIL for proxy servers
* This file is part of General Entity Annotator Benchmark.
* General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* General Entity Annotator Benchmark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* GNU Lesser General Public License for more details.
* You should have received a copy of the GNU Lesser General Public License
* along with General Entity Annotator Benchmark. If not, see <>.
package org.aksw.gerbil.http;
import java.util.concurrent.Semaphore;
import org.aksw.gerbil.config.GerbilConfiguration;
import org.apache.http.HttpHost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.conn.DefaultProxyRoutePlanner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.carrotsearch.hppc.ObjectLongOpenHashMap;
public class HttpManagement {
private static final Logger LOGGER = LoggerFactory.getLogger(HttpManagement.class);
public static final String MAXIMUM_TIME_TO_WAIT_KEY = "org.aksw.gerbil.annotator.http.HttpManagement.maxWaitingTime";
public static final String CHECK_INTERVAL_KEY = "org.aksw.gerbil.annotator.http.HttpManagement.checkInterval";
public static final String PROXY_HOST_KEY = "org.aksw.gerbil.annotator.http.HttpManagement.proxyHost";
public static final String PROXY_PORT_KEY = "org.aksw.gerbil.annotator.http.HttpManagement.proxyPort";
* TODO move this list into the property files.
private static final String BLOCKING_DOMAINS[] = new String[] { "", "",
"", "", "", "", "", "",
"", "", "", "", "", "",
"", "", "", "", "",
"", "", "", "", "",
"", "", "", "", "",
"", "", "", "", "",
"", "" };
public static final long DEFAULT_WAITING_TIME = 60000;
public static final long DEFAULT_CHECK_INTERVAL = 10000;
* The time the system should wait before sending a new request to a domain
* that could block the system.
private static final long BLOCKING_DOMAIN_WAITING_TIME = 500;
private static final String INTERRUPTER_THREAD_NAME = "HttpInterrupter";
private static final String USER_AGENT_STRING = "GERBIL/" + GerbilConfiguration.getGerbilVersion()
+ " (";
private static HttpManagement instance;
public synchronized static HttpManagement getInstance() {
if (instance == null) {
long maxWaitingTime = DEFAULT_WAITING_TIME;
try {
maxWaitingTime = GerbilConfiguration.getInstance().getLong(MAXIMUM_TIME_TO_WAIT_KEY);
} catch (Exception e) {
LOGGER.warn("Couldn't load maximum time to wait from configuration. Using default "
long checkInterval = DEFAULT_CHECK_INTERVAL;
try {
checkInterval = GerbilConfiguration.getInstance().getLong(CHECK_INTERVAL_KEY);
} catch (Exception e) {
LOGGER.warn("Couldn't load check interval from configuration. Using default " + DEFAULT_CHECK_INTERVAL
+ "ms.", e);
InterruptingObserver interruptingObserver = new InterruptingObserver(maxWaitingTime, checkInterval);
Thread t = new Thread(interruptingObserver);
instance = new HttpManagement(interruptingObserver, USER_AGENT_STRING);
for (int i = 0; i < BLOCKING_DOMAINS.length; ++i) {
return instance;
protected InterruptingObserver interruptingObserver;
protected CloseableHttpClient client;
protected String userAgent;
protected Semaphore blockingDomainMappingMutex = new Semaphore(1);
protected ObjectLongOpenHashMap<String> blockingDomainTimestampMapping = new ObjectLongOpenHashMap<String>();
protected HttpManagement(InterruptingObserver interruptingObserver, String userAgent) {
this.interruptingObserver = interruptingObserver;
this.client = generateHttpClientBuilder().build();
public void reportStart(HttpRequestEmitter emitter, HttpUriRequest request) {
// get the permission to send
interruptingObserver.reportStart(emitter, request);
protected void getStartPermission(HttpUriRequest request) {
try {
} catch (InterruptedException e) {
LOGGER.error("Interrupted while waiting for mutex to access the list of blocking domains. Aborting.");
long timeToSleep = 0;
try {
String host = request.getURI().getHost();
if ((host == null) || (!blockingDomainTimestampMapping.containsKey(host))) {
// we are allowed to use lget and lset since the mutex is securing
// the hashmap
long lastRequestTimeStamp = blockingDomainTimestampMapping.lget();
long currentTime = System.currentTimeMillis();
timeToSleep = BLOCKING_DOMAIN_WAITING_TIME - (currentTime - lastRequestTimeStamp);
if (timeToSleep > 0) {
blockingDomainTimestampMapping.lset(currentTime + timeToSleep);
} else {
} finally {
if (timeToSleep > 0) {
try {
} catch (InterruptedException e) {
LOGGER.error("Interrupted while waiting for permission. Sending will start now.");
public void reportEnd(HttpRequestEmitter emitter, HttpUriRequest request) {
interruptingObserver.reportEnd(emitter, request);
public void setMaxWaitingTime(long maxWaitingTime) {
public void setCheckInterval(long checkInterval) {
public long getMaxWaitingTime() {
return interruptingObserver.getMaxWaitingTime();
public long getCheckInterval() {
return interruptingObserver.getCheckInterval();
public CloseableHttpClient getDefaultClient() {
return client;
* Adds a domain that might block HTTP clients if they are sending too many
* requests.
public void addBlockingDomain(String domain) {
try {
} catch (InterruptedException e) {
LOGGER.error("Interrupted while waiting for mutex to access the list of blocking domains. Aborting.");
try {
blockingDomainTimestampMapping.put(domain, 0);
} finally {
* Creates a HttpClientBuilder with the default settings of GERBIL.
* @return a HttpClientBuilder with the default settings of GERBIL.
public HttpClientBuilder generateHttpClientBuilder() {
HttpClientBuilder builder = HttpClientBuilder.create();
String proxyHost = GerbilConfiguration.getInstance().getString(PROXY_HOST_KEY);
int proxyPort = GerbilConfiguration.getInstance().getInt(PROXY_PORT_KEY);
HttpHost proxy = new HttpHost(proxyHost, proxyPort);
DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy);
return builder;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment