Created
June 30, 2020 15:46
-
-
Save kris-sigur/2c86c3ad22b0d9b399357c2dff8916ce to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package is.landsbokasafn.crawler.deciderules; | |
import static org.archive.modules.CoreAttributeConstants.A_DNS_SERVER_IP_LABEL; | |
import java.net.InetAddress; | |
import java.util.Collections; | |
import java.util.HashSet; | |
import java.util.Set; | |
import java.util.logging.Level; | |
import java.util.logging.Logger; | |
import org.apache.commons.net.util.SubnetUtils; | |
import org.archive.modules.CrawlURI; | |
import org.archive.modules.deciderules.PredicatedDecideRule; | |
import org.archive.modules.net.CrawlHost; | |
import org.archive.modules.net.ServerCache; | |
import org.springframework.beans.factory.annotation.Autowired; | |
/** | |
* <p> | |
* Variant on the {@link org.archive.modules.deciderules.IpAddressSetDecideRule} | |
* that also accepts address ranges in CIDR notation instead of just individual | |
* IP addresses. Note that the CIDR addresses are expanded and stored as | |
* individual addresses. This is suitable for small-ish subnets, but for larger | |
* ranges use {@link IpAddressCidrSetDecideRule} | |
* </p> | |
* | |
* <p> | |
* IpAddressSetDecideRule must be used with | |
* org.archive.crawler.prefetch.Preselector#setRecheckScope(boolean) set to true | |
* because it relies on Heritrix' dns lookup to establish the ip address for a | |
* URI before it can run. | |
* </p> | |
* | |
* <pre> | |
* <bean class="org.archive.modules.deciderules.IpAddressCidrSetDecideRule"> | |
* <property name="ipAddresseCidr"> | |
* <set> | |
* <value>127.0.0.1</value> | |
* <value>69.89.27.0/24</value> | |
* </set> | |
* </property> | |
* <property name='decision' value='REJECT' /> | |
* </bean> | |
* </pre> | |
* | |
* @author Travis Wellman <travis@archive.org> | |
* @author Kristinn Sigurðsson | |
*/ | |
public class IpAddressSetDecideRule extends PredicatedDecideRule { | |
private static final Logger logger = Logger.getLogger(IpAddressSetDecideRule.class.getName()); | |
private static final long serialVersionUID = -3670434739183271441L; | |
private Set<String> ipAddresses; | |
/** | |
* @return the addresses being matched | |
*/ | |
public Set<String> getIpAddresses() { | |
return Collections.unmodifiableSet(ipAddresses); | |
} | |
/** | |
* @param ipAddresses the addresses to match | |
*/ | |
public void setIpAddresses(Set<String> ipAddresses) { | |
this.ipAddresses = new HashSet<>(); | |
for (String address : ipAddresses) { | |
addIpAddress(address); | |
} | |
} | |
public void addIpAddress(String address) { | |
if (address.matches("[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}/[0-9]{1,2}")) { | |
// Address in CIDR notation | |
try { | |
for (String cidrAdd : (new SubnetUtils(address)).getInfo().getAllAddresses()) { | |
this.ipAddresses.add(cidrAdd); | |
} | |
} catch (Exception e) { | |
logger.severe("Invalid CIDR address specified: " + address); | |
} | |
} else { | |
this.ipAddresses.add(address); | |
} | |
} | |
@Override | |
protected boolean evaluate(CrawlURI curi) { | |
String hostAddress = getHostAddress(curi); | |
return hostAddress != null && ipAddresses.contains(hostAddress.intern()); | |
} | |
transient protected ServerCache serverCache; | |
public ServerCache getServerCache() { | |
return this.serverCache; | |
} | |
@Autowired | |
public void setServerCache(ServerCache serverCache) { | |
this.serverCache = serverCache; | |
} | |
/** | |
* from WriterPoolProcessor | |
* | |
* @param curi CrawlURI | |
* @return String of IP address or null if unable to determine IP address | |
*/ | |
protected String getHostAddress(CrawlURI curi) { | |
// special handling for DNS URIs: want address of DNS server | |
if (curi.getUURI().getScheme().toLowerCase().equals("dns")) { | |
return (String) curi.getData().get(A_DNS_SERVER_IP_LABEL); | |
} | |
// otherwise, host referenced in URI | |
// TODO:FIXME: have fetcher insert exact IP contacted into curi, | |
// use that rather than inferred by CrawlHost lookup | |
String addr = null; | |
try { | |
CrawlHost crlh = getServerCache().getHostFor(curi.getUURI()); | |
if (crlh == null) { | |
return null; | |
} | |
InetAddress inetadd = crlh.getIP(); | |
if (inetadd == null) { | |
return null; | |
} | |
addr = inetadd.getHostAddress(); | |
} catch (Exception e) { | |
// Log error and continue (return null) | |
logger.log(Level.WARNING, "Error looking up IP for URI " + curi.getURI(), e); | |
} | |
return addr; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment