Skip to content

Instantly share code, notes, and snippets.

@jnioche
Created March 21, 2022 13:47
Show Gist options
  • Save jnioche/040983264ea8043a9efa7198bf509789 to your computer and use it in GitHub Desktop.
Save jnioche/040983264ea8043a9efa7198bf509789 to your computer and use it in GitHub Desktop.
protected CrawlController init() throws Exception {
final CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(“/tmp”);
config.setPolitenessDelay(800);
config.setMaxDepthOfCrawling(3);
config.setIncludeBinaryContentInCrawling(false);
config.setResumableCrawling(true);
config.setHaltOnError(false);
final BasicURLNormalizer normalizer = BasicURLNormalizer.newBuilder().idnNormalization(BasicURLNormalizer.IdnNormalization.NONE).build();
final PageFetcher pageFetcher = new PageFetcher(config, normalizer);
final RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
robotstxtConfig.setSkipCheckForSeeds(true); // we skip the robots checks for adding seeds (will be checked later on demand)
final int maxQueues = 10;
final int port = 10;
final FrontierConfiguration frontierConfiguration = new URLFrontierConfiguration(config, maxQueues, "localhost", port);
final RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher, frontierConfiguration.getWebURLFactory());
return new CrawlController(config, normalizer, pageFetcher, robotstxtServer, frontierConfiguration);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment