Created
March 21, 2022 13:47
-
-
Save jnioche/040983264ea8043a9efa7198bf509789 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
protected CrawlController init() throws Exception { | |
final CrawlConfig config = new CrawlConfig(); | |
config.setCrawlStorageFolder(“/tmp”); | |
config.setPolitenessDelay(800); | |
config.setMaxDepthOfCrawling(3); | |
config.setIncludeBinaryContentInCrawling(false); | |
config.setResumableCrawling(true); | |
config.setHaltOnError(false); | |
final BasicURLNormalizer normalizer = BasicURLNormalizer.newBuilder().idnNormalization(BasicURLNormalizer.IdnNormalization.NONE).build(); | |
final PageFetcher pageFetcher = new PageFetcher(config, normalizer); | |
final RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); | |
robotstxtConfig.setSkipCheckForSeeds(true); // we skip the robots checks for adding seeds (will be checked later on demand) | |
final int maxQueues = 10; | |
final int port = 10; | |
final FrontierConfiguration frontierConfiguration = new URLFrontierConfiguration(config, maxQueues, "localhost", port); | |
final RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher, frontierConfiguration.getWebURLFactory()); | |
return new CrawlController(config, normalizer, pageFetcher, robotstxtServer, frontierConfiguration); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment