Skip to content

Instantly share code, notes, and snippets.

@martinsson
Created October 18, 2011 20:47
Show Gist options
  • Save martinsson/1296674 to your computer and use it in GitHub Desktop.
Save martinsson/1296674 to your computer and use it in GitHub Desktop.
Simple != Simple to use
public class BatchCrawler {
public BatchCrawler(Job job) {
this.job = job;
this.timeout = ConfigurationService.getInt(SCAN_TIMEOUT);
httpCrawler = new RetryingHttpCrawler();
}
public CrawlResult crawl() {
CrawlResult crawlResult = new CrawlResult();
for (Url url : job.getUrls()) {
Page page = httpCrawler.visit(url, timeout);
result.add(url, page.content());
}
return crawlResult;
}
public BatchCrawler(Job job, int timeout, Crawler crawler) {
this.job = job;
this.timeout = timeout;
this.crawler = crawler;
}
public static retryingBatchCrawlerWithSystemConfig(Job job) {
int timeout = ConfigurationService.getInt(SCAN_TIMEOUT);
int[] retryintervals = ConfigurationService.getIntArray(RETRY_INTERVALS);
Crawler retryingHttpCrawler = new RetryingHttpCrawler(retryIntervals);
return new BatchCrawler(job, timeout, retryingHttpCrawler)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment