Last active
August 29, 2015 14:09
-
-
Save sheimi/5043bf0efb49ee8d326b to your computer and use it in GitHub Desktop.
code in blog.sheimi.me: 2012-05-13-hadoop-source-code-01 (1)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// code snippet in nutch | |
Injector injector = new Injector(getConf()); | |
Generator generator = new Generator(getConf()); | |
Fetcher fetcher = new Fetcher(getConf()); | |
ParseSegment parseSegment = new ParseSegment(getConf()); | |
CrawlDb crawlDbTool = new CrawlDb(getConf()); | |
LinkDb linkDbTool = new LinkDb(getConf()); | |
injector.inject(crawlDb, rootUrlDir); | |
int i; | |
for (i = 0; i < depth; i++) { // generate new segment | |
Path[] segs = generator.generate(crawlDb, segments, -1, topN, System | |
.currentTimeMillis()); | |
if (segs == null) { | |
LOG.info("Stopping at depth=" + i + " - no more URLs to fetch."); | |
break; | |
} | |
fetcher.fetch(segs[0], threads); // fetch it | |
if (!Fetcher.isParsing(job)) { | |
parseSegment.parse(segs[0]); // parse it, if needed | |
} | |
crawlDbTool.update(crawlDb, segs, true, true); // update crawldb | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment