Created
August 2, 2010 18:18
-
-
Save sgonyea/505065 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0"?> | |
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> | |
<configuration> | |
<property> | |
<name>mapred.job.tracker</name> | |
<value>ec2-1-2-3-4.us-west-1.compute.amazonaws.com:8021</value> | |
</property> | |
<!-- Enable Hue plugins | |
<property> | |
<name>mapred.jobtracker.plugins</name> | |
<value>org.apache.hadoop.thriftfs.ThriftJobTrackerPlugin</value> | |
<description>Comma-separated list of jobtracker plug-ins to be activated. | |
</description> | |
</property> --> | |
<property> | |
<name>mapred.local.dir</name> | |
<value>/hadoop/mapred/local</value> | |
<final>true</final> | |
</property> | |
<property> | |
<name>mapred.system.dir</name> | |
<value>/hadoop/mapred/system</value> | |
</property> | |
<property> | |
<name>mapred.compress.map.output</name> | |
<value>true</value> | |
</property> | |
<property> | |
<name>mapred.output.compress</name> | |
<value>true</value> | |
</property> | |
<property> | |
<name>mapred.child.java.opts</name> | |
<value>-Xmx2048m</value> | |
</property> | |
<property> | |
<name>mapred.job.tracker.handler.count</name> | |
<value>7</value> | |
</property> | |
<property> | |
<name>mapred.map.tasks.speculative.execution</name> | |
<value>false</value> | |
</property> | |
<property> | |
<name>mapred.reduce.tasks.speculative.execution</name> | |
<value>false</value> | |
</property> | |
<property> | |
<name>mapred.map.tasks</name> | |
<value>181</value> | |
</property> | |
<property> | |
<name>mapred.reduce.tasks</name> | |
<value>41</value> | |
</property> | |
<property> | |
<name>mapred.submit.replication</name> | |
<value>2</value> | |
</property> | |
<property> | |
<name>mapred.job.reuse.jvm.num.tasks</name> | |
<value>-1</value> | |
</property> | |
<property> | |
<name>mapred.tasktracker.map.tasks.maximum</name> | |
<value>3</value> | |
<final>true</final> | |
</property> | |
<property> | |
<name>mapred.tasktracker.reduce.tasks.maximum</name> | |
<value>1</value> | |
<final>true</final> | |
</property> | |
</configuration> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!-- These are snippets from my config files, relevant to performance, which I'd appreciate feedback on --> | |
<?xml version="1.0"?> | |
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> | |
<!-- Put site-specific property overrides in this file. --> | |
<configuration> | |
<property> | |
<name>fetcher.threads.fetch</name> | |
<value>320</value> | |
<description>The number of FetcherThreads the fetcher should use. | |
This is also determines the maximum number of requests that are | |
made at once (each FetcherThread handles one connection).</description> | |
</property> | |
<property> | |
<name>fetcher.max.exceptions.per.queue</name> | |
<value>-1</value> | |
<description>The maximum number of protocol-level exceptions (e.g. timeouts) per | |
host (or IP) queue. Once this value is reached, any remaining entries from this | |
queue are purged, effectively stopping the fetching from this host/IP. The default | |
value of -1 deactivates this limit. | |
</description> | |
</property> | |
<property> | |
<name>db.max.outlinks.per.page</name> | |
<value>256</value> | |
<description>The maximum number of outlinks that we'll process for a page. | |
If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks | |
will be processed for a page; otherwise, all outlinks will be processed. | |
</description> | |
</property> | |
<property> | |
<name>search.response.default.numrows</name> | |
<value>50</value> | |
<description> | |
The default number of rows to return if none is specified. | |
</description> | |
</property> | |
<property> | |
<name>searcher.filter.cache.size</name> | |
<value>32</value> | |
<description> | |
Maximum number of filters to cache. Filters can accelerate certain | |
field-based queries, like language, document format, etc. Each | |
filter requires one bit of RAM per page. So, with a 10 million page | |
index, a cache size of 16 consumes two bytes per page, or 20MB. | |
</description> | |
</property> | |
<property> | |
<name>http.timeout</name> | |
<value>15000</value> | |
<description>The default network timeout, in milliseconds.</description> | |
</property> | |
<!-- web db properties --> | |
<property> | |
<name>db.fetch.interval.default</name> | |
<value>604800</value> | |
<description>The default number of seconds between re-fetches of a page (30 days ). | |
</description> | |
</property> | |
<property> | |
<name>db.fetch.interval.max</name> | |
<value>864000</value> | |
<description>The maximum number of seconds between re-fetches of a page | |
(90 days). After this period every page in the db will be re-tried, no | |
matter what is its status. | |
</description> | |
</property> | |
<property> | |
<name>db.fetch.schedule.class</name> | |
<value>org.apache.nutch.crawl.DefaultFetchSchedule</value> | |
<description>The implementation of fetch schedule. DefaultFetchSchedule simply | |
adds the original fetchInterval to the last fetch time, regardless of | |
page changes.</description> | |
</property> | |
<property> | |
<name>db.ignore.external.links</name> | |
<value>true</value> | |
<description>If true, outlinks leading from a page to external hosts | |
will be ignored. This is an effective way to limit the crawl to include | |
only initially injected hosts, without creating complex URLFilters. | |
</description> | |
</property> | |
<property> | |
<name>fetcher.verbose</name> | |
<value>true</value> | |
<description>If true, fetcher will log more verbosely.</description> | |
</property> | |
<!-- urlpartitioner properties --> | |
<property> | |
<name>crawl.gen.delay</name> | |
<value>86400000</value> | |
<description> | |
This value, expressed in days, defines how long we should keep the lock on records | |
in CrawlDb that were just selected for fetching. If these records are not updated | |
in the meantime, the lock is canceled, i.e. the become eligible for selecting. | |
Default value of this is 7 days. (millisecs units) | |
</description> | |
</property> | |
<property> | |
<name>fetcher.server.delay</name> | |
<value>2</value> | |
<description>The number of seconds the fetcher will delay between | |
successive requests to the same server.(default 5)</description> | |
</property> | |
<property> | |
<name>http.redirect.max</name> | |
<value>8</value> | |
<description>The maximum number of redirects the fetcher will follow when | |
trying to fetch a page. If set to negative or 0, fetcher won't immediately | |
follow redirected URLs, instead it will record them for later fetching. | |
</description> | |
</property> | |
<property> | |
<name>plugin.includes</name> | |
<value>protocol-http|urlfilter-regex|parse-(text|html)|index-(basic|anchor|urlmeta)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value> | |
<!-- <value>protocol-http|urlfilter-regex|parse-(tika)|index-(basic|anchor|urlmeta)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value> --> | |
<!-- <value>protocol-http|urlfilter-regex|parse-(text|html|js|tika|rss)|index-(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value> --> | |
<description>Regular expression naming plugin directory names to | |
include. Any plugin not matching this expression is excluded. | |
In any case you need at least include the nutch-extensionpoints plugin. By | |
default Nutch includes crawling just HTML and plain text via HTTP, | |
and basic indexing and search plugins. In order to use HTTPS please enable | |
protocol-httpclient, but be aware of possible intermittent problems with the | |
underlying commons-httpclient library. Nutch now also includes integration with Tika | |
to leverage Tika's parsing capabilities for multiple content types. The existing Nutch | |
parser implementations will likely be phased out in the next release or so, as such, it is | |
a good idea to begin migrating away from anything not provided by parse-tika. | |
</description> | |
</property> | |
<property> | |
<name>http.useHttp11</name> | |
<value>true</value> | |
<description>NOTE: at the moment this works only for protocol-httpclient. | |
If true, use HTTP 1.1, if false use HTTP 1.0 . | |
</description> | |
</property> | |
</configuration> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!-- This is on every server's own mapred-site hadoop config --> | |
<?xml version="1.0"?> | |
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> | |
<configuration> | |
<property> | |
<name>mapred.job.tracker</name> | |
<value>ec2-1-2-3-4.us-west-1.compute.amazonaws.com:8021</value> | |
</property> | |
<!-- Enable Hue plugins | |
<property> | |
<name>mapred.jobtracker.plugins</name> | |
<value>org.apache.hadoop.thriftfs.ThriftJobTrackerPlugin</value> | |
<description>Comma-separated list of jobtracker plug-ins to be activated. | |
</description> | |
</property> --> | |
<property> | |
<name>mapred.local.dir</name> | |
<value>/hadoop/mapred/local</value> | |
<!-- <final>true</final> --> | |
</property> | |
<property> | |
<name>mapred.system.dir</name> | |
<value>/hadoop/mapred/system</value> | |
</property> | |
<property> | |
<name>mapred.compress.map.output</name> | |
<value>true</value> | |
</property> | |
<property> | |
<name>mapred.output.compress</name> | |
<value>true</value> | |
</property> | |
<property> | |
<name>mapred.child.java.opts</name> | |
<value>-Xmx2048m</value> | |
</property> | |
<property> | |
<name>mapred.job.tracker.handler.count</name> | |
<value>7</value> | |
</property> | |
<property> | |
<name>mapred.map.tasks.speculative.execution</name> | |
<value>true</value> | |
</property> | |
<property> | |
<name>mapred.map.tasks</name> | |
<value>3</value> | |
</property> | |
<property> | |
<name>mapred.reduce.tasks</name> | |
<value>1</value> | |
</property> | |
<property> | |
<name>mapred.submit.replication</name> | |
<value>2</value> | |
</property> | |
<property> | |
<name>mapred.job.reuse.jvm.num.tasks</name> | |
<value>-1</value> | |
</property> | |
<property> | |
<name>mapred.tasktracker.map.tasks.maximum</name> | |
<value>3</value> | |
</property> | |
<property> | |
<name>mapred.tasktracker.reduce.tasks.maximum</name> | |
<value>1</value> | |
</property> | |
</configuration> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment