Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@sgonyea
Created August 2, 2010 18:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sgonyea/505065 to your computer and use it in GitHub Desktop.
Save sgonyea/505065 to your computer and use it in GitHub Desktop.
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>mapred.job.tracker</name>
<value>ec2-1-2-3-4.us-west-1.compute.amazonaws.com:8021</value>
</property>
<!-- Enable Hue plugins
<property>
<name>mapred.jobtracker.plugins</name>
<value>org.apache.hadoop.thriftfs.ThriftJobTrackerPlugin</value>
<description>Comma-separated list of jobtracker plug-ins to be activated.
</description>
</property> -->
<property>
<name>mapred.local.dir</name>
<value>/hadoop/mapred/local</value>
<final>true</final>
</property>
<property>
<name>mapred.system.dir</name>
<value>/hadoop/mapred/system</value>
</property>
<property>
<name>mapred.compress.map.output</name>
<value>true</value>
</property>
<property>
<name>mapred.output.compress</name>
<value>true</value>
</property>
<property>
<name>mapred.child.java.opts</name>
<value>-Xmx2048m</value>
</property>
<property>
<name>mapred.job.tracker.handler.count</name>
<value>7</value>
</property>
<property>
<name>mapred.map.tasks.speculative.execution</name>
<value>false</value>
</property>
<property>
<name>mapred.reduce.tasks.speculative.execution</name>
<value>false</value>
</property>
<property>
<name>mapred.map.tasks</name>
<value>181</value>
</property>
<property>
<name>mapred.reduce.tasks</name>
<value>41</value>
</property>
<property>
<name>mapred.submit.replication</name>
<value>2</value>
</property>
<property>
<name>mapred.job.reuse.jvm.num.tasks</name>
<value>-1</value>
</property>
<property>
<name>mapred.tasktracker.map.tasks.maximum</name>
<value>3</value>
<final>true</final>
</property>
<property>
<name>mapred.tasktracker.reduce.tasks.maximum</name>
<value>1</value>
<final>true</final>
</property>
</configuration>
<!-- These are snippets from my config files, relevant to performance, which I'd appreciate feedback on -->
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>fetcher.threads.fetch</name>
<value>320</value>
<description>The number of FetcherThreads the fetcher should use.
This is also determines the maximum number of requests that are
made at once (each FetcherThread handles one connection).</description>
</property>
<property>
<name>fetcher.max.exceptions.per.queue</name>
<value>-1</value>
<description>The maximum number of protocol-level exceptions (e.g. timeouts) per
host (or IP) queue. Once this value is reached, any remaining entries from this
queue are purged, effectively stopping the fetching from this host/IP. The default
value of -1 deactivates this limit.
</description>
</property>
<property>
<name>db.max.outlinks.per.page</name>
<value>256</value>
<description>The maximum number of outlinks that we'll process for a page.
If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks
will be processed for a page; otherwise, all outlinks will be processed.
</description>
</property>
<property>
<name>search.response.default.numrows</name>
<value>50</value>
<description>
The default number of rows to return if none is specified.
</description>
</property>
<property>
<name>searcher.filter.cache.size</name>
<value>32</value>
<description>
Maximum number of filters to cache. Filters can accelerate certain
field-based queries, like language, document format, etc. Each
filter requires one bit of RAM per page. So, with a 10 million page
index, a cache size of 16 consumes two bytes per page, or 20MB.
</description>
</property>
<property>
<name>http.timeout</name>
<value>15000</value>
<description>The default network timeout, in milliseconds.</description>
</property>
<!-- web db properties -->
<property>
<name>db.fetch.interval.default</name>
<value>604800</value>
<description>The default number of seconds between re-fetches of a page (30 days ).
</description>
</property>
<property>
<name>db.fetch.interval.max</name>
<value>864000</value>
<description>The maximum number of seconds between re-fetches of a page
(90 days). After this period every page in the db will be re-tried, no
matter what is its status.
</description>
</property>
<property>
<name>db.fetch.schedule.class</name>
<value>org.apache.nutch.crawl.DefaultFetchSchedule</value>
<description>The implementation of fetch schedule. DefaultFetchSchedule simply
adds the original fetchInterval to the last fetch time, regardless of
page changes.</description>
</property>
<property>
<name>db.ignore.external.links</name>
<value>true</value>
<description>If true, outlinks leading from a page to external hosts
will be ignored. This is an effective way to limit the crawl to include
only initially injected hosts, without creating complex URLFilters.
</description>
</property>
<property>
<name>fetcher.verbose</name>
<value>true</value>
<description>If true, fetcher will log more verbosely.</description>
</property>
<!-- urlpartitioner properties -->
<property>
<name>crawl.gen.delay</name>
<value>86400000</value>
<description>
This value, expressed in days, defines how long we should keep the lock on records
in CrawlDb that were just selected for fetching. If these records are not updated
in the meantime, the lock is canceled, i.e. the become eligible for selecting.
Default value of this is 7 days. (millisecs units)
</description>
</property>
<property>
<name>fetcher.server.delay</name>
<value>2</value>
<description>The number of seconds the fetcher will delay between
successive requests to the same server.(default 5)</description>
</property>
<property>
<name>http.redirect.max</name>
<value>8</value>
<description>The maximum number of redirects the fetcher will follow when
trying to fetch a page. If set to negative or 0, fetcher won't immediately
follow redirected URLs, instead it will record them for later fetching.
</description>
</property>
<property>
<name>plugin.includes</name>
<value>protocol-http|urlfilter-regex|parse-(text|html)|index-(basic|anchor|urlmeta)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
<!-- <value>protocol-http|urlfilter-regex|parse-(tika)|index-(basic|anchor|urlmeta)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value> -->
<!-- <value>protocol-http|urlfilter-regex|parse-(text|html|js|tika|rss)|index-(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value> -->
<description>Regular expression naming plugin directory names to
include. Any plugin not matching this expression is excluded.
In any case you need at least include the nutch-extensionpoints plugin. By
default Nutch includes crawling just HTML and plain text via HTTP,
and basic indexing and search plugins. In order to use HTTPS please enable
protocol-httpclient, but be aware of possible intermittent problems with the
underlying commons-httpclient library. Nutch now also includes integration with Tika
to leverage Tika's parsing capabilities for multiple content types. The existing Nutch
parser implementations will likely be phased out in the next release or so, as such, it is
a good idea to begin migrating away from anything not provided by parse-tika.
</description>
</property>
<property>
<name>http.useHttp11</name>
<value>true</value>
<description>NOTE: at the moment this works only for protocol-httpclient.
If true, use HTTP 1.1, if false use HTTP 1.0 .
</description>
</property>
</configuration>
<!-- This is on every server's own mapred-site hadoop config -->
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>mapred.job.tracker</name>
<value>ec2-1-2-3-4.us-west-1.compute.amazonaws.com:8021</value>
</property>
<!-- Enable Hue plugins
<property>
<name>mapred.jobtracker.plugins</name>
<value>org.apache.hadoop.thriftfs.ThriftJobTrackerPlugin</value>
<description>Comma-separated list of jobtracker plug-ins to be activated.
</description>
</property> -->
<property>
<name>mapred.local.dir</name>
<value>/hadoop/mapred/local</value>
<!-- <final>true</final> -->
</property>
<property>
<name>mapred.system.dir</name>
<value>/hadoop/mapred/system</value>
</property>
<property>
<name>mapred.compress.map.output</name>
<value>true</value>
</property>
<property>
<name>mapred.output.compress</name>
<value>true</value>
</property>
<property>
<name>mapred.child.java.opts</name>
<value>-Xmx2048m</value>
</property>
<property>
<name>mapred.job.tracker.handler.count</name>
<value>7</value>
</property>
<property>
<name>mapred.map.tasks.speculative.execution</name>
<value>true</value>
</property>
<property>
<name>mapred.map.tasks</name>
<value>3</value>
</property>
<property>
<name>mapred.reduce.tasks</name>
<value>1</value>
</property>
<property>
<name>mapred.submit.replication</name>
<value>2</value>
</property>
<property>
<name>mapred.job.reuse.jvm.num.tasks</name>
<value>-1</value>
</property>
<property>
<name>mapred.tasktracker.map.tasks.maximum</name>
<value>3</value>
</property>
<property>
<name>mapred.tasktracker.reduce.tasks.maximum</name>
<value>1</value>
</property>
</configuration>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment