Skip to content

Instantly share code, notes, and snippets.

@jorgelbg
Created May 26, 2016 01:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jorgelbg/689b1d66d116fa55a1ee14d7193d71b4 to your computer and use it in GitHub Desktop.
Save jorgelbg/689b1d66d116fa55a1ee14d7193d71b4 to your computer and use it in GitHub Desktop.
Default configuration for a Nutch REST Job with the solr.server.url setted
{
"anchorIndexingFilter.deduplicate": "false",
"cosine.goldstandard.file": "goldstandard.txt",
"crawl.gen.delay": "604800000",
"db.fetch.interval.default": "2592000",
"db.fetch.interval.max": "7776000",
"db.fetch.retry.max": "3",
"db.fetch.schedule.adaptive.dec_rate": "0.2",
"db.fetch.schedule.adaptive.inc_rate": "0.4",
"db.fetch.schedule.adaptive.max_interval": "31536000.0",
"db.fetch.schedule.adaptive.min_interval": "60.0",
"db.fetch.schedule.adaptive.sync_delta": "true",
"db.fetch.schedule.adaptive.sync_delta_rate": "0.3",
"db.fetch.schedule.class": "org.apache.nutch.crawl.DefaultFetchSchedule",
"db.fetch.schedule.mime.file": "adaptive-mimetypes.txt",
"db.ignore.external.exemptions.file": "db-ignore-external-exemptions.txt",
"db.ignore.external.links": "false",
"db.ignore.external.links.mode": "byHost",
"db.ignore.internal.links": "false",
"db.injector.overwrite": "false",
"db.injector.update": "false",
"db.max.anchor.length": "100",
"db.max.outlinks.per.page": "100",
"db.preserve.backup": "true",
"db.score.count.filtered": "false",
"db.score.injected": "1.0",
"db.score.link.external": "1.0",
"db.score.link.internal": "1.0",
"db.signature.class": "org.apache.nutch.crawl.MD5Signature",
"db.signature.text_profile.min_token_len": "2",
"db.signature.text_profile.quant_rate": "0.01",
"db.update.additions.allowed": "true",
"db.update.max.inlinks": "10000",
"db.update.purge.404": "false",
"db.url.filters": "false",
"db.url.normalizers": "false",
"dfs.ha.fencing.ssh.connect-timeout": "30000",
"elastic.index": "nutch",
"elastic.max.bulk.docs": "250",
"elastic.max.bulk.size": "2500500",
"elastic.port": "9300",
"encodingdetector.charset.min.confidence": "-1",
"fetcher.bandwidth.target": "-1",
"fetcher.bandwidth.target.check.everyNSecs": "30",
"fetcher.follow.outlinks.depth": "-1",
"fetcher.follow.outlinks.depth.divisor": "2",
"fetcher.follow.outlinks.ignore.external": "true",
"fetcher.follow.outlinks.num.links": "4",
"fetcher.max.crawl.delay": "30",
"fetcher.max.exceptions.per.queue": "-1",
"fetcher.maxNum.threads": "25",
"fetcher.parse": "false",
"fetcher.queue.depth.multiplier": "50",
"fetcher.queue.mode": "byHost",
"fetcher.server.delay": "5.0",
"fetcher.server.min.delay": "0.0",
"fetcher.store.content": "true",
"fetcher.threads.fetch": "10",
"fetcher.threads.per.queue": "1",
"fetcher.threads.timeout.divisor": "2",
"fetcher.throughput.threshold.check.after": "5",
"fetcher.throughput.threshold.pages": "-1",
"fetcher.throughput.threshold.retries": "5",
"fetcher.timelimit.mins": "-1",
"fetcher.verbose": "false",
"file.blocksize": "67108864",
"file.bytes-per-checksum": "512",
"file.client-write-packet-size": "65536",
"file.content.ignored": "true",
"file.content.limit": "65536",
"file.crawl.parent": "true",
"file.crawl.redirect_noncanonical": "true",
"file.replication": "1",
"file.stream-buffer-size": "4096",
"fs.AbstractFileSystem.file.impl": "org.apache.hadoop.fs.local.LocalFs",
"fs.AbstractFileSystem.hdfs.impl": "org.apache.hadoop.fs.Hdfs",
"fs.AbstractFileSystem.viewfs.impl": "org.apache.hadoop.fs.viewfs.ViewFs",
"fs.automatic.close": "true",
"fs.client.resolve.remote.symlinks": "true",
"fs.defaultFS": "file:///",
"fs.df.interval": "60000",
"fs.du.interval": "600000",
"fs.ftp.host": "0.0.0.0",
"fs.ftp.host.port": "21",
"fs.permissions.umask-mode": "022",
"fs.s3.block.size": "67108864",
"fs.s3.buffer.dir": "${hadoop.tmp.dir}/s3",
"fs.s3.maxRetries": "4",
"fs.s3.sleepTimeSeconds": "10",
"fs.s3n.block.size": "67108864",
"fs.s3n.multipart.copy.block.size": "5368709120",
"fs.s3n.multipart.uploads.block.size": "67108864",
"fs.s3n.multipart.uploads.enabled": "false",
"fs.swift.impl": "org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem",
"fs.trash.checkpoint.interval": "0",
"fs.trash.interval": "0",
"ftp.blocksize": "67108864",
"ftp.bytes-per-checksum": "512",
"ftp.client-write-packet-size": "65536",
"ftp.content.limit": "65536",
"ftp.follow.talk": "false",
"ftp.keep.connection": "false",
"ftp.password": "anonymous@example.com",
"ftp.replication": "3",
"ftp.server.timeout": "100000",
"ftp.stream-buffer-size": "4096",
"ftp.timeout": "60000",
"ftp.username": "anonymous",
"generate.count.mode": "host",
"generate.max.count": "-1",
"generate.min.interval": "-1",
"generate.min.score": "0",
"generate.update.crawldb": "false",
"ha.failover-controller.cli-check.rpc-timeout.ms": "20000",
"ha.failover-controller.graceful-fence.connection.retries": "1",
"ha.failover-controller.graceful-fence.rpc-timeout.ms": "5000",
"ha.failover-controller.new-active.rpc-timeout.ms": "60000",
"ha.health-monitor.check-interval.ms": "1000",
"ha.health-monitor.connect-retry-interval.ms": "1000",
"ha.health-monitor.rpc-timeout.ms": "45000",
"ha.health-monitor.sleep-after-disconnect.ms": "1000",
"ha.zookeeper.acl": "world:anyone:rwcda",
"ha.zookeeper.parent-znode": "/hadoop-ha",
"ha.zookeeper.session-timeout.ms": "5000",
"hadoop.common.configuration.version": "0.23.0",
"hadoop.http.authentication.kerberos.keytab": "${user.home}/hadoop.keytab",
"hadoop.http.authentication.kerberos.principal": "HTTP/_HOST@LOCALHOST",
"hadoop.http.authentication.signature.secret.file": "${user.home}/hadoop-http-auth-signature-secret",
"hadoop.http.authentication.simple.anonymous.allowed": "true",
"hadoop.http.authentication.token.validity": "36000",
"hadoop.http.authentication.type": "simple",
"hadoop.http.filter.initializers": "org.apache.hadoop.http.lib.StaticUserWebFilter",
"hadoop.http.staticuser.user": "dr.who",
"hadoop.jetty.logs.serve.aliases": "true",
"hadoop.job.history.user.location": "${hadoop.log.dir}/history/user",
"hadoop.kerberos.kinit.command": "kinit",
"hadoop.rpc.protection": "authentication",
"hadoop.rpc.socket.factory.class.default": "org.apache.hadoop.net.StandardSocketFactory",
"hadoop.security.authentication": "simple",
"hadoop.security.authorization": "false",
"hadoop.security.group.mapping": "org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback",
"hadoop.security.group.mapping.ldap.directory.search.timeout": "10000",
"hadoop.security.group.mapping.ldap.search.attr.group.name": "cn",
"hadoop.security.group.mapping.ldap.search.attr.member": "member",
"hadoop.security.group.mapping.ldap.search.filter.group": "(objectClass=group)",
"hadoop.security.group.mapping.ldap.search.filter.user": "(&(objectClass=user)(sAMAccountName={0}))",
"hadoop.security.group.mapping.ldap.ssl": "false",
"hadoop.security.groups.cache.secs": "300",
"hadoop.security.groups.cache.warn.after.ms": "5000",
"hadoop.security.instrumentation.requires.admin": "false",
"hadoop.security.uid.cache.secs": "14400",
"hadoop.ssl.client.conf": "ssl-client.xml",
"hadoop.ssl.enabled": "false",
"hadoop.ssl.hostname.verifier": "DEFAULT",
"hadoop.ssl.keystores.factory.class": "org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory",
"hadoop.ssl.require.client.cert": "false",
"hadoop.ssl.server.conf": "ssl-server.xml",
"hadoop.tmp.dir": "/tmp/hadoop-${user.name}",
"hadoop.user.group.static.mapping.overrides": "dr.who=;",
"hadoop.util.hash.type": "murmur",
"hadoop.work.around.non.threadsafe.getpwuid": "false",
"headings": "h1,h2",
"headings.multivalued": "false",
"hostdb.check.failed": "true",
"hostdb.check.known": "true",
"hostdb.check.new": "true",
"hostdb.force.check": "false",
"hostdb.num.resolvers.threads": "25",
"hostdb.numeric.fields": "_rs_",
"hostdb.percentiles": "50,75,95,99",
"hostdb.purge.failed.hosts.threshold": "3",
"hostdb.recheck.interval": "86400000",
"hostdb.string.fields": "Content-Type",
"hostdb.url.filter": "false",
"hostdb.url.normalize": "false",
"http.accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"http.accept.language": "en-us,en-gb,en;q=0.7,*;q=0.3",
"http.agent.name": "Orion",
"http.agent.rotate": "false",
"http.agent.rotate.file": "agents.txt",
"http.agent.version": "Nutch-1.12-SNAPSHOT",
"http.auth.file": "httpclient-auth.xml",
"http.content.limit": "65536",
"http.enable.if.modified.since.header": "true",
"http.max.delays": "100",
"http.redirect.max": "0",
"http.robots.403.allow": "true",
"http.store.responsetime": "true",
"http.timeout": "10000",
"http.useHttp11": "false",
"http.verbose": "false",
"index.geoip.usage": "insightsService",
"index.links.hosts.only": "false",
"index.links.inlinks.host.ignore": "false",
"index.links.outlinks.host.ignore": "false",
"index.parse.md": "metatag.description,metatag.keywords",
"index.static.fieldsep": ",",
"index.static.keysep": ":",
"index.static.valuesep": " ",
"indexer.add.domain": "false",
"indexer.delete.robots.noindex": "false",
"indexer.delete.skipped.by.indexingfilter": "false",
"indexer.max.content.length": "-1",
"indexer.max.title.length": "100",
"indexer.score.power": "0.5",
"indexer.skip.notmodified": "false",
"interactiveselenium.handlers": "DefaultHandler",
"io.bytes.per.checksum": "512",
"io.compression.codec.bzip2.library": "system-native",
"io.file.buffer.size": "4096",
"io.map.index.interval": "128",
"io.map.index.skip": "0",
"io.mapfile.bloom.error.rate": "0.005",
"io.mapfile.bloom.size": "1048576",
"io.native.lib.available": "true",
"io.seqfile.compress.blocksize": "1000000",
"io.seqfile.lazydecompress": "true",
"io.seqfile.local.dir": "${hadoop.tmp.dir}/io/local",
"io.seqfile.sorter.recordlimit": "1000000",
"io.serializations": "org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.JavaSerialization",
"io.skip.checksum.errors": "false",
"ipc.client.connect.max.retries": "10",
"ipc.client.connect.max.retries.on.timeouts": "45",
"ipc.client.connect.retry.interval": "1000",
"ipc.client.connect.timeout": "20000",
"ipc.client.connection.maxidletime": "10000",
"ipc.client.fallback-to-simple-auth-allowed": "false",
"ipc.client.idlethreshold": "4000",
"ipc.client.kill.max": "10",
"ipc.client.tcpnodelay": "false",
"ipc.server.listen.queue.size": "128",
"ipc.server.tcpnodelay": "false",
"lang.analyze.max.length": "2048",
"lang.extraction.policy": "detect,identify",
"lang.identification.only.certain": "false",
"libselenium.page.load.delay": "3",
"link.analyze.damping.factor": "0.85f",
"link.analyze.initial.score": "1.0f",
"link.analyze.num.iterations": "10",
"link.delete.gone": "false",
"link.ignore.internal.domain": "true",
"link.ignore.internal.host": "true",
"link.ignore.limit.domain": "true",
"link.ignore.limit.page": "true",
"link.loops.depth": "2",
"link.score.updater.clear.score": "0.0f",
"linkdb.ignore.external.links": "false",
"linkdb.ignore.internal.links": "true",
"linkdb.max.inlinks": "10000",
"map.sort.class": "org.apache.hadoop.util.QuickSort",
"mapred.child.java.opts": "-Xmx200m",
"mapreduce.am.max-attempts": "2",
"mapreduce.app-submission.cross-platform": "false",
"mapreduce.client.completion.pollinterval": "5000",
"mapreduce.client.output.filter": "FAILED",
"mapreduce.client.progressmonitor.pollinterval": "1000",
"mapreduce.client.submit.file.replication": "10",
"mapreduce.cluster.acls.enabled": "false",
"mapreduce.cluster.local.dir": "${hadoop.tmp.dir}/mapred/local",
"mapreduce.cluster.temp.dir": "${hadoop.tmp.dir}/mapred/temp",
"mapreduce.fileoutputcommitter.marksuccessfuljobs": "false",
"mapreduce.framework.name": "local",
"mapreduce.ifile.readahead": "true",
"mapreduce.ifile.readahead.bytes": "4194304",
"mapreduce.input.fileinputformat.list-status.num-threads": "1",
"mapreduce.input.fileinputformat.split.minsize": "0",
"mapreduce.job.acl-modify-job": " ",
"mapreduce.job.acl-view-job": " ",
"mapreduce.job.classloader": "false",
"mapreduce.job.classloader.system.classes": "java.,javax.,org.apache.commons.logging.,org.apache.log4j.,org.apache.hadoop.",
"mapreduce.job.committer.setup.cleanup.needed": "true",
"mapreduce.job.complete.cancel.delegation.tokens": "true",
"mapreduce.job.counters.max": "120",
"mapreduce.job.end-notification.max.attempts": "5",
"mapreduce.job.end-notification.max.retry.interval": "5000",
"mapreduce.job.end-notification.retry.attempts": "0",
"mapreduce.job.end-notification.retry.interval": "1000",
"mapreduce.job.hdfs-servers": "${fs.defaultFS}",
"mapreduce.job.jvm.numtasks": "1",
"mapreduce.job.map.output.collector.class": "org.apache.hadoop.mapred.MapTask$MapOutputBuffer",
"mapreduce.job.maps": "2",
"mapreduce.job.max.split.locations": "10",
"mapreduce.job.maxtaskfailures.per.tracker": "3",
"mapreduce.job.queuename": "default",
"mapreduce.job.reduce.shuffle.consumer.plugin.class": "org.apache.hadoop.mapreduce.task.reduce.Shuffle",
"mapreduce.job.reduce.slowstart.completedmaps": "0.05",
"mapreduce.job.reduces": "1",
"mapreduce.job.speculative.slownodethreshold": "1.0",
"mapreduce.job.speculative.slowtaskthreshold": "1.0",
"mapreduce.job.speculative.speculativecap": "0.1",
"mapreduce.job.split.metainfo.maxsize": "10000000",
"mapreduce.job.token.tracking.ids.enabled": "false",
"mapreduce.job.ubertask.enable": "false",
"mapreduce.job.ubertask.maxmaps": "9",
"mapreduce.job.ubertask.maxreduces": "1",
"mapreduce.job.userlog.retain.hours": "24",
"mapreduce.jobhistory.address": "0.0.0.0:10020",
"mapreduce.jobhistory.admin.acl": "*",
"mapreduce.jobhistory.admin.address": "0.0.0.0:10033",
"mapreduce.jobhistory.cleaner.enable": "true",
"mapreduce.jobhistory.cleaner.interval-ms": "86400000",
"mapreduce.jobhistory.client.thread-count": "10",
"mapreduce.jobhistory.datestring.cache.size": "200000",
"mapreduce.jobhistory.done-dir": "${yarn.app.mapreduce.am.staging-dir}/history/done",
"mapreduce.jobhistory.http.policy": "HTTP_ONLY",
"mapreduce.jobhistory.intermediate-done-dir": "${yarn.app.mapreduce.am.staging-dir}/history/done_intermediate",
"mapreduce.jobhistory.joblist.cache.size": "20000",
"mapreduce.jobhistory.keytab": "/etc/security/keytab/jhs.service.keytab",
"mapreduce.jobhistory.loadedjobs.cache.size": "5",
"mapreduce.jobhistory.max-age-ms": "604800000",
"mapreduce.jobhistory.minicluster.fixed.ports": "false",
"mapreduce.jobhistory.move.interval-ms": "180000",
"mapreduce.jobhistory.move.thread-count": "3",
"mapreduce.jobhistory.principal": "jhs/_HOST@REALM.TLD",
"mapreduce.jobhistory.recovery.enable": "false",
"mapreduce.jobhistory.recovery.store.class": "org.apache.hadoop.mapreduce.v2.hs.HistoryServerFileSystemStateStoreService",
"mapreduce.jobhistory.recovery.store.fs.uri": "${hadoop.tmp.dir}/mapred/history/recoverystore",
"mapreduce.jobhistory.webapp.address": "0.0.0.0:19888",
"mapreduce.jobtracker.address": "local",
"mapreduce.jobtracker.expire.trackers.interval": "600000",
"mapreduce.jobtracker.handler.count": "10",
"mapreduce.jobtracker.heartbeats.in.second": "100",
"mapreduce.jobtracker.http.address": "0.0.0.0:50030",
"mapreduce.jobtracker.instrumentation": "org.apache.hadoop.mapred.JobTrackerMetricsInst",
"mapreduce.jobtracker.jobhistory.block.size": "3145728",
"mapreduce.jobtracker.jobhistory.lru.cache.size": "5",
"mapreduce.jobtracker.jobhistory.task.numberprogresssplits": "12",
"mapreduce.jobtracker.maxtasks.perjob": "-1",
"mapreduce.jobtracker.persist.jobstatus.active": "true",
"mapreduce.jobtracker.persist.jobstatus.dir": "/jobtracker/jobsInfo",
"mapreduce.jobtracker.persist.jobstatus.hours": "1",
"mapreduce.jobtracker.restart.recover": "false",
"mapreduce.jobtracker.retiredjobs.cache.size": "1000",
"mapreduce.jobtracker.staging.root.dir": "${hadoop.tmp.dir}/mapred/staging",
"mapreduce.jobtracker.system.dir": "${hadoop.tmp.dir}/mapred/system",
"mapreduce.jobtracker.taskcache.levels": "2",
"mapreduce.jobtracker.taskscheduler": "org.apache.hadoop.mapred.JobQueueTaskScheduler",
"mapreduce.jobtracker.tasktracker.maxblacklists": "4",
"mapreduce.local.clientfactory.class.name": "org.apache.hadoop.mapred.LocalClientFactory",
"mapreduce.map.cpu.vcores": "1",
"mapreduce.map.log.level": "INFO",
"mapreduce.map.maxattempts": "4",
"mapreduce.map.output.compress": "false",
"mapreduce.map.output.compress.codec": "org.apache.hadoop.io.compress.DefaultCodec",
"mapreduce.map.skip.maxrecords": "0",
"mapreduce.map.skip.proc.count.autoincr": "true",
"mapreduce.map.sort.spill.percent": "0.80",
"mapreduce.map.speculative": "true",
"mapreduce.output.fileoutputformat.compress": "false",
"mapreduce.output.fileoutputformat.compress.codec": "org.apache.hadoop.io.compress.DefaultCodec",
"mapreduce.output.fileoutputformat.compress.type": "RECORD",
"mapreduce.reduce.cpu.vcores": "1",
"mapreduce.reduce.input.buffer.percent": "0.0",
"mapreduce.reduce.log.level": "INFO",
"mapreduce.reduce.markreset.buffer.percent": "0.0",
"mapreduce.reduce.maxattempts": "4",
"mapreduce.reduce.merge.inmem.threshold": "1000",
"mapreduce.reduce.shuffle.connect.timeout": "180000",
"mapreduce.reduce.shuffle.input.buffer.percent": "0.70",
"mapreduce.reduce.shuffle.memory.limit.percent": "0.25",
"mapreduce.reduce.shuffle.merge.percent": "0.66",
"mapreduce.reduce.shuffle.parallelcopies": "5",
"mapreduce.reduce.shuffle.read.timeout": "180000",
"mapreduce.reduce.shuffle.retry-delay.max.ms": "60000",
"mapreduce.reduce.skip.maxgroups": "0",
"mapreduce.reduce.skip.proc.count.autoincr": "true",
"mapreduce.reduce.speculative": "true",
"mapreduce.shuffle.connection-keep-alive.enable": "false",
"mapreduce.shuffle.connection-keep-alive.timeout": "5",
"mapreduce.shuffle.max.connections": "0",
"mapreduce.shuffle.max.threads": "0",
"mapreduce.shuffle.port": "13562",
"mapreduce.shuffle.ssl.enabled": "false",
"mapreduce.shuffle.ssl.file.buffer.size": "65536",
"mapreduce.shuffle.transfer.buffer.size": "131072",
"mapreduce.task.files.preserve.failedtasks": "false",
"mapreduce.task.io.sort.factor": "10",
"mapreduce.task.io.sort.mb": "100",
"mapreduce.task.merge.progress.records": "10000",
"mapreduce.task.profile": "false",
"mapreduce.task.profile.map.params": "${mapreduce.task.profile.params}",
"mapreduce.task.profile.maps": "0-2",
"mapreduce.task.profile.reduce.params": "${mapreduce.task.profile.params}",
"mapreduce.task.profile.reduces": "0-2",
"mapreduce.task.skip.start.attempts": "2",
"mapreduce.task.timeout": "600000",
"mapreduce.task.tmp.dir": "./tmp",
"mapreduce.task.userlog.limit.kb": "0",
"mapreduce.tasktracker.dns.interface": "default",
"mapreduce.tasktracker.dns.nameserver": "default",
"mapreduce.tasktracker.healthchecker.interval": "60000",
"mapreduce.tasktracker.healthchecker.script.timeout": "600000",
"mapreduce.tasktracker.http.address": "0.0.0.0:50060",
"mapreduce.tasktracker.http.threads": "40",
"mapreduce.tasktracker.indexcache.mb": "10",
"mapreduce.tasktracker.instrumentation": "org.apache.hadoop.mapred.TaskTrackerMetricsInst",
"mapreduce.tasktracker.local.dir.minspacekill": "0",
"mapreduce.tasktracker.local.dir.minspacestart": "0",
"mapreduce.tasktracker.map.tasks.maximum": "2",
"mapreduce.tasktracker.outofband.heartbeat": "false",
"mapreduce.tasktracker.reduce.tasks.maximum": "2",
"mapreduce.tasktracker.report.address": "127.0.0.1:0",
"mapreduce.tasktracker.taskcontroller": "org.apache.hadoop.mapred.DefaultTaskController",
"mapreduce.tasktracker.taskmemorymanager.monitoringinterval": "5000",
"mapreduce.tasktracker.tasks.sleeptimebeforesigkill": "5000",
"metatags.names": "description,keywords",
"mime.type.magic": "true",
"mimetype.filter.file": "mimetype-filter.txt",
"moreIndexingFilter.indexMimeTypeParts": "true",
"moreIndexingFilter.mapMimeTypes": "false",
"net.topology.impl": "org.apache.hadoop.net.NetworkTopology",
"net.topology.node.switch.mapping.impl": "org.apache.hadoop.net.ScriptBasedMapping",
"net.topology.script.number.args": "100",
"nfs3.mountd.port": "4242",
"nfs3.server.port": "2049",
"nutch.conf.uuid": "e2ce3f49-7b1d-4c13-aac1-e07478f02673",
"parse.filter.urls": "true",
"parse.normalize.urls": "true",
"parse.plugin.file": "parse-plugins.xml",
"parsefilter.naivebayes.trainfile": "naivebayes-train.txt",
"parsefilter.naivebayes.wordlist": "naivebayes-wordlist.txt",
"parser.caching.forbidden.policy": "content",
"parser.character.encoding.default": "windows-1252",
"parser.html.form.use_action": "false",
"parser.html.impl": "neko",
"parser.skip.truncated": "true",
"parser.timeout": "30",
"partition.url.mode": "byHost",
"plugin.auto-activation": "true",
"plugin.folders": "plugins",
"plugin.includes": "protocol-httpclient|urlfilter-regex|parse-(tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)",
"rpc.metrics.quantile.enable": "false",
"s3.blocksize": "67108864",
"s3.bytes-per-checksum": "512",
"s3.client-write-packet-size": "65536",
"s3.replication": "3",
"s3.stream-buffer-size": "4096",
"s3native.blocksize": "67108864",
"s3native.bytes-per-checksum": "512",
"s3native.client-write-packet-size": "65536",
"s3native.replication": "3",
"s3native.stream-buffer-size": "4096",
"scoring.depth.max": "1000",
"scoring.similarity.model": "cosine",
"scoring.similarity.ngrams": "1",
"scoring.similarity.stopword.file": "stopwords.txt",
"selenium.driver": "firefox",
"selenium.firefox.allowed.hosts": "localhost",
"selenium.firefox.binary.timeout": "45",
"selenium.firefox.enable.flash": "false",
"selenium.firefox.load.image": "1",
"selenium.firefox.load.stylesheet": "1",
"selenium.grid.driver": "firefox",
"selenium.hub.host": "localhost",
"selenium.hub.path": "/wd/hub",
"selenium.hub.port": "4444",
"selenium.hub.protocol": "http",
"selenium.take.screenshot": "false",
"solr.auth": "false",
"solr.commit.index": "true",
"solr.commit.size": "250",
"solr.mapping.file": "solrindex-mapping.xml",
"solr.server.type": "http",
"solr.server.url": "http://127.0.0.1:8983/solr/",
"store.http.headers": "false",
"store.http.request": "false",
"store.ip.address": "false",
"subcollection.default.fieldname": "subcollection",
"tfile.fs.input.buffer.size": "262144",
"tfile.fs.output.buffer.size": "262144",
"tfile.io.chunk.size": "1048576",
"tika.extractor": "none",
"tika.extractor.boilerpipe.algorithm": "ArticleExtractor",
"tika.uppercase.element.names": "true",
"urlfilter.automaton.file": "automaton-urlfilter.txt",
"urlfilter.domain.file": "domain-urlfilter.txt",
"urlfilter.prefix.file": "prefix-urlfilter.txt",
"urlfilter.regex.file": "regex-urlfilter.txt",
"urlfilter.suffix.file": "suffix-urlfilter.txt",
"urlnormalizer.loop.count": "1",
"urlnormalizer.order": "org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer",
"urlnormalizer.regex.file": "regex-normalize.xml",
"yarn.acl.enable": "false",
"yarn.admin.acl": "*",
"yarn.am.liveness-monitor.expiry-interval-ms": "600000",
"yarn.app.mapreduce.am.command-opts": "-Xmx1024m",
"yarn.app.mapreduce.am.container.log.backups": "0",
"yarn.app.mapreduce.am.container.log.limit.kb": "0",
"yarn.app.mapreduce.am.job.committer.cancel-timeout": "60000",
"yarn.app.mapreduce.am.job.committer.commit-window": "10000",
"yarn.app.mapreduce.am.job.task.listener.thread-count": "30",
"yarn.app.mapreduce.am.resource.cpu-vcores": "1",
"yarn.app.mapreduce.am.resource.mb": "1536",
"yarn.app.mapreduce.am.scheduler.heartbeat.interval-ms": "1000",
"yarn.app.mapreduce.am.staging-dir": "/tmp/hadoop-yarn/staging",
"yarn.app.mapreduce.client-am.ipc.max-retries": "3",
"yarn.app.mapreduce.client-am.ipc.max-retries-on-timeouts": "3",
"yarn.app.mapreduce.client.max-retries": "3",
"yarn.app.mapreduce.task.container.log.backups": "0",
"yarn.client.application-client-protocol.poll-interval-ms": "200",
"yarn.client.failover-proxy-provider": "org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider",
"yarn.client.failover-retries": "0",
"yarn.client.failover-retries-on-socket-timeouts": "0",
"yarn.client.max-nodemanagers-proxies": "500",
"yarn.client.nodemanager-client-async.thread-pool-max-size": "500",
"yarn.http.policy": "HTTP_ONLY",
"yarn.ipc.rpc.class": "org.apache.hadoop.yarn.ipc.HadoopYarnProtoRPC",
"yarn.ipc.serializer.type": "protocolbuffers",
"yarn.log-aggregation-enable": "false",
"yarn.log-aggregation.retain-check-interval-seconds": "-1",
"yarn.log-aggregation.retain-seconds": "-1",
"yarn.nm.liveness-monitor.expiry-interval-ms": "600000",
"yarn.nodemanager.address": "${yarn.nodemanager.hostname}:0",
"yarn.nodemanager.admin-env": "MALLOC_ARENA_MAX=$MALLOC_ARENA_MAX",
"yarn.nodemanager.aux-services.mapreduce_shuffle.class": "org.apache.hadoop.mapred.ShuffleHandler",
"yarn.nodemanager.container-executor.class": "org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor",
"yarn.nodemanager.container-manager.thread-count": "20",
"yarn.nodemanager.container-monitor.interval-ms": "3000",
"yarn.nodemanager.container-monitor.procfs-tree.smaps-based-rss.enabled": "false",
"yarn.nodemanager.delete.debug-delay-sec": "0",
"yarn.nodemanager.delete.thread-count": "4",
"yarn.nodemanager.disk-health-checker.interval-ms": "120000",
"yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage": "100.0",
"yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb": "0",
"yarn.nodemanager.disk-health-checker.min-healthy-disks": "0.25",
"yarn.nodemanager.env-whitelist": "JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,HADOOP_YARN_HOME",
"yarn.nodemanager.health-checker.interval-ms": "600000",
"yarn.nodemanager.health-checker.script.timeout-ms": "1200000",
"yarn.nodemanager.hostname": "0.0.0.0",
"yarn.nodemanager.keytab": "/etc/krb5.keytab",
"yarn.nodemanager.linux-container-executor.cgroups.hierarchy": "/hadoop-yarn",
"yarn.nodemanager.linux-container-executor.cgroups.mount": "false",
"yarn.nodemanager.linux-container-executor.nonsecure-mode.local-user": "nobody",
"yarn.nodemanager.linux-container-executor.nonsecure-mode.user-pattern": "^[_.A-Za-z0-9][-@_.A-Za-z0-9]{0,255}?[$]?$",
"yarn.nodemanager.linux-container-executor.resources-handler.class": "org.apache.hadoop.yarn.server.nodemanager.util.DefaultLCEResourcesHandler",
"yarn.nodemanager.local-cache.max-files-per-directory": "8192",
"yarn.nodemanager.local-dirs": "${hadoop.tmp.dir}/nm-local-dir",
"yarn.nodemanager.localizer.address": "${yarn.nodemanager.hostname}:8040",
"yarn.nodemanager.localizer.cache.cleanup.interval-ms": "600000",
"yarn.nodemanager.localizer.cache.target-size-mb": "10240",
"yarn.nodemanager.localizer.client.thread-count": "5",
"yarn.nodemanager.localizer.fetch.thread-count": "4",
"yarn.nodemanager.log-aggregation.compression-type": "none",
"yarn.nodemanager.log-dirs": "${yarn.log.dir}/userlogs",
"yarn.nodemanager.log.retain-seconds": "10800",
"yarn.nodemanager.pmem-check-enabled": "true",
"yarn.nodemanager.process-kill-wait.ms": "2000",
"yarn.nodemanager.remote-app-log-dir": "/tmp/logs",
"yarn.nodemanager.remote-app-log-dir-suffix": "logs",
"yarn.nodemanager.resource.cpu-vcores": "8",
"yarn.nodemanager.resource.memory-mb": "8192",
"yarn.nodemanager.resourcemanager.connect.retry_interval.secs": "30",
"yarn.nodemanager.resourcemanager.connect.wait.secs": "900",
"yarn.nodemanager.resourcemanager.minimum.version": "NONE",
"yarn.nodemanager.sleep-delay-before-sigkill.ms": "250",
"yarn.nodemanager.vmem-check-enabled": "true",
"yarn.nodemanager.vmem-pmem-ratio": "2.1",
"yarn.nodemanager.webapp.address": "${yarn.nodemanager.hostname}:8042",
"yarn.resourcemanager.address": "${yarn.resourcemanager.hostname}:8032",
"yarn.resourcemanager.admin.address": "${yarn.resourcemanager.hostname}:8033",
"yarn.resourcemanager.admin.client.thread-count": "1",
"yarn.resourcemanager.am.max-attempts": "2",
"yarn.resourcemanager.amliveliness-monitor.interval-ms": "1000",
"yarn.resourcemanager.application-tokens.master-key-rolling-interval-secs": "86400",
"yarn.resourcemanager.client.thread-count": "50",
"yarn.resourcemanager.configuration.provider-class": "org.apache.hadoop.yarn.LocalConfigurationProvider",
"yarn.resourcemanager.connect.max-wait.ms": "900000",
"yarn.resourcemanager.connect.retry-interval.ms": "30000",
"yarn.resourcemanager.container-tokens.master-key-rolling-interval-secs": "86400",
"yarn.resourcemanager.container.liveness-monitor.interval-ms": "600000",
"yarn.resourcemanager.delayed.delegation-token.removal-interval-ms": "30000",
"yarn.resourcemanager.fs.state-store.retry-policy-spec": "2000, 500",
"yarn.resourcemanager.fs.state-store.uri": "${hadoop.tmp.dir}/yarn/system/rmstore",
"yarn.resourcemanager.ha.automatic-failover.embedded": "true",
"yarn.resourcemanager.ha.automatic-failover.enabled": "true",
"yarn.resourcemanager.ha.automatic-failover.zk-base-path": "/yarn-leader-election",
"yarn.resourcemanager.ha.enabled": "false",
"yarn.resourcemanager.history-writer.multi-threaded-dispatcher.pool-size": "10",
"yarn.resourcemanager.hostname": "0.0.0.0",
"yarn.resourcemanager.keytab": "/etc/krb5.keytab",
"yarn.resourcemanager.max-completed-applications": "10000",
"yarn.resourcemanager.nm.liveness-monitor.interval-ms": "1000",
"yarn.resourcemanager.nodemanager.minimum.version": "NONE",
"yarn.resourcemanager.nodemanagers.heartbeat-interval-ms": "1000",
"yarn.resourcemanager.recovery.enabled": "false",
"yarn.resourcemanager.resource-tracker.address": "${yarn.resourcemanager.hostname}:8031",
"yarn.resourcemanager.resource-tracker.client.thread-count": "50",
"yarn.resourcemanager.scheduler.address": "${yarn.resourcemanager.hostname}:8030",
"yarn.resourcemanager.scheduler.class": "org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler",
"yarn.resourcemanager.scheduler.client.thread-count": "50",
"yarn.resourcemanager.scheduler.monitor.enable": "false",
"yarn.resourcemanager.scheduler.monitor.policies": "org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy",
"yarn.resourcemanager.state-store.max-completed-applications": "${yarn.resourcemanager.max-completed-applications}",
"yarn.resourcemanager.store.class": "org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore",
"yarn.resourcemanager.webapp.address": "${yarn.resourcemanager.hostname}:8088",
"yarn.resourcemanager.webapp.https.address": "${yarn.resourcemanager.hostname}:8090",
"yarn.resourcemanager.zk-acl": "world:anyone:rwcda",
"yarn.resourcemanager.zk-num-retries": "500",
"yarn.resourcemanager.zk-retry-interval-ms": "2000",
"yarn.resourcemanager.zk-state-store.parent-path": "/rmstore",
"yarn.resourcemanager.zk-timeout-ms": "10000",
"yarn.scheduler.maximum-allocation-mb": "8192",
"yarn.scheduler.maximum-allocation-vcores": "32",
"yarn.scheduler.minimum-allocation-mb": "1024",
"yarn.scheduler.minimum-allocation-vcores": "1",
"yarn.timeline-service.address": "${yarn.timeline-service.hostname}:10200",
"yarn.timeline-service.enabled": "true",
"yarn.timeline-service.generic-application-history.enabled": "false",
"yarn.timeline-service.generic-application-history.fs-history-store.compression-type": "none",
"yarn.timeline-service.generic-application-history.fs-history-store.uri": "${hadoop.log.dir}/yarn/system/history",
"yarn.timeline-service.generic-application-history.store-class": "org.apache.hadoop.yarn.server.applicationhistoryservice.FileSystemApplicationHistoryStore",
"yarn.timeline-service.handler-thread-count": "10",
"yarn.timeline-service.hostname": "0.0.0.0",
"yarn.timeline-service.leveldb-timeline-store.path": "${yarn.log.dir}/timeline",
"yarn.timeline-service.leveldb-timeline-store.read-cache-size": "104857600",
"yarn.timeline-service.leveldb-timeline-store.start-time-read-cache-size": "10000",
"yarn.timeline-service.leveldb-timeline-store.start-time-write-cache-size": "10000",
"yarn.timeline-service.leveldb-timeline-store.ttl-interval-ms": "300000",
"yarn.timeline-service.store-class": "org.apache.hadoop.yarn.server.applicationhistoryservice.timeline.LeveldbTimelineStore",
"yarn.timeline-service.ttl-enable": "true",
"yarn.timeline-service.ttl-ms": "604800000",
"yarn.timeline-service.webapp.address": "${yarn.timeline-service.hostname}:8188",
"yarn.timeline-service.webapp.https.address": "${yarn.timeline-service.hostname}:8190"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment