pzgz/gmetad.conf

## gmetad.conf
# This is an example of a Ganglia Meta Daemon configuration file
#                http://ganglia.sourceforge.net/
#
# $Id: gmetad.conf 667 2006-07-20 08:49:41Z knobi1 $
#
#-------------------------------------------------------------------------------
# Setting the debug_level to 1 will keep daemon in the forground and
# show only error messages. Setting this value higher than 1 will make
# gmetad output debugging information and stay in the foreground.
# default: 0
# debug_level 10
#
#-------------------------------------------------------------------------------
# What to monitor. The most important section of this file.
#
# The data_source tag specifies either a cluster or a grid to
# monitor. If we detect the source is a cluster, we will maintain a complete
# set of RRD databases for it, which can be used to create historical
# graphs of the metrics. If the source is a grid (it comes from another gmetad),
# we will only maintain summary RRDs for it.
#
# Format:
# data_source "my cluster" [polling interval] address1:port addreses2:port ...
#
# The keyword 'data_source' must immediately be followed by a unique
# string which identifies the source, then an optional polling interval in
# seconds. The source will be polled at this interval on average.
# If the polling interval is omitted, 15sec is asssumed.
#
# A list of machines which service the data source follows, in the
# format ip:port, or name:port. If a port is not specified then 8649
# (the default gmond port) is assumed.
# default: There is no default value
#
# data_source "my cluster" 10 localhost  my.machine.edu:8649  1.2.3.5:8655
# data_source "my grid" 50 1.3.4.7:8655 grid.org:8651 grid-backup.org:8651
# data_source "another source" 1.3.4.7:8655  1.3.4.8

data_source "Hadoop" dspprod-m:8649
data_source "Console" dspprod-c:8649
data_source "App" dspprod-a1:8649

gridname "Allyes Dsp Hangzhou"

# 15 seconds for 1 month, 6 minutes for 1 year
RRAs "RRA:AVERAGE:0.5:1:178560" "RRA:AVERAGE:0.5:24:88800"
rrd_rootdir "/data/ganglia/rrds"
#
# Round-Robin Archives
# You can specify custom Round-Robin archives here (defaults are listed below)
#
# RRAs "RRA:AVERAGE:0.5:1:244" "RRA:AVERAGE:0.5:24:244" "RRA:AVERAGE:0.5:168:244" "RRA:AVERAGE:0.5:672:244" \
#      "RRA:AVERAGE:0.5:5760:374"
#

#
#-------------------------------------------------------------------------------
# Scalability mode. If on, we summarize over downstream grids, and respect
# authority tags. If off, we take on 2.5.0-era behavior: we do not wrap our output
# in <GRID></GRID> tags, we ignore all <GRID> tags we see, and always assume
# we are the "authority" on data source feeds. This approach does not scale to
# large groups of clusters, but is provided for backwards compatibility.
# default: on
# scalable off
#
#-------------------------------------------------------------------------------
# The name of this Grid. All the data sources above will be wrapped in a GRID
# tag with this name.
# default: Unspecified
# gridname "MyGrid"
#
#-------------------------------------------------------------------------------
# The authority URL for this grid. Used by other gmetads to locate graphs
# for our data sources. Generally points to a ganglia/
# website on this machine.
# default: "http://hostname/ganglia/",
#   where hostname is the name of this machine, as defined by gethostname().
# authority "http://mycluster.org/newprefix/"
#
#-------------------------------------------------------------------------------
# List of machines this gmetad will share XML with. Localhost
# is always trusted.
# default: There is no default value
# trusted_hosts 127.0.0.1 169.229.50.165 my.gmetad.org
#
#-------------------------------------------------------------------------------
# If you want any host which connects to the gmetad XML to receive
# data, then set this value to "on"
# default: off
# all_trusted on
#
#-------------------------------------------------------------------------------
# If you don't want gmetad to setuid then set this to off
# default: on
# setuid off
#
#-------------------------------------------------------------------------------
# User gmetad will setuid to (defaults to "ganglia")
# default: "ganglia"
# setuid_username "ganglia"
#
#-------------------------------------------------------------------------------
# The port gmetad will answer requests for XML
# default: 8651
# xml_port 8651
#
#-------------------------------------------------------------------------------
# The port gmetad will answer queries for XML. This facility allows
# simple subtree and summation views of the XML tree.
# default: 8652
# interactive_port 8652
#
#-------------------------------------------------------------------------------
# The number of threads answering XML requests
# default: 4
# server_threads 10
#
#-------------------------------------------------------------------------------
# Where gmetad stores its round-robin databases
# default: "/var/lib/ganglia/rrds"
# rrd_rootdir "/some/other/place"

## gmond.conf
/* This configuration is as close to 2.5.x default behavior as possible
   The values closely match ./gmond/metric.h definitions in 2.5.x */
globals {
  daemonize = yes
  setuid = yes
  user = ganglia
  debug_level = 0
  max_udp_msg_len = 1472
  mute = no
  deaf = no
  allow_extra_data = yes
  host_dmax = 86400 /*secs, set this to remove dead host */
  cleanup_threshold = 300 /*secs */
  gexec = no
  send_metadata_interval = 30 /*secs, set this, or sometimes gmond will stuck */
}

/*
 * The cluster attributes specified will be used as part of the <CLUSTER>
 * tag that will wrap all hosts collected by this instance.
 */
cluster {
  name = "cluster_name_to_change"
  owner = "Allyes"
  latlong = "unspecified"
  url = "unspecified"
}

/* The host section describes attributes of the host, like the location */
host {
  location = "unspecified"
}

/* Feel free to specify as many udp_send_channels as you like.  Gmond
   used to only support having a single channel */
udp_send_channel {
  #bind_hostname = yes # Highly recommended, soon to be default.
                       # This option tells gmond to use a source address
                       # that resolves to the machine's hostname.  Without
                       # this, the metrics may appear to come from any
                       # interface and the DNS names associated with
                       # those IPs will be used to create the RRDs.
  host = host_name_to_change
  port = 8649
  ttl = 1
}

/* You can specify as many udp_recv_channels as you like as well. */
udp_recv_channel {
  port = 8649
}

/* You can specify as many tcp_accept_channels as you like to share
   an xml description of the state of the cluster */
tcp_accept_channel {
  port = 8649
}

/* Each metrics module that is referenced by gmond must be specified and
   loaded. If the module has been statically linked with gmond, it does
   not require a load path. However all dynamically loadable modules must
   include a load path. */
modules {
  module {
    name = "core_metrics"
  }
  module {
    name = "cpu_module"
    path = "modcpu.so"
  }
  module {
    name = "disk_module"
    path = "moddisk.so"
  }
  module {
    name = "load_module"
    path = "modload.so"
  }
  module {
    name = "mem_module"
    path = "modmem.so"
  }
  module {
    name = "net_module"
    path = "modnet.so"
  }
  module {
    name = "proc_module"
    path = "modproc.so"
  }
  module {
    name = "sys_module"
    path = "modsys.so"
  }
}

include ('/etc/ganglia/conf.d/*.conf')

/* The old internal 2.5.x metric array has been replaced by the following
   collection_group directives.  What follows is the default behavior for
   collecting and sending metrics that is as close to 2.5.x behavior as
   possible. */

/* This collection group will cause a heartbeat (or beacon) to be sent every
   20 seconds.  In the heartbeat is the GMOND_STARTED data which expresses
   the age of the running gmond. */
collection_group {
  collect_once = yes
  time_threshold = 20
  metric {
    name = "heartbeat"
  }
}

/* This collection group will send general info about this host every
   1200 secs.
   This information doesn't change between reboots and is only collected
   once. */
collection_group {
  collect_once = yes
  time_threshold = 1200
  metric {
    name = "cpu_num"
    title = "CPU Count"
  }
  metric {
    name = "cpu_speed"
    title = "CPU Speed"
  }
  metric {
    name = "mem_total"
    title = "Memory Total"
  }
  /* Should this be here? Swap can be added/removed between reboots. */
  metric {
    name = "swap_total"
    title = "Swap Space Total"
  }
  metric {
    name = "boottime"
    title = "Last Boot Time"
  }
  metric {
    name = "machine_type"
    title = "Machine Type"
  }
  metric {
    name = "os_name"
    title = "Operating System"
  }
  metric {
    name = "os_release"
    title = "Operating System Release"
  }
  metric {
    name = "location"
    title = "Location"
  }
}

/* This collection group will send the status of gexecd for this host
   every 300 secs.*/
/* Unlike 2.5.x the default behavior is to report gexecd OFF. */
collection_group {
  collect_once = yes
  time_threshold = 300
  metric {
    name = "gexec"
    title = "Gexec Status"
  }
}

/* This collection group will collect the CPU status info every 20 secs.
   The time threshold is set to 90 seconds.  In honesty, this
   time_threshold could be set significantly higher to reduce
   unneccessary  network chatter. */
collection_group {
  collect_every = 20
  time_threshold = 90
  /* CPU status */
  metric {
    name = "cpu_user"
    value_threshold = "1.0"
    title = "CPU User"
  }
  metric {
    name = "cpu_system"
    value_threshold = "1.0"
    title = "CPU System"
  }
  metric {
    name = "cpu_idle"
    value_threshold = "5.0"
    title = "CPU Idle"
  }
  metric {
    name = "cpu_nice"
    value_threshold = "1.0"
    title = "CPU Nice"
  }
  metric {
    name = "cpu_aidle"
    value_threshold = "5.0"
    title = "CPU aidle"
  }
  metric {
    name = "cpu_wio"
    value_threshold = "1.0"
    title = "CPU wio"
  }
  /* The next two metrics are optional if you want more detail...
     ... since they are accounted for in cpu_system.
  metric {
    name = "cpu_intr"
    value_threshold = "1.0"
    title = "CPU intr"
  }
  metric {
    name = "cpu_sintr"
    value_threshold = "1.0"
    title = "CPU sintr"
  }
  */
}

collection_group {
  collect_every = 20
  time_threshold = 90
  /* Load Averages */
  metric {
    name = "load_one"
    value_threshold = "1.0"
    title = "One Minute Load Average"
  }
  metric {
    name = "load_five"
    value_threshold = "1.0"
    title = "Five Minute Load Average"
  }
  metric {
    name = "load_fifteen"
    value_threshold = "1.0"
    title = "Fifteen Minute Load Average"
  }
}

/* This group collects the number of running and total processes */
collection_group {
  collect_every = 80
  time_threshold = 950
  metric {
    name = "proc_run"
    value_threshold = "1.0"
    title = "Total Running Processes"
  }
  metric {
    name = "proc_total"
    value_threshold = "1.0"
    title = "Total Processes"
  }
}

/* This collection group grabs the volatile memory metrics every 40 secs and
   sends them at least every 180 secs.  This time_threshold can be increased
   significantly to reduce unneeded network traffic. */
collection_group {
  collect_every = 40
  time_threshold = 180
  metric {
    name = "mem_free"
    value_threshold = "1024.0"
    title = "Free Memory"
  }
  metric {
    name = "mem_shared"
    value_threshold = "1024.0"
    title = "Shared Memory"
  }
  metric {
    name = "mem_buffers"
    value_threshold = "1024.0"
    title = "Memory Buffers"
  }
  metric {
    name = "mem_cached"
    value_threshold = "1024.0"
    title = "Cached Memory"
  }
  metric {
    name = "swap_free"
    value_threshold = "1024.0"
    title = "Free Swap Space"
  }
}

collection_group {
  collect_every = 40
  time_threshold = 300
  metric {
    name = "bytes_out"
    value_threshold = 4096
    title = "Bytes Sent"
  }
  metric {
    name = "bytes_in"
    value_threshold = 4096
    title = "Bytes Received"
  }
  metric {
    name = "pkts_in"
    value_threshold = 256
    title = "Packets Received"
  }
  metric {
    name = "pkts_out"
    value_threshold = 256
    title = "Packets Sent"
  }
}

/* Different than 2.5.x default since the old config made no sense */
collection_group {
  collect_every = 1800
  time_threshold = 3600
  metric {
    name = "disk_total"
    value_threshold = 1.0
    title = "Total Disk Space"
  }
}

collection_group {
  collect_every = 40
  time_threshold = 180
  metric {
    name = "disk_free"
    value_threshold = 1.0
    title = "Disk Space Available"
  }
  metric {
    name = "part_max_used"
    value_threshold = 1.0
    title = "Maximum Disk Space Used"
  }
}
	# This is an example of a Ganglia Meta Daemon configuration file
	# http://ganglia.sourceforge.net/
	#
	# $Id: gmetad.conf 667 2006-07-20 08:49:41Z knobi1 $
	#
	#-------------------------------------------------------------------------------
	# Setting the debug_level to 1 will keep daemon in the forground and
	# show only error messages. Setting this value higher than 1 will make
	# gmetad output debugging information and stay in the foreground.
	# default: 0
	# debug_level 10
	#
	#-------------------------------------------------------------------------------
	# What to monitor. The most important section of this file.
	#
	# The data_source tag specifies either a cluster or a grid to
	# monitor. If we detect the source is a cluster, we will maintain a complete
	# set of RRD databases for it, which can be used to create historical
	# graphs of the metrics. If the source is a grid (it comes from another gmetad),
	# we will only maintain summary RRDs for it.
	#
	# Format:
	# data_source "my cluster" [polling interval] address1:port addreses2:port ...
	#
	# The keyword 'data_source' must immediately be followed by a unique
	# string which identifies the source, then an optional polling interval in
	# seconds. The source will be polled at this interval on average.
	# If the polling interval is omitted, 15sec is asssumed.
	#
	# A list of machines which service the data source follows, in the
	# format ip:port, or name:port. If a port is not specified then 8649
	# (the default gmond port) is assumed.
	# default: There is no default value
	#
	# data_source "my cluster" 10 localhost my.machine.edu:8649 1.2.3.5:8655
	# data_source "my grid" 50 1.3.4.7:8655 grid.org:8651 grid-backup.org:8651
	# data_source "another source" 1.3.4.7:8655 1.3.4.8

	data_source "Hadoop" dspprod-m:8649
	data_source "Console" dspprod-c:8649
	data_source "App" dspprod-a1:8649

	gridname "Allyes Dsp Hangzhou"

	# 15 seconds for 1 month, 6 minutes for 1 year
	RRAs "RRA:AVERAGE:0.5:1:178560" "RRA:AVERAGE:0.5:24:88800"
	rrd_rootdir "/data/ganglia/rrds"
	#
	# Round-Robin Archives
	# You can specify custom Round-Robin archives here (defaults are listed below)
	#
	# RRAs "RRA:AVERAGE:0.5:1:244" "RRA:AVERAGE:0.5:24:244" "RRA:AVERAGE:0.5:168:244" "RRA:AVERAGE:0.5:672:244" \
	# "RRA:AVERAGE:0.5:5760:374"
	#

	#
	#-------------------------------------------------------------------------------
	# Scalability mode. If on, we summarize over downstream grids, and respect
	# authority tags. If off, we take on 2.5.0-era behavior: we do not wrap our output
	# in <GRID></GRID> tags, we ignore all <GRID> tags we see, and always assume
	# we are the "authority" on data source feeds. This approach does not scale to
	# large groups of clusters, but is provided for backwards compatibility.
	# default: on
	# scalable off
	#
	#-------------------------------------------------------------------------------
	# The name of this Grid. All the data sources above will be wrapped in a GRID
	# tag with this name.
	# default: Unspecified
	# gridname "MyGrid"
	#
	#-------------------------------------------------------------------------------
	# The authority URL for this grid. Used by other gmetads to locate graphs
	# for our data sources. Generally points to a ganglia/
	# website on this machine.
	# default: "http://hostname/ganglia/",
	# where hostname is the name of this machine, as defined by gethostname().
	# authority "http://mycluster.org/newprefix/"
	#
	#-------------------------------------------------------------------------------
	# List of machines this gmetad will share XML with. Localhost
	# is always trusted.
	# default: There is no default value
	# trusted_hosts 127.0.0.1 169.229.50.165 my.gmetad.org
	#
	#-------------------------------------------------------------------------------
	# If you want any host which connects to the gmetad XML to receive
	# data, then set this value to "on"
	# default: off
	# all_trusted on
	#
	#-------------------------------------------------------------------------------
	# If you don't want gmetad to setuid then set this to off
	# default: on
	# setuid off
	#
	#-------------------------------------------------------------------------------
	# User gmetad will setuid to (defaults to "ganglia")
	# default: "ganglia"
	# setuid_username "ganglia"
	#
	#-------------------------------------------------------------------------------
	# The port gmetad will answer requests for XML
	# default: 8651
	# xml_port 8651
	#
	#-------------------------------------------------------------------------------
	# The port gmetad will answer queries for XML. This facility allows
	# simple subtree and summation views of the XML tree.
	# default: 8652
	# interactive_port 8652
	#
	#-------------------------------------------------------------------------------
	# The number of threads answering XML requests
	# default: 4
	# server_threads 10
	#
	#-------------------------------------------------------------------------------
	# Where gmetad stores its round-robin databases
	# default: "/var/lib/ganglia/rrds"
	# rrd_rootdir "/some/other/place"
	/* This configuration is as close to 2.5.x default behavior as possible
	The values closely match ./gmond/metric.h definitions in 2.5.x */
	globals {
	daemonize = yes
	setuid = yes
	user = ganglia
	debug_level = 0
	max_udp_msg_len = 1472
	mute = no
	deaf = no
	allow_extra_data = yes
	host_dmax = 86400 /secs, set this to remove dead host /
	cleanup_threshold = 300 /secs /
	gexec = no
	send_metadata_interval = 30 /secs, set this, or sometimes gmond will stuck /
	}

	/*
	* The cluster attributes specified will be used as part of the <CLUSTER>
	* tag that will wrap all hosts collected by this instance.
	*/
	cluster {
	name = "cluster_name_to_change"
	owner = "Allyes"
	latlong = "unspecified"
	url = "unspecified"
	}

	/* The host section describes attributes of the host, like the location */
	host {
	location = "unspecified"
	}

	/* Feel free to specify as many udp_send_channels as you like. Gmond
	used to only support having a single channel */
	udp_send_channel {
	#bind_hostname = yes # Highly recommended, soon to be default.
	# This option tells gmond to use a source address
	# that resolves to the machine's hostname. Without
	# this, the metrics may appear to come from any
	# interface and the DNS names associated with
	# those IPs will be used to create the RRDs.
	host = host_name_to_change
	port = 8649
	ttl = 1
	}

	/* You can specify as many udp_recv_channels as you like as well. */
	udp_recv_channel {
	port = 8649
	}

	/* You can specify as many tcp_accept_channels as you like to share
	an xml description of the state of the cluster */
	tcp_accept_channel {
	port = 8649
	}

	/* Each metrics module that is referenced by gmond must be specified and
	loaded. If the module has been statically linked with gmond, it does
	not require a load path. However all dynamically loadable modules must
	include a load path. */
	modules {
	module {
	name = "core_metrics"
	}
	module {
	name = "cpu_module"
	path = "modcpu.so"
	}
	module {
	name = "disk_module"
	path = "moddisk.so"
	}
	module {
	name = "load_module"
	path = "modload.so"
	}
	module {
	name = "mem_module"
	path = "modmem.so"
	}
	module {
	name = "net_module"
	path = "modnet.so"
	}
	module {
	name = "proc_module"
	path = "modproc.so"
	}
	module {
	name = "sys_module"
	path = "modsys.so"
	}
	}

	include ('/etc/ganglia/conf.d/*.conf')

	/* The old internal 2.5.x metric array has been replaced by the following
	collection_group directives. What follows is the default behavior for
	collecting and sending metrics that is as close to 2.5.x behavior as
	possible. */

	/* This collection group will cause a heartbeat (or beacon) to be sent every
	20 seconds. In the heartbeat is the GMOND_STARTED data which expresses
	the age of the running gmond. */
	collection_group {
	collect_once = yes
	time_threshold = 20
	metric {
	name = "heartbeat"
	}
	}

	/* This collection group will send general info about this host every
	1200 secs.
	This information doesn't change between reboots and is only collected
	once. */
	collection_group {
	collect_once = yes
	time_threshold = 1200
	metric {
	name = "cpu_num"
	title = "CPU Count"
	}
	metric {
	name = "cpu_speed"
	title = "CPU Speed"
	}
	metric {
	name = "mem_total"
	title = "Memory Total"
	}
	/* Should this be here? Swap can be added/removed between reboots. */
	metric {
	name = "swap_total"
	title = "Swap Space Total"
	}
	metric {
	name = "boottime"
	title = "Last Boot Time"
	}
	metric {
	name = "machine_type"
	title = "Machine Type"
	}
	metric {
	name = "os_name"
	title = "Operating System"
	}
	metric {
	name = "os_release"
	title = "Operating System Release"
	}
	metric {
	name = "location"
	title = "Location"
	}
	}

	/* This collection group will send the status of gexecd for this host
	every 300 secs.*/
	/* Unlike 2.5.x the default behavior is to report gexecd OFF. */
	collection_group {
	collect_once = yes
	time_threshold = 300
	metric {
	name = "gexec"
	title = "Gexec Status"
	}
	}

	/* This collection group will collect the CPU status info every 20 secs.
	The time threshold is set to 90 seconds. In honesty, this
	time_threshold could be set significantly higher to reduce
	unneccessary network chatter. */
	collection_group {
	collect_every = 20
	time_threshold = 90
	/* CPU status */
	metric {
	name = "cpu_user"
	value_threshold = "1.0"
	title = "CPU User"
	}
	metric {
	name = "cpu_system"
	value_threshold = "1.0"
	title = "CPU System"
	}
	metric {
	name = "cpu_idle"
	value_threshold = "5.0"
	title = "CPU Idle"
	}
	metric {
	name = "cpu_nice"
	value_threshold = "1.0"
	title = "CPU Nice"
	}
	metric {
	name = "cpu_aidle"
	value_threshold = "5.0"
	title = "CPU aidle"
	}
	metric {
	name = "cpu_wio"
	value_threshold = "1.0"
	title = "CPU wio"
	}
	/* The next two metrics are optional if you want more detail...
	... since they are accounted for in cpu_system.
	metric {
	name = "cpu_intr"
	value_threshold = "1.0"
	title = "CPU intr"
	}
	metric {
	name = "cpu_sintr"
	value_threshold = "1.0"
	title = "CPU sintr"
	}
	*/
	}

	collection_group {
	collect_every = 20
	time_threshold = 90
	/* Load Averages */
	metric {
	name = "load_one"
	value_threshold = "1.0"
	title = "One Minute Load Average"
	}
	metric {
	name = "load_five"
	value_threshold = "1.0"
	title = "Five Minute Load Average"
	}
	metric {
	name = "load_fifteen"
	value_threshold = "1.0"
	title = "Fifteen Minute Load Average"
	}
	}

	/* This group collects the number of running and total processes */
	collection_group {
	collect_every = 80
	time_threshold = 950
	metric {
	name = "proc_run"
	value_threshold = "1.0"
	title = "Total Running Processes"
	}
	metric {
	name = "proc_total"
	value_threshold = "1.0"
	title = "Total Processes"
	}
	}

	/* This collection group grabs the volatile memory metrics every 40 secs and
	sends them at least every 180 secs. This time_threshold can be increased
	significantly to reduce unneeded network traffic. */
	collection_group {
	collect_every = 40
	time_threshold = 180
	metric {
	name = "mem_free"
	value_threshold = "1024.0"
	title = "Free Memory"
	}
	metric {
	name = "mem_shared"
	value_threshold = "1024.0"
	title = "Shared Memory"
	}
	metric {
	name = "mem_buffers"
	value_threshold = "1024.0"
	title = "Memory Buffers"
	}
	metric {
	name = "mem_cached"
	value_threshold = "1024.0"
	title = "Cached Memory"
	}
	metric {
	name = "swap_free"
	value_threshold = "1024.0"
	title = "Free Swap Space"
	}
	}

	collection_group {
	collect_every = 40
	time_threshold = 300
	metric {
	name = "bytes_out"
	value_threshold = 4096
	title = "Bytes Sent"
	}
	metric {
	name = "bytes_in"
	value_threshold = 4096
	title = "Bytes Received"
	}
	metric {
	name = "pkts_in"
	value_threshold = 256
	title = "Packets Received"
	}
	metric {
	name = "pkts_out"
	value_threshold = 256
	title = "Packets Sent"
	}
	}

	/* Different than 2.5.x default since the old config made no sense */
	collection_group {
	collect_every = 1800
	time_threshold = 3600
	metric {
	name = "disk_total"
	value_threshold = 1.0
	title = "Total Disk Space"
	}
	}

	collection_group {
	collect_every = 40
	time_threshold = 180
	metric {
	name = "disk_free"
	value_threshold = 1.0
	title = "Disk Space Available"
	}
	metric {
	name = "part_max_used"
	value_threshold = 1.0
	title = "Maximum Disk Space Used"
	}
	}