Skip to content

Instantly share code, notes, and snippets.

@ruanjf
Last active December 19, 2015 18:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ruanjf/6000151 to your computer and use it in GitHub Desktop.
Save ruanjf/6000151 to your computer and use it in GitHub Desktop.
linux下监控进程的脚本,记录cpu、内存、netstat(本进程/TCP/ALL)、文件打开数,预警支持发邮件、记录日志
# code by rjf
# 启动方式
# cd /root/monitoring/ && nohup mt.sh &
# 替换路径
###########################################
# 监控程序(默认监听init)
p_name=java
# 监控网卡(默认监听eth0)
eth=eth0
# 监控时间间隔(单位秒)
s_time=900
# 抵达监控上限后的监控时间间隔(单位秒)
sr_time=300
# 超过上限次数,才发送短信或者邮件(默认测试3次)功能未实现
over_time=
# 采样输出位置(以斜杠结尾,默认脚本所在目录)
sample_dir=
# 抵达监控上限后的采样执行命令(可写多个命令)
run=netstat -anp|grep ${pid}/${p_name} > ${sample_dir}netstat_${p_name}_${gdate}.txt
#run=jstack -l ${pid} > ${sample_dir}jstack_${gdate}.txt
#run=jmap -dump:format=b,file=${sample_dir}${p_name}.heap.${gdate}.bin ${pid}
# 发送邮件到(支持多个)
mail_on=true
mail_to=ruanjiefeng@gmail.com
mail_to=r_jf@sina.cn
# 发短信到(基于http方式,支持多个)
msg_on=
msg_url=
msg_to=
# 监控cpu上限(cpu使用率参考top)
max_cpu=49
# 监控内存上限(mem使用百分比参考top)
max_mem=40
# 监控连接数(程序的)
max_netstat=400
# 文件打开最大数
max_lsof=10000
#!/bin/sh
debug="false"
dir=$(cd "$(dirname "$0")"; pwd)"/"
config_file="${dir}config.ini"
if [ ! -s ${config_file} ]; then
echo "配置文件不存在 ${config_file},退出监听"
exit 1
fi
function getconfigmodify(){
config_file_modify=`stat ${config_file} |grep Modify |awk -F': ' '{print $2}'`
}
getconfigmodify
c_file_modify=$config_file_modify
p_name="init"
function setpid(){
#当前用户的进程
pid=`ps ux|grep ${p_name}|grep -v grep|sed -n '1p'|awk '{print $2}'`
}
setpid
s_time=900
sr_time=300
over_time=3
sample_dir=$dir
mail_on="false"
msg_on="false"
max_cpu=100
max_mem=40
max_netstat=400
max_lsof=10000
over=0
ip="127.0.0.1"
eth=""
function setip(){
if [ -n "$eth" ]; then
ip=`ifconfig ${eth} |grep "inet addr"| cut -f 2 -d ":"|cut -f 1 -d " "`
fi
}
setip
gdate='${date}'
function getconfig(){
while read line;
do
if echo $line|grep -v "#"|grep -v ^$ > /dev/null 2>&1
then
eval $(echo $line|awk -F '=' '{print "name="$1" value=\""$2"\""}')
if [ -n "$value" ]; then
case ${name} in
'p_name')
p_name=$value
setpid
;;
'eth')
eth=$value
setip
;;
's_time') s_time=$value
;;
'sr_time') sr_time=$value
;;
'over_time') over_time=$value
;;
'sample_dir') sample_dir=$value
;;
'run') runs[${#runs[@]}]=$value
;;
'mail_on') mail_on=$value
;;
'mail_to') mails_to[${#mails_to[@]}]=$value
;;
'msg_on') msg_on=$value
;;
'msg_url') msg_url=$value
;;
'msg_to') msgs_to[${#msgs_to[@]}]=$value
;;
'max_cpu') max_cpu=$value
;;
'max_mem') max_mem=$value
;;
'max_netstat') max_netstat=$value
;;
'max_lsof') max_lsof=$value
;;
esac
fi
fi
done < ${config_file}
turn_time=$s_time
}
getconfig
send_title=""
send_content=""
function getdate(){
return "`date +"%Y%m%d%H%M%S"`"
}
function sendmail(){
for email in ${mails_to[@]}
do
echo "${send_content}"|mail -s "${send_title}" ${email} -- -f web_error@rongji.com
done
}
function sendmsg(){
if [ -n "$msg_url" ]; then
for msg in ${msgs_to[@]}
do
if [ -n "$msg" ]; then
http_code=`curl -o /dev/null -s -m 10 --connect-timeout 10 -w %{http_code} "$msg_url"`
fi
done
fi
}
function sampling(){
for i in $(seq ${#runs[@]})
do
date="`date +"%Y%m%d%H%M%S"`"
run="${runs[`expr $i - 1`]}"
if [ "$debug" = "true" ] || [ "$1" = "one" ]; then
echo "${run}"
fi
eval $(echo ${run})
done
}
function send(){
if [ "$mail_on" = "true" ]; then
sendmail
fi
if [ "$msg_on" = "true" ]; then
sendmsg
fi
}
function addlog(){
log_file="${sample_dir}monitoring_${p_name}_`date +"%Y-%m-%d"`.log"
if [ ! -s ${log_file} ]; then
touch ${log_file}
fi
send_content="[`date +"%Y-%m-%d %H:%M:%S"`] cpu=${cpu} mem=${mem} netstat_p=${nets_p_name} netstat_t=${nets_tcp} netstat_a=${nets_all} lsof_p=${lsof_p} lsof_a=${lsof_all} ${send_title}"
echo -e "${send_content}\n" >> ${log_file}
}
function monitoring(){
cpu=0
mem=0
time=0
eval $(echo `top -b -n 1 -p ${pid}|sed -n '8p'|awk '{print "cpu="$9" mem="$10" time="$11}'`)
#echo "${cpu},${mem},${time}"
nets_p_name=`netstat -apn|grep ${pid}/${p_name}|wc -l`
nets_tcp=`netstat -tnp|wc -l`
nets_all=`netstat -anp|wc -l`
#echo "${nets_p_name},${nets_tcp},${nets_all}"
lsof_p=`lsof -p ${pid}|wc -l`
lsof_all=`lsof|wc -l`
#echo "${lsof_p},${lsof_all}"
#if [ "$cpu" -gt "$max_cpu" ]; then
if [ `echo "$cpu $max_cpu"|awk '{print ($1 > $2)?"1":"0"}'` -eq "1" ]; then
send_title="cpu_use[${cpu}] ${send_title}"
fi
#if [ $mem -gt $max_mem ]; then
if [ `echo "$mem $max_mem"|awk '{print ($1 > $2)?"1":"0"}'` -eq "1" ]; then
send_title="mem_use[${mem}] ${send_title}"
fi
if [ $nets_p_name -gt $max_netstat ]; then
send_title="netstat[${nets_p_name}] ${send_title}"
fi
if [ $lsof_all -gt $max_lsof ]; then
send_title="lsof[${lsof_all}] ${send_title}"
fi
addlog
if [ -n "$send_title" ]; then
if [ "$debug" = "true" ]; then
echo -e "[`date +"%Y-%m-%d %H:%M:%S"`] 存在异常 ${send_title}\n"
fi
over=`expr $over + 1`
if [ "$over" -ge "$over_time" ]; then
sampling
send
over=0
if [ "$debug" = "true" ]; then
echo "发送信息"
fi
fi
turn_time=$sr_time
unset send_title
unset send_content
else
turn_time=$s_time
fi
}
if [ -z "$pid" ]; then
echo "指定程序不存在 ${p_name},退出监听"
exit 1
fi
if [ "$debug" = "true" ] || [ "$1" = "one" ]; then
echo "监听程序pid $pid"
echo -e "监听程序名称 $p_name\n"
fi
if [ "$debug" = "true" ]; then
echo "开始监听...."
fi
if [ "$1" = "one" ]; then
echo "开始采样 ${pid}/${p_name}"
sampling
echo "完成采样"
exit 1
fi
while [ 1 -eq 1 ]
do
getconfigmodify
plc=`top -b -n 1 -p ${pid}|wc -l`
if [ "${c_file_modify}" != "${config_file_modify}" ] || [ "$plc" -eq "8" ]; then
if [ "$debug" = "true" ]; then
echo "更新配置文件"
fi
unset runs
unset mails_to
unset msgs_to
getconfig
c_file_modify=$config_file_modify
fi
monitoring
sleep $turn_time
if [ "$debug" = "true" ]; then
echo "重新开始...."
fi
done
[2012-10-01 00:02:20] cpu=0 mem=8.6 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 00:17:21] cpu=2 mem=8.5 netstat_p=13 netstat_t=43 netstat_a=452 lsof_p=436 lsof_a=6414
[2012-10-01 00:32:23] cpu=0 mem=8.5 netstat_p=13 netstat_t=43 netstat_a=452 lsof_p=436 lsof_a=6414
[2012-10-01 00:47:29] cpu=0 mem=8.5 netstat_p=13 netstat_t=42 netstat_a=451 lsof_p=436 lsof_a=6380
[2012-10-01 01:02:31] cpu=0 mem=8.5 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6348
[2012-10-01 01:17:32] cpu=0 mem=8.5 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6348
[2012-10-01 01:32:33] cpu=0 mem=8.5 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6347
[2012-10-01 01:47:35] cpu=0 mem=8.5 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 02:02:36] cpu=0 mem=8.5 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 02:17:37] cpu=0 mem=8.5 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 02:32:38] cpu=0 mem=8.5 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 02:47:40] cpu=0 mem=8.5 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 03:02:41] cpu=0 mem=8.5 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 03:17:42] cpu=0 mem=8.5 netstat_p=13 netstat_t=42 netstat_a=451 lsof_p=436 lsof_a=6380
[2012-10-01 03:32:44] cpu=0 mem=8.4 netstat_p=13 netstat_t=42 netstat_a=451 lsof_p=436 lsof_a=6380
[2012-10-01 03:47:45] cpu=0 mem=8.4 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 04:02:46] cpu=0 mem=8.4 netstat_p=13 netstat_t=42 netstat_a=451 lsof_p=436 lsof_a=6346
[2012-10-01 04:17:47] cpu=0 mem=8.4 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 04:32:49] cpu=0 mem=8.4 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 04:47:50] cpu=0 mem=8.4 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 05:02:51] cpu=0 mem=8.4 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 05:17:52] cpu=0 mem=8.4 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 05:32:54] cpu=0 mem=8.4 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 05:47:55] cpu=0 mem=8.4 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 06:02:56] cpu=0 mem=8.4 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 06:17:58] cpu=0 mem=8.4 netstat_p=13 netstat_t=42 netstat_a=451 lsof_p=436 lsof_a=6380
[2012-10-01 06:32:59] cpu=0 mem=8.4 netstat_p=13 netstat_t=42 netstat_a=451 lsof_p=436 lsof_a=6380
[2012-10-01 06:48:00] cpu=0 mem=8.4 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 07:03:02] cpu=0 mem=8.4 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 07:18:08] cpu=0 mem=8.4 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 07:33:09] cpu=0 mem=8.4 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 07:48:10] cpu=0 mem=8.4 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 08:03:11] cpu=0 mem=8.4 netstat_p=13 netstat_t=40 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 08:18:13] cpu=0 mem=8.3 netstat_p=12 netstat_t=40 netstat_a=449 lsof_p=435 lsof_a=6345
[2012-10-01 08:33:14] cpu=0 mem=8.3 netstat_p=12 netstat_t=40 netstat_a=449 lsof_p=435 lsof_a=6345
[2012-10-01 08:48:15] cpu=0 mem=8.3 netstat_p=12 netstat_t=40 netstat_a=449 lsof_p=435 lsof_a=6345
[2012-10-01 09:03:16] cpu=0 mem=8.3 netstat_p=12 netstat_t=40 netstat_a=449 lsof_p=435 lsof_a=6345
[2012-10-01 09:18:17] cpu=0 mem=8.3 netstat_p=12 netstat_t=41 netstat_a=450 lsof_p=435 lsof_a=6378
[2012-10-01 09:33:18] cpu=0 mem=8.3 netstat_p=12 netstat_t=41 netstat_a=450 lsof_p=435 lsof_a=6379
[2012-10-01 09:48:20] cpu=0 mem=8.3 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 10:03:22] cpu=0 mem=8.3 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 10:18:23] cpu=0 mem=8.3 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 10:33:24] cpu=0 mem=8.3 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 10:48:26] cpu=0 mem=8.2 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 11:03:27] cpu=0 mem=8.2 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 11:18:33] cpu=0 mem=8.2 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 11:33:35] cpu=0 mem=8.2 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 11:48:36] cpu=0 mem=8.2 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 12:03:37] cpu=0 mem=8.2 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 12:18:39] cpu=0 mem=8.2 netstat_p=13 netstat_t=42 netstat_a=451 lsof_p=436 lsof_a=6380
[2012-10-01 12:33:40] cpu=0 mem=8.2 netstat_p=13 netstat_t=42 netstat_a=451 lsof_p=436 lsof_a=6380
[2012-10-01 12:48:42] cpu=0 mem=8.2 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 13:03:43] cpu=0 mem=8.1 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 13:18:44] cpu=0 mem=8.1 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 13:33:46] cpu=0 mem=8.1 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 13:48:47] cpu=0 mem=8.1 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 14:03:49] cpu=0 mem=8.1 netstat_p=13 netstat_t=42 netstat_a=451 lsof_p=436 lsof_a=6380
[2012-10-01 14:18:50] cpu=0 mem=8.1 netstat_p=13 netstat_t=42 netstat_a=451 lsof_p=436 lsof_a=6380
[2012-10-01 14:33:52] cpu=0 mem=8.1 netstat_p=13 netstat_t=43 netstat_a=452 lsof_p=436 lsof_a=6346
[2012-10-01 14:48:53] cpu=0 mem=8.1 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 15:03:54] cpu=0 mem=8.1 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 15:18:56] cpu=0 mem=8.1 netstat_p=13 netstat_t=42 netstat_a=451 lsof_p=436 lsof_a=6380
[2012-10-01 15:33:57] cpu=0 mem=8.1 netstat_p=13 netstat_t=47 netstat_a=456 lsof_p=436 lsof_a=6554
[2012-10-01 15:48:59] cpu=0 mem=8.2 netstat_p=13 netstat_t=47 netstat_a=456 lsof_p=436 lsof_a=6554
[2012-10-01 16:04:00] cpu=0 mem=8.2 netstat_p=13 netstat_t=40 netstat_a=449 lsof_p=436 lsof_a=6346
[2012-10-01 16:19:01] cpu=0 mem=8.3 netstat_p=13 netstat_t=40 netstat_a=449 lsof_p=436 lsof_a=6346
[2012-10-01 16:34:03] cpu=0 mem=8.3 netstat_p=13 netstat_t=40 netstat_a=449 lsof_p=436 lsof_a=6346
[2012-10-01 16:49:04] cpu=0 mem=8.3 netstat_p=13 netstat_t=40 netstat_a=449 lsof_p=436 lsof_a=6346
[2012-10-01 17:04:06] cpu=0 mem=8.3 netstat_p=13 netstat_t=40 netstat_a=449 lsof_p=436 lsof_a=6346
[2012-10-01 17:19:07] cpu=0 mem=8.3 netstat_p=13 netstat_t=40 netstat_a=449 lsof_p=436 lsof_a=6346
[2012-10-01 17:34:08] cpu=0 mem=8.3 netstat_p=13 netstat_t=40 netstat_a=449 lsof_p=436 lsof_a=6346
[2012-10-01 17:49:10] cpu=0 mem=8.3 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 18:04:11] cpu=0 mem=8.3 netstat_p=13 netstat_t=40 netstat_a=449 lsof_p=436 lsof_a=6346
[2012-10-01 18:19:13] cpu=0 mem=8.2 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6380
[2012-10-01 18:34:14] cpu=0 mem=8.2 netstat_p=13 netstat_t=40 netstat_a=450 lsof_p=436 lsof_a=6380
[2012-10-01 18:49:20] cpu=0 mem=8.2 netstat_p=12 netstat_t=39 netstat_a=448 lsof_p=435 lsof_a=6345
[2012-10-01 19:04:22] cpu=0 mem=8.2 netstat_p=12 netstat_t=39 netstat_a=448 lsof_p=435 lsof_a=6345
[2012-10-01 19:19:23] cpu=0 mem=8.2 netstat_p=12 netstat_t=39 netstat_a=448 lsof_p=435 lsof_a=6345
[2012-10-01 19:34:25] cpu=0 mem=8.2 netstat_p=12 netstat_t=39 netstat_a=448 lsof_p=435 lsof_a=6345
[2012-10-01 19:49:26] cpu=0 mem=8.2 netstat_p=12 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 20:04:28] cpu=0 mem=8.2 netstat_p=13 netstat_t=40 netstat_a=449 lsof_p=436 lsof_a=6346
[2012-10-01 20:19:29] cpu=0 mem=8.2 netstat_p=14 netstat_t=41 netstat_a=450 lsof_p=437 lsof_a=6347
[2012-10-01 20:34:36] cpu=2 mem=8.1 netstat_p=13 netstat_t=40 netstat_a=449 lsof_p=436 lsof_a=6346
[2012-10-01 20:49:37] cpu=0 mem=8.1 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 21:04:39] cpu=0 mem=8.1 netstat_p=13 netstat_t=43 netstat_a=452 lsof_p=436 lsof_a=6380
[2012-10-01 21:19:45] cpu=0 mem=8.1 netstat_p=13 netstat_t=42 netstat_a=451 lsof_p=436 lsof_a=6380
[2012-10-01 21:34:47] cpu=0 mem=8.1 netstat_p=13 netstat_t=42 netstat_a=451 lsof_p=436 lsof_a=6346
[2012-10-01 21:49:48] cpu=0 mem=8.1 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 22:04:49] cpu=0 mem=8.1 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 22:19:51] cpu=0 mem=8.1 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 22:34:53] cpu=0 mem=8.1 netstat_p=13 netstat_t=42 netstat_a=451 lsof_p=436 lsof_a=6346
[2012-10-01 22:49:54] cpu=0 mem=8.1 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 23:04:56] cpu=0 mem=8.1 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 23:19:57] cpu=0 mem=8.1 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 23:35:00] cpu=0 mem=8.0 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
[2012-10-01 23:50:01] cpu=0 mem=8.0 netstat_p=13 netstat_t=41 netstat_a=450 lsof_p=436 lsof_a=6346
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment