Wildcarde/01_notes.md

## 01_notes.md

      
    Raw
  

              01_notes.md
            
          
    Notes for use of parallel pcap and auto termination of job on isilon

On Isilon nodes:

Run capstart.sh command to begin rotating parallel tcpdumps on all nodes with external interfaces.
On computation node:


the scripts check-nfs.sh and check-nfs-datacapture.sh must be loaded onto the node and stored in /root to be used via cron.
configure an ssh key on the node that can reach a specific node on the storage cluster as root (required to terminate the server tcpdump process by calling capstop.sh)
Make sure the file /tmp/nfsissuecaptured does not exist
run echo 0 > /tmp/nfsmonitor.semaphore to reset the event notification from previous captures
Configure check_nfs.sh to run every 10 minutes via cron
Configure check-nfs-datacapture.sh to run every 2 minutes so that it catches events quickly.
launch new tcpdump capture with tcpdump -w /tmp/pcaps/$(date '+%Y-%m-%dT%H.%M.%S').$(hostname).em1.pcap -i em1 -s 640 -C 250 -W 100 -Z root

Recommended that this is launched in a tmux session on the individual node with additional panes running:
tail -F /var/log/messages | grep CAPTURE
tail -F /var/log/cron -n 40


## capstart.sh
isi_for_array -n9-13 'nohup /bin/bash /ifs/data/Isilon_Support/pcaps/capture_control.sh > /ifs/data/Isilon_Support/pcaps/logfile 2>&1 &'

## capstop.sh
isi_for_array -n9-13 killall -SIGINT capture_control.sh
isi_for_array -n9-13 killall -SIGINT tcpdump
sleep 2
isi_for_array -s ps auxw |egrep "capture_control|tcpdump"

## capture_control.sh
#!/bin/bash
tcpdump -w /ifs/data/Isilon_Support/pcaps/$(hostname)/$(date '+%Y-%m-%dT%H.%M.%S').$(hostname).bxe1.pcap -i bxe1 -s 640 -C 250 -W 100

## check-nfs-datacapture.sh
#! /bin/env bash

#this script will handles a few things:
# 1) stop the tcpdump capture running on the node
# 2) clone pcap output to the sink admin folder
# 3) remote into bucket and terminate pcap and related processes
#also:
## reaper program to remove old stat block captures from isiperf, check if
#they are older than 5 hours if so remove them.7
#files will be located in: /ifs/data/Isilon_Support/corecollect this program
#should call the reaper if it's still searchign for problems and skip if the run has finished.

#kill commands for tcpdump

#check value of nfsmonitoring script
read -r errorcode < /tmp/nfsmonitor.semaphore

#once the issue has been captured set this so that it doesn't keep attempting to run
if [ -f /tmp/nfsissuecaptured ]; then
  #logger "CAPTUREPCAP: Issue Captured Already"
  exit 0
fi
#if empty or something other than one quit and do nothing.
#the initial file has no value in it so errorcode is actually null
if [ -z "$errorcode" ]; then
  #logger "CAPTUREPCAP: No Issue Found"
  exit 0
fi
#if capture has been reset by sending an echo 0 this will apply.
if [ $errorcode != "1" ]; then
  #logger "CAPTUREPCAP: No Issue Found"
  exit 0
fi

logger "CAPTUREPCAP: Issue discovered, capturing"

#Kill all tcpdump tasks
killall -SIGINT tcpdump
sleep 2 #make sure the kill finishes

#turn off this script for the future
touch /tmp/nfsissuecaptured

# ssh into bucket node and stop tcpdump
ssh 10.2.147.221 'bash /ifs/data/Isilon_Support/pcaps/capstop.sh'

#store the pcaps into our working area so they can be sent to dell
rsync /tmp/pcaps/* /mnt/bucket/PNI-facilities/sw/gmcgrath/work/pcaps

exit 0

## notes and referrences.

#killall -SIGINT tcpdump
#sleep 2
#ps auxw |egrep "capture_control|tcpdump"

#tcp dump to run on node
#tcpdump -w /tmp/pcaps/$(date '+%Y-%m-%dT%H.%M.%S').$(hostname).em1.pcap -i em1 -s 640 -C 250 -W 100 -Z root

## check-nfs.sh
#!/bin/bash
# original script this was built off of: https://gist.github.com/cinsk/840ed553905cb6e8f0ae
PATH=/bin:/usr/bin:/usr/local/bin

check-nfs () {
  #this temp file holds the pid of the orphan shell that needs to be cleaned up
  local TMPFILE=/tmp/checknfs.$$ RET=0 ORPHAN SUBSHELLPID

  #arg checks
  if [ "$#" -eq 0 ]; then
    cat<<EOF
    usage: check-nfs NFS-DIRECTORY...
    Check if accessing any of NFS-DIRECTORY failed
EOF
    return 1
  fi

  while [ -n "$1" ]; do
    read -t35 < <(echo $BASHPID >"$TMPFILE"; stat -t "$1" 2>/dev/null)
      if [ "$?" -gt 128 ]; then
        #echo "error: $1"
        ORPHAN=$(cat $TMPFILE)
        SUBSHELLPID=$(ps --ppid $ORPHAN -o pid=)
        [ -n "$SUBSHELLPID" ] && kill -9 $SUBSHELLPID
        kill -9 $ORPHAN
        RET=1
        #if mount is hung notify
        logger "CHECKNFS: $1 hung; attempting to fix"
        echo 1 > /tmp/nfsmonitor.semaphore
        umount -l $1
        sleep 1
        mount $1

        if [ "$?" -eq 0 ]; then
          logger "CHECKNFS: $1 fixed"
          #only print success notification if it works
        fi
      fi
      shift
      rm -f $TMPFILE
    done
    return "$RET"
}

#parse mount command and check all NFS mounts with check-nfs function
while read _ _ mount _; do

  check-nfs "$mount"

done < <(mount -t nfs)
	isi_for_array -n9-13 killall -SIGINT capture_control.sh
	isi_for_array -n9-13 killall -SIGINT tcpdump
	sleep 2
	isi_for_array -s ps auxw \|egrep "capture_control\|tcpdump"
	#!/bin/bash
	tcpdump -w /ifs/data/Isilon_Support/pcaps/$(hostname)/$(date '+%Y-%m-%dT%H.%M.%S').$(hostname).bxe1.pcap -i bxe1 -s 640 -C 250 -W 100
	#! /bin/env bash

	#this script will handles a few things:
	# 1) stop the tcpdump capture running on the node
	# 2) clone pcap output to the sink admin folder
	# 3) remote into bucket and terminate pcap and related processes
	#also:
	## reaper program to remove old stat block captures from isiperf, check if
	#they are older than 5 hours if so remove them.7
	#files will be located in: /ifs/data/Isilon_Support/corecollect this program
	#should call the reaper if it's still searchign for problems and skip if the run has finished.

	#kill commands for tcpdump

	#check value of nfsmonitoring script
	read -r errorcode < /tmp/nfsmonitor.semaphore

	#once the issue has been captured set this so that it doesn't keep attempting to run
	if [ -f /tmp/nfsissuecaptured ]; then
	#logger "CAPTUREPCAP: Issue Captured Already"
	exit 0
	fi
	#if empty or something other than one quit and do nothing.
	#the initial file has no value in it so errorcode is actually null
	if [ -z "$errorcode" ]; then
	#logger "CAPTUREPCAP: No Issue Found"
	exit 0
	fi
	#if capture has been reset by sending an echo 0 this will apply.
	if [ $errorcode != "1" ]; then
	#logger "CAPTUREPCAP: No Issue Found"
	exit 0
	fi

	logger "CAPTUREPCAP: Issue discovered, capturing"

	#Kill all tcpdump tasks
	killall -SIGINT tcpdump
	sleep 2 #make sure the kill finishes

	#turn off this script for the future
	touch /tmp/nfsissuecaptured

	# ssh into bucket node and stop tcpdump
	ssh 10.2.147.221 'bash /ifs/data/Isilon_Support/pcaps/capstop.sh'

	#store the pcaps into our working area so they can be sent to dell
	rsync /tmp/pcaps/* /mnt/bucket/PNI-facilities/sw/gmcgrath/work/pcaps

	exit 0

	## notes and referrences.

	#killall -SIGINT tcpdump
	#sleep 2
	#ps auxw \|egrep "capture_control\|tcpdump"

	#tcp dump to run on node
	#tcpdump -w /tmp/pcaps/$(date '+%Y-%m-%dT%H.%M.%S').$(hostname).em1.pcap -i em1 -s 640 -C 250 -W 100 -Z root
	#!/bin/bash
	# original script this was built off of: https://gist.github.com/cinsk/840ed553905cb6e8f0ae
	PATH=/bin:/usr/bin:/usr/local/bin

	check-nfs () {
	#this temp file holds the pid of the orphan shell that needs to be cleaned up
	local TMPFILE=/tmp/checknfs.$$ RET=0 ORPHAN SUBSHELLPID

	#arg checks
	if [ "$#" -eq 0 ]; then
	cat<<EOF
	usage: check-nfs NFS-DIRECTORY...
	Check if accessing any of NFS-DIRECTORY failed
	EOF
	return 1
	fi

	while [ -n "$1" ]; do
	read -t35 < <(echo $BASHPID >"$TMPFILE"; stat -t "$1" 2>/dev/null)
	if [ "$?" -gt 128 ]; then
	#echo "error: $1"
	ORPHAN=$(cat $TMPFILE)
	SUBSHELLPID=$(ps --ppid $ORPHAN -o pid=)
	[ -n "$SUBSHELLPID" ] && kill -9 $SUBSHELLPID
	kill -9 $ORPHAN
	RET=1
	#if mount is hung notify
	logger "CHECKNFS: $1 hung; attempting to fix"
	echo 1 > /tmp/nfsmonitor.semaphore
	umount -l $1
	sleep 1
	mount $1

	if [ "$?" -eq 0 ]; then
	logger "CHECKNFS: $1 fixed"
	#only print success notification if it works
	fi
	fi
	shift
	rm -f $TMPFILE
	done
	return "$RET"
	}

	#parse mount command and check all NFS mounts with check-nfs function
	while read _ _ mount _; do

	check-nfs "$mount"

	done < <(mount -t nfs)