Last active
November 17, 2021 10:47
-
-
Save prasanthj/f4e5420759f4abb62a936f47731c8de9 to your computer and use it in GitHub Desktop.
Debugging hadoop/tez/llap shuffle issues
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# PDSH Usage | |
export PDSH_SSH_ARGS_APPEND="-q -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i ~/.ssh/private.key" | |
pdsh -R ssh -w ^slaves.txt | |
# parallel-ssh Usage | |
parallel-ssh -x "-oStrictHostKeyChecking=no -i ~/.ssh/private.key" -i -h slaves.txt "sudo -u root jps | grep "LlapDaemon" | cut -f1 -d' '" | |
# remote heap-dump | |
# Add the following contents to jcmds.sh and use parallel-scp to copy the script to all hosts | |
#!/bin/bash | |
LLAP_PID=`pgrep -f LlapDaemon` | |
sudo -u root jmap -heap $LLAP_PID > /tmp/jmap-heap-before-gc-${LLAP_PID}.txt | |
sudo -u yarn jmap -histo:live $LLAP_PID > /tmp/jmap-histo-live-${LLAP_PID}.txt | |
sudo -u root jmap -heap $LLAP_PID > /tmp/jmap-heap-after-gc-${LLAP_PID}.txt | |
# Then execute the script using | |
parallel-ssh -x "-oStrictHostKeyChecking=no -i ~/.ssh/private.key" -i -h slaves.txt "./jcmds.sh" | |
# Look for tcp connection issues | |
dmesg | grep SYN.*15551 | wc -l | |
dmesg | grep SYN.* | wc -l | |
dmesg | grep cookies | |
netstat -s | grep "overflow" | |
netstat -s | grep "failed c" | |
# Kernel settings to avoid shuffle connection drops | |
sudo sysctl -w net.core.somaxconn=16384 | |
sudo sysctl -w net.core.netdev_max_backlog=16384 | |
sudo sysctl -w net.ipv4.ip_local_port_range="4096 60999" | |
sudo sysctl -w net.ipv4.tcp_fin_timeout=10 | |
sudo sysctl -w vm.swappiness=0 | |
sudo sysctl -w vm.dirty_ratio=80 | |
echo "# Apache Hive LLAP config changes" | sudo tee -a /etc/sysctl.conf | |
echo "net.core.somaxconn=16384" | sudo tee -a /etc/sysctl.conf | |
echo "net.core.netdev_max_backlog=16384" | sudo tee -a /etc/sysctl.conf | |
echo 'net.ipv4.ip_local_port_range="4096 60999"' | sudo tee -a /etc/sysctl.conf | |
echo "net.ipv4.tcp_fin_timeout=10" | sudo tee -a /etc/sysctl.conf | |
echo "vm.swappiness=0" | sudo tee -a /etc/sysctl.conf | |
echo "vm.dirty_ratio=80" | sudo tee -a /etc/sysctl.conf | |
echo "# Apache Hive LLAP config changes" | sudo tee -a /etc/sysctl.d/50-hive-llap.conf | |
echo "net.core.somaxconn=16384" | sudo tee -a /etc/sysctl.d/50-hive-llap.conf | |
echo "net.core.netdev_max_backlog=16384" | sudo tee -a /etc/sysctl.d/50-hive-llap.conf | |
echo 'net.ipv4.ip_local_port_range="4096 60999"' | sudo tee -a /etc/sysctl.d/50-hive-llap.conf | |
echo "net.ipv4.tcp_fin_timeout=10" | sudo tee -a /etc/sysctl.d/50-hive-llap.conf | |
echo "vm.swappiness=0" | sudo tee -a /etc/sysctl.d/50-hive-llap.conf | |
echo "vm.dirty_ratio=80" | sudo tee -a /etc/sysctl.d/50-hive-llap.conf | |
echo "# sysctl changes for LLAP are not getting effect on system restart" | sudo tee -a /etc/rc.local | |
echo "sysctl --system" | sudo tee -a /etc/rc.local | |
# Ulimit changes for shuffle handler | |
echo "# Apache Hive LLAP config changes" | sudo tee -a /etc/security/limits.d/z-llap.conf | |
echo "# /etc/security/limits.d/hive.conf overrides the system defaults which has higher limits" | sudo tee -a /etc/security/limits.d/z-llap.conf | |
echo "hive - nofile 128000" | sudo tee -a /etc/security/limits.d/z-llap.conf | |
echo "yarn - nofile 128000" | sudo tee -a /etc/security/limits.d/z-llap.conf | |
# Disable THP | |
echo never | sudo tee -a /sys/kernel/mm/transparent_hugepage/enabled | |
echo never | sudo tee -a /sys/kernel/mm/transparent_hugepage/defrag | |
# Lookup JMX or proc files | |
curl -s http://`hostname`:15002/jmx | grep LlapDaemonOpenFileDescriptorCount | |
sudo ls -l /proc/`pgrep -f LlapDaemon`/fd | tee /tmp/q50-fd.txt | |
# Find read errors and where it happened | |
grep "read error" am.log | cut -f4 -d':' | cut -f2 -d' ' | sort | uniq | |
egrep "attempt_1499327402359_0003_1_02_000145_0|attempt_1499327402359_0003_1_02_000249_0" am.log | grep "containerId" | grep "HISTORY" | rev | cut -f1 -d'=' | rev | cut -f1 -d':' | sort | uniq | |
# TFile parser PIG commands | |
# looking for Shuffle stalls (Status.WAIT log line in task logs) | |
grunt> register '/tmp/tez-tfile-parser-0.9.0-SNAPSHOT.jar' | |
grunt> a = LOAD '/app-logs/hive/logs/application_1499327402359_0004/' using org.apache.tez.tools.TFileLoader() AS (machine:chararray,key:chararray, line:chararray); | |
grunt> b = filter a by (line matches '.*WAIT.*'); | |
grunt> c = limit b 100; | |
grunt> dump c; | |
# Flight record JVM options | |
-XX:+UnlockCommercialFeatures | |
-XX:+FlightRecorder | |
-XX:FlightRecorderOptions=defaultrecording=true,dumponexit=true,dumponexitpath=/tmp/jmcdumps,disk=true,repository=/tmp/jmcdumps | |
-Dcom.sun.management.jmxremote.port=7091 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment