Skip to content

Instantly share code, notes, and snippets.

@wileyj
Last active August 30, 2024 02:25
Show Gist options
  • Save wileyj/e84eb1e9bb0784f941483c50570587f0 to your computer and use it in GitHub Desktop.
Save wileyj/e84eb1e9bb0784f941483c50570587f0 to your computer and use it in GitHub Desktop.
#!/bin/bash
set -o pipefail
NETWORK="mainnet"
REPO_DIR="$HOME/stacks-inspect"
SCRATCH_DIR="$HOME/scratch"
TIMESTAMP=$(date +%Y-%m-%d-%s) # use a simple date format year-month-day-epoch
LOG_DIR="/tmp/replay_${TIMESTAMP}"
SLICE_DIR="${SCRATCH_DIR}/slice"
TMUX_SESSION="replay"
TERM_OUT=false
UPLOAD=false ## default to not upload to s3
BRANCH="develop"
S3_BUCKET="stacks-replay" # public s3 bucket to upload results to
S3_ROOT_FOLDER="results"
## retrieve number of CORES
CORES=$(cat /proc/cpuinfo | grep processor | wc -l)
## reserve this many CORES for other processes as default
RESERVED=10
COLRED=$'\033[31m' # Red
COLGREEN=$'\033[32m' # Green
COLYELLOW=$'\033[33m' # Yellow
COLBOLD=$'\033[1m' # Bold Text
COLRESET=$'\033[0m' # reset color/formatting
install_cargo() {
## verify that cargo is installed in the expected path, not only $PATH
command -v "$HOME/.cargo/bin/cargo" >/dev/null 2>&1 || {
echo "Installing Rust via rustup"
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y || {
echo "${COLRED}Error${COLRESET} installing Rust"
exit 1
}
}
echo "Exporting $HOME/.cargo/env"
source "$HOME/.cargo/env"
return 0
}
build_stacks_inspect() {
if [ -d ${REPO_DIR} ];then
echo "Found ${COLYELLOW}${REPO_DIR}${COLRESET}. checking out ${COLGREEN}${BRANCH}${COLRESET} and resetting to ${COLBOLD}HEAD${COLRESET}"
cd ${REPO_DIR} && git checkout $BRANCH && git reset --hard HEAD || {
echo "${COLRED}Error${COLRESET} checking out ${BRANCH}"
exit 1
}
else
git clone https://github.com/stacks-network/stacks-core --single-branch --branch ${BRANCH} ${REPO_DIR} && cd ${REPO_DIR} || {
echo "${COLRED}Error${COLRESET} cloning https://github.com/stacks-network/stacks-core into ${REPO_DIR}"
exit 1
}
fi
## build stacks-inspect to: $HOME/stacks-inspect/target/release/stacks-inspect
cargo build --bin=stacks-inspect --release || {
echo "${COLRED}Error${COLRESET} building stacks-inspect binary"
exit 1
}
}
configure_replay_slices() {
if [ -d "$HOME/scratch" ]; then
echo "Deleting existing scratch dir: ${COLYELLOW}$HOME/scratch${COLRESET}"
rm -rf $HOME/scratch || {
echo "${COLRED}Error${COLRESET} deleting dir $HOME/scratch"
exit 1
}
fi
echo "Creating scratch and slice dirs"
mkdir -p ${SLICE_DIR}0 && cd ${SCRATCH_DIR} || {
echo "${COLRED}Error${COLRESET} creating dir ${SLICE_DIR}"
exit 1
}
echo "Downloading latest ${NETWORK} chainstate archive ${COLYELLOW}https://archive.hiro.so/${NETWORK}/stacks-blockchain/${NETWORK}-stacks-blockchain-latest.tar.gz${COLRESET}"
curl -L --proto '=https' --tlsv1.2 https://archive.hiro.so/${NETWORK}/stacks-blockchain/${NETWORK}-stacks-blockchain-latest.tar.gz -o ${SCRATCH_DIR}/${NETWORK}-stacks-blockchain-latest.tar.gz || {
echo "${COLRED}Error${COLRESET} downlaoding latest ${NETWORK} chainstate archive"
exit 1
}
echo "Extracting downloaded archive: ${COLYELLOW}${SCRATCH_DIR}/${NETWORK}-stacks-blockchain-latest.tar.gz${COLRESET}"
tar --strip-components=1 -xzf ${SCRATCH_DIR}/${NETWORK}-stacks-blockchain-latest.tar.gz -C ${SLICE_DIR}0 || {
echo "${COLRED}Error${COLRESET} extracting ${NETWORK} chainstate archive"
exit
}
echo "Moving marf database: ${SLICE_DIR}0/chainstate/vm/clarity/marf.sqlite.blobs -> ${COLYELLOW}${SCRATCH_DIR}/marf.sqlite.blobs${COLRESET}"
mv ${SLICE_DIR}0/chainstate/vm/clarity/marf.sqlite.blobs ${SCRATCH_DIR}/
echo "Symlinking marf database: ${SCRATCH_DIR}/marf.sqlite.blobs -> ${COLYELLOW}${SLICE_DIR}0/chainstate/vm/clarity/marf.sqlite.blobs${COLRESET}"
ln -s ${SCRATCH_DIR}/marf.sqlite.blobs ${SLICE_DIR}0/chainstate/vm/clarity/marf.sqlite.blobs || {
echo "${COLRED}Error${COLRESET} creating symlink: ${SCRATCH_DIR}/marf.sqlite.blobs -> ${SLICE_DIR}0/chainstate/vm/clarity/marf.sqlite.blobs"
exit 1
}
## we'll need ~53GB per slice (for 16 slices, this is 848GB)
## with the chainstate archive and linked file, total space required is roughly: 1.2 TB
## create a copy of the linked db with <number of CORES><number of RESERVED CORES>
## decrement additional "1" since we already have ${SLICE_DIR}0
for ((i=1;i<=$(expr "$CORES" - "$RESERVED" - 1);i++)); do
echo "Copying ${SLICE_DIR}0 -> ${COLYELLOW}${SLICE_DIR}${i}${COLRESET}"
cp -R "${SLICE_DIR}0" "${SLICE_DIR}${i}" || {
echo "${COLRED}Error${COLRESET} copying ${SLICE_DIR}0 -> ${SLICE_DIR}${i}"
exit 1
}
done
}
start_replay() {
## if there is an existing folder, rm it
if [ -f "${LOG_DIR}/results.log" ];then
rm -rf "${LOG_DIR}"
fi
## create LOG_DIR to store output files
if [ ! -d "${LOG_DIR}" ]; then
mkdir -p "${LOG_DIR}"
fi
## if tmux session "replay" exists, kill it and start anew
if $(tmux list-windows -t ${TMUX_SESSION} &> /dev/null); then
tmux kill-session -t ${TMUX_SESSION} &> /dev/null
fi
local slice_counter=0
## create tmux session named ${TMUX_SESSION} with a window named slice0
tmux new-session -d -s ${TMUX_SESSION} -n slice${slice_counter} || {
echo "${COLRED}Error${COLRESET} creating tmux session ${COLYELLOW}${TMUX_SESSION}${COLRESET}"
exit 1
}
# if [ ! -f "${SLICE_DIR}0/S3_ROOT_FOLDERchainstate/vm/index.sqlite" ]; then
if [ ! -f "${SLICE_DIR}0/chainstate/vm/index.sqlite" ]; then
echo "${COLRED}Error${COLRESET}: chainstate db not found (${SLICE_DIR}0/chainstate/vm/index.sqlite)"
exit 1
fi
## get the total number of blocks (with orphans) in db
local total_blocks=$(echo "select count(*) from staging_blocks where orphaned = 0" | sqlite3 ${SLICE_DIR}0/chainstate/vm/index.sqlite)
local starting_block=0 # for the block counter, start at this block
##
## 2.5 epoch block query using: $HOME/scratch/slice0/chainstate/vm/index.sqlite
## echo 'with subq AS (select index_block_hash, RANK() OVER (ORDER BY index_block_hash ASC) rnk FROM staging_blocks where orphaned = 0) SELECT index_block_hash, rnk FROM subq WHERE index_block_hash = "de441baf2232417fae38dd0590e547842aac755a1aff60fa01ccedecaf78692d";' | sqlite3 $HOME/scratch/slice0/chainstate/vm/index.sqlite
# local total_blocks=154000 ## for testing 2.5 epoch at 153106
# local starting_block=152000 ## for testing 2.5 epoch at 153106
##
local block_diff=`expr $total_blocks - $starting_block`
local slices=`expr $CORES - $RESERVED` # how many slices to run
local slice_blocks=`expr $block_diff / $slices` # how many blocks per slice
echo "Total blocks: ${COLYELLOW}${total_blocks}${COLRESET}"
echo "Total slices: ${COLYELLOW}${slices}${COLRESET}"
echo "Blocks per slice: ${COLYELLOW}${slice_blocks}${COLRESET}"
echo "Block diff: ${COLYELLOW}$block_diff${COLRESET}"
local end_block_count=$starting_block
while [[ ${end_block_count} -lt ${total_blocks} ]]; do
local start_block_count=$end_block_count
end_block_count=`expr $end_block_count + $slice_blocks`
if [ $end_block_count -gt $total_blocks -o "$slice_counter" -eq `expr "$slices" - 1` ]; then
end_block_count=$total_blocks
fi
if [ ${slice_counter} -gt 0 ];then
tmux new-window -t replay -d -n slice${slice_counter} || {
echo "${COLRED}Error${COLRESET} creating tmux window ${COLYELLOW}slice${slice_counter}${COLRESET}"
exit 1
}
fi
local cmd="${REPO_DIR}/target/release/stacks-inspect replay-block ${SLICE_DIR}${slice_counter} index-range $start_block_count $end_block_count 2>/dev/null"
# local cmd="${REPO_DIR}/target/release/stacks-inspect replay-block ${SLICE_DIR}${slice_counter} index-range $start_block_count $end_block_count 2>/dev/null"
local log="| tee -a ${LOG_DIR}/slice${slice_counter}.log"
echo " Creating tmux window: ${COLGREEN}replay:slice${slice_counter}${COLRESET} :: Blocks: ${COLYELLOW}${start_block_count}-${end_block_count}${COLRESET}"
echo "Command: ${cmd}" > ${LOG_DIR}/slice${slice_counter}.log # log the command being run for the slice
echo "Replaying indexed blocks: ${start_block_count}-${end_block_count} (out of ${total_blocks})" >> ${LOG_DIR}/slice${slice_counter}.log
tmux send-keys -t ${TMUX_SESSION}:slice${slice_counter} "${cmd}${log}" Enter || {
echo "${COLRED}Error${COLRESET} sending replay command to tmux window ${COLYELLOW}slice${slice_counter}${COLRESET}"
exit 1
}
## log the return code as the last line
tmux send-keys -t ${TMUX_SESSION}:slice${slice_counter} "echo \${PIPESTATUS[0]} >> ${LOG_DIR}/slice${slice_counter}.log" Enter || {
echo "${COLRED}Error${COLRESET} sending return status command to tmux window ${COLYELLOW}slice${slice_counter}${COLRESET}"
exit 1
}
slice_counter=`expr $slice_counter + 1`
done
}
check_progress() {
# give the pids a few seconds to show up in process table
local sleep_duration=5
while [ $sleep_duration -gt 0 ]; do
${TERM_OUT} && printf "Sleeping ... \b [ ${COLYELLOW}${sleep_duration}${COLRESET} ] \033[0K\r"
sleep_duration=$((sleep_duration-1))
sleep 1
done
local progress=1
local sp="/-\|"
echo "************************************************************************"
echo "Checking Block Replay status"
# ${TERM_OUT} && echo "Tmux windows (${COLYELLOW}${TMUX_SESSION}${COLRESET}):"
# ${TERM_OUT} && tmux list-windows -t ${TMUX_SESSION} -F 'window: #{window_name} | command: #{pane_current_command}'
echo -e ' '
while true; do
local count=$(pgrep -c "stacks-inspect")
if [ $count -gt 0 ]; then
${TERM_OUT} && printf "Block replay processes are currently active [ ${COLYELLOW}${COLBOLD}${count}${COLRESET} ] ... \b${sp:progress++%${#sp}:1} \033[0K\r"
else
${TERM_OUT} && printf "\r\n"
break
fi
done
echo "************************************************************************"
}
store_results() {
## text file to store results
local results="${LOG_DIR}/results.log"
## html file to store results
local results_html="${LOG_DIR}/results.html"
## retrieve current date in UTC
local cur_date=$(date -u)
echo "Results: ${COLYELLOW}${results}${COLRESET}"
cd ${LOG_DIR}
local failed=0;
local return_code=0;
## retrieve the count of all lines with `Failed processing block`
local failure_count=$(grep -rc "Failed processing block" slice*.log | awk -F: '$NF >= 0 {x+=$NF; $NF=""} END{print x}')
if [ ${failure_count} -gt 0 ]; then
echo "Failures: ${COLRED}${failure_count}${COLRESET}"
else
echo "Failures: ${COLGREEN}${failure_count}${COLRESET}"
fi
echo "Failures: ${failure_count}" > $results
## check the return codes to see if we had a panic
for file in $(ls slice*.log | sort); do
return_code=$(tail -1 $file)
case ${return_code} in
0)
# block replay ran successfully
;;
1)
# block replay had some block failures
failed=1
;;
*)
# return code likely indicates a panic
failed=1
echo "$file return code: $return_code" >> $results # ok to continue if this write fails
;;
esac
# echo "return code: $return_code"
# if [ ${return_code} -gt "0" ];then
# echo "${COLRED}Replay Failure${COLREST} in: ${LOG_DIR}/${file}"
# failed=1
# fi
done
## Store the results as HTML:
cat <<-EOF> "${results_html}"
<body>
<style>
@import url('https://fonts.googleapis.com/css2?family=Source+Code+Pro:ital,wght@0,200..900;1,200..900&display=swap');
.container {
border: 1px outset black;
padding: 5px;
border-radius: 5px;
background-color: #eae9e8;
}
.fail {
background-color: #ffffff;
border: 1px outset black;
border-radius: 5px;
font-weight: 350;
}
.pass {
background-color: #eae9e8;
}
.result {
text-align: left;
padding-left: 10px;
padding-top: 10px;
padding-bottom: 10px;
margin: 5px;
}
body {
font-family: "Source Code Pro", monospace;
font-optical-sizing: auto;
font-style: normal;
}
</style>
<h2>${cur_date}</h2>
<hr/>
<h2>Failures: ${failure_count}</h2>
<div class="container">
EOF
## use the $failed var here in case there is a panic, then $failure_count may show zero, but the replay was not successful
if [ ${failed} == "1" ];then
output=$(grep -r -h "Failed processing block" slice*.log)
IFS=$'\n'
for line in ${output}; do
echo " <div class=\"result fail\">${line}</div>" >> ${results_html} || {
echo "${COLRED}Error${COLRESET} writing failure to: ${$results_html}"
}
echo "${line}" >> $results || {
echo "${COLRED}Error${COLRESET} writing failure to: ${$results}"
}
done
else
echo " <div class=\"result\">Test Passed</div>" >> ${results_html}
fi
echo " </div>" >> ${results_html}
echo "</body>" >> ${results_html}
}
upload_results() {
## upload the results folder to s3 in the format of year-month-day-epoch
local s3_folder_name=$(basename $LOG_DIR | cut -f2 -d "_")
if [ ! -d "${LOG_DIR}" ]; then
echo "${COLRED}Error${COLRESET} - logdir (${COLYELLOW}${LOG_DIR}${COLRESET}) is missing."
echo "${COLYELLOW}Skipping logs upload${COLRESET}"
else
if $(aws s3 ls s3://${S3_BUCKET}/${S3_ROOT_FOLDER} > /dev/null 2>&1); then
## upload LOG_DIR results folder
aws s3 cp --recursive --cache-control 'no-cache' --content-type text/plain --metadata-directive REPLACE ${LOG_DIR} s3://${S3_BUCKET}/${S3_ROOT_FOLDER}/${s3_folder_name} || {
echo "${COLRED}Error${COLRESET} Uploading ${LOG_DIR} folder to ${COLYELLOW}s3://${S3_BUCKET}/${S3_ROOT_FOLDER}/${s3_folder_name}${COLRESET}"
}
## re-upload LOG_DIR/results.html with metadata set to text/html
aws s3 cp --cache-control 'no-cache' --content-type text/html --metadata-directive REPLACE ${LOG_DIR}/results.html s3://${S3_BUCKET}/${S3_ROOT_FOLDER}/${s3_folder_name}/results.html || {
echo "${COLRED}Error${COLRESET} re-uploading ${LOG_DIR}/results.html file to ${COLYELLOW}s3://${S3_BUCKET}/${S3_ROOT_FOLDER}/${s3_folder_name}/results.html${COLRESET}"
}
## re-upload LOG_DIR/results.html in bucket root as latest.html
aws s3 cp --cache-control 'no-cache' --content-type text/html --metadata-directive REPLACE ${LOG_DIR}/results.html s3://${S3_BUCKET}/latest.html || {
echo "${COLRED}Error${COLRESET} re-uploading ${LOG_DIR}/results.html file to ${COLYELLOW}s3://${S3_BUCKET}/results/${s3_folder_name}/latest.html${COLRESET}"
}
fi
fi
}
## install missing dependencies
for cmd in curl tmux git wget tar gzip grep cargo pgrep aws; do
command -v "${cmd}" >/dev/null 2>&1 || {
case "${cmd}" in
"cargo")
install_cargo
;;
"pgrep")
package="procps"
;;
"aws")
package="awscli"
;;
*)
package="${cmd}"
;;
esac
sudo apt-get update && sudo apt-get install ${package} || {
echo "${COLRED}Error${COLRESET} installing $package"
exit 1
}
}
done
while [ ${#} -gt 0 ]; do
case ${1} in
-t|--terminal)
TERM_OUT=true
;;
-u|--upload)
## required if uploading results
UPLOAD=true
;;
-n|--network)
## required if not mainnet
if [ "${2}" == "" ]; then
echo "Missing required value for ${1}"
fi
NETWORK=${2}
shift
;;
## build from specific branch
-b|--branch)
if [ "${2}" == "" ]; then
echo "Missing required value for ${1}"
fi
BRANCH=${2}
shift
;;
-r|--RESERVED)
if [ "${2}" == "" ]; then
echo "Missing required value for ${1}"
fi
if ! [[ "$2" =~ ^[0-9]+$ ]]; then
echo "ERROR: arg ($2) is not a number." >&2
exit 1
fi
RESERVED=${2}
shift
;;
esac
shift
done
echo "Replay Started: ${COLYELLOW}$(date)${COLRESET}"
build_stacks_inspect ## disable for testing
configure_replay_slices ## disable for testing
start_replay
check_progress
store_results
${UPLOAD} && upload_results ## only upload results if -u arg is supplied
echo "Replay finished: $(date)"
@wileyj
Copy link
Author

wileyj commented Aug 22, 2024

~1.25 TB of disk is needed for 14 cpus.
default will reserve 10 cpus from total pool of cores (i.e. if 24 cores, then 14 cores will run a replay task).
time taken with 10 cores running is ~12 hours.

Note: the branch being used here requires cargo < 1.80 so installing 1.79 is hardcoded for now.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment