Last active
August 30, 2024 02:25
-
-
Save wileyj/e84eb1e9bb0784f941483c50570587f0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -o pipefail | |
NETWORK="mainnet" | |
REPO_DIR="$HOME/stacks-inspect" | |
SCRATCH_DIR="$HOME/scratch" | |
TIMESTAMP=$(date +%Y-%m-%d-%s) # use a simple date format year-month-day-epoch | |
LOG_DIR="/tmp/replay_${TIMESTAMP}" | |
SLICE_DIR="${SCRATCH_DIR}/slice" | |
TMUX_SESSION="replay" | |
TERM_OUT=false | |
UPLOAD=false ## default to not upload to s3 | |
BRANCH="develop" | |
S3_BUCKET="stacks-replay" # public s3 bucket to upload results to | |
S3_ROOT_FOLDER="results" | |
## retrieve number of CORES | |
CORES=$(cat /proc/cpuinfo | grep processor | wc -l) | |
## reserve this many CORES for other processes as default | |
RESERVED=10 | |
COLRED=$'\033[31m' # Red | |
COLGREEN=$'\033[32m' # Green | |
COLYELLOW=$'\033[33m' # Yellow | |
COLBOLD=$'\033[1m' # Bold Text | |
COLRESET=$'\033[0m' # reset color/formatting | |
install_cargo() { | |
## verify that cargo is installed in the expected path, not only $PATH | |
command -v "$HOME/.cargo/bin/cargo" >/dev/null 2>&1 || { | |
echo "Installing Rust via rustup" | |
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y || { | |
echo "${COLRED}Error${COLRESET} installing Rust" | |
exit 1 | |
} | |
} | |
echo "Exporting $HOME/.cargo/env" | |
source "$HOME/.cargo/env" | |
return 0 | |
} | |
build_stacks_inspect() { | |
if [ -d ${REPO_DIR} ];then | |
echo "Found ${COLYELLOW}${REPO_DIR}${COLRESET}. checking out ${COLGREEN}${BRANCH}${COLRESET} and resetting to ${COLBOLD}HEAD${COLRESET}" | |
cd ${REPO_DIR} && git checkout $BRANCH && git reset --hard HEAD || { | |
echo "${COLRED}Error${COLRESET} checking out ${BRANCH}" | |
exit 1 | |
} | |
else | |
git clone https://github.com/stacks-network/stacks-core --single-branch --branch ${BRANCH} ${REPO_DIR} && cd ${REPO_DIR} || { | |
echo "${COLRED}Error${COLRESET} cloning https://github.com/stacks-network/stacks-core into ${REPO_DIR}" | |
exit 1 | |
} | |
fi | |
## build stacks-inspect to: $HOME/stacks-inspect/target/release/stacks-inspect | |
cargo build --bin=stacks-inspect --release || { | |
echo "${COLRED}Error${COLRESET} building stacks-inspect binary" | |
exit 1 | |
} | |
} | |
configure_replay_slices() { | |
if [ -d "$HOME/scratch" ]; then | |
echo "Deleting existing scratch dir: ${COLYELLOW}$HOME/scratch${COLRESET}" | |
rm -rf $HOME/scratch || { | |
echo "${COLRED}Error${COLRESET} deleting dir $HOME/scratch" | |
exit 1 | |
} | |
fi | |
echo "Creating scratch and slice dirs" | |
mkdir -p ${SLICE_DIR}0 && cd ${SCRATCH_DIR} || { | |
echo "${COLRED}Error${COLRESET} creating dir ${SLICE_DIR}" | |
exit 1 | |
} | |
echo "Downloading latest ${NETWORK} chainstate archive ${COLYELLOW}https://archive.hiro.so/${NETWORK}/stacks-blockchain/${NETWORK}-stacks-blockchain-latest.tar.gz${COLRESET}" | |
curl -L --proto '=https' --tlsv1.2 https://archive.hiro.so/${NETWORK}/stacks-blockchain/${NETWORK}-stacks-blockchain-latest.tar.gz -o ${SCRATCH_DIR}/${NETWORK}-stacks-blockchain-latest.tar.gz || { | |
echo "${COLRED}Error${COLRESET} downlaoding latest ${NETWORK} chainstate archive" | |
exit 1 | |
} | |
echo "Extracting downloaded archive: ${COLYELLOW}${SCRATCH_DIR}/${NETWORK}-stacks-blockchain-latest.tar.gz${COLRESET}" | |
tar --strip-components=1 -xzf ${SCRATCH_DIR}/${NETWORK}-stacks-blockchain-latest.tar.gz -C ${SLICE_DIR}0 || { | |
echo "${COLRED}Error${COLRESET} extracting ${NETWORK} chainstate archive" | |
exit | |
} | |
echo "Moving marf database: ${SLICE_DIR}0/chainstate/vm/clarity/marf.sqlite.blobs -> ${COLYELLOW}${SCRATCH_DIR}/marf.sqlite.blobs${COLRESET}" | |
mv ${SLICE_DIR}0/chainstate/vm/clarity/marf.sqlite.blobs ${SCRATCH_DIR}/ | |
echo "Symlinking marf database: ${SCRATCH_DIR}/marf.sqlite.blobs -> ${COLYELLOW}${SLICE_DIR}0/chainstate/vm/clarity/marf.sqlite.blobs${COLRESET}" | |
ln -s ${SCRATCH_DIR}/marf.sqlite.blobs ${SLICE_DIR}0/chainstate/vm/clarity/marf.sqlite.blobs || { | |
echo "${COLRED}Error${COLRESET} creating symlink: ${SCRATCH_DIR}/marf.sqlite.blobs -> ${SLICE_DIR}0/chainstate/vm/clarity/marf.sqlite.blobs" | |
exit 1 | |
} | |
## we'll need ~53GB per slice (for 16 slices, this is 848GB) | |
## with the chainstate archive and linked file, total space required is roughly: 1.2 TB | |
## create a copy of the linked db with <number of CORES><number of RESERVED CORES> | |
## decrement additional "1" since we already have ${SLICE_DIR}0 | |
for ((i=1;i<=$(expr "$CORES" - "$RESERVED" - 1);i++)); do | |
echo "Copying ${SLICE_DIR}0 -> ${COLYELLOW}${SLICE_DIR}${i}${COLRESET}" | |
cp -R "${SLICE_DIR}0" "${SLICE_DIR}${i}" || { | |
echo "${COLRED}Error${COLRESET} copying ${SLICE_DIR}0 -> ${SLICE_DIR}${i}" | |
exit 1 | |
} | |
done | |
} | |
start_replay() { | |
## if there is an existing folder, rm it | |
if [ -f "${LOG_DIR}/results.log" ];then | |
rm -rf "${LOG_DIR}" | |
fi | |
## create LOG_DIR to store output files | |
if [ ! -d "${LOG_DIR}" ]; then | |
mkdir -p "${LOG_DIR}" | |
fi | |
## if tmux session "replay" exists, kill it and start anew | |
if $(tmux list-windows -t ${TMUX_SESSION} &> /dev/null); then | |
tmux kill-session -t ${TMUX_SESSION} &> /dev/null | |
fi | |
local slice_counter=0 | |
## create tmux session named ${TMUX_SESSION} with a window named slice0 | |
tmux new-session -d -s ${TMUX_SESSION} -n slice${slice_counter} || { | |
echo "${COLRED}Error${COLRESET} creating tmux session ${COLYELLOW}${TMUX_SESSION}${COLRESET}" | |
exit 1 | |
} | |
# if [ ! -f "${SLICE_DIR}0/S3_ROOT_FOLDERchainstate/vm/index.sqlite" ]; then | |
if [ ! -f "${SLICE_DIR}0/chainstate/vm/index.sqlite" ]; then | |
echo "${COLRED}Error${COLRESET}: chainstate db not found (${SLICE_DIR}0/chainstate/vm/index.sqlite)" | |
exit 1 | |
fi | |
## get the total number of blocks (with orphans) in db | |
local total_blocks=$(echo "select count(*) from staging_blocks where orphaned = 0" | sqlite3 ${SLICE_DIR}0/chainstate/vm/index.sqlite) | |
local starting_block=0 # for the block counter, start at this block | |
## | |
## 2.5 epoch block query using: $HOME/scratch/slice0/chainstate/vm/index.sqlite | |
## echo 'with subq AS (select index_block_hash, RANK() OVER (ORDER BY index_block_hash ASC) rnk FROM staging_blocks where orphaned = 0) SELECT index_block_hash, rnk FROM subq WHERE index_block_hash = "de441baf2232417fae38dd0590e547842aac755a1aff60fa01ccedecaf78692d";' | sqlite3 $HOME/scratch/slice0/chainstate/vm/index.sqlite | |
# local total_blocks=154000 ## for testing 2.5 epoch at 153106 | |
# local starting_block=152000 ## for testing 2.5 epoch at 153106 | |
## | |
local block_diff=`expr $total_blocks - $starting_block` | |
local slices=`expr $CORES - $RESERVED` # how many slices to run | |
local slice_blocks=`expr $block_diff / $slices` # how many blocks per slice | |
echo "Total blocks: ${COLYELLOW}${total_blocks}${COLRESET}" | |
echo "Total slices: ${COLYELLOW}${slices}${COLRESET}" | |
echo "Blocks per slice: ${COLYELLOW}${slice_blocks}${COLRESET}" | |
echo "Block diff: ${COLYELLOW}$block_diff${COLRESET}" | |
local end_block_count=$starting_block | |
while [[ ${end_block_count} -lt ${total_blocks} ]]; do | |
local start_block_count=$end_block_count | |
end_block_count=`expr $end_block_count + $slice_blocks` | |
if [ $end_block_count -gt $total_blocks -o "$slice_counter" -eq `expr "$slices" - 1` ]; then | |
end_block_count=$total_blocks | |
fi | |
if [ ${slice_counter} -gt 0 ];then | |
tmux new-window -t replay -d -n slice${slice_counter} || { | |
echo "${COLRED}Error${COLRESET} creating tmux window ${COLYELLOW}slice${slice_counter}${COLRESET}" | |
exit 1 | |
} | |
fi | |
local cmd="${REPO_DIR}/target/release/stacks-inspect replay-block ${SLICE_DIR}${slice_counter} index-range $start_block_count $end_block_count 2>/dev/null" | |
# local cmd="${REPO_DIR}/target/release/stacks-inspect replay-block ${SLICE_DIR}${slice_counter} index-range $start_block_count $end_block_count 2>/dev/null" | |
local log="| tee -a ${LOG_DIR}/slice${slice_counter}.log" | |
echo " Creating tmux window: ${COLGREEN}replay:slice${slice_counter}${COLRESET} :: Blocks: ${COLYELLOW}${start_block_count}-${end_block_count}${COLRESET}" | |
echo "Command: ${cmd}" > ${LOG_DIR}/slice${slice_counter}.log # log the command being run for the slice | |
echo "Replaying indexed blocks: ${start_block_count}-${end_block_count} (out of ${total_blocks})" >> ${LOG_DIR}/slice${slice_counter}.log | |
tmux send-keys -t ${TMUX_SESSION}:slice${slice_counter} "${cmd}${log}" Enter || { | |
echo "${COLRED}Error${COLRESET} sending replay command to tmux window ${COLYELLOW}slice${slice_counter}${COLRESET}" | |
exit 1 | |
} | |
## log the return code as the last line | |
tmux send-keys -t ${TMUX_SESSION}:slice${slice_counter} "echo \${PIPESTATUS[0]} >> ${LOG_DIR}/slice${slice_counter}.log" Enter || { | |
echo "${COLRED}Error${COLRESET} sending return status command to tmux window ${COLYELLOW}slice${slice_counter}${COLRESET}" | |
exit 1 | |
} | |
slice_counter=`expr $slice_counter + 1` | |
done | |
} | |
check_progress() { | |
# give the pids a few seconds to show up in process table | |
local sleep_duration=5 | |
while [ $sleep_duration -gt 0 ]; do | |
${TERM_OUT} && printf "Sleeping ... \b [ ${COLYELLOW}${sleep_duration}${COLRESET} ] \033[0K\r" | |
sleep_duration=$((sleep_duration-1)) | |
sleep 1 | |
done | |
local progress=1 | |
local sp="/-\|" | |
echo "************************************************************************" | |
echo "Checking Block Replay status" | |
# ${TERM_OUT} && echo "Tmux windows (${COLYELLOW}${TMUX_SESSION}${COLRESET}):" | |
# ${TERM_OUT} && tmux list-windows -t ${TMUX_SESSION} -F 'window: #{window_name} | command: #{pane_current_command}' | |
echo -e ' ' | |
while true; do | |
local count=$(pgrep -c "stacks-inspect") | |
if [ $count -gt 0 ]; then | |
${TERM_OUT} && printf "Block replay processes are currently active [ ${COLYELLOW}${COLBOLD}${count}${COLRESET} ] ... \b${sp:progress++%${#sp}:1} \033[0K\r" | |
else | |
${TERM_OUT} && printf "\r\n" | |
break | |
fi | |
done | |
echo "************************************************************************" | |
} | |
store_results() { | |
## text file to store results | |
local results="${LOG_DIR}/results.log" | |
## html file to store results | |
local results_html="${LOG_DIR}/results.html" | |
## retrieve current date in UTC | |
local cur_date=$(date -u) | |
echo "Results: ${COLYELLOW}${results}${COLRESET}" | |
cd ${LOG_DIR} | |
local failed=0; | |
local return_code=0; | |
## retrieve the count of all lines with `Failed processing block` | |
local failure_count=$(grep -rc "Failed processing block" slice*.log | awk -F: '$NF >= 0 {x+=$NF; $NF=""} END{print x}') | |
if [ ${failure_count} -gt 0 ]; then | |
echo "Failures: ${COLRED}${failure_count}${COLRESET}" | |
else | |
echo "Failures: ${COLGREEN}${failure_count}${COLRESET}" | |
fi | |
echo "Failures: ${failure_count}" > $results | |
## check the return codes to see if we had a panic | |
for file in $(ls slice*.log | sort); do | |
return_code=$(tail -1 $file) | |
case ${return_code} in | |
0) | |
# block replay ran successfully | |
;; | |
1) | |
# block replay had some block failures | |
failed=1 | |
;; | |
*) | |
# return code likely indicates a panic | |
failed=1 | |
echo "$file return code: $return_code" >> $results # ok to continue if this write fails | |
;; | |
esac | |
# echo "return code: $return_code" | |
# if [ ${return_code} -gt "0" ];then | |
# echo "${COLRED}Replay Failure${COLREST} in: ${LOG_DIR}/${file}" | |
# failed=1 | |
# fi | |
done | |
## Store the results as HTML: | |
cat <<-EOF> "${results_html}" | |
<body> | |
<style> | |
@import url('https://fonts.googleapis.com/css2?family=Source+Code+Pro:ital,wght@0,200..900;1,200..900&display=swap'); | |
.container { | |
border: 1px outset black; | |
padding: 5px; | |
border-radius: 5px; | |
background-color: #eae9e8; | |
} | |
.fail { | |
background-color: #ffffff; | |
border: 1px outset black; | |
border-radius: 5px; | |
font-weight: 350; | |
} | |
.pass { | |
background-color: #eae9e8; | |
} | |
.result { | |
text-align: left; | |
padding-left: 10px; | |
padding-top: 10px; | |
padding-bottom: 10px; | |
margin: 5px; | |
} | |
body { | |
font-family: "Source Code Pro", monospace; | |
font-optical-sizing: auto; | |
font-style: normal; | |
} | |
</style> | |
<h2>${cur_date}</h2> | |
<hr/> | |
<h2>Failures: ${failure_count}</h2> | |
<div class="container"> | |
EOF | |
## use the $failed var here in case there is a panic, then $failure_count may show zero, but the replay was not successful | |
if [ ${failed} == "1" ];then | |
output=$(grep -r -h "Failed processing block" slice*.log) | |
IFS=$'\n' | |
for line in ${output}; do | |
echo " <div class=\"result fail\">${line}</div>" >> ${results_html} || { | |
echo "${COLRED}Error${COLRESET} writing failure to: ${$results_html}" | |
} | |
echo "${line}" >> $results || { | |
echo "${COLRED}Error${COLRESET} writing failure to: ${$results}" | |
} | |
done | |
else | |
echo " <div class=\"result\">Test Passed</div>" >> ${results_html} | |
fi | |
echo " </div>" >> ${results_html} | |
echo "</body>" >> ${results_html} | |
} | |
upload_results() { | |
## upload the results folder to s3 in the format of year-month-day-epoch | |
local s3_folder_name=$(basename $LOG_DIR | cut -f2 -d "_") | |
if [ ! -d "${LOG_DIR}" ]; then | |
echo "${COLRED}Error${COLRESET} - logdir (${COLYELLOW}${LOG_DIR}${COLRESET}) is missing." | |
echo "${COLYELLOW}Skipping logs upload${COLRESET}" | |
else | |
if $(aws s3 ls s3://${S3_BUCKET}/${S3_ROOT_FOLDER} > /dev/null 2>&1); then | |
## upload LOG_DIR results folder | |
aws s3 cp --recursive --cache-control 'no-cache' --content-type text/plain --metadata-directive REPLACE ${LOG_DIR} s3://${S3_BUCKET}/${S3_ROOT_FOLDER}/${s3_folder_name} || { | |
echo "${COLRED}Error${COLRESET} Uploading ${LOG_DIR} folder to ${COLYELLOW}s3://${S3_BUCKET}/${S3_ROOT_FOLDER}/${s3_folder_name}${COLRESET}" | |
} | |
## re-upload LOG_DIR/results.html with metadata set to text/html | |
aws s3 cp --cache-control 'no-cache' --content-type text/html --metadata-directive REPLACE ${LOG_DIR}/results.html s3://${S3_BUCKET}/${S3_ROOT_FOLDER}/${s3_folder_name}/results.html || { | |
echo "${COLRED}Error${COLRESET} re-uploading ${LOG_DIR}/results.html file to ${COLYELLOW}s3://${S3_BUCKET}/${S3_ROOT_FOLDER}/${s3_folder_name}/results.html${COLRESET}" | |
} | |
## re-upload LOG_DIR/results.html in bucket root as latest.html | |
aws s3 cp --cache-control 'no-cache' --content-type text/html --metadata-directive REPLACE ${LOG_DIR}/results.html s3://${S3_BUCKET}/latest.html || { | |
echo "${COLRED}Error${COLRESET} re-uploading ${LOG_DIR}/results.html file to ${COLYELLOW}s3://${S3_BUCKET}/results/${s3_folder_name}/latest.html${COLRESET}" | |
} | |
fi | |
fi | |
} | |
## install missing dependencies | |
for cmd in curl tmux git wget tar gzip grep cargo pgrep aws; do | |
command -v "${cmd}" >/dev/null 2>&1 || { | |
case "${cmd}" in | |
"cargo") | |
install_cargo | |
;; | |
"pgrep") | |
package="procps" | |
;; | |
"aws") | |
package="awscli" | |
;; | |
*) | |
package="${cmd}" | |
;; | |
esac | |
sudo apt-get update && sudo apt-get install ${package} || { | |
echo "${COLRED}Error${COLRESET} installing $package" | |
exit 1 | |
} | |
} | |
done | |
while [ ${#} -gt 0 ]; do | |
case ${1} in | |
-t|--terminal) | |
TERM_OUT=true | |
;; | |
-u|--upload) | |
## required if uploading results | |
UPLOAD=true | |
;; | |
-n|--network) | |
## required if not mainnet | |
if [ "${2}" == "" ]; then | |
echo "Missing required value for ${1}" | |
fi | |
NETWORK=${2} | |
shift | |
;; | |
## build from specific branch | |
-b|--branch) | |
if [ "${2}" == "" ]; then | |
echo "Missing required value for ${1}" | |
fi | |
BRANCH=${2} | |
shift | |
;; | |
-r|--RESERVED) | |
if [ "${2}" == "" ]; then | |
echo "Missing required value for ${1}" | |
fi | |
if ! [[ "$2" =~ ^[0-9]+$ ]]; then | |
echo "ERROR: arg ($2) is not a number." >&2 | |
exit 1 | |
fi | |
RESERVED=${2} | |
shift | |
;; | |
esac | |
shift | |
done | |
echo "Replay Started: ${COLYELLOW}$(date)${COLRESET}" | |
build_stacks_inspect ## disable for testing | |
configure_replay_slices ## disable for testing | |
start_replay | |
check_progress | |
store_results | |
${UPLOAD} && upload_results ## only upload results if -u arg is supplied | |
echo "Replay finished: $(date)" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
~1.25 TB of disk is needed for 14 cpus.
default will reserve 10 cpus from total pool of cores (i.e. if 24 cores, then 14 cores will run a replay task).
time taken with 10 cores running is ~12 hours.
Note: the branch being used here requires cargo < 1.80 so installing 1.79 is hardcoded for now.