To run this in the background, and detach the process from your current shell:
$ GREP_ARGS=my-query OUTPUT=datadog-s3-log-scan.txt BUCKET=my-s3-bucket >stdout 2>stderr &
$ disown
#!/bin/bash | |
# read -p "What should I grep for? " GREP_ARGS | |
# read -p "Where should I write filtered logs to? " OUTPUT | |
# read -p "What is the name of your S3 bucket? " BUCKET | |
echo "Configuration:" | |
echo "GREP_ARGS=$GREP_ARGS" | |
echo "OUTPUT=$OUTPUT" | |
echo "BUCKET=$BUCKET" | |
echo "Removing temporary storage directory: ./tmp" | |
rm -rf tmp | |
DAYS=$(aws s3 ls s3://$BUCKET/datadog/logs/ | awk '{print $2}') | |
NUM_DAYS=$(echo -n "$DAYS" | grep -c '^') | |
echo "Scanning archived logs for $NUM_DAYS days." | |
for DAY in $DAYS; do | |
PRETTY_DAY=$(echo $DAY | sed -E 's/.*([0-9]{4})([0-9]{2})([0-9]{2}).*/\2\/\3\/\1/g') | |
# echo "Scanning logs from $PRETTY_DAY." | |
HOURS=$(aws s3 ls s3://$BUCKET/datadog/logs/$DAY | awk '{print $2}') | |
for HOUR in $HOURS; do | |
PRETTY_HOUR=$(echo $HOUR | sed -E 's/.*([0-9]{2}).*/\1\:00/g') | |
echo "Scanning logs from $PRETTY_DAY $PRETTY_HOUR." | |
ARCHIVES=$(aws s3 ls s3://$BUCKET/datadog/logs/$DAY$HOUR | awk '{print $4}') | |
for ARCHIVE in $ARCHIVES; do | |
S3_PATH="s3://$BUCKET/datadog/logs/$DAY$HOUR$ARCHIVE" | |
# echo "Downloading: $S3_PATH -> tmp/$ARCHIVE" | |
aws s3 cp $S3_PATH tmp/$ARCHIVE >/dev/null | |
UNZIPPED=$(echo $ARCHIVE | sed -E 's/(.*)\.gz/\1/g') | |
# echo "Decompressing: tmp/$ARCHIVE -> tmp/$UNZIPPED" | |
gzip -d tmp/$ARCHIVE | |
# echo "Grepping: tmp/$UNZIPPED -> $OUTPUT" | |
cat tmp/$UNZIPPED | grep $GREP_ARGS >>$OUTPUT | |
# echo "Removing: tmp/$UNZIPPED" | |
rm tmp/$UNZIPPED | |
OUTPUT_SIZE=$(ls -lh $OUTPUT | awk '{print $5}') | |
echo "$OUTPUT - $OUTPUT_SIZE" | |
done | |
done | |
done |