Skip to content

Instantly share code, notes, and snippets.

@kris-sigur
Last active April 22, 2022 09:23
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kris-sigur/359344361ce4063034fb20b25714af50 to your computer and use it in GitHub Desktop.
Save kris-sigur/359344361ce4063034fb20b25714af50 to your computer and use it in GitHub Desktop.
#!/bin/bash
OUTBACK_INDEX_URL='http://localhost:8080/lbs?badLines=skip'
# FUNCTIONS
containsElement () {
local e
for e in "${@:2}"; do [[ "$e" == "$1" ]] && return 0; done
return 1
}
# MAIN BODY
CDX_BASE_DIR=$1;
TMP_BASE='/tmp/';
ALREADY_INCLUDED=$2;
export LC_ALL=C;
TMP_FILE="$TMP_BASE/tmp";
# Find all the sorted.cdx.gz files under the CDX_BASE_DIR that are not already included
unset a i
NEW_CDX_FILES="$TMP_BASE/newCdxFiles.txt";
find $CDX_BASE_DIR -type f -regex ".*\.cdx\.gz$" | grep -Fv -f $ALREADY_INCLUDED > $NEW_CDX_FILES;
fileCount=0;
while read file; do
cdxFiles[$fileCount]=$file;
fileCount=$((fileCount+1));
done < $NEW_CDX_FILES
echo "Found $fileCount gzipped and pre-sorted cdx files, not in $ALREADY_INCLUDED.";
rm $NEW_CDX_FILES;
if [ $fileCount -eq "0" ]; then
exit;
fi
START_TIME=$SECONDS;
BATCH_SIZE=5242880
accIdx=1
accSize=0
acc=()
for item in "${cdxFiles[@]}"; do
itemSize=$(wc -c <"$item")
accSize=$(( $accSize + $itemSize ))
acc[$accIdx]=$item
if ((accSize > BATCH_SIZE)); then
# Add current batch to index
echo "Adding $accIdx files in this batch, a total of $accSize compressed bytes"
zcat ${acc[*]} | curl -X POST --data-binary @- $OUTBACK_INDEX_URL
for batchItem in "${acc[@]}"; do
echo "Done $(basename $batchItem)"
echo $(basename $batchItem) >> $ALREADY_INCLUDED;
done
# Cleanup for next batch
acc=()
accIdx=1
accSize=0
else
accIdx=$(( accIdx + 1))
fi
done
# Finish off whatever is in accumulator
if ((accIdx > 1)); then
echo "Adding $accIdx files in final batch, a total of $accSize compressed bytes"
zcat ${acc[*]} | curl -X POST --data-binary @- $OUTBACK_INDEX_URL
for batchItem in "${acc[@]}"; do
echo "Done $(basename $batchItem)"
echo $(basename $batchItem) >> $ALREADY_INCLUDED;
done
fi
echo "Added files in $(($SECONDS - $START_TIME)) seconds";
echo "Resorting $ALREADY_INCLUDED";
alreadyKnownTmp="$TMP_BASE/known.tmp";
sort $ALREADY_INCLUDED > $alreadyKnownTmp;
mv $alreadyKnownTmp $ALREADY_INCLUDED
echo "Done";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment