Last active
April 22, 2022 09:23
-
-
Save kris-sigur/359344361ce4063034fb20b25714af50 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
OUTBACK_INDEX_URL='http://localhost:8080/lbs?badLines=skip' | |
# FUNCTIONS | |
containsElement () { | |
local e | |
for e in "${@:2}"; do [[ "$e" == "$1" ]] && return 0; done | |
return 1 | |
} | |
# MAIN BODY | |
CDX_BASE_DIR=$1; | |
TMP_BASE='/tmp/'; | |
ALREADY_INCLUDED=$2; | |
export LC_ALL=C; | |
TMP_FILE="$TMP_BASE/tmp"; | |
# Find all the sorted.cdx.gz files under the CDX_BASE_DIR that are not already included | |
unset a i | |
NEW_CDX_FILES="$TMP_BASE/newCdxFiles.txt"; | |
find $CDX_BASE_DIR -type f -regex ".*\.cdx\.gz$" | grep -Fv -f $ALREADY_INCLUDED > $NEW_CDX_FILES; | |
fileCount=0; | |
while read file; do | |
cdxFiles[$fileCount]=$file; | |
fileCount=$((fileCount+1)); | |
done < $NEW_CDX_FILES | |
echo "Found $fileCount gzipped and pre-sorted cdx files, not in $ALREADY_INCLUDED."; | |
rm $NEW_CDX_FILES; | |
if [ $fileCount -eq "0" ]; then | |
exit; | |
fi | |
START_TIME=$SECONDS; | |
BATCH_SIZE=5242880 | |
accIdx=1 | |
accSize=0 | |
acc=() | |
for item in "${cdxFiles[@]}"; do | |
itemSize=$(wc -c <"$item") | |
accSize=$(( $accSize + $itemSize )) | |
acc[$accIdx]=$item | |
if ((accSize > BATCH_SIZE)); then | |
# Add current batch to index | |
echo "Adding $accIdx files in this batch, a total of $accSize compressed bytes" | |
zcat ${acc[*]} | curl -X POST --data-binary @- $OUTBACK_INDEX_URL | |
for batchItem in "${acc[@]}"; do | |
echo "Done $(basename $batchItem)" | |
echo $(basename $batchItem) >> $ALREADY_INCLUDED; | |
done | |
# Cleanup for next batch | |
acc=() | |
accIdx=1 | |
accSize=0 | |
else | |
accIdx=$(( accIdx + 1)) | |
fi | |
done | |
# Finish off whatever is in accumulator | |
if ((accIdx > 1)); then | |
echo "Adding $accIdx files in final batch, a total of $accSize compressed bytes" | |
zcat ${acc[*]} | curl -X POST --data-binary @- $OUTBACK_INDEX_URL | |
for batchItem in "${acc[@]}"; do | |
echo "Done $(basename $batchItem)" | |
echo $(basename $batchItem) >> $ALREADY_INCLUDED; | |
done | |
fi | |
echo "Added files in $(($SECONDS - $START_TIME)) seconds"; | |
echo "Resorting $ALREADY_INCLUDED"; | |
alreadyKnownTmp="$TMP_BASE/known.tmp"; | |
sort $ALREADY_INCLUDED > $alreadyKnownTmp; | |
mv $alreadyKnownTmp $ALREADY_INCLUDED | |
echo "Done"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment