Created
September 16, 2020 08:36
-
-
Save mickesv/4de9528e2311064a7a93a8929820dfb3 to your computer and use it in GitHub Desktop.
Clony McCloneface done the Unix way
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
CHUNKSIZE=5 | |
STARTDIR=. | |
EXTENSION=txt | |
TMPFILE="/tmp/chunk-tmp" | |
OUTFILE="./clony-out.txt" | |
FINDFLAGS="" | |
# Helper Functions | |
# -------------------- | |
function getLine() { | |
head -n $1 $2 | tail -1 | |
} | |
function clearFile() { | |
rm -f $1 | |
touch $1 | |
} | |
function logCommand () { | |
TIMEFORMAT='time %3lR' | |
echo -n $1 $'\t\t' && time eval $2 | |
} | |
function sortByFilename () { # TODO: HERE BE BUGS | |
sort -k1,1 -k2,2n $1 | |
} | |
# Chunkification | |
# -------------------- | |
function getChunk() { | |
tail -n "+$1" $2 | head -n "$CHUNKSIZE" | |
} | |
function chunkifyFile () { | |
FILENAME=$1 | |
LENGTH=$(echo `wc -l $FILENAME` | cut -d ' ' -f 1) | |
ADJUSTEDLENGTH=`expr $LENGTH - $CHUNKSIZE` | |
if [ $ADJUSTEDLENGTH -lt 1 ] | |
then | |
ADJUSTEDLENGTH=1 | |
fi | |
for LINE in $(seq 1 $ADJUSTEDLENGTH) | |
do | |
echo $FILENAME $LINE `getChunk $LINE $FILENAME | md5` | |
done | |
} | |
function chunkifyDirectory () { | |
clearFile $TMPFILE-chunks | |
find $1 -name "*.$2" $FINDFLAGS | while read FILE; do chunkifyFile "$FILE" >> $TMPFILE-chunks ; done | |
} | |
# Get and Expand Clones | |
# -------------------- | |
function getPotentialClones () { | |
uniq -uf 2 $1 > $TMPFILE-uniq | |
fgrep -vFf $TMPFILE-uniq $1 | |
rm $TMPFILE-uniq | |
} | |
function expandClone () { | |
HEAD=$(getLine $1 $2) | |
HFILE=`echo $HEAD | cut -d ' ' -f 1` | |
HLINE=`echo $HEAD | cut -d ' ' -f 2` | |
echo $HEAD # The first is always part of the expansion | |
CHUNKLINE=`expr $1 + 1` | |
NEXT=$(getLine $CHUNKLINE $2) | |
NFILE=`echo $NEXT | cut -d ' ' -f 1` | |
NLINE=`echo $NEXT | cut -d ' ' -f 2` | |
while [ "$HFILE" = "$NFILE" ] && (( $NLINE == $HLINE + 1 )) | |
do | |
echo $NEXT | |
HFILE=$NFILE #probably need to escape this in some nice way. | |
HLINE=$NLINE | |
CHUNKLINE=`expr $CHUNKLINE + 1` | |
NEXT=$(getLine $CHUNKLINE $2) | |
NFILE=`echo $NEXT | cut -d ' ' -f 1` | |
NLINE=`echo $NEXT | cut -d ' ' -f 2` | |
done | |
} | |
getCloneFile () { | |
head -1 $1 | cut -d ' ' -f 1 | |
} | |
getCloneLines () { | |
FIRST=`head -1 $1 | cut -d ' ' -f 2` | |
LAST=`tail -1 $1 | cut -d ' ' -f 2` | |
echo from lines $FIRST to $(( $LAST + $CHUNKSIZE )) | |
} | |
getCloneSize () { | |
echo `wc -l $1` | cut -d ' ' -f 1 | |
} | |
function findSiblings () { | |
cut -d ' ' -f 3 $1 > $TMPFILE-siblings | |
fgrep -f $TMPFILE-siblings $2 | |
} | |
function expandAllSiblings () { | |
while [ -s $1 ] | |
do | |
expandClone 1 $1 > $TMPFILE-singleSibling | |
SIBFILE=`getCloneFile $TMPFILE-singleSibling` | |
SIBLINES=`getCloneLines $TMPFILE-singleSibling` | |
SIBSIZE=$(expr `getCloneSize $TMPFILE-singleSibling` + 1) | |
echo " Sibling:" $SIBFILE $SIBLINES | |
sed -n -e $SIBSIZE',$p' -i backup $1 | |
done | |
} | |
function expandAllClones () { | |
while [ -s $1 ] | |
do | |
expandClone 1 $1 > $TMPFILE-singleCloneExpanded | |
CLONEFILE=`getCloneFile $TMPFILE-singleCloneExpanded` | |
CLONELINES=`getCloneLines $TMPFILE-singleCloneExpanded` | |
CLONESIZE=$(expr `getCloneSize $TMPFILE-singleCloneExpanded` + 1) | |
echo "Clone in" $CLONEFILE of size $(( $CLONESIZE + $CHUNKSIZE )) | |
findSiblings $TMPFILE-singleCloneExpanded $TMPFILE-potential-byFile > $TMPFILE-siblings | |
expandAllSiblings $TMPFILE-siblings | |
sed -n -e $CLONESIZE',$p' -i backup $1 | |
done | |
} | |
# Overall execution | |
# -------------------- | |
logCommand "Chunkify Directory..." "chunkifyDirectory $STARTDIR $EXTENSION" | |
logCommand "Sorting Result..." "sort -k 3 $TMPFILE-chunks > $TMPFILE-sorted" # required for uniq. Keep this separate to be able to time it. | |
logCommand "Getting Clones..." "getPotentialClones $TMPFILE-sorted > $TMPFILE-potential" | |
logCommand "Sort by Filename..." "sortByFilename $TMPFILE-potential > $TMPFILE-potential-byFile" | |
# Here would be a good place to split the input in some nice way for parallel processing. | |
# You would want to split between two clones -- either between two files, or between two non-consecutive chunks in one file. | |
cp $TMPFILE-potential-byFile $TMPFILE-potential-byFile-toConsume | |
logCommand "Expand All Clones..." "expandAllClones $TMPFILE-potential-byFile-toConsume > $OUTFILE" | |
# Cleanup | |
# -------------------- | |
rm -f $TMPFILE* |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment