Skip to content

Instantly share code, notes, and snippets.

@mickesv
Created September 16, 2020 08:36
Show Gist options
  • Save mickesv/4de9528e2311064a7a93a8929820dfb3 to your computer and use it in GitHub Desktop.
Save mickesv/4de9528e2311064a7a93a8929820dfb3 to your computer and use it in GitHub Desktop.
Clony McCloneface done the Unix way
#!/usr/bin/env bash
CHUNKSIZE=5
STARTDIR=.
EXTENSION=txt
TMPFILE="/tmp/chunk-tmp"
OUTFILE="./clony-out.txt"
FINDFLAGS=""
# Helper Functions
# --------------------
function getLine() {
head -n $1 $2 | tail -1
}
function clearFile() {
rm -f $1
touch $1
}
function logCommand () {
TIMEFORMAT='time %3lR'
echo -n $1 $'\t\t' && time eval $2
}
function sortByFilename () { # TODO: HERE BE BUGS
sort -k1,1 -k2,2n $1
}
# Chunkification
# --------------------
function getChunk() {
tail -n "+$1" $2 | head -n "$CHUNKSIZE"
}
function chunkifyFile () {
FILENAME=$1
LENGTH=$(echo `wc -l $FILENAME` | cut -d ' ' -f 1)
ADJUSTEDLENGTH=`expr $LENGTH - $CHUNKSIZE`
if [ $ADJUSTEDLENGTH -lt 1 ]
then
ADJUSTEDLENGTH=1
fi
for LINE in $(seq 1 $ADJUSTEDLENGTH)
do
echo $FILENAME $LINE `getChunk $LINE $FILENAME | md5`
done
}
function chunkifyDirectory () {
clearFile $TMPFILE-chunks
find $1 -name "*.$2" $FINDFLAGS | while read FILE; do chunkifyFile "$FILE" >> $TMPFILE-chunks ; done
}
# Get and Expand Clones
# --------------------
function getPotentialClones () {
uniq -uf 2 $1 > $TMPFILE-uniq
fgrep -vFf $TMPFILE-uniq $1
rm $TMPFILE-uniq
}
function expandClone () {
HEAD=$(getLine $1 $2)
HFILE=`echo $HEAD | cut -d ' ' -f 1`
HLINE=`echo $HEAD | cut -d ' ' -f 2`
echo $HEAD # The first is always part of the expansion
CHUNKLINE=`expr $1 + 1`
NEXT=$(getLine $CHUNKLINE $2)
NFILE=`echo $NEXT | cut -d ' ' -f 1`
NLINE=`echo $NEXT | cut -d ' ' -f 2`
while [ "$HFILE" = "$NFILE" ] && (( $NLINE == $HLINE + 1 ))
do
echo $NEXT
HFILE=$NFILE #probably need to escape this in some nice way.
HLINE=$NLINE
CHUNKLINE=`expr $CHUNKLINE + 1`
NEXT=$(getLine $CHUNKLINE $2)
NFILE=`echo $NEXT | cut -d ' ' -f 1`
NLINE=`echo $NEXT | cut -d ' ' -f 2`
done
}
getCloneFile () {
head -1 $1 | cut -d ' ' -f 1
}
getCloneLines () {
FIRST=`head -1 $1 | cut -d ' ' -f 2`
LAST=`tail -1 $1 | cut -d ' ' -f 2`
echo from lines $FIRST to $(( $LAST + $CHUNKSIZE ))
}
getCloneSize () {
echo `wc -l $1` | cut -d ' ' -f 1
}
function findSiblings () {
cut -d ' ' -f 3 $1 > $TMPFILE-siblings
fgrep -f $TMPFILE-siblings $2
}
function expandAllSiblings () {
while [ -s $1 ]
do
expandClone 1 $1 > $TMPFILE-singleSibling
SIBFILE=`getCloneFile $TMPFILE-singleSibling`
SIBLINES=`getCloneLines $TMPFILE-singleSibling`
SIBSIZE=$(expr `getCloneSize $TMPFILE-singleSibling` + 1)
echo " Sibling:" $SIBFILE $SIBLINES
sed -n -e $SIBSIZE',$p' -i backup $1
done
}
function expandAllClones () {
while [ -s $1 ]
do
expandClone 1 $1 > $TMPFILE-singleCloneExpanded
CLONEFILE=`getCloneFile $TMPFILE-singleCloneExpanded`
CLONELINES=`getCloneLines $TMPFILE-singleCloneExpanded`
CLONESIZE=$(expr `getCloneSize $TMPFILE-singleCloneExpanded` + 1)
echo "Clone in" $CLONEFILE of size $(( $CLONESIZE + $CHUNKSIZE ))
findSiblings $TMPFILE-singleCloneExpanded $TMPFILE-potential-byFile > $TMPFILE-siblings
expandAllSiblings $TMPFILE-siblings
sed -n -e $CLONESIZE',$p' -i backup $1
done
}
# Overall execution
# --------------------
logCommand "Chunkify Directory..." "chunkifyDirectory $STARTDIR $EXTENSION"
logCommand "Sorting Result..." "sort -k 3 $TMPFILE-chunks > $TMPFILE-sorted" # required for uniq. Keep this separate to be able to time it.
logCommand "Getting Clones..." "getPotentialClones $TMPFILE-sorted > $TMPFILE-potential"
logCommand "Sort by Filename..." "sortByFilename $TMPFILE-potential > $TMPFILE-potential-byFile"
# Here would be a good place to split the input in some nice way for parallel processing.
# You would want to split between two clones -- either between two files, or between two non-consecutive chunks in one file.
cp $TMPFILE-potential-byFile $TMPFILE-potential-byFile-toConsume
logCommand "Expand All Clones..." "expandAllClones $TMPFILE-potential-byFile-toConsume > $OUTFILE"
# Cleanup
# --------------------
rm -f $TMPFILE*
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment