timvdalen/Makefile

## Makefile
BASE_DIR=/home/ubuntu/ccfinder-src
SCRATCH_SPACE=/media/output

CCFX=${BASE_DIR}/ubuntu32/ccfx
PICOSEL=${BASE_DIR}/ubuntu32/picosel

SOURCE=/media/output/source

TEMP=temp
TEMP_CCFX=$(TEMP)/ccfx
TEMP_METRICS=$(TEMP)/metrics
TEMP_RESULTS=$(TEMP)/results

VERSIONS=$(shell ls ${SOURCE})
LOCS=$(addsuffix .csv,$(addprefix loc/,$(VERSIONS)))
VERSIONPAIRS=$(foreach a,$(VERSIONS),$(foreach b,$(VERSIONS), $a-$b))
COVERAGES=$(addsuffix .csv,$(addprefix coverage/,$(VERSIONPAIRS)))

all: coverages.csv

locs.csv: $(LOCS)
  cat loc/*.csv > locs.csv

loc/%.csv:
  $(eval VERSION=$(basename $(@F)))
  mkdir -p loc
  echo ${VERSION},`(find ${SOURCE}/${VERSION} -name '*.java' -print0 | xargs -0 cat) | wc -l`  > $@

coverages.csv: $(COVERAGES)
  cat coverage/*.csv > coverages.csv

tempfolders:
  mkdir -p ${TEMP_CCFX}
  mkdir -p ${TEMP_METRICS}
  mkdir -p ${TEMP_RESULTS}

coverage/%.csv: locs.csv tempfolders
  if [ "`echo $(basename $(@F)) | cut -f1 -d-`" -lt "`echo $(basename $(@F)) | cut -f2 -d-`" ] ; then \
    mkdir -p coverage ; \
    #I had some nice variables called versionA,versionB,locA,locB here, but my make experience isn't enough to eval them at run time, so it didn't work with parallel jobs ; \
    #Instead, I have now expanded them in the lines below ; \
    #versionA=`echo $(basename $(@F)) | cut -f1 -d-` ; \
    #versionB=`echo $(basename $(@F)) | cut -f2 -d-` ; \
    #Run analysis, filter results, save metrics ; \
    ${CCFX} d java -w f-w-g+ -b 50 -dn ${SOURCE}/`echo $(basename $(@F)) | cut -f1 -d-` -is -dn ${SOURCE}/`echo $(basename $(@F)) | cut -f2 -d-` -o ${TEMP_CCFX}/$(basename $(@F)) ; \
    ${CCFX} m ${TEMP_CCFX}/$(basename $(@F)).ccfxd -c -o ${TEMP_METRICS}/clone-$(basename $(@F)).tsv ; \
    ${PICOSEL} -o ${TEMP_RESULTS}/ids-$(basename $(@F)).txt from ${TEMP_METRICS}/clone-$(basename $(@F)).tsv select CID where RNR .gt. 0.5 ; \
    ${CCFX} s ${TEMP_CCFX}/$(basename $(@F)).ccfxd -o ${TEMP_CCFX}/filtered-$(basename $(@F)) -ci ${TEMP_RESULTS}/ids-$(basename $(@F)).txt ; \
    ${CCFX} m ${TEMP_CCFX}/filtered-$(basename $(@F)).ccfxd -w -o ${TEMP_RESULTS}/clone-$(basename $(@F)).tsv ; \
    #Load and sum the CLOC metric, calculate coverage, output ; \
    grep `echo $(basename $(@F)) | cut -f1 -d-` locs.csv | cut -f2 -d, > ${TEMP_RESULTS}/locA-$(basename $(@F)).txt ; \
    grep `echo $(basename $(@F)) | cut -f2 -d-` locs.csv | cut -f2 -d, > ${TEMP_RESULTS}/locB-$(basename $(@F)).txt ; \
    awk 'NR>1{CLOC+=$$4} END {print CLOC}' ${TEMP_RESULTS}/clone-$(basename $(@F)).tsv > ${TEMP_RESULTS}/cloc-$(basename $(@F)).txt ; \
    #echo versionA,versionB,coverage > $@ ; \
    echo `echo $(basename $(@F)) | cut -f1 -d-`,`echo $(basename $(@F)) | cut -f2 -d-`,`echo "scale=3; \`cat ${TEMP_RESULTS}/cloc-$(basename $(@F)).txt\`/(\`cat ${TEMP_RESULTS}/locA-$(basename $(@F)).txt\` + \`cat ${TEMP_RESULTS}/locB-$(basename $(@F)).txt\`)" | bc -l` > $@ #useless use of cat ; \
    #Clean up ; \
    rm -f ${TEMP_CCFX}/$(basename $(@F)).ccfxd ; \
    rm -f ${TEMP_CCFX}/filtered-$(basename $(@F)).ccfxd ; \
    rm -f ${TEMP_METRICS}/clone-$(basename $(@F)).tsv ; \
    rm -f ${TEMP_RESULTS}/clone-$(basename $(@F)).tsv ; \
    rm -f ${TEMP_RESULTS}/ids-$(basename $(@F)).txt ; \
    rm -f ${TEMP_RESULTS}/cloc-$(basename $(@F)).txt ; \
    rm -f ${TEMP_RESULTS}/locA-$(basename $(@F)).txt ; \
    rm -f ${TEMP_RESULTS}/locB-$(basename $(@F)).txt ; \
  fi

clean:
  rm -rf loc/
  rm -f locs.csv
  rm -rf coverage/
  rm -f coverages.csv
  rm -rf temp/


## process.rb
require 'csv'
require 'set'

def lookup_coverage(versionA, versionB, data)
  versionA, versionB = versionB, versionA if versionA.to_i > versionB.to_i

  data.each do |row|
    return row[2].to_f if versionA == row[0] and versionB == row[1]
  end
  0.to_f
end

data = CSV.read('results.csv')

# Read in values, determine max, determine unique versions
max = -1.0
versions = SortedSet.new
data.each do |row|
  if row[2].to_f > max
    max = row[2].to_f
  end
  versions << row[0]
  versions << row[1]
end
versions_a = versions.to_a

# Start writing the matrix
matrix = Array.new
0.upto(versions.size) do |j|
  row = Array.new
  0.upto(versions.size) do |i|
    if j == versions.size
      # Bottom row
      if i == 0
        row << ""
      else
        row << versions_a[i-1]
      end
    else
      if i == 0
        # Left column
        row << versions_a[j]
      elsif i >= j
        row << 0.0
      else
        # The actual center square
        versionA = versions_a[i-1]
        versionB = versions_a[j]
        # Normalize coverage
        coverage = (lookup_coverage(versionA, versionB, data)/max).round(3).to_s
        row << coverage
      end
    end
  end
  matrix << row
end

CSV.open("processed.csv", "wb") do |csv|
  matrix.each do |row|
    csv << row
  end
end

## script.sh
#!/bin/bash
BASE_DIR=/home/ubuntu/ccfinder-src

CCFX=$BASE_DIR/ubuntu32/ccfx
PICOSEL=$BASE_DIR/ubuntu32/picosel

SOURCE=$BASE_DIR/source/source

OUTPUT=output
TEMP_CCFX=$OUTPUT/ccfx
TEMP_METRICS=$OUTPUT/metrics
TEMP_RESULTS=$OUTPUT/results

VERSIONS=$(ls $SOURCE)

# Book keeping
mkdir -p {$TEMP_CCFX,$TEMP_METRICS,$TEMP_RESULTS}

# Get the sum of the file lengths per version
declare -a version_loc

for version in $(echo $VERSIONS)
do
  version_loc[$version]=$( (find $SOURCE/$version -name '*.java' -print0 | xargs -0 cat) | wc -l )
done

# Execute analysis per version pair
echo 'VersionA,VersionB,coverage' > results.csv
for versionA in $VERSIONS
do
  for versionB in $VERSIONS
  do
    if [ "$versionA" -ge "$versionB" ]
    then
      continue
    fi

    echo "Analyzing $versionA versus $versionB"

    $CCFX d java -w f-w-g+ -b 50 -dn $SOURCE/$versionA -is -dn $SOURCE/$versionB -o $TEMP_CCFX/$versionA-$versionB
    $CCFX m $TEMP_CCFX/$versionA-$versionB.ccfxd -c -o $TEMP_METRICS/clone-$versionA-$versionB.tsv
    $PICOSEL -o $TEMP_RESULTS/ids-$versionA-$versionB.txt from $TEMP_METRICS/clone-$versionA-$versionB.tsv select CID where RNR .gt. 0.5
    $CCFX s $TEMP_CCFX/$versionA-$versionB.ccfxd -o $TEMP_CCFX/filtered-$versionA-$versionB -ci $TEMP_RESULTS/ids-$versionA-$versionB.txt
    $CCFX m $TEMP_CCFX/filtered-$versionA-$versionB.ccfxd -w -o $TEMP_RESULTS/clone-$versionA-$versionB.tsv

    loc_clone=$(awk 'NR>1{CLOC+=$4} END {print CLOC}' $TEMP_RESULTS/clone-$versionA-$versionB.tsv)
    coverage=$(echo "scale=3; $loc_clone/(${version_loc[$versionA]} + ${version_loc[$versionB]})" | bc -l)

    echo "$versionA,$versionB,$coverage" >> results.csv

    # Clean up
    rm $TEMP_CCFX/{filtered-,}$versionA-$versionB.ccfxd
    rm $TEMP_METRICS/clone-$versionA-$versionB.tsv
    rm $TEMP_RESULTS/ids-$versionA-$versionB.txt

    rm -f $OUTPUT/*.tmp
  done
done
	BASE_DIR=/home/ubuntu/ccfinder-src
	SCRATCH_SPACE=/media/output

	CCFX=${BASE_DIR}/ubuntu32/ccfx
	PICOSEL=${BASE_DIR}/ubuntu32/picosel

	SOURCE=/media/output/source

	TEMP=temp
	TEMP_CCFX=$(TEMP)/ccfx
	TEMP_METRICS=$(TEMP)/metrics
	TEMP_RESULTS=$(TEMP)/results

	VERSIONS=$(shell ls ${SOURCE})
	LOCS=$(addsuffix .csv,$(addprefix loc/,$(VERSIONS)))
	VERSIONPAIRS=$(foreach a,$(VERSIONS),$(foreach b,$(VERSIONS), $a-$b))
	COVERAGES=$(addsuffix .csv,$(addprefix coverage/,$(VERSIONPAIRS)))

	all: coverages.csv

	locs.csv: $(LOCS)
	cat loc/*.csv > locs.csv

	loc/%.csv:
	$(eval VERSION=$(basename $(@F)))
	mkdir -p loc
	echo ${VERSION},`(find ${SOURCE}/${VERSION} -name '*.java' -print0 \| xargs -0 cat) \| wc -l` > $@

	coverages.csv: $(COVERAGES)
	cat coverage/*.csv > coverages.csv

	tempfolders:
	mkdir -p ${TEMP_CCFX}
	mkdir -p ${TEMP_METRICS}
	mkdir -p ${TEMP_RESULTS}

	coverage/%.csv: locs.csv tempfolders
	if [ "`echo $(basename $(@F)) \| cut -f1 -d-`" -lt "`echo $(basename $(@F)) \| cut -f2 -d-`" ] ; then \
	mkdir -p coverage ; \
	#I had some nice variables called versionA,versionB,locA,locB here, but my make experience isn't enough to eval them at run time, so it didn't work with parallel jobs ; \
	#Instead, I have now expanded them in the lines below ; \
	#versionA=`echo $(basename $(@F)) \| cut -f1 -d-` ; \
	#versionB=`echo $(basename $(@F)) \| cut -f2 -d-` ; \
	#Run analysis, filter results, save metrics ; \
	${CCFX} d java -w f-w-g+ -b 50 -dn ${SOURCE}/`echo $(basename $(@F)) \| cut -f1 -d-` -is -dn ${SOURCE}/`echo $(basename $(@F)) \| cut -f2 -d-` -o ${TEMP_CCFX}/$(basename $(@F)) ; \
	${CCFX} m ${TEMP_CCFX}/$(basename $(@F)).ccfxd -c -o ${TEMP_METRICS}/clone-$(basename $(@F)).tsv ; \
	${PICOSEL} -o ${TEMP_RESULTS}/ids-$(basename $(@F)).txt from ${TEMP_METRICS}/clone-$(basename $(@F)).tsv select CID where RNR .gt. 0.5 ; \
	${CCFX} s ${TEMP_CCFX}/$(basename $(@F)).ccfxd -o ${TEMP_CCFX}/filtered-$(basename $(@F)) -ci ${TEMP_RESULTS}/ids-$(basename $(@F)).txt ; \
	${CCFX} m ${TEMP_CCFX}/filtered-$(basename $(@F)).ccfxd -w -o ${TEMP_RESULTS}/clone-$(basename $(@F)).tsv ; \
	#Load and sum the CLOC metric, calculate coverage, output ; \
	grep `echo $(basename $(@F)) \| cut -f1 -d-` locs.csv \| cut -f2 -d, > ${TEMP_RESULTS}/locA-$(basename $(@F)).txt ; \
	grep `echo $(basename $(@F)) \| cut -f2 -d-` locs.csv \| cut -f2 -d, > ${TEMP_RESULTS}/locB-$(basename $(@F)).txt ; \
	awk 'NR>1{CLOC+=$$4} END {print CLOC}' ${TEMP_RESULTS}/clone-$(basename $(@F)).tsv > ${TEMP_RESULTS}/cloc-$(basename $(@F)).txt ; \
	#echo versionA,versionB,coverage > $@ ; \
	echo `echo $(basename $(@F)) \| cut -f1 -d-`,`echo $(basename $(@F)) \| cut -f2 -d-`,`echo "scale=3; \`cat ${TEMP_RESULTS}/cloc-$(basename $(@F)).txt\`/(\`cat ${TEMP_RESULTS}/locA-$(basename $(@F)).txt\` + \`cat ${TEMP_RESULTS}/locB-$(basename $(@F)).txt\`)" \| bc -l` > $@ #useless use of cat ; \
	#Clean up ; \
	rm -f ${TEMP_CCFX}/$(basename $(@F)).ccfxd ; \
	rm -f ${TEMP_CCFX}/filtered-$(basename $(@F)).ccfxd ; \
	rm -f ${TEMP_METRICS}/clone-$(basename $(@F)).tsv ; \
	rm -f ${TEMP_RESULTS}/clone-$(basename $(@F)).tsv ; \
	rm -f ${TEMP_RESULTS}/ids-$(basename $(@F)).txt ; \
	rm -f ${TEMP_RESULTS}/cloc-$(basename $(@F)).txt ; \
	rm -f ${TEMP_RESULTS}/locA-$(basename $(@F)).txt ; \
	rm -f ${TEMP_RESULTS}/locB-$(basename $(@F)).txt ; \
	fi

	clean:
	rm -rf loc/
	rm -f locs.csv
	rm -rf coverage/
	rm -f coverages.csv
	rm -rf temp/
	require 'csv'
	require 'set'

	def lookup_coverage(versionA, versionB, data)
	versionA, versionB = versionB, versionA if versionA.to_i > versionB.to_i

	data.each do \|row\|
	return row[2].to_f if versionA == row[0] and versionB == row[1]
	end
	0.to_f
	end

	data = CSV.read('results.csv')

	# Read in values, determine max, determine unique versions
	max = -1.0
	versions = SortedSet.new
	data.each do \|row\|
	if row[2].to_f > max
	max = row[2].to_f
	end
	versions << row[0]
	versions << row[1]
	end
	versions_a = versions.to_a

	# Start writing the matrix
	matrix = Array.new
	0.upto(versions.size) do \|j\|
	row = Array.new
	0.upto(versions.size) do \|i\|
	if j == versions.size
	# Bottom row
	if i == 0
	row << ""
	else
	row << versions_a[i-1]
	end
	else
	if i == 0
	# Left column
	row << versions_a[j]
	elsif i >= j
	row << 0.0
	else
	# The actual center square
	versionA = versions_a[i-1]
	versionB = versions_a[j]
	# Normalize coverage
	coverage = (lookup_coverage(versionA, versionB, data)/max).round(3).to_s
	row << coverage
	end
	end
	end
	matrix << row
	end

	CSV.open("processed.csv", "wb") do \|csv\|
	matrix.each do \|row\|
	csv << row
	end
	end
	#!/bin/bash
	BASE_DIR=/home/ubuntu/ccfinder-src

	CCFX=$BASE_DIR/ubuntu32/ccfx
	PICOSEL=$BASE_DIR/ubuntu32/picosel

	SOURCE=$BASE_DIR/source/source

	OUTPUT=output
	TEMP_CCFX=$OUTPUT/ccfx
	TEMP_METRICS=$OUTPUT/metrics
	TEMP_RESULTS=$OUTPUT/results

	VERSIONS=$(ls $SOURCE)

	# Book keeping
	mkdir -p {$TEMP_CCFX,$TEMP_METRICS,$TEMP_RESULTS}

	# Get the sum of the file lengths per version
	declare -a version_loc

	for version in $(echo $VERSIONS)
	do
	version_loc[$version]=$( (find $SOURCE/$version -name '*.java' -print0 \| xargs -0 cat) \| wc -l )
	done

	# Execute analysis per version pair
	echo 'VersionA,VersionB,coverage' > results.csv
	for versionA in $VERSIONS
	do
	for versionB in $VERSIONS
	do
	if [ "$versionA" -ge "$versionB" ]
	then
	continue
	fi

	echo "Analyzing $versionA versus $versionB"

	$CCFX d java -w f-w-g+ -b 50 -dn $SOURCE/$versionA -is -dn $SOURCE/$versionB -o $TEMP_CCFX/$versionA-$versionB
	$CCFX m $TEMP_CCFX/$versionA-$versionB.ccfxd -c -o $TEMP_METRICS/clone-$versionA-$versionB.tsv
	$PICOSEL -o $TEMP_RESULTS/ids-$versionA-$versionB.txt from $TEMP_METRICS/clone-$versionA-$versionB.tsv select CID where RNR .gt. 0.5
	$CCFX s $TEMP_CCFX/$versionA-$versionB.ccfxd -o $TEMP_CCFX/filtered-$versionA-$versionB -ci $TEMP_RESULTS/ids-$versionA-$versionB.txt
	$CCFX m $TEMP_CCFX/filtered-$versionA-$versionB.ccfxd -w -o $TEMP_RESULTS/clone-$versionA-$versionB.tsv

	loc_clone=$(awk 'NR>1{CLOC+=$4} END {print CLOC}' $TEMP_RESULTS/clone-$versionA-$versionB.tsv)
	coverage=$(echo "scale=3; $loc_clone/(${version_loc[$versionA]} + ${version_loc[$versionB]})" \| bc -l)

	echo "$versionA,$versionB,$coverage" >> results.csv

	# Clean up
	rm $TEMP_CCFX/{filtered-,}$versionA-$versionB.ccfxd
	rm $TEMP_METRICS/clone-$versionA-$versionB.tsv
	rm $TEMP_RESULTS/ids-$versionA-$versionB.txt

	rm -f $OUTPUT/*.tmp
	done
	done