Created
March 17, 2014 10:26
-
-
Save timvdalen/9597034 to your computer and use it in GitHub Desktop.
Scripts for 2IS55 Assignment 3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BASE_DIR=/home/ubuntu/ccfinder-src | |
SCRATCH_SPACE=/media/output | |
CCFX=${BASE_DIR}/ubuntu32/ccfx | |
PICOSEL=${BASE_DIR}/ubuntu32/picosel | |
SOURCE=/media/output/source | |
TEMP=temp | |
TEMP_CCFX=$(TEMP)/ccfx | |
TEMP_METRICS=$(TEMP)/metrics | |
TEMP_RESULTS=$(TEMP)/results | |
VERSIONS=$(shell ls ${SOURCE}) | |
LOCS=$(addsuffix .csv,$(addprefix loc/,$(VERSIONS))) | |
VERSIONPAIRS=$(foreach a,$(VERSIONS),$(foreach b,$(VERSIONS), $a-$b)) | |
COVERAGES=$(addsuffix .csv,$(addprefix coverage/,$(VERSIONPAIRS))) | |
all: coverages.csv | |
locs.csv: $(LOCS) | |
cat loc/*.csv > locs.csv | |
loc/%.csv: | |
$(eval VERSION=$(basename $(@F))) | |
mkdir -p loc | |
echo ${VERSION},`(find ${SOURCE}/${VERSION} -name '*.java' -print0 | xargs -0 cat) | wc -l` > $@ | |
coverages.csv: $(COVERAGES) | |
cat coverage/*.csv > coverages.csv | |
tempfolders: | |
mkdir -p ${TEMP_CCFX} | |
mkdir -p ${TEMP_METRICS} | |
mkdir -p ${TEMP_RESULTS} | |
coverage/%.csv: locs.csv tempfolders | |
if [ "`echo $(basename $(@F)) | cut -f1 -d-`" -lt "`echo $(basename $(@F)) | cut -f2 -d-`" ] ; then \ | |
mkdir -p coverage ; \ | |
#I had some nice variables called versionA,versionB,locA,locB here, but my make experience isn't enough to eval them at run time, so it didn't work with parallel jobs ; \ | |
#Instead, I have now expanded them in the lines below ; \ | |
#versionA=`echo $(basename $(@F)) | cut -f1 -d-` ; \ | |
#versionB=`echo $(basename $(@F)) | cut -f2 -d-` ; \ | |
#Run analysis, filter results, save metrics ; \ | |
${CCFX} d java -w f-w-g+ -b 50 -dn ${SOURCE}/`echo $(basename $(@F)) | cut -f1 -d-` -is -dn ${SOURCE}/`echo $(basename $(@F)) | cut -f2 -d-` -o ${TEMP_CCFX}/$(basename $(@F)) ; \ | |
${CCFX} m ${TEMP_CCFX}/$(basename $(@F)).ccfxd -c -o ${TEMP_METRICS}/clone-$(basename $(@F)).tsv ; \ | |
${PICOSEL} -o ${TEMP_RESULTS}/ids-$(basename $(@F)).txt from ${TEMP_METRICS}/clone-$(basename $(@F)).tsv select CID where RNR .gt. 0.5 ; \ | |
${CCFX} s ${TEMP_CCFX}/$(basename $(@F)).ccfxd -o ${TEMP_CCFX}/filtered-$(basename $(@F)) -ci ${TEMP_RESULTS}/ids-$(basename $(@F)).txt ; \ | |
${CCFX} m ${TEMP_CCFX}/filtered-$(basename $(@F)).ccfxd -w -o ${TEMP_RESULTS}/clone-$(basename $(@F)).tsv ; \ | |
#Load and sum the CLOC metric, calculate coverage, output ; \ | |
grep `echo $(basename $(@F)) | cut -f1 -d-` locs.csv | cut -f2 -d, > ${TEMP_RESULTS}/locA-$(basename $(@F)).txt ; \ | |
grep `echo $(basename $(@F)) | cut -f2 -d-` locs.csv | cut -f2 -d, > ${TEMP_RESULTS}/locB-$(basename $(@F)).txt ; \ | |
awk 'NR>1{CLOC+=$$4} END {print CLOC}' ${TEMP_RESULTS}/clone-$(basename $(@F)).tsv > ${TEMP_RESULTS}/cloc-$(basename $(@F)).txt ; \ | |
#echo versionA,versionB,coverage > $@ ; \ | |
echo `echo $(basename $(@F)) | cut -f1 -d-`,`echo $(basename $(@F)) | cut -f2 -d-`,`echo "scale=3; \`cat ${TEMP_RESULTS}/cloc-$(basename $(@F)).txt\`/(\`cat ${TEMP_RESULTS}/locA-$(basename $(@F)).txt\` + \`cat ${TEMP_RESULTS}/locB-$(basename $(@F)).txt\`)" | bc -l` > $@ #useless use of cat ; \ | |
#Clean up ; \ | |
rm -f ${TEMP_CCFX}/$(basename $(@F)).ccfxd ; \ | |
rm -f ${TEMP_CCFX}/filtered-$(basename $(@F)).ccfxd ; \ | |
rm -f ${TEMP_METRICS}/clone-$(basename $(@F)).tsv ; \ | |
rm -f ${TEMP_RESULTS}/clone-$(basename $(@F)).tsv ; \ | |
rm -f ${TEMP_RESULTS}/ids-$(basename $(@F)).txt ; \ | |
rm -f ${TEMP_RESULTS}/cloc-$(basename $(@F)).txt ; \ | |
rm -f ${TEMP_RESULTS}/locA-$(basename $(@F)).txt ; \ | |
rm -f ${TEMP_RESULTS}/locB-$(basename $(@F)).txt ; \ | |
fi | |
clean: | |
rm -rf loc/ | |
rm -f locs.csv | |
rm -rf coverage/ | |
rm -f coverages.csv | |
rm -rf temp/ | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'csv' | |
require 'set' | |
def lookup_coverage(versionA, versionB, data) | |
versionA, versionB = versionB, versionA if versionA.to_i > versionB.to_i | |
data.each do |row| | |
return row[2].to_f if versionA == row[0] and versionB == row[1] | |
end | |
0.to_f | |
end | |
data = CSV.read('results.csv') | |
# Read in values, determine max, determine unique versions | |
max = -1.0 | |
versions = SortedSet.new | |
data.each do |row| | |
if row[2].to_f > max | |
max = row[2].to_f | |
end | |
versions << row[0] | |
versions << row[1] | |
end | |
versions_a = versions.to_a | |
# Start writing the matrix | |
matrix = Array.new | |
0.upto(versions.size) do |j| | |
row = Array.new | |
0.upto(versions.size) do |i| | |
if j == versions.size | |
# Bottom row | |
if i == 0 | |
row << "" | |
else | |
row << versions_a[i-1] | |
end | |
else | |
if i == 0 | |
# Left column | |
row << versions_a[j] | |
elsif i >= j | |
row << 0.0 | |
else | |
# The actual center square | |
versionA = versions_a[i-1] | |
versionB = versions_a[j] | |
# Normalize coverage | |
coverage = (lookup_coverage(versionA, versionB, data)/max).round(3).to_s | |
row << coverage | |
end | |
end | |
end | |
matrix << row | |
end | |
CSV.open("processed.csv", "wb") do |csv| | |
matrix.each do |row| | |
csv << row | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
BASE_DIR=/home/ubuntu/ccfinder-src | |
CCFX=$BASE_DIR/ubuntu32/ccfx | |
PICOSEL=$BASE_DIR/ubuntu32/picosel | |
SOURCE=$BASE_DIR/source/source | |
OUTPUT=output | |
TEMP_CCFX=$OUTPUT/ccfx | |
TEMP_METRICS=$OUTPUT/metrics | |
TEMP_RESULTS=$OUTPUT/results | |
VERSIONS=$(ls $SOURCE) | |
# Book keeping | |
mkdir -p {$TEMP_CCFX,$TEMP_METRICS,$TEMP_RESULTS} | |
# Get the sum of the file lengths per version | |
declare -a version_loc | |
for version in $(echo $VERSIONS) | |
do | |
version_loc[$version]=$( (find $SOURCE/$version -name '*.java' -print0 | xargs -0 cat) | wc -l ) | |
done | |
# Execute analysis per version pair | |
echo 'VersionA,VersionB,coverage' > results.csv | |
for versionA in $VERSIONS | |
do | |
for versionB in $VERSIONS | |
do | |
if [ "$versionA" -ge "$versionB" ] | |
then | |
continue | |
fi | |
echo "Analyzing $versionA versus $versionB" | |
$CCFX d java -w f-w-g+ -b 50 -dn $SOURCE/$versionA -is -dn $SOURCE/$versionB -o $TEMP_CCFX/$versionA-$versionB | |
$CCFX m $TEMP_CCFX/$versionA-$versionB.ccfxd -c -o $TEMP_METRICS/clone-$versionA-$versionB.tsv | |
$PICOSEL -o $TEMP_RESULTS/ids-$versionA-$versionB.txt from $TEMP_METRICS/clone-$versionA-$versionB.tsv select CID where RNR .gt. 0.5 | |
$CCFX s $TEMP_CCFX/$versionA-$versionB.ccfxd -o $TEMP_CCFX/filtered-$versionA-$versionB -ci $TEMP_RESULTS/ids-$versionA-$versionB.txt | |
$CCFX m $TEMP_CCFX/filtered-$versionA-$versionB.ccfxd -w -o $TEMP_RESULTS/clone-$versionA-$versionB.tsv | |
loc_clone=$(awk 'NR>1{CLOC+=$4} END {print CLOC}' $TEMP_RESULTS/clone-$versionA-$versionB.tsv) | |
coverage=$(echo "scale=3; $loc_clone/(${version_loc[$versionA]} + ${version_loc[$versionB]})" | bc -l) | |
echo "$versionA,$versionB,$coverage" >> results.csv | |
# Clean up | |
rm $TEMP_CCFX/{filtered-,}$versionA-$versionB.ccfxd | |
rm $TEMP_METRICS/clone-$versionA-$versionB.tsv | |
rm $TEMP_RESULTS/ids-$versionA-$versionB.txt | |
rm -f $OUTPUT/*.tmp | |
done | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment