Created
August 21, 2016 00:24
-
-
Save vrthra/17643e92a812dfd3835e8e4208385dcb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import sys | |
import scipy | |
import scipy.stats | |
okProjects = [] | |
with open("data/subjects.csv") as csvfile: | |
pdata = csv.DictReader(csvfile) | |
for row in pdata: | |
if int(row['total.mutants']) <= 0: | |
continue | |
if int(row['commit.recent'] <= row['commit.initial']): | |
continue | |
okProjects.append(row['project']) | |
ot_others = {} | |
my_result = [] | |
with open('data/results.mutants.uniq.csv') as csvfile: | |
data = csv.DictReader(csvfile) | |
for row in data: | |
if row['project'] not in okProjects: continue | |
commitid = row['commitid'] | |
mid = row['project'] + "," + \ | |
row["file"] + ":" + \ | |
row['class'] + "." + row ['method'] + ":" + row ['line'] + ":" + \ | |
row ['mutant'] + ":" + row ['index'] | |
fullname = row['project'] + ":" + row['class'] + "." + row ['method'] | |
if row['isbug'] == 'OT': | |
ot_others[fullname] = 1 | |
continue | |
else: | |
my_result.append({'fullname': fullname, 'mid': mid, 'commitid': commitid}) | |
# checked: build/my.result.csv | |
#for i in my_result: | |
# print i['fullname'], i['mid'], i['commitid'] | |
# what we want: we want to get each method (given in fullname), and for | |
# each method, compute the number of bugfixes and total mutants. | |
my_methods = {} | |
for result in my_result: | |
method = result['fullname'] | |
mid = result['mid'] | |
commitid = result['commitid'] | |
if method not in my_methods: my_methods[method] = { 'mutants' : set(), 'bugfix' : set()} | |
my_methods[method]['mutants'].add(mid) | |
my_methods[method]['bugfix'].add(commitid) | |
print "data/results.mutants.uniq.csv", len(my_methods.keys()) | |
# checked: build/method.res.summary.csv | |
with open("fullname:results.mutants.uniq.csv", 'w') as f: | |
for key in my_methods.keys(): | |
print >> f, key | |
#print f, key, len(my_methods[key]['mutants']), len(my_methods[key]['bugfix']) | |
# remember, some of the methods are overloaded. strip them out. | |
method = [] | |
with open('processed/method.uniq.csv') as csvfile: | |
data = csv.DictReader(csvfile) | |
for row in data: | |
if row['projectname'] not in okProjects: continue | |
m = {} | |
l = row['CountLineCode'] | |
if l == '': continue | |
loc = int(l) | |
if loc == 0: continue | |
m['methodloc'] = loc | |
m['fullname'] = row['projectname'] + ":" + row['Name'] | |
method.append(m) | |
methods = {} | |
for m in method: | |
fullname = m['fullname'] | |
methods[fullname] = m['methodloc'] | |
print "processed/method.uniq.csv:", len(methods.keys()) | |
# checked: build/method.csv | |
with open("fullname:method.uniq.csv", 'w') as f: | |
for key in methods.keys(): | |
print >> f, key | |
# for i in methods.keys(): | |
# print i, methods[i] | |
# what we want? we have my_methods with fullname as key, and we have | |
# my_result with fullname. Given that it is not the exact same | |
# data set, we can either pick the processed/method.uniq.csv as the | |
# ground truth, and fill in the bug fix count from the | |
# data/results.mutants.uniq.csv with by filling `0` for those methods | |
# we dont have an entry in `data/results.mutants.uniq.csv` (because | |
# it is the exact same project, and the `understand` is run at epoch) | |
# or we can essentially look at only the intersection of method names | |
# between `method.uniq.csv` and `results.mutants.uniq.csv`. The reason | |
# second may be preferable is that some of the methods does not | |
# produce a mutation, and hence may show no commits, even when there | |
# are commits if the commit did not overlap with any mutants (because | |
# `results.mutants.uniq.csv` is primarily a mutant # record db). | |
# on the other hand, first is plausible and we do not bias the results | |
# by avoiding zero commits. | |
# Choice 1. Include zero commit lines. | |
# - But we do not know if any commits are omited. | |
joined_method = {} | |
for m in methods.keys(): | |
v = {} | |
if m in my_methods: | |
v['bugfix'] = len(my_methods[m]['bugfix']) | |
v['mutants'] = len(my_methods[m]['mutants']) | |
else: | |
v['bugfix'] = 0 | |
v['mutants'] = 0 | |
v['loc'] = methods[m] | |
joined_method[m] = v | |
# Choice 2. Discard zero commits. | |
# - But we discard the zero commits. | |
err = [] | |
joined_method2 = {} | |
for m in methods.keys(): | |
v = {} | |
if not m in my_methods: | |
if m not in ot_others: | |
# does not have a mutant on a commit. | |
err.append(m) | |
continue | |
v['bugfix'] = len(my_methods[m]['bugfix']) | |
v['mutants'] = len(my_methods[m]['mutants']) | |
v['loc'] = methods[m] | |
joined_method2[m] = v | |
print "matching", len(joined_method2.keys()) | |
with open("py.data.err", 'w') as f: | |
for e in err: | |
print >> f,e | |
# checked: | |
# for i in joined_method.keys(): | |
# print i, joined_method[i]['loc'], joined_method[i]['bugfix'], joined_method[i]['mutants'] | |
print "Choice 1" | |
allMutantCounts = [joined_method[i]['mutants'] for i in joined_method.keys()] | |
allBugs = [joined_method[i]['bugfix'] for i in joined_method.keys()] | |
allLoc = [joined_method[i]['loc'] for i in joined_method.keys()] | |
print "Mutant count correlations:" | |
print "B~Mut Tau:",scipy.stats.kendalltau(allBugs,allMutantCounts,) | |
print "B~Loc Tau:",scipy.stats.kendalltau(allBugs, allLoc) | |
print "B~Mut r^2:",scipy.stats.pearsonr(allBugs, allMutantCounts) | |
print "B~Loc r^2:",scipy.stats.pearsonr(allBugs, allLoc) | |
print "" | |
# check the R^ and Tau values using this. | |
with open("py.data.csv", 'w') as f: | |
f.write("key,mutants,bugfix,loc\n") | |
for i in joined_method.keys(): | |
j = joined_method[i] | |
m = j['mutants'] | |
b = j['bugfix'] | |
l = j['loc'] | |
print >> f, ",".join([i,str(m),str(b),str(l)]) | |
joined_method = joined_method2 | |
print "Choice 2" | |
allMutantCounts = [joined_method[i]['mutants'] for i in joined_method.keys()] | |
allBugs = [joined_method[i]['bugfix'] for i in joined_method.keys()] | |
allLoc = [joined_method[i]['loc'] for i in joined_method.keys()] | |
print "Mutant count correlations:" | |
print "B~Mut Tau:",scipy.stats.kendalltau(allBugs,allMutantCounts,) | |
print "B~Loc Tau:",scipy.stats.kendalltau(allBugs, allLoc) | |
print "B~Mut r^2:",scipy.stats.pearsonr(allBugs, allMutantCounts) | |
print "B~Loc r^2:",scipy.stats.pearsonr(allBugs, allLoc) | |
# check the R^ and Tau values using this. | |
with open("py.data2.csv", 'w') as f: | |
f.write("key,mutants,bugfix,loc\n") | |
for i in joined_method.keys(): | |
j = joined_method[i] | |
m = j['mutants'] | |
b = j['bugfix'] | |
l = j['loc'] | |
print >> f, ",".join([i,str(m),str(b),str(l)]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment