vrthra/checkcor-rahul.py

## checkcor-rahul.py
import csv
import sys
import scipy
import scipy.stats

okProjects = []

with open("data/subjects.csv") as csvfile:
    pdata = csv.DictReader(csvfile)
    for row in pdata:
        if int(row['total.mutants']) <= 0:
            continue
        if int(row['commit.recent'] <= row['commit.initial']):
            continue
        okProjects.append(row['project'])

ot_others = {}
my_result = []

with open('data/results.mutants.uniq.csv') as csvfile:
    data = csv.DictReader(csvfile)

    for row in data:
        if row['project'] not in okProjects: continue
        commitid = row['commitid']
        mid = row['project'] + "," + \
              row["file"] + ":" + \
              row['class'] + "." + row ['method'] + ":" + row ['line'] + ":" + \
              row ['mutant'] + ":" + row ['index']
        fullname = row['project'] + ":" + row['class'] + "." + row ['method']
        if row['isbug'] == 'OT':
            ot_others[fullname] = 1
            continue
        else:
            my_result.append({'fullname': fullname, 'mid': mid, 'commitid': commitid})

# checked: build/my.result.csv
#for i in my_result:
#    print i['fullname'], i['mid'], i['commitid']

# what we want: we want to get each method (given in fullname), and for
# each method, compute the number of bugfixes and total mutants.

my_methods = {}
for result in my_result:
    method = result['fullname']
    mid = result['mid']
    commitid = result['commitid']
    if method not in my_methods: my_methods[method] = { 'mutants' : set(), 'bugfix' : set()}
    my_methods[method]['mutants'].add(mid)
    my_methods[method]['bugfix'].add(commitid)

print "data/results.mutants.uniq.csv", len(my_methods.keys())

# checked: build/method.res.summary.csv
with open("fullname:results.mutants.uniq.csv", 'w') as f:
   for key in my_methods.keys():
      print >> f, key
      #print f, key, len(my_methods[key]['mutants']), len(my_methods[key]['bugfix'])

# remember, some of the methods are overloaded. strip them out.
method = []
with open('processed/method.uniq.csv') as csvfile:
    data = csv.DictReader(csvfile)

    for row in data:
        if row['projectname'] not in okProjects: continue
        m = {}
        l = row['CountLineCode']
        if l == '': continue
        loc = int(l)
        if loc == 0: continue
        m['methodloc'] = loc
        m['fullname'] = row['projectname'] + ":" + row['Name']
        method.append(m)

methods = {}
for m in method:
    fullname = m['fullname']
    methods[fullname] = m['methodloc']

print "processed/method.uniq.csv:", len(methods.keys())

# checked: build/method.csv
with open("fullname:method.uniq.csv", 'w') as f:
   for key in methods.keys():
      print >> f, key
# for i in methods.keys():
#     print i, methods[i]

# what we want? we have my_methods with fullname as key, and we have
# my_result with fullname. Given that it is not the exact same
# data set, we can either pick the processed/method.uniq.csv as the
# ground truth, and fill in the bug fix count from the
# data/results.mutants.uniq.csv with by filling `0` for those methods
# we dont have an entry in `data/results.mutants.uniq.csv` (because
# it is the exact same project, and the `understand` is run at epoch)
# or we can essentially look at only the intersection of method names
# between `method.uniq.csv` and `results.mutants.uniq.csv`. The reason
# second may be preferable is that some of the methods does not
# produce a mutation, and hence may show no commits, even when there
# are commits if the commit did not overlap with any mutants (because
# `results.mutants.uniq.csv` is primarily a mutant # record db).
# on the other hand, first is plausible and we do not bias the results
# by avoiding zero commits.


# Choice 1. Include zero commit lines.
#  - But we do not know if any commits are omited.
joined_method = {}
for m in methods.keys():
    v = {}
    if m in my_methods:
        v['bugfix'] = len(my_methods[m]['bugfix'])
        v['mutants'] = len(my_methods[m]['mutants'])
    else:
        v['bugfix'] = 0
        v['mutants'] = 0
    v['loc'] = methods[m]
    joined_method[m] = v

# Choice 2. Discard zero commits.
#  - But we discard the zero commits.
err = []
joined_method2 = {}
for m in methods.keys():
    v = {}
    if not m in my_methods:
        if m not in ot_others:
            # does not have a mutant on a commit.
            err.append(m)
        continue
    v['bugfix'] = len(my_methods[m]['bugfix'])
    v['mutants'] = len(my_methods[m]['mutants'])
    v['loc'] = methods[m]
    joined_method2[m] = v


print "matching", len(joined_method2.keys())


with open("py.data.err", 'w') as f:
    for e in err:
        print >> f,e

# checked:
# for i in joined_method.keys():
#    print i, joined_method[i]['loc'], joined_method[i]['bugfix'], joined_method[i]['mutants']

print "Choice 1"
allMutantCounts = [joined_method[i]['mutants'] for i in joined_method.keys()]
allBugs = [joined_method[i]['bugfix'] for i in joined_method.keys()]
allLoc = [joined_method[i]['loc'] for i in joined_method.keys()]

print "Mutant count correlations:"
print "B~Mut Tau:",scipy.stats.kendalltau(allBugs,allMutantCounts,)
print "B~Loc Tau:",scipy.stats.kendalltau(allBugs, allLoc)


print "B~Mut r^2:",scipy.stats.pearsonr(allBugs, allMutantCounts)
print "B~Loc r^2:",scipy.stats.pearsonr(allBugs, allLoc)

print ""
# check the R^ and Tau values using this.
with open("py.data.csv", 'w') as f:
    f.write("key,mutants,bugfix,loc\n")
    for i in joined_method.keys():
        j = joined_method[i]
        m = j['mutants']
        b = j['bugfix']
        l = j['loc']
        print >> f, ",".join([i,str(m),str(b),str(l)])

joined_method = joined_method2
print "Choice 2"
allMutantCounts = [joined_method[i]['mutants'] for i in joined_method.keys()]
allBugs = [joined_method[i]['bugfix'] for i in joined_method.keys()]
allLoc = [joined_method[i]['loc'] for i in joined_method.keys()]

print "Mutant count correlations:"
print "B~Mut Tau:",scipy.stats.kendalltau(allBugs,allMutantCounts,)
print "B~Loc Tau:",scipy.stats.kendalltau(allBugs, allLoc)


print "B~Mut r^2:",scipy.stats.pearsonr(allBugs, allMutantCounts)
print "B~Loc r^2:",scipy.stats.pearsonr(allBugs, allLoc)
# check the R^ and Tau values using this.
with open("py.data2.csv", 'w') as f:
    f.write("key,mutants,bugfix,loc\n")
    for i in joined_method.keys():
        j = joined_method[i]
        m = j['mutants']
        b = j['bugfix']
        l = j['loc']
        print >> f, ",".join([i,str(m),str(b),str(l)])
	import csv
	import sys
	import scipy
	import scipy.stats

	okProjects = []

	with open("data/subjects.csv") as csvfile:
	pdata = csv.DictReader(csvfile)
	for row in pdata:
	if int(row['total.mutants']) <= 0:
	continue
	if int(row['commit.recent'] <= row['commit.initial']):
	continue
	okProjects.append(row['project'])

	ot_others = {}
	my_result = []

	with open('data/results.mutants.uniq.csv') as csvfile:
	data = csv.DictReader(csvfile)

	for row in data:
	if row['project'] not in okProjects: continue
	commitid = row['commitid']
	mid = row['project'] + "," + \
	row["file"] + ":" + \
	row['class'] + "." + row ['method'] + ":" + row ['line'] + ":" + \
	row ['mutant'] + ":" + row ['index']
	fullname = row['project'] + ":" + row['class'] + "." + row ['method']
	if row['isbug'] == 'OT':
	ot_others[fullname] = 1
	continue
	else:
	my_result.append({'fullname': fullname, 'mid': mid, 'commitid': commitid})

	# checked: build/my.result.csv
	#for i in my_result:
	# print i['fullname'], i['mid'], i['commitid']

	# what we want: we want to get each method (given in fullname), and for
	# each method, compute the number of bugfixes and total mutants.

	my_methods = {}
	for result in my_result:
	method = result['fullname']
	mid = result['mid']
	commitid = result['commitid']
	if method not in my_methods: my_methods[method] = { 'mutants' : set(), 'bugfix' : set()}
	my_methods[method]['mutants'].add(mid)
	my_methods[method]['bugfix'].add(commitid)

	print "data/results.mutants.uniq.csv", len(my_methods.keys())

	# checked: build/method.res.summary.csv
	with open("fullname:results.mutants.uniq.csv", 'w') as f:
	for key in my_methods.keys():
	print >> f, key
	#print f, key, len(my_methods[key]['mutants']), len(my_methods[key]['bugfix'])

	# remember, some of the methods are overloaded. strip them out.
	method = []
	with open('processed/method.uniq.csv') as csvfile:
	data = csv.DictReader(csvfile)

	for row in data:
	if row['projectname'] not in okProjects: continue
	m = {}
	l = row['CountLineCode']
	if l == '': continue
	loc = int(l)
	if loc == 0: continue
	m['methodloc'] = loc
	m['fullname'] = row['projectname'] + ":" + row['Name']
	method.append(m)

	methods = {}
	for m in method:
	fullname = m['fullname']
	methods[fullname] = m['methodloc']

	print "processed/method.uniq.csv:", len(methods.keys())

	# checked: build/method.csv
	with open("fullname:method.uniq.csv", 'w') as f:
	for key in methods.keys():
	print >> f, key
	# for i in methods.keys():
	# print i, methods[i]

	# what we want? we have my_methods with fullname as key, and we have
	# my_result with fullname. Given that it is not the exact same
	# data set, we can either pick the processed/method.uniq.csv as the
	# ground truth, and fill in the bug fix count from the
	# data/results.mutants.uniq.csv with by filling `0` for those methods
	# we dont have an entry in `data/results.mutants.uniq.csv` (because
	# it is the exact same project, and the `understand` is run at epoch)
	# or we can essentially look at only the intersection of method names
	# between `method.uniq.csv` and `results.mutants.uniq.csv`. The reason
	# second may be preferable is that some of the methods does not
	# produce a mutation, and hence may show no commits, even when there
	# are commits if the commit did not overlap with any mutants (because
	# `results.mutants.uniq.csv` is primarily a mutant # record db).
	# on the other hand, first is plausible and we do not bias the results
	# by avoiding zero commits.


	# Choice 1. Include zero commit lines.
	# - But we do not know if any commits are omited.
	joined_method = {}
	for m in methods.keys():
	v = {}
	if m in my_methods:
	v['bugfix'] = len(my_methods[m]['bugfix'])
	v['mutants'] = len(my_methods[m]['mutants'])
	else:
	v['bugfix'] = 0
	v['mutants'] = 0
	v['loc'] = methods[m]
	joined_method[m] = v

	# Choice 2. Discard zero commits.
	# - But we discard the zero commits.
	err = []
	joined_method2 = {}
	for m in methods.keys():
	v = {}
	if not m in my_methods:
	if m not in ot_others:
	# does not have a mutant on a commit.
	err.append(m)
	continue
	v['bugfix'] = len(my_methods[m]['bugfix'])
	v['mutants'] = len(my_methods[m]['mutants'])
	v['loc'] = methods[m]
	joined_method2[m] = v


	print "matching", len(joined_method2.keys())


	with open("py.data.err", 'w') as f:
	for e in err:
	print >> f,e

	# checked:
	# for i in joined_method.keys():
	# print i, joined_method[i]['loc'], joined_method[i]['bugfix'], joined_method[i]['mutants']

	print "Choice 1"
	allMutantCounts = [joined_method[i]['mutants'] for i in joined_method.keys()]
	allBugs = [joined_method[i]['bugfix'] for i in joined_method.keys()]
	allLoc = [joined_method[i]['loc'] for i in joined_method.keys()]

	print "Mutant count correlations:"
	print "B~Mut Tau:",scipy.stats.kendalltau(allBugs,allMutantCounts,)
	print "B~Loc Tau:",scipy.stats.kendalltau(allBugs, allLoc)


	print "B~Mut r^2:",scipy.stats.pearsonr(allBugs, allMutantCounts)
	print "B~Loc r^2:",scipy.stats.pearsonr(allBugs, allLoc)

	print ""
	# check the R^ and Tau values using this.
	with open("py.data.csv", 'w') as f:
	f.write("key,mutants,bugfix,loc\n")
	for i in joined_method.keys():
	j = joined_method[i]
	m = j['mutants']
	b = j['bugfix']
	l = j['loc']
	print >> f, ",".join([i,str(m),str(b),str(l)])

	joined_method = joined_method2
	print "Choice 2"
	allMutantCounts = [joined_method[i]['mutants'] for i in joined_method.keys()]
	allBugs = [joined_method[i]['bugfix'] for i in joined_method.keys()]
	allLoc = [joined_method[i]['loc'] for i in joined_method.keys()]

	print "Mutant count correlations:"
	print "B~Mut Tau:",scipy.stats.kendalltau(allBugs,allMutantCounts,)
	print "B~Loc Tau:",scipy.stats.kendalltau(allBugs, allLoc)


	print "B~Mut r^2:",scipy.stats.pearsonr(allBugs, allMutantCounts)
	print "B~Loc r^2:",scipy.stats.pearsonr(allBugs, allLoc)
	# check the R^ and Tau values using this.
	with open("py.data2.csv", 'w') as f:
	f.write("key,mutants,bugfix,loc\n")
	for i in joined_method.keys():
	j = joined_method[i]
	m = j['mutants']
	b = j['bugfix']
	l = j['loc']
	print >> f, ",".join([i,str(m),str(b),str(l)])