wmvanvliet/checker.py

## checker.py
from __future__ import print_function

import os.path as op
import inspect
import distance
import numpy as np
from scipy.misc import comb
import progressbar as pb
import progressbar.widgets as pw

import mne

# Extract docstrings from the python files
docstrings = dict()

def add_docstring(prefix, f):
    """Extract docstring from a function node and append to the giant list"""
    docstring = inspect.getdoc(f)
    if docstring is not None:
        docstrings[prefix] = docstring
    else:
        docstrings[prefix] = 'NONE'

def is_interesting(node):
    return (
        inspect.isfunction(node) or
        inspect.isgeneratorfunction(node) or
        inspect.ismethod(node) or
        inspect.isclass(node) or
        inspect.ismodule(node)
    )

parsed_nodes = set()
def parse(name, node):
    """Recursively obtain docstrings for all functions and methods in the module"""
    try:
        nodes = inspect.getmembers(node, is_interesting)
        f = inspect.getfile(node)
    except:
        return

    if not f.startswith(op.dirname(mne.__file__)):
        return
    if f +':'+ name in parsed_nodes:
        return
    parsed_nodes.add(f +':'+ name)

    for name_, node in nodes:
        try:
            if not inspect.getfile(node).startswith(op.dirname(mne.__file__)):
                continue
        except:
            continue
        if inspect.isfunction(node) or inspect.isgeneratorfunction(node):
            add_docstring(name_ +' ('+ inspect.getfile(node) +')', node)
        elif inspect.ismethod(node):
            add_docstring(name +'.'+ name_ +' ('+ inspect.getfile(node) +')', node)
        elif inspect.isclass(node) or inspect.ismodule(node):
            add_docstring(name_ +' ('+ inspect.getfile(node) +')', node)
            parse(name_, node)

parse('mne', mne)

# Make a sorted list of docstrings
docstrings = list(docstrings.items())
docstrings.sort(key=lambda x: x[0])

# Save all the extracted docstrings
with open('docstrings.txt', 'w') as f:
    f.writelines(map(lambda x: x[0] +' '+ x[1] +'\n', docstrings))

#for prefix, docstring in docstrings:
#    print prefix

# Remove whitespace from docstrings
docstrings = [(d[0], ''.join(d[1].split())) for d in docstrings]

# Filter the docstrings by length
docstrings = [d for d in docstrings if len(d[1]) > 100]

# Construct a nice progress bar
pbar = pb.ProgressBar(
    maxval = comb(len(docstrings), 2),
    widgets = [
        pw.Percentage(),
        '|',
        pw.ETA(),
        '|',
        'current function',
    ],
)

# Compare all long docstrings (takes time...)
scores = -1 * np.ones((len(docstrings), len(docstrings)), dtype=float)
pbar = pbar.start()
for i, d1 in enumerate(docstrings[:-1]):
    p1, d1 = d1
    pbar.widgets[-1] = p1  # Note current function in the progress bar
    for j, d2 in enumerate(docstrings[i + 1:], i + 1):
        p2, d2 = d2
        scores[i, j] = distance.levenshtein(d1, d2, max_dist=500)
        if scores[i, j] == -1:
            scores[i, j] = 1.0
        else:
            scores[i, j] /= min(max(len(d1), len(d2)), 500)
        pbar.update(pbar.currval + 1)
pbar.finish()

# Find candicate duplicate docstrings
similar_docstrings = list(zip(*np.nonzero(np.logical_and(scores >= 0, scores < 0.25))))

# Print them
for i1, i2 in similar_docstrings:
    print(scores[i1, i2], docstrings[i1][0], docstrings[i2][0])
	from __future__ import print_function

	import os.path as op
	import inspect
	import distance
	import numpy as np
	from scipy.misc import comb
	import progressbar as pb
	import progressbar.widgets as pw

	import mne

	# Extract docstrings from the python files
	docstrings = dict()

	def add_docstring(prefix, f):
	"""Extract docstring from a function node and append to the giant list"""
	docstring = inspect.getdoc(f)
	if docstring is not None:
	docstrings[prefix] = docstring
	else:
	docstrings[prefix] = 'NONE'

	def is_interesting(node):
	return (
	inspect.isfunction(node) or
	inspect.isgeneratorfunction(node) or
	inspect.ismethod(node) or
	inspect.isclass(node) or
	inspect.ismodule(node)
	)

	parsed_nodes = set()
	def parse(name, node):
	"""Recursively obtain docstrings for all functions and methods in the module"""
	try:
	nodes = inspect.getmembers(node, is_interesting)
	f = inspect.getfile(node)
	except:
	return

	if not f.startswith(op.dirname(mne.__file__)):
	return
	if f +':'+ name in parsed_nodes:
	return
	parsed_nodes.add(f +':'+ name)

	for name_, node in nodes:
	try:
	if not inspect.getfile(node).startswith(op.dirname(mne.__file__)):
	continue
	except:
	continue
	if inspect.isfunction(node) or inspect.isgeneratorfunction(node):
	add_docstring(name_ +' ('+ inspect.getfile(node) +')', node)
	elif inspect.ismethod(node):
	add_docstring(name +'.'+ name_ +' ('+ inspect.getfile(node) +')', node)
	elif inspect.isclass(node) or inspect.ismodule(node):
	add_docstring(name_ +' ('+ inspect.getfile(node) +')', node)
	parse(name_, node)

	parse('mne', mne)

	# Make a sorted list of docstrings
	docstrings = list(docstrings.items())
	docstrings.sort(key=lambda x: x[0])

	# Save all the extracted docstrings
	with open('docstrings.txt', 'w') as f:
	f.writelines(map(lambda x: x[0] +' '+ x[1] +'\n', docstrings))

	#for prefix, docstring in docstrings:
	# print prefix

	# Remove whitespace from docstrings
	docstrings = [(d[0], ''.join(d[1].split())) for d in docstrings]

	# Filter the docstrings by length
	docstrings = [d for d in docstrings if len(d[1]) > 100]

	# Construct a nice progress bar
	pbar = pb.ProgressBar(
	maxval = comb(len(docstrings), 2),
	widgets = [
	pw.Percentage(),
	'\|',
	pw.ETA(),
	'\|',
	'current function',
	],
	)

	# Compare all long docstrings (takes time...)
	scores = -1 * np.ones((len(docstrings), len(docstrings)), dtype=float)
	pbar = pbar.start()
	for i, d1 in enumerate(docstrings[:-1]):
	p1, d1 = d1
	pbar.widgets[-1] = p1 # Note current function in the progress bar
	for j, d2 in enumerate(docstrings[i + 1:], i + 1):
	p2, d2 = d2
	scores[i, j] = distance.levenshtein(d1, d2, max_dist=500)
	if scores[i, j] == -1:
	scores[i, j] = 1.0
	else:
	scores[i, j] /= min(max(len(d1), len(d2)), 500)
	pbar.update(pbar.currval + 1)
	pbar.finish()

	# Find candicate duplicate docstrings
	similar_docstrings = list(zip(*np.nonzero(np.logical_and(scores >= 0, scores < 0.25))))

	# Print them
	for i1, i2 in similar_docstrings:
	print(scores[i1, i2], docstrings[i1][0], docstrings[i2][0])