Skip to content

Instantly share code, notes, and snippets.

@wmvanvliet
Last active August 6, 2016 17:45
Show Gist options
  • Save wmvanvliet/de8d322773cd79848e486d56d6590835 to your computer and use it in GitHub Desktop.
Save wmvanvliet/de8d322773cd79848e486d56d6590835 to your computer and use it in GitHub Desktop.
Script that extracts all docstrings from a Python module, saves them to a file, and proceeds to compile a list of possible duplicates based on levenshtein distance.
from __future__ import print_function
import os.path as op
import inspect
import distance
import numpy as np
from scipy.misc import comb
import progressbar as pb
import progressbar.widgets as pw
import mne
# Extract docstrings from the python files
docstrings = dict()
def add_docstring(prefix, f):
"""Extract docstring from a function node and append to the giant list"""
docstring = inspect.getdoc(f)
if docstring is not None:
docstrings[prefix] = docstring
else:
docstrings[prefix] = 'NONE'
def is_interesting(node):
return (
inspect.isfunction(node) or
inspect.isgeneratorfunction(node) or
inspect.ismethod(node) or
inspect.isclass(node) or
inspect.ismodule(node)
)
parsed_nodes = set()
def parse(name, node):
"""Recursively obtain docstrings for all functions and methods in the module"""
try:
nodes = inspect.getmembers(node, is_interesting)
f = inspect.getfile(node)
except:
return
if not f.startswith(op.dirname(mne.__file__)):
return
if f +':'+ name in parsed_nodes:
return
parsed_nodes.add(f +':'+ name)
for name_, node in nodes:
try:
if not inspect.getfile(node).startswith(op.dirname(mne.__file__)):
continue
except:
continue
if inspect.isfunction(node) or inspect.isgeneratorfunction(node):
add_docstring(name_ +' ('+ inspect.getfile(node) +')', node)
elif inspect.ismethod(node):
add_docstring(name +'.'+ name_ +' ('+ inspect.getfile(node) +')', node)
elif inspect.isclass(node) or inspect.ismodule(node):
add_docstring(name_ +' ('+ inspect.getfile(node) +')', node)
parse(name_, node)
parse('mne', mne)
# Make a sorted list of docstrings
docstrings = list(docstrings.items())
docstrings.sort(key=lambda x: x[0])
# Save all the extracted docstrings
with open('docstrings.txt', 'w') as f:
f.writelines(map(lambda x: x[0] +' '+ x[1] +'\n', docstrings))
#for prefix, docstring in docstrings:
# print prefix
# Remove whitespace from docstrings
docstrings = [(d[0], ''.join(d[1].split())) for d in docstrings]
# Filter the docstrings by length
docstrings = [d for d in docstrings if len(d[1]) > 100]
# Construct a nice progress bar
pbar = pb.ProgressBar(
maxval = comb(len(docstrings), 2),
widgets = [
pw.Percentage(),
'|',
pw.ETA(),
'|',
'current function',
],
)
# Compare all long docstrings (takes time...)
scores = -1 * np.ones((len(docstrings), len(docstrings)), dtype=float)
pbar = pbar.start()
for i, d1 in enumerate(docstrings[:-1]):
p1, d1 = d1
pbar.widgets[-1] = p1 # Note current function in the progress bar
for j, d2 in enumerate(docstrings[i + 1:], i + 1):
p2, d2 = d2
scores[i, j] = distance.levenshtein(d1, d2, max_dist=500)
if scores[i, j] == -1:
scores[i, j] = 1.0
else:
scores[i, j] /= min(max(len(d1), len(d2)), 500)
pbar.update(pbar.currval + 1)
pbar.finish()
# Find candicate duplicate docstrings
similar_docstrings = list(zip(*np.nonzero(np.logical_and(scores >= 0, scores < 0.25))))
# Print them
for i1, i2 in similar_docstrings:
print(scores[i1, i2], docstrings[i1][0], docstrings[i2][0])
@jasmainak
Copy link

Cool script, thanks for sharing :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment