jnmclarty/dry_checker.py

## dry_checker.py
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 06 07:29:38 2015

@author: Jeffrey
"""

"""
Python code clone detector,
using Abstract Syntax Trees.
"""

import ast
import collections

__author__  = 'FBV'
__license__ = 'MIT'
__version__ = '0.0.2'
__status__  = 'alpha'

class Position(ast.NodeVisitor):
    '''
    Find a clone position in the code (its line-span).
    Count child nodes.
    '''
    def init(self, clone):
        '''
        Lazy initialization.
        '''
        self.begin_line = self.end_line = clone.node.lineno
        self.node_count = 0
        self.generic_visit(clone.node)
    def visit(self, node):
        '''
        Find node's line and column span.
        '''
        if hasattr(node, 'lineno'):
            self.begin_line = min(self.begin_line, node.lineno)
            self.end_line = max(self.end_line, node.lineno)
            self.node_count += 1
            self.generic_visit(node)

class Clone(collections.namedtuple('Clone', 'node file position')):
    '''
    A set of code.
    '''
    def source(self, indent=''):
        '''
        Retrieve original source code.
        '''
        if not hasattr(self.position, 'begin_line'):
            self.position.init(self)
        lines = self.file.source[
            self.position.begin_line-1:self.position.end_line]
        return (self.position.begin_line, self.position.end_line,
                '\n'.join(indent + line.rstrip() for line in lines))

class Clones(list):
    '''
    A list of identical code snippets.
    '''
    def score(self):
        '''
        Provide a score for ordering clones while reporting.
        This sorts by number of nodes in the subtree, number
        of clones of the node, and code size.
        '''
        candidate = self[0] # Pick the first clone.
        size = len(candidate.source()[-1])
        return (candidate.position.node_count, len(self), size)

class File:
    '''
    A source file.
    '''
    def __init__(self, name, source):
        '''
        Create a file with name and source-code.
        '''
        self.name = name
        self.source = source

def digest(node):
    '''
    Return an unambiguous string representation of a sub-tree in node.
    Emulates ast.dump(node, False).
    '''
    if isinstance(node, ast.AST):
        if not hasattr(node, '_cached'):
            node._cached = '%s(%s)' % (node.__class__.__name__, ', '.join(
                digest(b) for a, b in ast.iter_fields(node)))
        return node._cached
    elif isinstance(node, list):
        return '[%s]' % ', '.join(digest(x) for x in node)
    return repr(node)

class Index(ast.NodeVisitor):
    '''
    A source code repository.
    '''
    def __init__(self, exclude):
        '''
        Create a new file indexer.
        '''
        self.nodes = collections.defaultdict(Clones)
        self.blacklist = frozenset(exclude)
    def add(self, file):
        '''
        Add a file to the index and parse it.
        '''
        source = open(file).readlines()
        tree = ast.parse(''.join(source))
        self._file = File(file, source)
        self.generic_visit(tree)
    def visit(self, node):
        '''
        Walk the Abstract Syntax Tree of a file.
        Convert each sub-tree to a string, which is used
        as a key in the clones dictionnary.
        '''
        if hasattr(node, 'lineno'):
            if node.__class__.__name__ not in self.blacklist:
                expr = digest(node)
                self.nodes[expr].append(
                    Clone(node, self._file, Position()))
        self.generic_visit(node)
    def clones(self):
        '''
        Returns a list of duplicate constructs.
        '''
        return sorted(((expr, nodes)
            for expr, nodes in self.nodes.items() if len(nodes)>1),
                key=lambda n: n[1].score(), reverse=True)

def source_obj(s):
    c = s.strip(" ").strip("\t")
    if c[:3] == 'def' and 'self' in c:
        return "method"
    elif c[:3] == 'def':
        return "function"
    elif c[:2] == 'if' or c[:4] == 'elif':
        return "logic"
    elif c[:3] == 'try':
        return "error handling"
    else:
        return "other code"

all_files = r"C:\Users\Jeffrey\Desktop\qi-commons\qi_commons\allfiles.out"

def make_aggregate_file(flist):
    with open(all_files, 'w') as outfile:
        for fname in flist:
            with open(fname) as infile:
                outfile.write(infile.read())
                outfile.write("\n")

if __name__ == '__main__':
    import argparse
    import itertools
    import os

    # Parse command-line arguments.
    args = argparse.ArgumentParser(description=__doc__)
    #args.add_argument('files',
    #    metavar='FILE', nargs='+',
    #    help='set of Python files to check for duplicate code')
    args.add_argument('--ignore', '-i',
        metavar='NODE', nargs='+', default=['Name'],
        help='skip some syntactic constructs (default: Name)')
    args.add_argument('--min', '-m',
        metavar='N', type=int, default=1,
        help='report items duplicated at least N times')
    args.add_argument('--version', '-v',
        action='version', version='%(prog)s ' + __version__)
    input = args.parse_args()

    files_search = []
    for root, subFolders, files in os.walk(r'C:\Users\Jeffrey\Desktop\qi-commons\qi_commons'):
        for fil in files:
            files_search.append(os.path.join(root, fil))

    files_search = [f for f in files_search if all((c not in f for c in ('test', 'spyder','yaml','pyc','xlsx', 'allfiles.out')))]

    #make_aggregate_file(files_search)

    #files_search.append(all_files)

    # Process files.
    sources = Index(input.ignore)
    for f in files_search:
        sources.add(f)

    results = []

    # Report clones.
    for expr, clones in sources.clones():
        if len(clones) >= input.min:
            print("+%d repetitions of: %s ->" %
                    (len(clones), expr))
            for file, group in itertools.groupby(clones,
                    lambda clone: clone.file.name):
                fname = file.split("qi_commons")[-1]
                print("  - in %s" % fname)
                for clone in group:
                    begin, end, source = clone.source(' '*8)
                    if begin == end:
                        line = 'line %d' % begin
                    else:
                        line = 'lines %d to %d' % (begin, end)
                    print('      %s:\n%s' % (line, source))
                    results.append((len(clones), expr, fname, begin, end, end - begin, len(source), source_obj(source), source))

    import pandas as pd
    df = pd.DataFrame(results, columns=['cnt', 'expression', 'file', 'begin', 'end', 'length', 'char length', 'object', 'source'])
    df.to_excel("C:\Users\Jeffrey\Desktop\qi-commons\qi_commons\dry.xlsx")
	# -- coding: utf-8 --
	"""
	Created on Sun Sep 06 07:29:38 2015

	@author: Jeffrey
	"""

	"""
	Python code clone detector,
	using Abstract Syntax Trees.
	"""

	import ast
	import collections

	__author__ = 'FBV'
	__license__ = 'MIT'
	__version__ = '0.0.2'
	__status__ = 'alpha'

	class Position(ast.NodeVisitor):
	'''
	Find a clone position in the code (its line-span).
	Count child nodes.
	'''
	def init(self, clone):
	'''
	Lazy initialization.
	'''
	self.begin_line = self.end_line = clone.node.lineno
	self.node_count = 0
	self.generic_visit(clone.node)
	def visit(self, node):
	'''
	Find node's line and column span.
	'''
	if hasattr(node, 'lineno'):
	self.begin_line = min(self.begin_line, node.lineno)
	self.end_line = max(self.end_line, node.lineno)
	self.node_count += 1
	self.generic_visit(node)

	class Clone(collections.namedtuple('Clone', 'node file position')):
	'''
	A set of code.
	'''
	def source(self, indent=''):
	'''
	Retrieve original source code.
	'''
	if not hasattr(self.position, 'begin_line'):
	self.position.init(self)
	lines = self.file.source[
	self.position.begin_line-1:self.position.end_line]
	return (self.position.begin_line, self.position.end_line,
	'\n'.join(indent + line.rstrip() for line in lines))

	class Clones(list):
	'''
	A list of identical code snippets.
	'''
	def score(self):
	'''
	Provide a score for ordering clones while reporting.
	This sorts by number of nodes in the subtree, number
	of clones of the node, and code size.
	'''
	candidate = self[0] # Pick the first clone.
	size = len(candidate.source()[-1])
	return (candidate.position.node_count, len(self), size)

	class File:
	'''
	A source file.
	'''
	def __init__(self, name, source):
	'''
	Create a file with name and source-code.
	'''
	self.name = name
	self.source = source

	def digest(node):
	'''
	Return an unambiguous string representation of a sub-tree in node.
	Emulates ast.dump(node, False).
	'''
	if isinstance(node, ast.AST):
	if not hasattr(node, '_cached'):
	node._cached = '%s(%s)' % (node.__class__.__name__, ', '.join(
	digest(b) for a, b in ast.iter_fields(node)))
	return node._cached
	elif isinstance(node, list):
	return '[%s]' % ', '.join(digest(x) for x in node)
	return repr(node)

	class Index(ast.NodeVisitor):
	'''
	A source code repository.
	'''
	def __init__(self, exclude):
	'''
	Create a new file indexer.
	'''
	self.nodes = collections.defaultdict(Clones)
	self.blacklist = frozenset(exclude)
	def add(self, file):
	'''
	Add a file to the index and parse it.
	'''
	source = open(file).readlines()
	tree = ast.parse(''.join(source))
	self._file = File(file, source)
	self.generic_visit(tree)
	def visit(self, node):
	'''
	Walk the Abstract Syntax Tree of a file.
	Convert each sub-tree to a string, which is used
	as a key in the clones dictionnary.
	'''
	if hasattr(node, 'lineno'):
	if node.__class__.__name__ not in self.blacklist:
	expr = digest(node)
	self.nodes[expr].append(
	Clone(node, self._file, Position()))
	self.generic_visit(node)
	def clones(self):
	'''
	Returns a list of duplicate constructs.
	'''
	return sorted(((expr, nodes)
	for expr, nodes in self.nodes.items() if len(nodes)>1),
	key=lambda n: n[1].score(), reverse=True)

	def source_obj(s):
	c = s.strip(" ").strip("\t")
	if c[:3] == 'def' and 'self' in c:
	return "method"
	elif c[:3] == 'def':
	return "function"
	elif c[:2] == 'if' or c[:4] == 'elif':
	return "logic"
	elif c[:3] == 'try':
	return "error handling"
	else:
	return "other code"

	all_files = r"C:\Users\Jeffrey\Desktop\qi-commons\qi_commons\allfiles.out"

	def make_aggregate_file(flist):
	with open(all_files, 'w') as outfile:
	for fname in flist:
	with open(fname) as infile:
	outfile.write(infile.read())
	outfile.write("\n")

	if __name__ == '__main__':
	import argparse
	import itertools
	import os

	# Parse command-line arguments.
	args = argparse.ArgumentParser(description=__doc__)
	#args.add_argument('files',
	# metavar='FILE', nargs='+',
	# help='set of Python files to check for duplicate code')
	args.add_argument('--ignore', '-i',
	metavar='NODE', nargs='+', default=['Name'],
	help='skip some syntactic constructs (default: Name)')
	args.add_argument('--min', '-m',
	metavar='N', type=int, default=1,
	help='report items duplicated at least N times')
	args.add_argument('--version', '-v',
	action='version', version='%(prog)s ' + __version__)
	input = args.parse_args()

	files_search = []
	for root, subFolders, files in os.walk(r'C:\Users\Jeffrey\Desktop\qi-commons\qi_commons'):
	for fil in files:
	files_search.append(os.path.join(root, fil))

	files_search = [f for f in files_search if all((c not in f for c in ('test', 'spyder','yaml','pyc','xlsx', 'allfiles.out')))]

	#make_aggregate_file(files_search)

	#files_search.append(all_files)

	# Process files.
	sources = Index(input.ignore)
	for f in files_search:
	sources.add(f)

	results = []

	# Report clones.
	for expr, clones in sources.clones():
	if len(clones) >= input.min:
	print("+%d repetitions of: %s ->" %
	(len(clones), expr))
	for file, group in itertools.groupby(clones,
	lambda clone: clone.file.name):
	fname = file.split("qi_commons")[-1]
	print(" - in %s" % fname)
	for clone in group:
	begin, end, source = clone.source(' '*8)
	if begin == end:
	line = 'line %d' % begin
	else:
	line = 'lines %d to %d' % (begin, end)
	print(' %s:\n%s' % (line, source))
	results.append((len(clones), expr, fname, begin, end, end - begin, len(source), source_obj(source), source))

	import pandas as pd
	df = pd.DataFrame(results, columns=['cnt', 'expression', 'file', 'begin', 'end', 'length', 'char length', 'object', 'source'])
	df.to_excel("C:\Users\Jeffrey\Desktop\qi-commons\qi_commons\dry.xlsx")