Skip to content

Instantly share code, notes, and snippets.

@amakukha
Created December 29, 2022 19:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amakukha/c83e77314daffe1e5c732bd0eb76485f to your computer and use it in GitHub Desktop.
Save amakukha/c83e77314daffe1e5c732bd0eb76485f to your computer and use it in GitHub Desktop.
code_duplication.py
#!/usr/bin/env python3
'''
Code duplication assessment tool. Runs in linear time.
Usage:
Put your packages into a single directory and run this script:
python3 code_duplication.py > report.txt
Then sort files by similarity:
cat report.txt | awk 'NF > 1' | sort -rn | less
'''
import os
from collections import Counter, defaultdict
SNIPPET_SIZE = 5 # how many lines of code constitute a "snippet"?
VERBOSE = False # show matching snippets?
FILE_EXTENSIONS = (".java", ".ts", ".js", ".tsx", ".jsx", ".py", ".cpp", ".c")
snippets = defaultdict(list) # snippet -> file list
for root, dirs, files in os.walk("."):
for file in files:
if file.endswith(FILE_EXTENSIONS):
filename = os.path.join(root, file)
print()
print(filename)
lines = [] # list of non-empty code lines in this file, trimmed
snippets_count = 0 # counter of snippets in the current file
similar_files = Counter() # filename -> snippet match count
for line in open(filename):
line = line.strip()
if not line: continue
lines.append('\n\t' + line) # separator \n\t is added to every line
if len(lines) < SNIPPET_SIZE:
continue
snippets_count += 1
snippet = ''.join(lines[-SNIPPET_SIZE:])
if snippet in snippets:
for similar_filename in snippets[snippet]:
if VERBOSE:
print('-', similar_filename)
print(snippet)
similar_files[similar_filename] += 1
snippets[snippet].append(filename)
for similar_filename in similar_files:
identical_count = similar_files[similar_filename]
if identical_count > 0 and similar_filename != filename:
score = identical_count*100/snippets_count # TODO: make the formula symmetrical
print('\t{}\t{:.1f}\t{}\t{}'.format(identical_count, score, similar_filename, filename))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment