Skip to content

Instantly share code, notes, and snippets.

@bjorndown
Created December 2, 2013 07:22
Show Gist options
  • Save bjorndown/7746200 to your computer and use it in GitHub Desktop.
Save bjorndown/7746200 to your computer and use it in GitHub Desktop.
Recursively scans a directory for text files and finds those files not mentioned by name in any file within this tree.
import os
import re
import sys
class ReferenceCollector:
def __init__(self, pattern, ignored_files):
self.files = {}
self.ignored_files = ignored_files
self.PATTERN = pattern
def run_scan(self, dir):
"""Scan given directory for unreferenced files."""
for root, dirs, files in os.walk(os.path.normpath(dir)):
for filename in files:
self.register(root, filename)
self.scan_content_for_references(root, filename)
def register(self, path, filename):
"""Initially register file if unknown."""
if not filename in self.ignored_files and not filename in self.files:
self.files[filename] = { "references": 0, "path": path }
def scan_content_for_references(self, path, filename):
"""Scan content of given file for references to other files."""
content = self._read_file(path, filename)
referenced_files = self._extract_file_references(content)
for referenced_file in referenced_files:
if not referenced_file in self.files:
self.register(path, referenced_file)
self.files[referenced_file]["references"] += 1
def _extract_file_references(self, content):
matches = re.findall(self.PATTERN, content)
referenced_files = [groups[0] for groups in matches]
return self._strip_path(referenced_files)
def _read_file(self, path, filename):
with open(os.path.join(path, filename), "r") as file:
content = file.read()
return content
def _strip_path(self, files):
filenames = []
for file in files:
filenames.append(os.path.basename(file))
return filenames
def get_unreferenced_files(self):
"""Return unreferenced files with fully qualified path."""
unreferenced = []
for filename, data in self.files.items():
if data["references"] == 0:
unreferenced.append(os.path.join(data["path"], filename))
unreferenced.sort()
return unreferenced
def get_number_of_scanned_files(self):
return len(self.files)
if __name__ == "__main__":
reference_collector = ReferenceCollector(pattern="([/\w-]+\.(xml|sql))", ignored_files=["ignore.xml"])
reference_collector.run_scan(sys.argv[1])
print("Number of scanned files: {}".format(reference_collector.get_number_of_scanned_files()))
print("Number of unreferenced file(s): {}".format(len(reference_collector.get_unreferenced_files())))
print("Unreferenced file(s):")
for file in reference_collector.get_unreferenced_files():
print(file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment