glenn-jocher/analyze_python_files.py

## analyze_python_files.py
"""
Python Files Analyzer

This script analyzes Python files in a given GitHub repo, excluding specified sub-directories.
It counts and reports the following for each file:
- Total number of characters
- Total number of words
- Total number of lines
- Total number of functions (based on the 'def' keyword)
- Total number of classes (based on the 'class' keyword)

Results are printed to the console. Additionally, histograms for characters, words, and lines
are plotted and saved as a high-resolution PNG image.

Usage:
Run the script in a directory containing Python files. By default, it will analyze all Python files
in the current directory and its sub-directories, excluding any paths matching patterns in the
'exclude_patterns' list (e.g., "/venv" and "/runs").

Required Libraries:
- collections
- pathlib
- matplotlib
- numpy
- tqdm
- re
"""


import re
import subprocess
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm


def clone_github_repo(repo_url: str, target_dir: Path = Path("repos")) -> Path:
    """
    Clone a GitHub repository to a target directory.

    Args:
        repo_url (str): URL of the GitHub repository.
        target_dir (Path): Directory where the repo should be cloned.

    Returns:
        Path: Path to the cloned repository.
    """
    if not target_dir.exists():
        target_dir.mkdir(parents=True)

    # Extract repo name from the URL to use as a folder name
    repo_name = repo_url.rstrip('/').split('/')[-1].replace('.git', '')
    repo_path = target_dir / repo_name

    if not repo_path.exists():
        print(f"Cloning {repo_url} into {repo_path}...")
        subprocess.run(["git", "clone", repo_url, str(repo_path)], check=True)
    else:
        print(f"{repo_path} already exists. Skipping clone and using existing data.")

    return repo_path


def analyze_python_files(directory_path: Path, exclude_patterns: list = []) -> dict:
    """
    Analyze Python files in the directory: count characters, words, lines, functions, and classes.

    Args:
        directory_path (Path): Directory to analyze.
        exclude_patterns (list): List of directory patterns to exclude.

    Returns:
        dict: Results with file paths as keys and stats as values.
    """
    func_pattern = re.compile(r'^\s*def\s+\w+\s*\(', re.MULTILINE)
    class_pattern = re.compile(r'^\s*class\s+\w+', re.MULTILINE)

    valid_files = [f for f in directory_path.rglob('*.py') if
                   not any(pattern in str(f) for pattern in exclude_patterns)]
    results = defaultdict(tuple)

    for py_file in tqdm(valid_files, desc="Analyzing"):
        with py_file.open(encoding='utf-8', errors='replace') as f:
            content = f.read()
            chars = len(content)
            words = len(content.split())
            lines = len(content.split('\n'))
            functions = len(func_pattern.findall(content))
            classes = len(class_pattern.findall(content))
            results[py_file] = (chars, words, lines, functions, classes)

    return results


def plot_histogram(data: dict) -> None:
    """
    Plot histograms for characters, words, and lines on a single graph with stats.

    Args:
        data (dict): Dictionary with filenames as keys and a tuple (chars, words, lines) as values.
    """
    metrics = ['chars', 'words', 'lines']
    titles = ['Characters', 'Words', 'Lines']

    # Create a figure and a 1x3 grid of subplots
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))

    for idx, metric in enumerate(metrics):
        if metric == 'chars':
            values = [v[0] for v in data.values()]
        elif metric == 'words':
            values = [v[1] for v in data.values()]
        elif metric == 'lines':
            values = [v[2] for v in data.values()]

        min_val, mean_val, max_val = np.min(values), np.mean(values), np.max(values)

        axes[idx].hist(values, bins=30, edgecolor='black')
        axes[idx].set_yscale('log')
        axes[idx].set_title(f"{titles[idx]}\nMin: {min_val}, Mean: {round(mean_val, 2)}, Max: {max_val}")
        axes[idx].set_xlabel(metric)
        axes[idx].set_ylabel('Number of files (log scale)')

    plt.tight_layout()
    plt.savefig('python_files_statistics.png', dpi=300)
    plt.show()


if __name__ == '__main__':
    repo_url = 'https://github.com/ultralytics/ultralytics'

    # Clone the GitHub repo into 'repos/' directory
    repo_path = clone_github_repo(repo_url)
    print(f'Analyzing Python files in {repo_path}')

    results = analyze_python_files(repo_path, ["/venv", "/runs"])
    total_files = len(results)
    total_lines = sum(v[2] for v in results.values())
    total_chars = sum(v[0] for v in results.values())
    total_words = sum(v[1] for v in results.values())
    total_functions = sum(v[3] for v in results.values())
    total_classes = sum(v[4] for v in results.values())

    print(f"Files: {total_files}, Lines: {total_lines}, Chars: {total_chars}, Words: {total_words} "
          f"Functions: {total_functions}, Classes: {total_classes}")
    plot_histogram(results)
	"""
	Python Files Analyzer

	This script analyzes Python files in a given GitHub repo, excluding specified sub-directories.
	It counts and reports the following for each file:
	- Total number of characters
	- Total number of words
	- Total number of lines
	- Total number of functions (based on the 'def' keyword)
	- Total number of classes (based on the 'class' keyword)

	Results are printed to the console. Additionally, histograms for characters, words, and lines
	are plotted and saved as a high-resolution PNG image.

	Usage:
	Run the script in a directory containing Python files. By default, it will analyze all Python files
	in the current directory and its sub-directories, excluding any paths matching patterns in the
	'exclude_patterns' list (e.g., "/venv" and "/runs").

	Required Libraries:
	- collections
	- pathlib
	- matplotlib
	- numpy
	- tqdm
	- re
	"""


	import re
	import subprocess
	from collections import defaultdict
	from pathlib import Path

	import matplotlib.pyplot as plt
	import numpy as np
	from tqdm import tqdm


	def clone_github_repo(repo_url: str, target_dir: Path = Path("repos")) -> Path:
	"""
	Clone a GitHub repository to a target directory.

	Args:
	repo_url (str): URL of the GitHub repository.
	target_dir (Path): Directory where the repo should be cloned.

	Returns:
	Path: Path to the cloned repository.
	"""
	if not target_dir.exists():
	target_dir.mkdir(parents=True)

	# Extract repo name from the URL to use as a folder name
	repo_name = repo_url.rstrip('/').split('/')[-1].replace('.git', '')
	repo_path = target_dir / repo_name

	if not repo_path.exists():
	print(f"Cloning {repo_url} into {repo_path}...")
	subprocess.run(["git", "clone", repo_url, str(repo_path)], check=True)
	else:
	print(f"{repo_path} already exists. Skipping clone and using existing data.")

	return repo_path


	def analyze_python_files(directory_path: Path, exclude_patterns: list = []) -> dict:
	"""
	Analyze Python files in the directory: count characters, words, lines, functions, and classes.

	Args:
	directory_path (Path): Directory to analyze.
	exclude_patterns (list): List of directory patterns to exclude.

	Returns:
	dict: Results with file paths as keys and stats as values.
	"""
	func_pattern = re.compile(r'^\sdef\s+\w+\s\(', re.MULTILINE)
	class_pattern = re.compile(r'^\s*class\s+\w+', re.MULTILINE)

	valid_files = [f for f in directory_path.rglob('*.py') if
	not any(pattern in str(f) for pattern in exclude_patterns)]
	results = defaultdict(tuple)

	for py_file in tqdm(valid_files, desc="Analyzing"):
	with py_file.open(encoding='utf-8', errors='replace') as f:
	content = f.read()
	chars = len(content)
	words = len(content.split())
	lines = len(content.split('\n'))
	functions = len(func_pattern.findall(content))
	classes = len(class_pattern.findall(content))
	results[py_file] = (chars, words, lines, functions, classes)

	return results


	def plot_histogram(data: dict) -> None:
	"""
	Plot histograms for characters, words, and lines on a single graph with stats.

	Args:
	data (dict): Dictionary with filenames as keys and a tuple (chars, words, lines) as values.
	"""
	metrics = ['chars', 'words', 'lines']
	titles = ['Characters', 'Words', 'Lines']

	# Create a figure and a 1x3 grid of subplots
	fig, axes = plt.subplots(1, 3, figsize=(18, 6))

	for idx, metric in enumerate(metrics):
	if metric == 'chars':
	values = [v[0] for v in data.values()]
	elif metric == 'words':
	values = [v[1] for v in data.values()]
	elif metric == 'lines':
	values = [v[2] for v in data.values()]

	min_val, mean_val, max_val = np.min(values), np.mean(values), np.max(values)

	axes[idx].hist(values, bins=30, edgecolor='black')
	axes[idx].set_yscale('log')
	axes[idx].set_title(f"{titles[idx]}\nMin: {min_val}, Mean: {round(mean_val, 2)}, Max: {max_val}")
	axes[idx].set_xlabel(metric)
	axes[idx].set_ylabel('Number of files (log scale)')

	plt.tight_layout()
	plt.savefig('python_files_statistics.png', dpi=300)
	plt.show()


	if __name__ == '__main__':
	repo_url = 'https://github.com/ultralytics/ultralytics'

	# Clone the GitHub repo into 'repos/' directory
	repo_path = clone_github_repo(repo_url)
	print(f'Analyzing Python files in {repo_path}')

	results = analyze_python_files(repo_path, ["/venv", "/runs"])
	total_files = len(results)
	total_lines = sum(v[2] for v in results.values())
	total_chars = sum(v[0] for v in results.values())
	total_words = sum(v[1] for v in results.values())
	total_functions = sum(v[3] for v in results.values())
	total_classes = sum(v[4] for v in results.values())

	print(f"Files: {total_files}, Lines: {total_lines}, Chars: {total_chars}, Words: {total_words} "
	f"Functions: {total_functions}, Classes: {total_classes}")
	plot_histogram(results)