Skip to content

Instantly share code, notes, and snippets.

@glenn-jocher
Last active October 8, 2023 13:14
Show Gist options
  • Save glenn-jocher/f0452b55364e6a83d24e80efd96c5b94 to your computer and use it in GitHub Desktop.
Save glenn-jocher/f0452b55364e6a83d24e80efd96c5b94 to your computer and use it in GitHub Desktop.
Analyze Python Files
"""
Python Files Analyzer
This script analyzes Python files in a given GitHub repo, excluding specified sub-directories.
It counts and reports the following for each file:
- Total number of characters
- Total number of words
- Total number of lines
- Total number of functions (based on the 'def' keyword)
- Total number of classes (based on the 'class' keyword)
Results are printed to the console. Additionally, histograms for characters, words, and lines
are plotted and saved as a high-resolution PNG image.
Usage:
Run the script in a directory containing Python files. By default, it will analyze all Python files
in the current directory and its sub-directories, excluding any paths matching patterns in the
'exclude_patterns' list (e.g., "/venv" and "/runs").
Required Libraries:
- collections
- pathlib
- matplotlib
- numpy
- tqdm
- re
"""
import re
import subprocess
from collections import defaultdict
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
def clone_github_repo(repo_url: str, target_dir: Path = Path("repos")) -> Path:
"""
Clone a GitHub repository to a target directory.
Args:
repo_url (str): URL of the GitHub repository.
target_dir (Path): Directory where the repo should be cloned.
Returns:
Path: Path to the cloned repository.
"""
if not target_dir.exists():
target_dir.mkdir(parents=True)
# Extract repo name from the URL to use as a folder name
repo_name = repo_url.rstrip('/').split('/')[-1].replace('.git', '')
repo_path = target_dir / repo_name
if not repo_path.exists():
print(f"Cloning {repo_url} into {repo_path}...")
subprocess.run(["git", "clone", repo_url, str(repo_path)], check=True)
else:
print(f"{repo_path} already exists. Skipping clone and using existing data.")
return repo_path
def analyze_python_files(directory_path: Path, exclude_patterns: list = []) -> dict:
"""
Analyze Python files in the directory: count characters, words, lines, functions, and classes.
Args:
directory_path (Path): Directory to analyze.
exclude_patterns (list): List of directory patterns to exclude.
Returns:
dict: Results with file paths as keys and stats as values.
"""
func_pattern = re.compile(r'^\s*def\s+\w+\s*\(', re.MULTILINE)
class_pattern = re.compile(r'^\s*class\s+\w+', re.MULTILINE)
valid_files = [f for f in directory_path.rglob('*.py') if
not any(pattern in str(f) for pattern in exclude_patterns)]
results = defaultdict(tuple)
for py_file in tqdm(valid_files, desc="Analyzing"):
with py_file.open(encoding='utf-8', errors='replace') as f:
content = f.read()
chars = len(content)
words = len(content.split())
lines = len(content.split('\n'))
functions = len(func_pattern.findall(content))
classes = len(class_pattern.findall(content))
results[py_file] = (chars, words, lines, functions, classes)
return results
def plot_histogram(data: dict) -> None:
"""
Plot histograms for characters, words, and lines on a single graph with stats.
Args:
data (dict): Dictionary with filenames as keys and a tuple (chars, words, lines) as values.
"""
metrics = ['chars', 'words', 'lines']
titles = ['Characters', 'Words', 'Lines']
# Create a figure and a 1x3 grid of subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
for idx, metric in enumerate(metrics):
if metric == 'chars':
values = [v[0] for v in data.values()]
elif metric == 'words':
values = [v[1] for v in data.values()]
elif metric == 'lines':
values = [v[2] for v in data.values()]
min_val, mean_val, max_val = np.min(values), np.mean(values), np.max(values)
axes[idx].hist(values, bins=30, edgecolor='black')
axes[idx].set_yscale('log')
axes[idx].set_title(f"{titles[idx]}\nMin: {min_val}, Mean: {round(mean_val, 2)}, Max: {max_val}")
axes[idx].set_xlabel(metric)
axes[idx].set_ylabel('Number of files (log scale)')
plt.tight_layout()
plt.savefig('python_files_statistics.png', dpi=300)
plt.show()
if __name__ == '__main__':
repo_url = 'https://github.com/ultralytics/ultralytics'
# Clone the GitHub repo into 'repos/' directory
repo_path = clone_github_repo(repo_url)
print(f'Analyzing Python files in {repo_path}')
results = analyze_python_files(repo_path, ["/venv", "/runs"])
total_files = len(results)
total_lines = sum(v[2] for v in results.values())
total_chars = sum(v[0] for v in results.values())
total_words = sum(v[1] for v in results.values())
total_functions = sum(v[3] for v in results.values())
total_classes = sum(v[4] for v in results.values())
print(f"Files: {total_files}, Lines: {total_lines}, Chars: {total_chars}, Words: {total_words} "
f"Functions: {total_functions}, Classes: {total_classes}")
plot_histogram(results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment