Last active
November 5, 2025 12:53
-
-
Save ykla/6c3df44c371d37fc3196ddf5fa87ce5f to your computer and use it in GitHub Desktop.
统计分析 git 项目
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| fix_stats.py | |
| 按年独立统计 git 提交并递归显示目录(>=5%)与重要文件(>=10%) | |
| 本版说明: | |
| - file_count[file] = 该文件在该年出现(被修改/记录)的 commit 数(在单个 commit 内对同一文件只计一次) | |
| - dir_count[dir] = 该目录下所有文件的 file_count 之和(即“其下所有文件的提交次数”) | |
| - total_files = sum(file_count.values()),用作全局百分比的分母(与 dir_count 语义一致) | |
| - 同时输出该年实际的 commit 数(commit_count) | |
| """ | |
| import sys | |
| import subprocess | |
| import collections | |
| from typing import Dict, List, Set | |
| def run_git_command(cmd: List[str]) -> str: | |
| """运行 git 命令并返回输出""" | |
| try: | |
| result = subprocess.run(cmd, capture_output=True, text=True, check=True) | |
| return result.stdout | |
| except subprocess.CalledProcessError as e: | |
| print(f"错误:git 命令执行失败: {e}", file=sys.stderr) | |
| sys.exit(1) | |
| except FileNotFoundError: | |
| print("错误:未找到 git 命令", file=sys.stderr) | |
| sys.exit(1) | |
| def is_git_repo() -> bool: | |
| """检查当前目录是否为 git 仓库""" | |
| try: | |
| subprocess.run(["git", "rev-parse", "--is-inside-work-tree"], | |
| capture_output=True, check=True) | |
| return True | |
| except (subprocess.CalledProcessError, FileNotFoundError): | |
| return False | |
| def get_commit_years() -> List[str]: | |
| """获取有提交的年份列表(升序)""" | |
| output = run_git_command([ | |
| "git", "log", "--all", "--pretty=format:%ad", "--date=format:%Y" | |
| ]) | |
| years = sorted(set(output.splitlines())) | |
| return [y for y in years if y.strip()] | |
| def analyze_year(year: str): | |
| """分析指定年份的提交数据(目录计数为其下所有文件的提交次数之和)""" | |
| print(f"\n========================================") | |
| print(f"年份: {year}") | |
| print(f"----------------------------------------") | |
| # 精确统计该年 commit 数 | |
| try: | |
| commit_count_out = run_git_command([ | |
| "git", "rev-list", "--all", "--count", | |
| f"--since={year}-01-01", f"--until={year}-12-31" | |
| ]) | |
| commit_count = int(commit_count_out.strip() or "0") | |
| except Exception: | |
| commit_count = 0 | |
| # 获取该年的提交数据(按 commit 列出文件) | |
| output = run_git_command([ | |
| "git", "log", "--all", | |
| f"--since={year}-01-01", | |
| f"--until={year}-12-31", | |
| "--pretty=format:--%H--", | |
| "--name-only" | |
| ]) | |
| if not output.strip(): | |
| print("该年无有效提交(或该年提交没有记录文件名)。") | |
| print(f"该年 commit 数: {commit_count}") | |
| return | |
| # 第一阶段:计算 file_count(文件在多少个 commit 中出现) | |
| file_count: Dict[str, int] = collections.defaultdict(int) | |
| file_parent: Dict[str, str] = {} | |
| lines = output.splitlines() | |
| current_commit_files: Set[str] = set() | |
| def process_commit_files(files_in_commit: Set[str]): | |
| """对单个 commit 的文件集合进行 file_count 增量(每个文件在该 commit 中 +1)""" | |
| for f in files_in_commit: | |
| file_count[f] += 1 | |
| # 记录直接父目录(用于重要文件归属判断) | |
| parts = f.split('/') | |
| if len(parts) == 1: | |
| parent = "." | |
| else: | |
| parent = '/'.join(parts[:-1]) | |
| file_parent[f] = parent | |
| # 解析 git log 输出(以 --<hash>-- 为 commit 分隔) | |
| for line in lines: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| if line.startswith('--') and line.endswith('--'): | |
| # 遇到新 commit 分隔:处理之前的 commit 文件集合 | |
| process_commit_files(current_commit_files) | |
| current_commit_files.clear() | |
| continue | |
| # 否则为文件路径 | |
| current_commit_files.add(line) | |
| # 处理最后一个 commit 的文件集合(若有) | |
| process_commit_files(current_commit_files) | |
| # total_files:全年所有文件修改次数总和(file occurrences) | |
| total_files = sum(file_count.values()) | |
| if total_files == 0: | |
| print("该年无有效提交(或该年提交没有记录文件名)。") | |
| print(f"该年 commit 数: {commit_count}") | |
| return | |
| # 第二阶段:基于 file_count 汇总目录计数(把每个文件的次数累加到其所有祖先目录) | |
| dir_count: Dict[str, int] = collections.defaultdict(int) | |
| for file_path, fcount in file_count.items(): | |
| parts = file_path.split('/') | |
| if len(parts) == 1: | |
| # 根目录文件计入 "." | |
| dir_count["."] += fcount | |
| else: | |
| # 将文件的次数加到所有祖先目录:a/b/c.txt -> a, a/b | |
| ancestor_parts = [] | |
| for i in range(0, len(parts) - 1): | |
| ancestor_parts.append(parts[i]) | |
| ancestor_dir = '/'.join(ancestor_parts) | |
| dir_count[ancestor_dir] += fcount | |
| # 现在 dir_count 表示:每个目录下所有文件在该年被修改的次数总和 | |
| # 输出结果 | |
| print(f"该年 commit 数: {commit_count}") | |
| print(f"总提交(文件项,总计所有文件被修改的次数): {total_files}\n") | |
| analyze_directory_structure(total_files, dir_count, file_count, file_parent) | |
| analyze_global_important_files(total_files, file_count) | |
| def analyze_directory_structure(total_files: int, dir_count: Dict[str, int], | |
| file_count: Dict[str, int], file_parent: Dict[str, str]): | |
| """分析并显示目录结构(dir_count 已经是文件次数之和)""" | |
| print("目录树(递归显示,占比阈值:相对于父目录 >=5%):") | |
| # 顶级目录(包括根 ".") | |
| top_dirs = [] | |
| for dir_path, count in dir_count.items(): | |
| if dir_path == "." or '/' not in dir_path: | |
| top_dirs.append((dir_path, count)) | |
| top_dirs.sort(key=lambda x: x[1], reverse=True) | |
| for dir_path, count in top_dirs: | |
| percentage = (count / total_files) * 100 if total_files > 0 else 0.0 | |
| if percentage >= 5: | |
| display_name = "(root)" if dir_path == "." else dir_path | |
| print(f"├─ {display_name} ({count}, {percentage:.1f}%)") | |
| print_important_files(dir_path, count, file_count, file_parent, "") | |
| print_child_directories(dir_path, count, dir_count, file_count, file_parent, "│ ") | |
| def print_child_directories(parent_dir: str, parent_count: int, dir_count: Dict[str, int], | |
| file_count: Dict[str, int], file_parent: Dict[str, str], prefix: str): | |
| """递归打印子目录(parent_count 为父目录的聚合计数)""" | |
| children = [] | |
| parent_prefix = f"{parent_dir}/" if parent_dir != "." else "" | |
| for dir_path, count in dir_count.items(): | |
| if dir_path == parent_dir: | |
| continue | |
| if parent_dir == ".": | |
| if '/' not in dir_path and dir_path != ".": | |
| children.append((dir_path, count)) | |
| else: | |
| if dir_path.startswith(parent_prefix): | |
| remaining = dir_path[len(parent_prefix):] | |
| if '/' not in remaining: | |
| children.append((dir_path, count)) | |
| children.sort(key=lambda x: x[1], reverse=True) | |
| for child_path, count in children: | |
| percentage = (count / parent_count) * 100 if parent_count > 0 else 0.0 | |
| if percentage >= 5: | |
| display_name = child_path.split('/')[-1] | |
| print(f"{prefix}├─ {display_name} ({count}, {percentage:.1f}%)") | |
| print_important_files(child_path, count, file_count, file_parent, prefix) | |
| print_child_directories(child_path, count, dir_count, file_count, file_parent, f"{prefix}│ ") | |
| def print_important_files(dir_path: str, dir_count: int, file_count: Dict[str, int], | |
| file_parent: Dict[str, str], prefix: str): | |
| """打印目录中的重要文件(>=10%),dir_count 为目录的聚合计数(之和)""" | |
| important_files = [] | |
| for file_path, fcount in file_count.items(): | |
| if file_parent.get(file_path) == dir_path: | |
| percentage = (fcount / dir_count) * 100 if dir_count > 0 else 0.0 | |
| if percentage >= 10: | |
| file_name = file_path.split('/')[-1] | |
| important_files.append((file_name, fcount, percentage)) | |
| if important_files: | |
| print(f"{prefix} * 重要文件:") | |
| for file_name, fcount, percentage in important_files: | |
| print(f"{prefix} • {file_name} ({fcount}, {percentage:.1f}%)") | |
| print(prefix) | |
| def analyze_global_important_files(total_files: int, file_count: Dict[str, int]): | |
| """分析全局重要文件(>=1%)""" | |
| print("\n全局单文件集中度(相对于年总数,>=1%):") | |
| important_files = [] | |
| for file_path, count in file_count.items(): | |
| percentage = (count / total_files) * 100 if total_files > 0 else 0.0 | |
| if percentage >= 1: | |
| important_files.append((file_path, count, percentage)) | |
| important_files.sort(key=lambda x: x[2], reverse=True) | |
| for file_path, count, percentage in important_files: | |
| print(f"{percentage:6.2f}%\t{count}\t{file_path}") | |
| print("----------------------------------------") | |
| def main(): | |
| """主函数""" | |
| if not is_git_repo(): | |
| print("错误:请在 git 仓库根目录运行此脚本。", file=sys.stderr) | |
| sys.exit(1) | |
| print("Counting commits and directories per year... (this may take a while on big repos)") | |
| years = get_commit_years() | |
| if not years: | |
| print("未检测到任何提交年份。") | |
| return | |
| for year in years: | |
| analyze_year(year) | |
| print("\nDone.") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment