Skip to content

Instantly share code, notes, and snippets.

@ykla
Last active November 5, 2025 12:53
Show Gist options
  • Select an option

  • Save ykla/6c3df44c371d37fc3196ddf5fa87ce5f to your computer and use it in GitHub Desktop.

Select an option

Save ykla/6c3df44c371d37fc3196ddf5fa87ce5f to your computer and use it in GitHub Desktop.
统计分析 git 项目
#!/usr/bin/env python3
"""
fix_stats.py
按年独立统计 git 提交并递归显示目录(>=5%)与重要文件(>=10%)
本版说明:
- file_count[file] = 该文件在该年出现(被修改/记录)的 commit 数(在单个 commit 内对同一文件只计一次)
- dir_count[dir] = 该目录下所有文件的 file_count 之和(即“其下所有文件的提交次数”)
- total_files = sum(file_count.values()),用作全局百分比的分母(与 dir_count 语义一致)
- 同时输出该年实际的 commit 数(commit_count)
"""
import sys
import subprocess
import collections
from typing import Dict, List, Set
def run_git_command(cmd: List[str]) -> str:
"""运行 git 命令并返回输出"""
try:
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
return result.stdout
except subprocess.CalledProcessError as e:
print(f"错误:git 命令执行失败: {e}", file=sys.stderr)
sys.exit(1)
except FileNotFoundError:
print("错误:未找到 git 命令", file=sys.stderr)
sys.exit(1)
def is_git_repo() -> bool:
"""检查当前目录是否为 git 仓库"""
try:
subprocess.run(["git", "rev-parse", "--is-inside-work-tree"],
capture_output=True, check=True)
return True
except (subprocess.CalledProcessError, FileNotFoundError):
return False
def get_commit_years() -> List[str]:
"""获取有提交的年份列表(升序)"""
output = run_git_command([
"git", "log", "--all", "--pretty=format:%ad", "--date=format:%Y"
])
years = sorted(set(output.splitlines()))
return [y for y in years if y.strip()]
def analyze_year(year: str):
"""分析指定年份的提交数据(目录计数为其下所有文件的提交次数之和)"""
print(f"\n========================================")
print(f"年份: {year}")
print(f"----------------------------------------")
# 精确统计该年 commit 数
try:
commit_count_out = run_git_command([
"git", "rev-list", "--all", "--count",
f"--since={year}-01-01", f"--until={year}-12-31"
])
commit_count = int(commit_count_out.strip() or "0")
except Exception:
commit_count = 0
# 获取该年的提交数据(按 commit 列出文件)
output = run_git_command([
"git", "log", "--all",
f"--since={year}-01-01",
f"--until={year}-12-31",
"--pretty=format:--%H--",
"--name-only"
])
if not output.strip():
print("该年无有效提交(或该年提交没有记录文件名)。")
print(f"该年 commit 数: {commit_count}")
return
# 第一阶段:计算 file_count(文件在多少个 commit 中出现)
file_count: Dict[str, int] = collections.defaultdict(int)
file_parent: Dict[str, str] = {}
lines = output.splitlines()
current_commit_files: Set[str] = set()
def process_commit_files(files_in_commit: Set[str]):
"""对单个 commit 的文件集合进行 file_count 增量(每个文件在该 commit 中 +1)"""
for f in files_in_commit:
file_count[f] += 1
# 记录直接父目录(用于重要文件归属判断)
parts = f.split('/')
if len(parts) == 1:
parent = "."
else:
parent = '/'.join(parts[:-1])
file_parent[f] = parent
# 解析 git log 输出(以 --<hash>-- 为 commit 分隔)
for line in lines:
line = line.strip()
if not line:
continue
if line.startswith('--') and line.endswith('--'):
# 遇到新 commit 分隔:处理之前的 commit 文件集合
process_commit_files(current_commit_files)
current_commit_files.clear()
continue
# 否则为文件路径
current_commit_files.add(line)
# 处理最后一个 commit 的文件集合(若有)
process_commit_files(current_commit_files)
# total_files:全年所有文件修改次数总和(file occurrences)
total_files = sum(file_count.values())
if total_files == 0:
print("该年无有效提交(或该年提交没有记录文件名)。")
print(f"该年 commit 数: {commit_count}")
return
# 第二阶段:基于 file_count 汇总目录计数(把每个文件的次数累加到其所有祖先目录)
dir_count: Dict[str, int] = collections.defaultdict(int)
for file_path, fcount in file_count.items():
parts = file_path.split('/')
if len(parts) == 1:
# 根目录文件计入 "."
dir_count["."] += fcount
else:
# 将文件的次数加到所有祖先目录:a/b/c.txt -> a, a/b
ancestor_parts = []
for i in range(0, len(parts) - 1):
ancestor_parts.append(parts[i])
ancestor_dir = '/'.join(ancestor_parts)
dir_count[ancestor_dir] += fcount
# 现在 dir_count 表示:每个目录下所有文件在该年被修改的次数总和
# 输出结果
print(f"该年 commit 数: {commit_count}")
print(f"总提交(文件项,总计所有文件被修改的次数): {total_files}\n")
analyze_directory_structure(total_files, dir_count, file_count, file_parent)
analyze_global_important_files(total_files, file_count)
def analyze_directory_structure(total_files: int, dir_count: Dict[str, int],
file_count: Dict[str, int], file_parent: Dict[str, str]):
"""分析并显示目录结构(dir_count 已经是文件次数之和)"""
print("目录树(递归显示,占比阈值:相对于父目录 >=5%):")
# 顶级目录(包括根 ".")
top_dirs = []
for dir_path, count in dir_count.items():
if dir_path == "." or '/' not in dir_path:
top_dirs.append((dir_path, count))
top_dirs.sort(key=lambda x: x[1], reverse=True)
for dir_path, count in top_dirs:
percentage = (count / total_files) * 100 if total_files > 0 else 0.0
if percentage >= 5:
display_name = "(root)" if dir_path == "." else dir_path
print(f"├─ {display_name} ({count}, {percentage:.1f}%)")
print_important_files(dir_path, count, file_count, file_parent, "")
print_child_directories(dir_path, count, dir_count, file_count, file_parent, "│ ")
def print_child_directories(parent_dir: str, parent_count: int, dir_count: Dict[str, int],
file_count: Dict[str, int], file_parent: Dict[str, str], prefix: str):
"""递归打印子目录(parent_count 为父目录的聚合计数)"""
children = []
parent_prefix = f"{parent_dir}/" if parent_dir != "." else ""
for dir_path, count in dir_count.items():
if dir_path == parent_dir:
continue
if parent_dir == ".":
if '/' not in dir_path and dir_path != ".":
children.append((dir_path, count))
else:
if dir_path.startswith(parent_prefix):
remaining = dir_path[len(parent_prefix):]
if '/' not in remaining:
children.append((dir_path, count))
children.sort(key=lambda x: x[1], reverse=True)
for child_path, count in children:
percentage = (count / parent_count) * 100 if parent_count > 0 else 0.0
if percentage >= 5:
display_name = child_path.split('/')[-1]
print(f"{prefix}├─ {display_name} ({count}, {percentage:.1f}%)")
print_important_files(child_path, count, file_count, file_parent, prefix)
print_child_directories(child_path, count, dir_count, file_count, file_parent, f"{prefix}│ ")
def print_important_files(dir_path: str, dir_count: int, file_count: Dict[str, int],
file_parent: Dict[str, str], prefix: str):
"""打印目录中的重要文件(>=10%),dir_count 为目录的聚合计数(之和)"""
important_files = []
for file_path, fcount in file_count.items():
if file_parent.get(file_path) == dir_path:
percentage = (fcount / dir_count) * 100 if dir_count > 0 else 0.0
if percentage >= 10:
file_name = file_path.split('/')[-1]
important_files.append((file_name, fcount, percentage))
if important_files:
print(f"{prefix} * 重要文件:")
for file_name, fcount, percentage in important_files:
print(f"{prefix} • {file_name} ({fcount}, {percentage:.1f}%)")
print(prefix)
def analyze_global_important_files(total_files: int, file_count: Dict[str, int]):
"""分析全局重要文件(>=1%)"""
print("\n全局单文件集中度(相对于年总数,>=1%):")
important_files = []
for file_path, count in file_count.items():
percentage = (count / total_files) * 100 if total_files > 0 else 0.0
if percentage >= 1:
important_files.append((file_path, count, percentage))
important_files.sort(key=lambda x: x[2], reverse=True)
for file_path, count, percentage in important_files:
print(f"{percentage:6.2f}%\t{count}\t{file_path}")
print("----------------------------------------")
def main():
"""主函数"""
if not is_git_repo():
print("错误:请在 git 仓库根目录运行此脚本。", file=sys.stderr)
sys.exit(1)
print("Counting commits and directories per year... (this may take a while on big repos)")
years = get_commit_years()
if not years:
print("未检测到任何提交年份。")
return
for year in years:
analyze_year(year)
print("\nDone.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment