Skip to content

Instantly share code, notes, and snippets.

@isshiki
Last active October 23, 2025 04:17
Show Gist options
  • Select an option

  • Save isshiki/0f94c9764c559fe99672d58e7c17b183 to your computer and use it in GitHub Desktop.

Select an option

Save isshiki/0f94c9764c559fe99672d58e7c17b183 to your computer and use it in GitHub Desktop.
ローカルLLMの実行可否を“ひと目判断”するための、環境情報の自動収集→要約スクリプト(macOS/Windows/Linux対応)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
llm_env_report.py
ローカルLLMの実行可否を“ひと目判断”。macOS/Windows/Linux対応(標準ライブラリのみ)
取得:
- OS/カーネル, 機種/CPU/メモリ/ディスク
- GPU要約(Metal/CUDA/ROCm/MPS推定)
- Python/主要ライブラリ(mlx/torch/coremltools/llama_cpp/vllm)
出力:
- 推奨モデル規模(7B/8B/13B…量子化目安)
- 量子化別の「推定必要メモリ(重み)」と可否判定
- 推奨バックエンド、注意点
- 量子化(Q4/Q5…)の簡易説明
使い方:
python llm_env_report.py # テキスト
python llm_env_report.py --format md # Markdown表
python llm_env_report.py --format json # JSON
python llm_env_report.py --ctx 8000 # 参考コンテキスト長(注記用)
"""
import argparse
import json
import platform
import shutil
import subprocess
from textwrap import dedent
# ---------- helpers ----------
def run(cmd, shell=False):
try:
out = subprocess.check_output(cmd, stderr=subprocess.DEVNULL, shell=shell)
return out.decode("utf-8", errors="ignore").strip()
except Exception:
return ""
def bytes_to_gb(n):
try:
return round(int(n) / (1024**3), 2)
except Exception:
return None
def print_table(kv, title=None):
if title:
print(title)
w = max((len(k) for k in kv.keys()), default=0)
for k, v in kv.items():
if isinstance(v, list):
v = " / ".join(map(str, v)) if v else "-"
print(f" {k:<{w}} : {v}")
print()
# ---------- quantization-aware estimation ----------
def estimate_weights_mem_gb(params_billion: float, quant_bits: int) -> float:
"""
重み(Weights)だけの概算必要メモリ[GB]を返す。
基準: FP16は約2GB/1B param → 量子化で (bits/16) 倍。
例) 7B, Q4 → 2*7*(4/16)=3.5GB(重みのみ)
"""
return round(2.0 * params_billion * (quant_bits / 16.0), 2)
def human_ok(need_gb: float, mem_total_gb: float, headroom: float = 0.75) -> bool:
"""
余裕係数(headroom)を掛けて可否判定。
UMAやOS分を考慮して「総メモリ×0.75」を上限の目安に。
"""
return need_gb <= round(mem_total_gb * headroom, 2)
def quant_explanations():
return [
"Q8: 8bit量子化。精度高めだが重い(重みはFP16の約50%)。",
"Q5: 5bit量子化。精度と軽さのバランスがよい(約31%)。",
"Q4: 4bit量子化。軽量・高速(約25%)。精度はやや低下。",
"※表の“推定必要メモリ(重み)”は重みだけの概算。実際の使用量はKVキャッシュ等で増加します。"
]
def build_quant_table(mem_gb: float):
"""
代表的なモデル規模×量子化で、重みの推定メモリと可否を表にする。
※KVキャッシュは未加算(注記で明示)
"""
entries = []
models = [
("3B", 3.0),
("4B", 4.0),
("7B", 7.0),
("8B", 8.0),
("13B", 13.0),
]
quants = [8, 5, 4] # Q8/Q5/Q4
for name, pb in models:
row = {"モデル": name}
for q in quants:
need = estimate_weights_mem_gb(pb, q)
ok = human_ok(need, mem_gb)
mark = "✅" if ok else "✕"
row[f"Q{q} 推定必要メモリ(重み)"] = f"{need} GB {mark}"
entries.append(row)
return entries
def print_quant_table(entries):
# シンプル表(テキスト)
headers = ["モデル", "Q8 推定必要メモリ(重み)", "Q5 推定必要メモリ(重み)", "Q4 推定必要メモリ(重み)"]
colw = {h: len(h) for h in headers}
for e in entries:
for h in headers:
colw[h] = max(colw[h], len(str(e[h])))
# header
line = " " + " | ".join(f"{h:<{colw[h]}}" for h in headers)
sep = " " + "-+-".join("-"*colw[h] for h in headers)
print(line); print(sep)
for e in entries:
print(" " + " | ".join(f"{str(e[h]):<{colw[h]}}" for h in headers))
print()
# ---------- OS specific collectors ----------
def collect_macos():
import json as _json
info = {}
info["os"] = f"{platform.system()} {platform.release()} ({platform.version()})"
info["python"] = platform.python_version()
info["machine"] = platform.machine()
def system_profiler(datatype):
out = run(["/usr/sbin/system_profiler", "-json", datatype])
return _json.loads(out) if out else {}
def sysctl(key):
out = run(["/usr/sbin/sysctl", "-n", key])
return out.strip()
hw = system_profiler("SPHardwareDataType").get("SPHardwareDataType", [{}])[0]
info["model_name"] = hw.get("machine_name") or hw.get("model_name") or ""
info["model_identifier"] = hw.get("machine_model") or hw.get("model_identifier") or ""
info["chip"] = hw.get("chip_type") or hw.get("cpu_type") or ""
info["memory_text"] = hw.get("physical_memory") or ""
mem_bytes = sysctl("hw.memsize")
info["memory_gb"] = bytes_to_gb(mem_bytes) if mem_bytes else None
disp = system_profiler("SPDisplaysDataType").get("SPDisplaysDataType", [])
info["gpu_summary"] = []
info["metal_supported"] = None
for d in disp:
name = d.get("_name") or d.get("spdisplays_chipset_model") or "Apple GPU"
vram = d.get("spdisplays_vram") or d.get("spdisplays_vram_shared") or ""
metal = d.get("spdisplays_metal") or ""
info["gpu_summary"].append({"name": name, "vram": vram, "metal": metal})
if isinstance(metal, str) and metal:
if "Supported" in metal or "対応" in metal:
info["metal_supported"] = True
elif "Not Supported" in metal or "非対応" in metal:
info["metal_supported"] = False
if info.get("metal_supported") is None:
chip = (info.get("chip") or "").lower()
if any(k in chip for k in ("apple", "m1", "m2", "m3", "m4")):
info["metal_supported"] = True
if not info.get("gpu_summary"):
info["gpu_summary"] = [{"name": "Apple GPU", "vram": "(shared)", "metal": "Metal: Supported (assumed)"}]
total, used, free = shutil.disk_usage("/")
info["disk_total_gb"] = round(total / (1024**3), 1)
info["disk_free_gb"] = round(free / (1024**3), 1)
return info
def collect_windows():
info = {}
info["os"] = f"{platform.system()} {platform.release()} ({platform.version()})"
info["python"] = platform.python_version()
info["machine"] = platform.machine()
mem_total = run(["powershell", "-NoProfile", "-Command",
"Get-CimInstance Win32_ComputerSystem | Select-Object -ExpandProperty TotalPhysicalMemory"])
info["memory_gb"] = bytes_to_gb(mem_total) if mem_total else None
info["model_name"] = run(["powershell", "-NoProfile", "-Command",
"(Get-CimInstance Win32_ComputerSystem).Model"])
info["chip"] = run(["powershell", "-NoProfile", "-Command",
"(Get-CimInstance Win32_Processor).Name"])
info["gpu_summary"] = []
gpus = run(["powershell", "-NoProfile", "-Command",
"Get-CimInstance Win32_VideoController | Select-Object -ExpandProperty Name"])
for line in gpus.splitlines():
line = line.strip()
if line:
info["gpu_summary"].append({"name": line, "vram": "", "metal": ""})
nvsmi = run(["nvidia-smi", "--query-gpu=name,memory.total", "--format=csv,noheader"])
info["cuda_supported"] = bool(nvsmi)
if nvsmi:
for l in nvsmi.splitlines():
parts = [x.strip() for x in l.split(",")]
if len(parts) >= 2:
name, mem = parts[:2]
info["gpu_summary"].append({"name": f"NVIDIA {name}", "vram": mem, "metal": ""})
total, used, free = shutil.disk_usage("\\")
info["disk_total_gb"] = round(total / (1024**3), 1)
info["disk_free_gb"] = round(free / (1024**3), 1)
return info
def collect_linux():
info = {}
info["os"] = f"{platform.system()} {platform.release()} ({platform.version()})"
info["python"] = platform.python_version()
info["machine"] = platform.machine()
mem_bytes = None
try:
with open("/proc/meminfo") as f:
for line in f:
if line.lower().startswith("memtotal"):
kb = int(line.split(":")[1].strip().split()[0])
mem_bytes = kb * 1024
break
except Exception:
pass
info["memory_gb"] = bytes_to_gb(mem_bytes) if mem_bytes else None
info["chip"] = run(["bash", "-lc", "grep 'model name' /proc/cpuinfo | head -1 | cut -d: -f2"]).strip()
info["gpu_summary"] = []
nvsmi = run(["nvidia-smi", "--query-gpu=name,memory.total", "--format=csv,noheader"])
info["cuda_supported"] = bool(nvsmi)
if nvsmi:
for l in nvsmi.splitlines():
parts = [x.strip() for x in l.split(",")]
if len(parts) >= 2:
name, mem = parts[:2]
info["gpu_summary"].append({"name": f"NVIDIA {name}", "vram": mem, "metal": ""})
rocmsmi = run(["rocm-smi"])
info["rocm_supported"] = bool(rocmsmi and ("GPU" in rocmsmi or "card" in rocmsmi.lower()))
if info["rocm_supported"]:
for line in rocmsmi.splitlines():
if "Card series" in line or "GPU" in line:
info["gpu_summary"].append({"name": line.strip(), "vram": "", "metal": ""})
if not info["gpu_summary"]:
lspci = run(["bash", "-lc", "lspci | egrep -i 'vga|3d|display'"])
for line in lspci.splitlines():
info["gpu_summary"].append({"name": line.strip(), "vram": "", "metal": ""})
total, used, free = shutil.disk_usage("/")
info["disk_total_gb"] = round(total / (1024**3), 1)
info["disk_free_gb"] = round(free / (1024**3), 1)
return info
# ---------- libs & capability ----------
def check_libs():
libs = {
"mlx": False, # macOS only
"torch": False,
"torch_mps": False, # Apple
"torch_cuda": False, # NVIDIA
"torch_rocm": False, # AMD ROCm (簡易推定)
"coremltools": False, # macOS中心
"llama_cpp": False,
"vllm": False,
}
try:
import mlx.core as mx # type: ignore
libs["mlx"] = True
except Exception:
pass
try:
import torch # type: ignore
libs["torch"] = True
try:
libs["torch_mps"] = bool(getattr(torch.backends, "mps", None) and torch.backends.mps.is_available())
except Exception:
libs["torch_mps"] = False
try:
libs["torch_cuda"] = bool(torch.cuda.is_available())
except Exception:
libs["torch_cuda"] = False
try:
ver = str(getattr(torch.version, "cuda", "")).lower()
libs["torch_rocm"] = ("rocm" in ver) or ("hip" in ver)
except Exception:
libs["torch_rocm"] = False
except Exception:
pass
try:
import coremltools # type: ignore
libs["coremltools"] = True
except Exception:
pass
try:
import llama_cpp # type: ignore
libs["llama_cpp"] = True
except Exception:
pass
try:
import vllm # type: ignore
libs["vllm"] = True
except Exception:
pass
return libs
# ---------- recommendation (mem-based) ----------
def recommend(info, libs):
osname = platform.system()
mem = info.get("memory_gb") or 0
free_disk = info.get("disk_free_gb") or 0
# 量子化考慮のメイン推奨テキスト
if mem >= 48:
size_rec = "7B〜13B余裕(Q4〜Q5)。大規模は要工夫。"
elif mem >= 32:
size_rec = "7B/8B(Q4〜Q5)快適、13BはQ4で軽用途。"
elif mem >= 24:
size_rec = "7B(Q4〜Q5)快適、8BはQ4で可、13Bは厳しめ。"
elif mem >= 16:
size_rec = "7B(Q4)現実的。8Bは設定次第。13Bは非推奨。"
else:
size_rec = "メモリ不足(<16GB):7B軽量量子化でも厳しめ。"
# バックエンド推奨
backends = []
if osname == "Darwin":
if libs.get("mlx"): backends.append("MLX(Apple Silicon)")
if libs.get("torch_mps"): backends.append("PyTorch MPS")
backends.append("llama.cpp (Metal)")
elif osname == "Windows":
if libs.get("torch_cuda"): backends.append("PyTorch CUDA")
backends.append("llama.cpp (CUDA対応ビルド)")
elif osname == "Linux":
if libs.get("torch_cuda"): backends.append("PyTorch CUDA")
if libs.get("torch_rocm"): backends.append("PyTorch ROCm")
backends.append("llama.cpp (CUDA/ROCm/CPU)")
if libs.get("vllm"): backends.append("vLLM(GPU前提)")
backend_rec = " / ".join(backends) if backends else "CPU fallback(遅い)"
notes = []
if free_disk < 15:
notes.append("ディスク空きが少なめ(<15GB)。モデル配置に注意。")
if osname == "Darwin" and info.get("metal_supported") is False:
notes.append("Metal非対応表示。GPU加速不可の可能性。")
if not any([libs.get("mlx"), libs.get("torch_mps"), libs.get("torch_cuda"), libs.get("torch_rocm")]):
notes.append("GPU/加速基盤が確認できず。まずは対応ライブラリの導入を。")
if osname == "Windows" and not info.get("cuda_supported") and not libs.get("torch_cuda"):
notes.append("WindowsでCUDAが見つかりません。NVIDIA + ドライバ + CUDA Toolkit を確認。")
if osname == "Linux" and not (info.get("cuda_supported") or info.get("rocm_supported")) and not (libs.get("torch_cuda") or libs.get("torch_rocm")):
notes.append("LinuxでGPU加速が未検出。NVIDIA(ドライバ/CUDA)またはAMD(ROCm)を確認。")
try:
py = tuple(int(x) for x in platform.python_version().split(".")[:2])
if py >= (3, 13):
notes.append("Python 3.13: 一部ライブラリ未対応。3.12または3.11推奨。")
except Exception:
pass
return {
"recommended_model_scale": size_rec,
"recommended_backends": backend_rec,
"notes": notes or ["特になし"],
}
# ---------- formatting ----------
def format_gpu_lines(info):
lines = []
for g in info.get("gpu_summary", []):
name = (g.get("name","") or "").strip()
vram = (g.get("vram","") or "").strip()
metal = (g.get("metal","") or "").strip()
parts = [p for p in [name, vram, metal] if p]
lines.append(" / ".join(parts) if parts else "(unknown GPU)")
if platform.system() == "Darwin":
if info.get("metal_supported") is True:
if not any("Metal" in l for l in lines):
if lines:
lines[0] = (lines[0] + " / Metal: Supported").strip()
else:
lines = ["Apple GPU / Metal: Supported"]
return lines or ["-"]
def to_markdown(hw, libs, rec, quant_rows, ctx):
def row(k,v): return f"| {k} | {v} |"
md = []
md += ["### OS / ハードウェア", "|項目|値|","|---|---|"]
for k,v in hw.items():
if isinstance(v, list): v = " / ".join(map(str, v))
md.append(row(k, v))
md += ["", "### Python / ライブラリ", "|項目|値|","|---|---|"]
for k,v in libs.items():
md.append(row(k, v))
md += ["", "### 推奨(自動判定)", "|項目|値|","|---|---|"]
for k,v in rec.items():
if isinstance(v, list): v = " / ".join(map(str, v))
md.append(row(k, v))
# 量子化表
md += ["", "### 量子化別の推定必要メモリ(重みのみ)", "|モデル|Q8|Q5|Q4|","|---|---|---|---|"]
for e in quant_rows:
md.append(f"| {e['モデル']} | {e['Q8 推定必要メモリ(重み)']} | {e['Q5 推定必要メモリ(重み)']} | {e['Q4 推定必要メモリ(重み)']} |")
md += ["", f"**注**: 数値は重みのみの概算。実際はKVキャッシュ等で増加。コンテキスト長(参考): ~{ctx} tokens。", ""]
md += ["**量子化メモ**:"]
for line in quant_explanations():
md.append(f"- {line}")
return "\n".join(md)
# ---------- main ----------
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--format", choices=["text","md","json"], default="text",
help="出力形式(text/md/json)")
parser.add_argument("--ctx", type=int, default=8000,
help="参考コンテキスト長(注記用の表示のみ。既定: 8000)")
args = parser.parse_args()
osname = platform.system()
if osname == "Darwin":
info = collect_macos()
elif osname == "Windows":
info = collect_windows()
else:
info = collect_linux()
libs = check_libs()
rec = recommend(info, libs)
gpu_lines = format_gpu_lines(info)
hw = {
"OS": info.get("os"),
"モデル名/機種": info.get("model_name") or info.get("model_identifier") or "-",
"チップ/CPU": info.get("chip") or "-",
"メモリ(GB)": info.get("memory_gb"),
"GPU要約": gpu_lines,
"ディスク空き(GB)": info.get("disk_free_gb"),
"ディスク総量(GB)": info.get("disk_total_gb"),
}
libshow = {
"Python": platform.python_version(),
"mlx": libs.get("mlx"),
"torch": libs.get("torch"),
"torch MPS": libs.get("torch_mps"),
"torch CUDA": libs.get("torch_cuda"),
"torch ROCm": libs.get("torch_rocm"),
"coremltools": libs.get("coremltools"),
"llama_cpp": libs.get("llama_cpp"),
"vllm": libs.get("vllm"),
}
recshow = {
"推奨モデル規模": rec["recommended_model_scale"],
"推奨バックエンド": rec["recommended_backends"],
"注意点": rec["notes"],
}
# 量子化表(重みベース概算)
mem_total = info.get("memory_gb") or 0.0
quant_rows = build_quant_table(mem_total)
if args.format == "json":
payload = {
"hardware": hw,
"libs": libshow,
"recommendation": recshow,
"quant_table": quant_rows,
"quant_notes": quant_explanations(),
"context_hint_tokens": args.ctx,
}
print(json.dumps(payload, ensure_ascii=False, indent=2))
return
if args.format == "md":
print(to_markdown(hw, libshow, recshow, quant_rows, args.ctx))
return
# text(デフォルト)
print("\n=== OS / ハードウェア ==="); print_table(hw)
print("=== Python / ライブラリ ==="); print_table(libshow)
print("=== 推奨(自動判定) ==="); print_table(recshow)
print("=== 量子化別の推定必要メモリ(重みのみ) ===")
print_quant_table(quant_rows)
print("量子化メモ:")
for line in quant_explanations():
print(" - " + line)
print(f"\n注: 上表の数値は“重みのみ”の概算です。実際の使用メモリは KVキャッシュ等で増加します。"
f" コンテキスト長(参考): ~{args.ctx} tokens")
hint = dedent("""
ヒント:
- まずは 7B(Q4/Q5)で推論確認 → 8B / 13B へ段階的に。
- Mac: MLX / llama.cpp(Metal)。Windows/Linux: NVIDIAはCUDA、AMDはROCm、CPUならllama.cpp。
- モデル配置はサイズ+30%の余白を目安に。長文はKVキャッシュでメモリ増。
- PyTorch: MPS/CUDA/ROCm が True になっているか確認。
""").strip()
print("\n" + hint)
if __name__ == "__main__":
main()
# 出力例:
#
# === OS / ハードウェア ===
# OS : Darwin 24.6.0 (Darwin Kernel Version 24.6.0: Mon Jul 14 11:30:40 PDT 2025; root:xnu-11417.140.69~1/RELEASE_ARM64_T8132)
# モデル名/機種 : MacBook Pro
# チップ/CPU : Apple M4
# メモリ(GB) : 16.0
# GPU要約 : Apple M4 / Metal: Supported
# ディスク空き(GB) : 284.1
# ディスク総量(GB) : 460.4
#
# === Python / ライブラリ ===
# Python : 3.13.5
# mlx : False
# torch : False
# torch MPS : False
# torch CUDA : False
# torch ROCm : False
# coremltools : False
# llama_cpp : False
# vllm : False
#
# === 推奨(自動判定) ===
# 推奨モデル規模 : 7B(Q4)現実的。8Bは設定次第。13Bは非推奨。
# 推奨バックエンド : llama.cpp (Metal)
# 注意点 : GPU/加速基盤が確認できず。まずは対応ライブラリの導入を。 / Python 3.13: 一部ライブラリ未対応。3.12または3.11推奨。
#
# === 量子化別の推定必要メモリ(重みのみ) ===
# モデル | Q8 推定必要メモリ(重み) | Q5 推定必要メモリ(重み) | Q4 推定必要メモリ(重み)
# ----+----------------+----------------+---------------
# 3B | 3.0 GB ✅ | 1.88 GB ✅ | 1.5 GB ✅
# 4B | 4.0 GB ✅ | 2.5 GB ✅ | 2.0 GB ✅
# 7B | 7.0 GB ✅ | 4.38 GB ✅ | 3.5 GB ✅
# 8B | 8.0 GB ✅ | 5.0 GB ✅ | 4.0 GB ✅
# 13B | 13.0 GB ✕ | 8.12 GB ✅ | 6.5 GB ✅
#
# 量子化メモ:
# - Q8: 8bit量子化。精度高めだが重い(重みはFP16の約50%)。
# - Q5: 5bit量子化。精度と軽さのバランスがよい(約31%)。
# - Q4: 4bit量子化。軽量・高速(約25%)。精度はやや低下。
# - ※表の“推定必要メモリ(重み)”は重みだけの概算。実際の使用量はKVキャッシュ等で増加します。
#
# 注: 上表の数値は“重みのみ”の概算です。実際の使用メモリは KVキャッシュ等で増加します。 コンテキスト長(参考): ~8000 tokens
#
# ヒント:
# - まずは 7B(Q4/Q5)で推論確認 → 8B / 13B へ段階的に。
# - Mac: MLX / llama.cpp(Metal)。Windows/Linux: NVIDIAはCUDA、AMDはROCm、CPUならllama.cpp。
# - モデル配置はサイズ+30%の余白を目安に。長文はKVキャッシュでメモリ増。
# - PyTorch: MPS/CUDA/ROCm が True になっているか確認。
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment