Last active
October 23, 2025 04:17
-
-
Save isshiki/0f94c9764c559fe99672d58e7c17b183 to your computer and use it in GitHub Desktop.
ローカルLLMの実行可否を“ひと目判断”するための、環境情報の自動収集→要約スクリプト(macOS/Windows/Linux対応)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| llm_env_report.py | |
| ローカルLLMの実行可否を“ひと目判断”。macOS/Windows/Linux対応(標準ライブラリのみ) | |
| 取得: | |
| - OS/カーネル, 機種/CPU/メモリ/ディスク | |
| - GPU要約(Metal/CUDA/ROCm/MPS推定) | |
| - Python/主要ライブラリ(mlx/torch/coremltools/llama_cpp/vllm) | |
| 出力: | |
| - 推奨モデル規模(7B/8B/13B…量子化目安) | |
| - 量子化別の「推定必要メモリ(重み)」と可否判定 | |
| - 推奨バックエンド、注意点 | |
| - 量子化(Q4/Q5…)の簡易説明 | |
| 使い方: | |
| python llm_env_report.py # テキスト | |
| python llm_env_report.py --format md # Markdown表 | |
| python llm_env_report.py --format json # JSON | |
| python llm_env_report.py --ctx 8000 # 参考コンテキスト長(注記用) | |
| """ | |
| import argparse | |
| import json | |
| import platform | |
| import shutil | |
| import subprocess | |
| from textwrap import dedent | |
| # ---------- helpers ---------- | |
| def run(cmd, shell=False): | |
| try: | |
| out = subprocess.check_output(cmd, stderr=subprocess.DEVNULL, shell=shell) | |
| return out.decode("utf-8", errors="ignore").strip() | |
| except Exception: | |
| return "" | |
| def bytes_to_gb(n): | |
| try: | |
| return round(int(n) / (1024**3), 2) | |
| except Exception: | |
| return None | |
| def print_table(kv, title=None): | |
| if title: | |
| print(title) | |
| w = max((len(k) for k in kv.keys()), default=0) | |
| for k, v in kv.items(): | |
| if isinstance(v, list): | |
| v = " / ".join(map(str, v)) if v else "-" | |
| print(f" {k:<{w}} : {v}") | |
| print() | |
| # ---------- quantization-aware estimation ---------- | |
| def estimate_weights_mem_gb(params_billion: float, quant_bits: int) -> float: | |
| """ | |
| 重み(Weights)だけの概算必要メモリ[GB]を返す。 | |
| 基準: FP16は約2GB/1B param → 量子化で (bits/16) 倍。 | |
| 例) 7B, Q4 → 2*7*(4/16)=3.5GB(重みのみ) | |
| """ | |
| return round(2.0 * params_billion * (quant_bits / 16.0), 2) | |
| def human_ok(need_gb: float, mem_total_gb: float, headroom: float = 0.75) -> bool: | |
| """ | |
| 余裕係数(headroom)を掛けて可否判定。 | |
| UMAやOS分を考慮して「総メモリ×0.75」を上限の目安に。 | |
| """ | |
| return need_gb <= round(mem_total_gb * headroom, 2) | |
| def quant_explanations(): | |
| return [ | |
| "Q8: 8bit量子化。精度高めだが重い(重みはFP16の約50%)。", | |
| "Q5: 5bit量子化。精度と軽さのバランスがよい(約31%)。", | |
| "Q4: 4bit量子化。軽量・高速(約25%)。精度はやや低下。", | |
| "※表の“推定必要メモリ(重み)”は重みだけの概算。実際の使用量はKVキャッシュ等で増加します。" | |
| ] | |
| def build_quant_table(mem_gb: float): | |
| """ | |
| 代表的なモデル規模×量子化で、重みの推定メモリと可否を表にする。 | |
| ※KVキャッシュは未加算(注記で明示) | |
| """ | |
| entries = [] | |
| models = [ | |
| ("3B", 3.0), | |
| ("4B", 4.0), | |
| ("7B", 7.0), | |
| ("8B", 8.0), | |
| ("13B", 13.0), | |
| ] | |
| quants = [8, 5, 4] # Q8/Q5/Q4 | |
| for name, pb in models: | |
| row = {"モデル": name} | |
| for q in quants: | |
| need = estimate_weights_mem_gb(pb, q) | |
| ok = human_ok(need, mem_gb) | |
| mark = "✅" if ok else "✕" | |
| row[f"Q{q} 推定必要メモリ(重み)"] = f"{need} GB {mark}" | |
| entries.append(row) | |
| return entries | |
| def print_quant_table(entries): | |
| # シンプル表(テキスト) | |
| headers = ["モデル", "Q8 推定必要メモリ(重み)", "Q5 推定必要メモリ(重み)", "Q4 推定必要メモリ(重み)"] | |
| colw = {h: len(h) for h in headers} | |
| for e in entries: | |
| for h in headers: | |
| colw[h] = max(colw[h], len(str(e[h]))) | |
| # header | |
| line = " " + " | ".join(f"{h:<{colw[h]}}" for h in headers) | |
| sep = " " + "-+-".join("-"*colw[h] for h in headers) | |
| print(line); print(sep) | |
| for e in entries: | |
| print(" " + " | ".join(f"{str(e[h]):<{colw[h]}}" for h in headers)) | |
| print() | |
| # ---------- OS specific collectors ---------- | |
| def collect_macos(): | |
| import json as _json | |
| info = {} | |
| info["os"] = f"{platform.system()} {platform.release()} ({platform.version()})" | |
| info["python"] = platform.python_version() | |
| info["machine"] = platform.machine() | |
| def system_profiler(datatype): | |
| out = run(["/usr/sbin/system_profiler", "-json", datatype]) | |
| return _json.loads(out) if out else {} | |
| def sysctl(key): | |
| out = run(["/usr/sbin/sysctl", "-n", key]) | |
| return out.strip() | |
| hw = system_profiler("SPHardwareDataType").get("SPHardwareDataType", [{}])[0] | |
| info["model_name"] = hw.get("machine_name") or hw.get("model_name") or "" | |
| info["model_identifier"] = hw.get("machine_model") or hw.get("model_identifier") or "" | |
| info["chip"] = hw.get("chip_type") or hw.get("cpu_type") or "" | |
| info["memory_text"] = hw.get("physical_memory") or "" | |
| mem_bytes = sysctl("hw.memsize") | |
| info["memory_gb"] = bytes_to_gb(mem_bytes) if mem_bytes else None | |
| disp = system_profiler("SPDisplaysDataType").get("SPDisplaysDataType", []) | |
| info["gpu_summary"] = [] | |
| info["metal_supported"] = None | |
| for d in disp: | |
| name = d.get("_name") or d.get("spdisplays_chipset_model") or "Apple GPU" | |
| vram = d.get("spdisplays_vram") or d.get("spdisplays_vram_shared") or "" | |
| metal = d.get("spdisplays_metal") or "" | |
| info["gpu_summary"].append({"name": name, "vram": vram, "metal": metal}) | |
| if isinstance(metal, str) and metal: | |
| if "Supported" in metal or "対応" in metal: | |
| info["metal_supported"] = True | |
| elif "Not Supported" in metal or "非対応" in metal: | |
| info["metal_supported"] = False | |
| if info.get("metal_supported") is None: | |
| chip = (info.get("chip") or "").lower() | |
| if any(k in chip for k in ("apple", "m1", "m2", "m3", "m4")): | |
| info["metal_supported"] = True | |
| if not info.get("gpu_summary"): | |
| info["gpu_summary"] = [{"name": "Apple GPU", "vram": "(shared)", "metal": "Metal: Supported (assumed)"}] | |
| total, used, free = shutil.disk_usage("/") | |
| info["disk_total_gb"] = round(total / (1024**3), 1) | |
| info["disk_free_gb"] = round(free / (1024**3), 1) | |
| return info | |
| def collect_windows(): | |
| info = {} | |
| info["os"] = f"{platform.system()} {platform.release()} ({platform.version()})" | |
| info["python"] = platform.python_version() | |
| info["machine"] = platform.machine() | |
| mem_total = run(["powershell", "-NoProfile", "-Command", | |
| "Get-CimInstance Win32_ComputerSystem | Select-Object -ExpandProperty TotalPhysicalMemory"]) | |
| info["memory_gb"] = bytes_to_gb(mem_total) if mem_total else None | |
| info["model_name"] = run(["powershell", "-NoProfile", "-Command", | |
| "(Get-CimInstance Win32_ComputerSystem).Model"]) | |
| info["chip"] = run(["powershell", "-NoProfile", "-Command", | |
| "(Get-CimInstance Win32_Processor).Name"]) | |
| info["gpu_summary"] = [] | |
| gpus = run(["powershell", "-NoProfile", "-Command", | |
| "Get-CimInstance Win32_VideoController | Select-Object -ExpandProperty Name"]) | |
| for line in gpus.splitlines(): | |
| line = line.strip() | |
| if line: | |
| info["gpu_summary"].append({"name": line, "vram": "", "metal": ""}) | |
| nvsmi = run(["nvidia-smi", "--query-gpu=name,memory.total", "--format=csv,noheader"]) | |
| info["cuda_supported"] = bool(nvsmi) | |
| if nvsmi: | |
| for l in nvsmi.splitlines(): | |
| parts = [x.strip() for x in l.split(",")] | |
| if len(parts) >= 2: | |
| name, mem = parts[:2] | |
| info["gpu_summary"].append({"name": f"NVIDIA {name}", "vram": mem, "metal": ""}) | |
| total, used, free = shutil.disk_usage("\\") | |
| info["disk_total_gb"] = round(total / (1024**3), 1) | |
| info["disk_free_gb"] = round(free / (1024**3), 1) | |
| return info | |
| def collect_linux(): | |
| info = {} | |
| info["os"] = f"{platform.system()} {platform.release()} ({platform.version()})" | |
| info["python"] = platform.python_version() | |
| info["machine"] = platform.machine() | |
| mem_bytes = None | |
| try: | |
| with open("/proc/meminfo") as f: | |
| for line in f: | |
| if line.lower().startswith("memtotal"): | |
| kb = int(line.split(":")[1].strip().split()[0]) | |
| mem_bytes = kb * 1024 | |
| break | |
| except Exception: | |
| pass | |
| info["memory_gb"] = bytes_to_gb(mem_bytes) if mem_bytes else None | |
| info["chip"] = run(["bash", "-lc", "grep 'model name' /proc/cpuinfo | head -1 | cut -d: -f2"]).strip() | |
| info["gpu_summary"] = [] | |
| nvsmi = run(["nvidia-smi", "--query-gpu=name,memory.total", "--format=csv,noheader"]) | |
| info["cuda_supported"] = bool(nvsmi) | |
| if nvsmi: | |
| for l in nvsmi.splitlines(): | |
| parts = [x.strip() for x in l.split(",")] | |
| if len(parts) >= 2: | |
| name, mem = parts[:2] | |
| info["gpu_summary"].append({"name": f"NVIDIA {name}", "vram": mem, "metal": ""}) | |
| rocmsmi = run(["rocm-smi"]) | |
| info["rocm_supported"] = bool(rocmsmi and ("GPU" in rocmsmi or "card" in rocmsmi.lower())) | |
| if info["rocm_supported"]: | |
| for line in rocmsmi.splitlines(): | |
| if "Card series" in line or "GPU" in line: | |
| info["gpu_summary"].append({"name": line.strip(), "vram": "", "metal": ""}) | |
| if not info["gpu_summary"]: | |
| lspci = run(["bash", "-lc", "lspci | egrep -i 'vga|3d|display'"]) | |
| for line in lspci.splitlines(): | |
| info["gpu_summary"].append({"name": line.strip(), "vram": "", "metal": ""}) | |
| total, used, free = shutil.disk_usage("/") | |
| info["disk_total_gb"] = round(total / (1024**3), 1) | |
| info["disk_free_gb"] = round(free / (1024**3), 1) | |
| return info | |
| # ---------- libs & capability ---------- | |
| def check_libs(): | |
| libs = { | |
| "mlx": False, # macOS only | |
| "torch": False, | |
| "torch_mps": False, # Apple | |
| "torch_cuda": False, # NVIDIA | |
| "torch_rocm": False, # AMD ROCm (簡易推定) | |
| "coremltools": False, # macOS中心 | |
| "llama_cpp": False, | |
| "vllm": False, | |
| } | |
| try: | |
| import mlx.core as mx # type: ignore | |
| libs["mlx"] = True | |
| except Exception: | |
| pass | |
| try: | |
| import torch # type: ignore | |
| libs["torch"] = True | |
| try: | |
| libs["torch_mps"] = bool(getattr(torch.backends, "mps", None) and torch.backends.mps.is_available()) | |
| except Exception: | |
| libs["torch_mps"] = False | |
| try: | |
| libs["torch_cuda"] = bool(torch.cuda.is_available()) | |
| except Exception: | |
| libs["torch_cuda"] = False | |
| try: | |
| ver = str(getattr(torch.version, "cuda", "")).lower() | |
| libs["torch_rocm"] = ("rocm" in ver) or ("hip" in ver) | |
| except Exception: | |
| libs["torch_rocm"] = False | |
| except Exception: | |
| pass | |
| try: | |
| import coremltools # type: ignore | |
| libs["coremltools"] = True | |
| except Exception: | |
| pass | |
| try: | |
| import llama_cpp # type: ignore | |
| libs["llama_cpp"] = True | |
| except Exception: | |
| pass | |
| try: | |
| import vllm # type: ignore | |
| libs["vllm"] = True | |
| except Exception: | |
| pass | |
| return libs | |
| # ---------- recommendation (mem-based) ---------- | |
| def recommend(info, libs): | |
| osname = platform.system() | |
| mem = info.get("memory_gb") or 0 | |
| free_disk = info.get("disk_free_gb") or 0 | |
| # 量子化考慮のメイン推奨テキスト | |
| if mem >= 48: | |
| size_rec = "7B〜13B余裕(Q4〜Q5)。大規模は要工夫。" | |
| elif mem >= 32: | |
| size_rec = "7B/8B(Q4〜Q5)快適、13BはQ4で軽用途。" | |
| elif mem >= 24: | |
| size_rec = "7B(Q4〜Q5)快適、8BはQ4で可、13Bは厳しめ。" | |
| elif mem >= 16: | |
| size_rec = "7B(Q4)現実的。8Bは設定次第。13Bは非推奨。" | |
| else: | |
| size_rec = "メモリ不足(<16GB):7B軽量量子化でも厳しめ。" | |
| # バックエンド推奨 | |
| backends = [] | |
| if osname == "Darwin": | |
| if libs.get("mlx"): backends.append("MLX(Apple Silicon)") | |
| if libs.get("torch_mps"): backends.append("PyTorch MPS") | |
| backends.append("llama.cpp (Metal)") | |
| elif osname == "Windows": | |
| if libs.get("torch_cuda"): backends.append("PyTorch CUDA") | |
| backends.append("llama.cpp (CUDA対応ビルド)") | |
| elif osname == "Linux": | |
| if libs.get("torch_cuda"): backends.append("PyTorch CUDA") | |
| if libs.get("torch_rocm"): backends.append("PyTorch ROCm") | |
| backends.append("llama.cpp (CUDA/ROCm/CPU)") | |
| if libs.get("vllm"): backends.append("vLLM(GPU前提)") | |
| backend_rec = " / ".join(backends) if backends else "CPU fallback(遅い)" | |
| notes = [] | |
| if free_disk < 15: | |
| notes.append("ディスク空きが少なめ(<15GB)。モデル配置に注意。") | |
| if osname == "Darwin" and info.get("metal_supported") is False: | |
| notes.append("Metal非対応表示。GPU加速不可の可能性。") | |
| if not any([libs.get("mlx"), libs.get("torch_mps"), libs.get("torch_cuda"), libs.get("torch_rocm")]): | |
| notes.append("GPU/加速基盤が確認できず。まずは対応ライブラリの導入を。") | |
| if osname == "Windows" and not info.get("cuda_supported") and not libs.get("torch_cuda"): | |
| notes.append("WindowsでCUDAが見つかりません。NVIDIA + ドライバ + CUDA Toolkit を確認。") | |
| if osname == "Linux" and not (info.get("cuda_supported") or info.get("rocm_supported")) and not (libs.get("torch_cuda") or libs.get("torch_rocm")): | |
| notes.append("LinuxでGPU加速が未検出。NVIDIA(ドライバ/CUDA)またはAMD(ROCm)を確認。") | |
| try: | |
| py = tuple(int(x) for x in platform.python_version().split(".")[:2]) | |
| if py >= (3, 13): | |
| notes.append("Python 3.13: 一部ライブラリ未対応。3.12または3.11推奨。") | |
| except Exception: | |
| pass | |
| return { | |
| "recommended_model_scale": size_rec, | |
| "recommended_backends": backend_rec, | |
| "notes": notes or ["特になし"], | |
| } | |
| # ---------- formatting ---------- | |
| def format_gpu_lines(info): | |
| lines = [] | |
| for g in info.get("gpu_summary", []): | |
| name = (g.get("name","") or "").strip() | |
| vram = (g.get("vram","") or "").strip() | |
| metal = (g.get("metal","") or "").strip() | |
| parts = [p for p in [name, vram, metal] if p] | |
| lines.append(" / ".join(parts) if parts else "(unknown GPU)") | |
| if platform.system() == "Darwin": | |
| if info.get("metal_supported") is True: | |
| if not any("Metal" in l for l in lines): | |
| if lines: | |
| lines[0] = (lines[0] + " / Metal: Supported").strip() | |
| else: | |
| lines = ["Apple GPU / Metal: Supported"] | |
| return lines or ["-"] | |
| def to_markdown(hw, libs, rec, quant_rows, ctx): | |
| def row(k,v): return f"| {k} | {v} |" | |
| md = [] | |
| md += ["### OS / ハードウェア", "|項目|値|","|---|---|"] | |
| for k,v in hw.items(): | |
| if isinstance(v, list): v = " / ".join(map(str, v)) | |
| md.append(row(k, v)) | |
| md += ["", "### Python / ライブラリ", "|項目|値|","|---|---|"] | |
| for k,v in libs.items(): | |
| md.append(row(k, v)) | |
| md += ["", "### 推奨(自動判定)", "|項目|値|","|---|---|"] | |
| for k,v in rec.items(): | |
| if isinstance(v, list): v = " / ".join(map(str, v)) | |
| md.append(row(k, v)) | |
| # 量子化表 | |
| md += ["", "### 量子化別の推定必要メモリ(重みのみ)", "|モデル|Q8|Q5|Q4|","|---|---|---|---|"] | |
| for e in quant_rows: | |
| md.append(f"| {e['モデル']} | {e['Q8 推定必要メモリ(重み)']} | {e['Q5 推定必要メモリ(重み)']} | {e['Q4 推定必要メモリ(重み)']} |") | |
| md += ["", f"**注**: 数値は重みのみの概算。実際はKVキャッシュ等で増加。コンテキスト長(参考): ~{ctx} tokens。", ""] | |
| md += ["**量子化メモ**:"] | |
| for line in quant_explanations(): | |
| md.append(f"- {line}") | |
| return "\n".join(md) | |
| # ---------- main ---------- | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--format", choices=["text","md","json"], default="text", | |
| help="出力形式(text/md/json)") | |
| parser.add_argument("--ctx", type=int, default=8000, | |
| help="参考コンテキスト長(注記用の表示のみ。既定: 8000)") | |
| args = parser.parse_args() | |
| osname = platform.system() | |
| if osname == "Darwin": | |
| info = collect_macos() | |
| elif osname == "Windows": | |
| info = collect_windows() | |
| else: | |
| info = collect_linux() | |
| libs = check_libs() | |
| rec = recommend(info, libs) | |
| gpu_lines = format_gpu_lines(info) | |
| hw = { | |
| "OS": info.get("os"), | |
| "モデル名/機種": info.get("model_name") or info.get("model_identifier") or "-", | |
| "チップ/CPU": info.get("chip") or "-", | |
| "メモリ(GB)": info.get("memory_gb"), | |
| "GPU要約": gpu_lines, | |
| "ディスク空き(GB)": info.get("disk_free_gb"), | |
| "ディスク総量(GB)": info.get("disk_total_gb"), | |
| } | |
| libshow = { | |
| "Python": platform.python_version(), | |
| "mlx": libs.get("mlx"), | |
| "torch": libs.get("torch"), | |
| "torch MPS": libs.get("torch_mps"), | |
| "torch CUDA": libs.get("torch_cuda"), | |
| "torch ROCm": libs.get("torch_rocm"), | |
| "coremltools": libs.get("coremltools"), | |
| "llama_cpp": libs.get("llama_cpp"), | |
| "vllm": libs.get("vllm"), | |
| } | |
| recshow = { | |
| "推奨モデル規模": rec["recommended_model_scale"], | |
| "推奨バックエンド": rec["recommended_backends"], | |
| "注意点": rec["notes"], | |
| } | |
| # 量子化表(重みベース概算) | |
| mem_total = info.get("memory_gb") or 0.0 | |
| quant_rows = build_quant_table(mem_total) | |
| if args.format == "json": | |
| payload = { | |
| "hardware": hw, | |
| "libs": libshow, | |
| "recommendation": recshow, | |
| "quant_table": quant_rows, | |
| "quant_notes": quant_explanations(), | |
| "context_hint_tokens": args.ctx, | |
| } | |
| print(json.dumps(payload, ensure_ascii=False, indent=2)) | |
| return | |
| if args.format == "md": | |
| print(to_markdown(hw, libshow, recshow, quant_rows, args.ctx)) | |
| return | |
| # text(デフォルト) | |
| print("\n=== OS / ハードウェア ==="); print_table(hw) | |
| print("=== Python / ライブラリ ==="); print_table(libshow) | |
| print("=== 推奨(自動判定) ==="); print_table(recshow) | |
| print("=== 量子化別の推定必要メモリ(重みのみ) ===") | |
| print_quant_table(quant_rows) | |
| print("量子化メモ:") | |
| for line in quant_explanations(): | |
| print(" - " + line) | |
| print(f"\n注: 上表の数値は“重みのみ”の概算です。実際の使用メモリは KVキャッシュ等で増加します。" | |
| f" コンテキスト長(参考): ~{args.ctx} tokens") | |
| hint = dedent(""" | |
| ヒント: | |
| - まずは 7B(Q4/Q5)で推論確認 → 8B / 13B へ段階的に。 | |
| - Mac: MLX / llama.cpp(Metal)。Windows/Linux: NVIDIAはCUDA、AMDはROCm、CPUならllama.cpp。 | |
| - モデル配置はサイズ+30%の余白を目安に。長文はKVキャッシュでメモリ増。 | |
| - PyTorch: MPS/CUDA/ROCm が True になっているか確認。 | |
| """).strip() | |
| print("\n" + hint) | |
| if __name__ == "__main__": | |
| main() | |
| # 出力例: | |
| # | |
| # === OS / ハードウェア === | |
| # OS : Darwin 24.6.0 (Darwin Kernel Version 24.6.0: Mon Jul 14 11:30:40 PDT 2025; root:xnu-11417.140.69~1/RELEASE_ARM64_T8132) | |
| # モデル名/機種 : MacBook Pro | |
| # チップ/CPU : Apple M4 | |
| # メモリ(GB) : 16.0 | |
| # GPU要約 : Apple M4 / Metal: Supported | |
| # ディスク空き(GB) : 284.1 | |
| # ディスク総量(GB) : 460.4 | |
| # | |
| # === Python / ライブラリ === | |
| # Python : 3.13.5 | |
| # mlx : False | |
| # torch : False | |
| # torch MPS : False | |
| # torch CUDA : False | |
| # torch ROCm : False | |
| # coremltools : False | |
| # llama_cpp : False | |
| # vllm : False | |
| # | |
| # === 推奨(自動判定) === | |
| # 推奨モデル規模 : 7B(Q4)現実的。8Bは設定次第。13Bは非推奨。 | |
| # 推奨バックエンド : llama.cpp (Metal) | |
| # 注意点 : GPU/加速基盤が確認できず。まずは対応ライブラリの導入を。 / Python 3.13: 一部ライブラリ未対応。3.12または3.11推奨。 | |
| # | |
| # === 量子化別の推定必要メモリ(重みのみ) === | |
| # モデル | Q8 推定必要メモリ(重み) | Q5 推定必要メモリ(重み) | Q4 推定必要メモリ(重み) | |
| # ----+----------------+----------------+--------------- | |
| # 3B | 3.0 GB ✅ | 1.88 GB ✅ | 1.5 GB ✅ | |
| # 4B | 4.0 GB ✅ | 2.5 GB ✅ | 2.0 GB ✅ | |
| # 7B | 7.0 GB ✅ | 4.38 GB ✅ | 3.5 GB ✅ | |
| # 8B | 8.0 GB ✅ | 5.0 GB ✅ | 4.0 GB ✅ | |
| # 13B | 13.0 GB ✕ | 8.12 GB ✅ | 6.5 GB ✅ | |
| # | |
| # 量子化メモ: | |
| # - Q8: 8bit量子化。精度高めだが重い(重みはFP16の約50%)。 | |
| # - Q5: 5bit量子化。精度と軽さのバランスがよい(約31%)。 | |
| # - Q4: 4bit量子化。軽量・高速(約25%)。精度はやや低下。 | |
| # - ※表の“推定必要メモリ(重み)”は重みだけの概算。実際の使用量はKVキャッシュ等で増加します。 | |
| # | |
| # 注: 上表の数値は“重みのみ”の概算です。実際の使用メモリは KVキャッシュ等で増加します。 コンテキスト長(参考): ~8000 tokens | |
| # | |
| # ヒント: | |
| # - まずは 7B(Q4/Q5)で推論確認 → 8B / 13B へ段階的に。 | |
| # - Mac: MLX / llama.cpp(Metal)。Windows/Linux: NVIDIAはCUDA、AMDはROCm、CPUならllama.cpp。 | |
| # - モデル配置はサイズ+30%の余白を目安に。長文はKVキャッシュでメモリ増。 | |
| # - PyTorch: MPS/CUDA/ROCm が True になっているか確認。 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment