Skip to content

Instantly share code, notes, and snippets.

@badlogic
Created December 1, 2025 01:22
Show Gist options
  • Select an option

  • Save badlogic/f45e8f6e481e5ab7d3a50659da84edaa to your computer and use it in GitHub Desktop.

Select an option

Save badlogic/f45e8f6e481e5ab7d3a50659da84edaa to your computer and use it in GitHub Desktop.
{
"id": "d8036258-e780-411b-8cd9-484757d9ab6d",
"started_at": "2025-11-30T00:09:52.550637",
"finished_at": "2025-12-01T01:49:55.650761",
"n_total_trials": 445,
"stats": {
"n_trials": 445,
"n_errors": 71,
"evals": {
"pi__claude-opus-4-5__terminal-bench": {
"n_trials": 428,
"n_errors": 71,
"metrics": [
{
"mean": 0.4786516853932584
}
],
"reward_stats": {
"reward": {
"1.0": [
"bn-fit-modify__MBkiuCY",
"break-filter-js-from-html__NKPWcqM",
"build-cython-ext__r3EepTj",
"build-pmars__nYiCDhG",
"circuit-fibsqrt__zdDega6",
"cobol-modernization__SyJLppq",
"code-from-image__UAy3EnG",
"compile-compcert__bqLjQtv",
"constraints-scheduling__XvUAjmJ",
"crack-7z-hash__hvLXTwG",
"custom-memory-heap-crash__A3g4egb",
"distribution-search__mZRddUK",
"financial-document-processor__GTASLij",
"fix-code-vulnerability__tZVzvsv",
"fix-git__f9kZSWD",
"fix-ocaml-gc__soJdYgx",
"git-leak-recovery__V9vb6JL",
"git-multibranch__fCc8DJS",
"headless-terminal__gpzuiaP",
"hf-model-inference__z4Sv2pS",
"kv-store-grpc__rx2NU3N",
"large-scale-text-editing__CtjcYi9",
"largest-eigenval__osvZs3b",
"llm-inference-batching-scheduler__mPm2L3K",
"merge-diff-arc-agi-task__By7FWKV",
"modernize-scientific-stack__zUdU2Bp",
"multi-source-data-merger__A8mzDEn",
"nginx-request-logging__Za7RKSs",
"openssl-selfsigned-cert__js5FXt5",
"overfull-hbox__K79ycE3",
"portfolio-optimization__XsUhW3e",
"prove-plus-comm__prcL7VX",
"pypi-server__Csty9e3",
"pytorch-model-cli__ZTPLofA",
"regex-log__DHwNpzo",
"sanitize-git-repo__E9v6CtH",
"sqlite-db-truncate__yQcfcV6",
"sqlite-with-gcov__KmaSPyr",
"torch-pipeline-parallelism__dNDjASi",
"tune-mjcf__ww444sp",
"vulnerable-secret__8nSTUmB",
"write-compressor__qyjnsFX",
"adaptive-rejection-sampler__vPHShtA",
"bn-fit-modify__rvMX7pQ",
"break-filter-js-from-html__oJJXPmT",
"build-cython-ext__9wFk3SV",
"build-pmars__NSpuViU",
"build-pov-ray__FxcqLzs",
"circuit-fibsqrt__KA3DzfC",
"cobol-modernization__4H7K9J3",
"code-from-image__s7jsz4b",
"compile-compcert__9VCrrXy",
"constraints-scheduling__72CTS6a",
"count-dataset-tokens__hnVUwaV",
"crack-7z-hash__TQMLUvv",
"custom-memory-heap-crash__sk4dJMb",
"distribution-search__TPdTvep",
"dna-assembly__UYJm5Pg",
"feal-differential-cryptanalysis__ecANauE",
"financial-document-processor__rkYWUDq",
"fix-code-vulnerability__zMjb4B2",
"fix-git__yHstGWF",
"fix-ocaml-gc__L3QNYHT",
"git-leak-recovery__Y9QN6uN",
"git-multibranch__u7YFC5A",
"hf-model-inference__K8iCKMU",
"kv-store-grpc__WJYgYc9",
"large-scale-text-editing__KEdkZ7d",
"largest-eigenval__jSdX4yy",
"multi-source-data-merger__aDwsVC3",
"nginx-request-logging__gnKXgh3",
"openssl-selfsigned-cert__dZdhTPA",
"portfolio-optimization__RBd8pXf",
"prove-plus-comm__LfiVaTr",
"pypi-server__kU344jA",
"pytorch-model-cli__BatW8ct",
"regex-log__Nmx5qqD",
"rstan-to-pystan__RcLMvum",
"sanitize-git-repo__T7E2b6g",
"sqlite-db-truncate__vgxDqg6",
"sqlite-with-gcov__DBUkWDt",
"torch-pipeline-parallelism__LNRoiJP",
"tune-mjcf__f7LBBtA",
"vulnerable-secret__4QHS9P7",
"break-filter-js-from-html__5CiEyav",
"build-pov-ray__YudXho9",
"cancel-async-tasks__TVF6RgU",
"circuit-fibsqrt__kE4eCxm",
"cobol-modernization__rTmpYqu",
"code-from-image__2Tj8YNc",
"compile-compcert__qDCkPew",
"constraints-scheduling__WHzgnNz",
"count-dataset-tokens__UHGP2Cu",
"crack-7z-hash__X6Fauhj",
"custom-memory-heap-crash__KENnkKb",
"distribution-search__WiUhcYP",
"dna-assembly__PWQyTGX",
"extract-moves-from-video__YfuDEff",
"feal-differential-cryptanalysis__5zbRtQi",
"financial-document-processor__sQpX3CK",
"fix-code-vulnerability__pNAeWt2",
"fix-git__oGxauC8",
"fix-ocaml-gc__cWJwYEz",
"git-leak-recovery__vW57n9H",
"git-multibranch__faFxLM4",
"hf-model-inference__hkottYY",
"kv-store-grpc__sSkivHd",
"large-scale-text-editing__GW8QsPr",
"largest-eigenval__6LVVnda",
"llm-inference-batching-scheduler__vjaZc2R",
"log-summary-date-ranges__xzwcoWJ",
"mcmc-sampling-stan__ucRsMxF",
"merge-diff-arc-agi-task__EYj5QX8",
"modernize-scientific-stack__Kx9jY7C",
"multi-source-data-merger__zfgaY6y",
"nginx-request-logging__iS8geM9",
"overfull-hbox__4eTEPj4",
"path-tracing__DmeZBcd",
"path-tracing-reverse__8jeT9rM",
"portfolio-optimization__mrr64dj",
"prove-plus-comm__TckCCQp",
"pypi-server__BiX2DCT",
"pytorch-model-cli__44SLziD",
"qemu-startup__i4dTc6j",
"query-optimize__SDSSciQ",
"sanitize-git-repo__KGFJJvX",
"schemelike-metacircular-eval__7nnyw22",
"sqlite-db-truncate__bxHcvTv",
"sqlite-with-gcov__X3nQSqW",
"tune-mjcf__fs6waBt",
"vulnerable-secret__Kmfnthk",
"bn-fit-modify__6yp8tZm",
"break-filter-js-from-html__UiKX8j6",
"build-cython-ext__NHYPJy4",
"build-pmars__PN5NivV",
"build-pov-ray__NRmrcKP",
"circuit-fibsqrt__HkPbAjj",
"cobol-modernization__TC5BvPS",
"code-from-image__w9Puw6t",
"constraints-scheduling__qb4N6e5",
"crack-7z-hash__iRMVUxL",
"custom-memory-heap-crash__2ArxUrS",
"distribution-search__iSwnEGH",
"financial-document-processor__fPJo7DH",
"fix-code-vulnerability__QbJ3Esc",
"fix-git__5tNo5JR",
"git-leak-recovery__AhB8gyN",
"git-multibranch__WBP8jTY",
"hf-model-inference__e4WeDKM",
"kv-store-grpc__s8M5fmM",
"large-scale-text-editing__eeU7HVH",
"largest-eigenval__KrCt8yn",
"llm-inference-batching-scheduler__QdKvNSB",
"mcmc-sampling-stan__ndSTHhe",
"merge-diff-arc-agi-task__wjd5yW3",
"modernize-scientific-stack__RYwUQ9X",
"multi-source-data-merger__gdMvDT6",
"nginx-request-logging__8vkFQbU",
"openssl-selfsigned-cert__mi5o5PR",
"password-recovery__i2i8cyv",
"path-tracing-reverse__JjEP2sD",
"portfolio-optimization__3uxwRGz",
"prove-plus-comm__szgSP3a",
"pypi-server__mkP6iqd",
"pytorch-model-cli__6yTwpkX",
"qemu-startup__2xycEkr",
"regex-log__8z78MtE",
"reshard-c4-data__q3fJ6Hs",
"sanitize-git-repo__dS3pkpc",
"sqlite-db-truncate__2dJfiyQ",
"sqlite-with-gcov__JigvzPJ",
"tune-mjcf__ZpkY33M",
"vulnerable-secret__L8q4Sfk",
"winning-avg-corewars__P6WdiN7",
"bn-fit-modify__gaeEwSd",
"break-filter-js-from-html__LwJNGQu",
"build-pov-ray__fSyGr8K",
"circuit-fibsqrt__3RGvnoL",
"cobol-modernization__tTLg9nb",
"code-from-image__ncADcaZ",
"compile-compcert__wEyTfsc",
"constraints-scheduling__7L8mtPa",
"custom-memory-heap-crash__eGtysvy",
"distribution-search__JZia6AN",
"dna-assembly__K2JCQVC",
"extract-elf__PjvTndN",
"feal-differential-cryptanalysis__T3ft4X6",
"financial-document-processor__aQV9nq5",
"fix-code-vulnerability__AFiY9xV",
"fix-git__zPAZFKP",
"git-leak-recovery__thQPWgZ",
"git-multibranch__yTEcKrW",
"hf-model-inference__vb6mJWe",
"kv-store-grpc__VZTf37C",
"large-scale-text-editing__5H7MJJF",
"largest-eigenval__AFm22Gd",
"llm-inference-batching-scheduler__KZ43NJJ",
"mcmc-sampling-stan__yRiJve6",
"modernize-scientific-stack__7MTPC5T",
"multi-source-data-merger__9Arydun",
"path-tracing__pHYyTRi",
"path-tracing-reverse__Ceshgwy",
"portfolio-optimization__Tio2KFZ",
"prove-plus-comm__NkX4nzd",
"pypi-server__A8rmMww",
"pytorch-model-cli__63jGHVk",
"regex-log__zEAfKKn",
"reshard-c4-data__amErBqn",
"rstan-to-pystan__g8YdHGw",
"sqlite-db-truncate__okMRMd8",
"sqlite-with-gcov__rYF59Rz",
"vulnerable-secret__3Nw6FWs",
"winning-avg-corewars__yqo5fTi"
],
"0.0": [
"build-pov-ray__o9tHuSr",
"caffe-cifar-10__QkTYLxA",
"cancel-async-tasks__DtLG5CY",
"chess-best-move__N4KXGvJ",
"configure-git-webserver__hzkzZaN",
"count-dataset-tokens__G9npEFC",
"db-wal-recovery__hDRwmwy",
"dna-assembly__gXcCTUQ",
"dna-insert__iALfBYX",
"extract-elf__cqomSEn",
"extract-moves-from-video__PfZ54KV",
"feal-differential-cryptanalysis__pXFv6n5",
"feal-linear-cryptanalysis__CQxkkR4",
"filter-js-from-html__ULEH3SD",
"gcode-to-text__WpPngF3",
"gpt2-codegolf__sF76gxr",
"log-summary-date-ranges__vVqXALR",
"mailman__zXczAgR",
"make-doom-for-mips__7w9D7qw",
"make-mips-interpreter__ofg3GHo",
"mcmc-sampling-stan__gSvDhFg",
"model-extraction-relu-logits__SSh3MRD",
"mteb-leaderboard__yg3oPSH",
"mteb-retrieve__5a3CPVZ",
"password-recovery__r7hhYF2",
"path-tracing__tUpEqTM",
"path-tracing-reverse__oveAvR9",
"polyglot-c-py__ZvHpgt3",
"polyglot-rust-c__GsgiUyn",
"protein-assembly__SjmUTb3",
"pytorch-model-recovery__G6EYrLc",
"qemu-alpine-ssh__i9Zh3Yv",
"qemu-startup__ijvb5u8",
"query-optimize__d4QNaJa",
"raman-fitting__UKmVp2V",
"regex-chess__vvczYiW",
"reshard-c4-data__unsHJp7",
"rstan-to-pystan__7sybQEx",
"sam-cell-seg__8eoAf2k",
"schemelike-metacircular-eval__K3gbGSR",
"sparql-university__pLCpJLy",
"torch-tensor-parallelism__yX3qYUv",
"train-fasttext__FxFN8xe",
"video-processing__HxQVmQV",
"winning-avg-corewars__g3TCiAG",
"caffe-cifar-10__9DUPUKe",
"cancel-async-tasks__Rx3L6o9",
"chess-best-move__XhXJZyq",
"configure-git-webserver__vMhQKgP",
"db-wal-recovery__73uBbhV",
"dna-insert__RktkDgE",
"extract-elf__kv69hrw",
"extract-moves-from-video__KxVvbHD",
"feal-linear-cryptanalysis__KqwF6Zs",
"gcode-to-text__ZkFFW5V",
"gpt2-codegolf__KwErDnk",
"headless-terminal__LwYzWLp",
"llm-inference-batching-scheduler__Kh2kvpF",
"mailman__wQXN5Zi",
"make-mips-interpreter__uEakwZC",
"mcmc-sampling-stan__TGtbCiD",
"merge-diff-arc-agi-task__8VvDU2r",
"model-extraction-relu-logits__ZXBwowc",
"modernize-scientific-stack__WMsDDsS",
"mteb-leaderboard__zjBJ25d",
"mteb-retrieve__qxTPyH4",
"overfull-hbox__f2Y3xL3",
"password-recovery__hbESBCA",
"path-tracing__vpeZNjV",
"path-tracing-reverse__soQusX4",
"polyglot-c-py__cCNDeJC",
"polyglot-rust-c__yShzTmW",
"protein-assembly__gFSxHLC",
"pytorch-model-recovery__F9EHUJP",
"qemu-alpine-ssh__ZRVuZje",
"qemu-startup__iqWpvcy",
"query-optimize__boqPrNK",
"raman-fitting__gTJNvdM",
"regex-chess__qQEVKDs",
"reshard-c4-data__SFcfxgi",
"sam-cell-seg__5FCrMYs",
"schemelike-metacircular-eval__tXdR3h4",
"sparql-university__TFp7hcq",
"torch-tensor-parallelism__iJvvPDf",
"train-fasttext__XnuaSAV",
"video-processing__oxGE5gg",
"winning-avg-corewars__ui4uJwL",
"write-compressor__Dsijp56",
"adaptive-rejection-sampler__Tev6iTx",
"build-cython-ext__MCp2Lkj",
"build-pmars__Hq2spqk",
"caffe-cifar-10__3FYw7Qu",
"chess-best-move__nL9hcCi",
"configure-git-webserver__cUQNzhc",
"db-wal-recovery__YkE8vVN",
"dna-insert__RKfZDzd",
"extract-elf__BgpE7RV",
"feal-linear-cryptanalysis__qj4vLqi",
"filter-js-from-html__vHNhjup",
"gcode-to-text__GpjDJXr",
"gpt2-codegolf__BoDKBGQ",
"headless-terminal__dVuLpb3",
"install-windows-3.11__UvYss3X",
"mailman__fcB3ueA",
"make-doom-for-mips__auWiyNP",
"make-mips-interpreter__wdBD6bB",
"mteb-leaderboard__p2xae7N",
"mteb-retrieve__SHHbvos",
"openssl-selfsigned-cert__VmcsNFo",
"password-recovery__DPCZRPX",
"polyglot-c-py__cqDX2Yf",
"polyglot-rust-c__36RrgQe",
"protein-assembly__yLtq7NV",
"pytorch-model-recovery__qBgW34t",
"qemu-alpine-ssh__3FSdvee",
"raman-fitting__uUoJNej",
"regex-chess__wQBJonx",
"regex-log__7eUcrs3",
"reshard-c4-data__srnXbqK",
"rstan-to-pystan__fB9wnqk",
"sam-cell-seg__Shcb86Q",
"sparql-university__V2ySAbv",
"torch-pipeline-parallelism__V9RN2P4",
"torch-tensor-parallelism__Xqr5zgP",
"train-fasttext__tshxnGg",
"video-processing__S9s7sqR",
"write-compressor__3dSNvSo",
"adaptive-rejection-sampler__sYabhE2",
"caffe-cifar-10__6sRisnu",
"cancel-async-tasks__VJGi4b6",
"chess-best-move__omXBhHu",
"compile-compcert__eduaFmG",
"configure-git-webserver__H53KQ5N",
"count-dataset-tokens__mQBMWFV",
"db-wal-recovery__SDU9Dd7",
"dna-assembly__GdXKBfo",
"dna-insert__QvbxaZ5",
"extract-elf__x4Ea2yY",
"extract-moves-from-video__SkrjNSv",
"feal-linear-cryptanalysis__bmbugmc",
"filter-js-from-html__cSZVsYw",
"gcode-to-text__ibPKzxD",
"gpt2-codegolf__nxnNpas",
"headless-terminal__b4HyvFY",
"log-summary-date-ranges__DffBnau",
"mailman__Bk5pUXc",
"make-doom-for-mips__Hvi7REk",
"make-mips-interpreter__3mKyPbx",
"model-extraction-relu-logits__kCBMta6",
"mteb-leaderboard__DVL6yxi",
"mteb-retrieve__7drb5Nx",
"overfull-hbox__kmvFHaA",
"path-tracing__PsLD4FU",
"polyglot-c-py__JXHAWrr",
"polyglot-rust-c__Zc5bNqq",
"protein-assembly__rfx77zi",
"pytorch-model-recovery__9N97raS",
"qemu-alpine-ssh__Exn9o3N",
"query-optimize__sKu8WUh",
"raman-fitting__QBYy47n",
"regex-chess__RodjnJz",
"rstan-to-pystan__U84tZqo",
"sam-cell-seg__atJDiTH",
"schemelike-metacircular-eval__XDkFEjv",
"sparql-university__yhHWgCX",
"torch-pipeline-parallelism__tUUjrTq",
"torch-tensor-parallelism__rVHvaPJ",
"train-fasttext__HHSrRXk",
"video-processing__qpe8TLB",
"write-compressor__s6S5X4u",
"adaptive-rejection-sampler__GENisuN",
"build-cython-ext__MxC5wL4",
"build-pmars__pjZSyQ3",
"caffe-cifar-10__DQNT38d",
"cancel-async-tasks__xYwJ6tE",
"chess-best-move__EfGAxKh",
"configure-git-webserver__sW7yyro",
"count-dataset-tokens__zAR7saE",
"crack-7z-hash__rsGDsQy",
"db-wal-recovery__XKG58kf",
"dna-insert__W3zngyD",
"extract-moves-from-video__mLEjSYz",
"feal-linear-cryptanalysis__TY3MpvJ",
"filter-js-from-html__BQ83rMx",
"fix-ocaml-gc__ofktq8s",
"gcode-to-text__X35fq38",
"gpt2-codegolf__ezyv6XT",
"headless-terminal__ZcEQz58",
"log-summary-date-ranges__acvYidc",
"mailman__tuRvHjr",
"make-doom-for-mips__w9x4MGo",
"make-mips-interpreter__K6iG7eR",
"merge-diff-arc-agi-task__fVC8wBt",
"model-extraction-relu-logits__X3VcZbw",
"openssl-selfsigned-cert__o6eJGHV",
"overfull-hbox__zGtbHDz",
"password-recovery__BxD3QjX",
"polyglot-c-py__CsiSub3",
"polyglot-rust-c__wiU5jee",
"protein-assembly__xpbedUt",
"pytorch-model-recovery__9Vev45C",
"qemu-alpine-ssh__MRtVzG4",
"qemu-startup__8waAtd2",
"query-optimize__afk2QJR",
"raman-fitting__LMobiwf",
"regex-chess__oCv65Pp",
"sam-cell-seg__LZVhF63",
"sanitize-git-repo__fABqrep",
"sparql-university__6ct5PZJ",
"torch-pipeline-parallelism__DxBnQ4c",
"torch-tensor-parallelism__FADqNHL",
"train-fasttext__JUgtsJd",
"tune-mjcf__5nefMZJ",
"video-processing__yJYk4jw",
"write-compressor__ZhkGQnY"
]
}
},
"exception_stats": {
"AddTestsDirError": [
"adaptive-rejection-sampler__6mjcAQo"
],
"AgentTimeoutError": [
"caffe-cifar-10__QkTYLxA",
"feal-linear-cryptanalysis__CQxkkR4",
"gpt2-codegolf__sF76gxr",
"kv-store-grpc__rx2NU3N",
"mailman__zXczAgR",
"make-doom-for-mips__7w9D7qw",
"path-tracing__tUpEqTM",
"qemu-alpine-ssh__i9Zh3Yv",
"qemu-startup__ijvb5u8",
"schemelike-metacircular-eval__K3gbGSR",
"tune-mjcf__ww444sp",
"feal-linear-cryptanalysis__KqwF6Zs",
"gpt2-codegolf__KwErDnk",
"kv-store-grpc__WJYgYc9",
"mailman__wQXN5Zi",
"path-tracing__vpeZNjV",
"qemu-alpine-ssh__ZRVuZje",
"qemu-startup__iqWpvcy",
"torch-pipeline-parallelism__LNRoiJP",
"caffe-cifar-10__3FYw7Qu",
"db-wal-recovery__YkE8vVN",
"feal-linear-cryptanalysis__qj4vLqi",
"gpt2-codegolf__BoDKBGQ",
"install-windows-3.11__UvYss3X",
"kv-store-grpc__sSkivHd",
"mailman__fcB3ueA",
"make-doom-for-mips__auWiyNP",
"model-extraction-relu-logits__iYi27VM",
"path-tracing__DmeZBcd",
"qemu-alpine-ssh__3FSdvee",
"qemu-startup__i4dTc6j",
"rstan-to-pystan__fB9wnqk",
"write-compressor__3dSNvSo",
"feal-differential-cryptanalysis__L2dfxyG",
"feal-linear-cryptanalysis__bmbugmc",
"gpt2-codegolf__nxnNpas",
"kv-store-grpc__s8M5fmM",
"mailman__Bk5pUXc",
"make-doom-for-mips__Hvi7REk",
"make-mips-interpreter__3mKyPbx",
"path-tracing__PsLD4FU",
"rstan-to-pystan__U84tZqo",
"schemelike-metacircular-eval__XDkFEjv",
"write-compressor__s6S5X4u",
"caffe-cifar-10__DQNT38d",
"crack-7z-hash__rsGDsQy",
"feal-linear-cryptanalysis__TY3MpvJ",
"gpt2-codegolf__ezyv6XT",
"kv-store-grpc__VZTf37C",
"mailman__tuRvHjr",
"make-doom-for-mips__w9x4MGo",
"path-tracing__pHYyTRi",
"qemu-alpine-ssh__MRtVzG4",
"qemu-startup__8waAtd2",
"rstan-to-pystan__g8YdHGw",
"schemelike-metacircular-eval__Z24kUFk",
"write-compressor__ZhkGQnY"
],
"RewardFileNotFoundError": [
"install-windows-3.11__AyRDhpm",
"install-windows-3.11__ks9sg7m",
"winning-avg-corewars__aEh3KEL",
"fix-ocaml-gc__aZq7GAL",
"install-windows-3.11__8rwvEQG",
"install-windows-3.11__LQqGqEV"
],
"VerifierTimeoutError": [
"filter-js-from-html__dMJz4B6"
],
"RuntimeError": [
"log-summary-date-ranges__rStXsyk",
"make-doom-for-mips__heSEYto",
"bn-fit-modify__aSTjVAW",
"mteb-leaderboard__4XZTxxy",
"mteb-retrieve__s8XVFsc",
"nginx-request-logging__f73eZtK"
]
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment