1 hari lalu · 3b9e8951df
--- a/benchmarks/results/benchmark_20260310T170013.md
+++ b/benchmarks/results/benchmark_20260310T170013.md
@@ -0,0 +1,78 @@
 
				+# Benchmark Results - 20260310T170013
			
 
				+
			
 
				+## Model Selection (6-slot / 2-socket)
			
 
				+| Slot | Socket | Role | Model | Composite Score |
			
 
				+|------|--------|------|-------|----------------|
			
 
				+| 1 | Node 1 (port 11434) | General (locked) | llama3.2:3b | 0.814 |
			
 
				+| 2 | Node 1 (port 11434) | General (locked) | llama3.1:8b | 0.623 |
			
 
				+| 5 | Node 1 (port 11434) | General (rotate) | gemma3:12b-it-q4_K_M | 0.481 |
			
 
				+| 3 | Node 0 (port 11435) | Coding (locked) | deepseek-coder-v2:16b | 0.723 |
			
 
				+| 4 | Node 0 (port 11435) | Coding (locked) | qwen2.5-coder:7b | 0.655 |
			
 
				+| 6 | Node 0 (port 11435) | Coding (rotate) | qwen2.5-coder:14B | 0.57 |
			
 
				+
			
 
				+## Detailed Metrics
			
 
				+### codellama:34b
			
 
				+- **Category**: coding
			
 
				+- **Coding Quality**: 0.833
			
 
				+- **General Quality**: 0.586
			
 
				+- **Avg Tokens/sec**: 3.2
			
 
				+- **Latency (ms)**: 4235.4
			
 
				+- **Coding Composite**: 0.437
			
 
				+- **General Composite**: 0.326
			
 
				+### deepseek-coder-v2:16b
			
 
				+- **Category**: coding
			
 
				+- **Coding Quality**: 0.833
			
 
				+- **General Quality**: 0.885
			
 
				+- **Avg Tokens/sec**: 23.5
			
 
				+- **Latency (ms)**: 1568.5
			
 
				+- **Coding Composite**: 0.723
			
 
				+- **General Composite**: 0.746
			
 
				+### qwen2.5-coder:14B
			
 
				+- **Category**: coding
			
 
				+- **Coding Quality**: 0.85
			
 
				+- **General Quality**: 0.931
			
 
				+- **Avg Tokens/sec**: 6.6
			
 
				+- **Latency (ms)**: 2229.7
			
 
				+- **Coding Composite**: 0.57
			
 
				+- **General Composite**: 0.607
			
 
				+### llama3.1:8b
			
 
				+- **Category**: general
			
 
				+- **Coding Quality**: 0.823
			
 
				+- **General Quality**: 0.877
			
 
				+- **Avg Tokens/sec**: 11.8
			
 
				+- **Latency (ms)**: 2202.0
			
 
				+- **Coding Composite**: 0.599
			
 
				+- **General Composite**: 0.623
			
 
				+### qwen2.5-coder:7b
			
 
				+- **Category**: coding
			
 
				+- **Coding Quality**: 0.85
			
 
				+- **General Quality**: 0.91
			
 
				+- **Avg Tokens/sec**: 12.5
			
 
				+- **Latency (ms)**: 1431.0
			
 
				+- **Coding Composite**: 0.655
			
 
				+- **General Composite**: 0.682
			
 
				+### gemma3:12b-it-q4_K_M
			
 
				+- **Category**: general
			
 
				+- **Coding Quality**: 0.873
			
 
				+- **General Quality**: 0.966
			
 
				+- **Avg Tokens/sec**: 6.1
			
 
				+- **Latency (ms)**: 5941.9
			
 
				+- **Coding Composite**: 0.439
			
 
				+- **General Composite**: 0.481
			
 
				+### llama3.2:3b
			
 
				+- **Category**: general
			
 
				+- **Coding Quality**: 0.89
			
 
				+- **General Quality**: 0.954
			
 
				+- **Avg Tokens/sec**: 23.0
			
 
				+- **Latency (ms)**: 754.8
			
 
				+- **Coding Composite**: 0.786
			
 
				+- **General Composite**: 0.814
			
 
				+
			
 
				+## Scoring Formula
			
 
				+- Composite = quality * 0.45 + token_speed_normalized * 0.30 + latency_score * 0.25
			
 
				+- Speed normalized against 40 tok/sec ceiling (hardware-observed max)
			
 
				+- Coding quality (per-prompt):
			
 
				+  code_gen: has_def×0.20 + has_return×0.20 + has_docstring×0.15 + has_type_hint×0.15 + has_code_block×0.10 + has_assert×0.08 + has_test_def×0.07 + has_import×0.05
			
 
				+  debug:    has_def×0.30 + has_return×0.30 + has_code_block×0.25 + has_assert×0.15
			
 
				+  refactor: has_def×0.25 + has_return×0.25 + has_code_block×0.20 + has_type_hint×0.15 + has_import×0.15
			
 
				+- Category: override dict → quality delta (coding_avg - general_avg >= 0.1) → name pattern (coder/codestral/codellama/starcoder) → general
			
--- a/benchmarks/results/benchmark_review_20260310.md
+++ b/benchmarks/results/benchmark_review_20260310.md
@@ -0,0 +1,433 @@
 
				+# Ticket Summary — Post-Change Benchmark Review: num_predict 300 → 500
			
 
				+
			
 
				+## Description
			
 
				+
			
 
				+After resolving the dual NUMA/CPUAffinity performance regression (2026-03-10), two
			
 
				+post-fix benchmark runs were executed to validate the effect of raising
			
 
				+`benchmark_num_predict` from 300 to 500. This document captures the four-run history,
			
 
				+before/after comparison, full Run 4 model results, and findings on system tuning state.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Acceptance Criteria
			
 
				+
			
 
				+- [x] Run 3 (num_predict=300) and Run 4 (num_predict=500) compared on common models
			
 
				+- [x] All tuning variables reviewed and declared optimal or requiring action
			
 
				+- [x] Any model-identity anomalies flagged for follow-up
			
 
				+- [x] MEMORY.md updated with current variable values
			
 
				+- [x] This ticket summary written to `benchmarks/results/`
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Work Implemented
			
 
				+
			
 
				+### Run History
			
 
				+
			
 
				+| Run | Timestamp | Condition | Result |
			
 
				+|-----|-----------|-----------|--------|
			
 
				+| 1 | 20260309T080551 | Broken NUMA (membind + CPUAffinity) | quality=0, tok/sec≈0.0–0.1 |
			
 
				+| 2 | 20260309T174604 | Broken NUMA (same bug) | quality=0, tok/sec=0.1 |
			
 
				+| 3 | 20260310T094843 | Post-NUMA-fix, num_predict=300, 4 models | quality=0.78–0.97, tok/sec=6.5–22.8 |
			
 
				+| 4 | 20260310T110632 | Post-NUMA-fix, num_predict=500, 9 models | quality=0.83–0.97, tok/sec=3.2–25.0 |
			
 
				+
			
 
				+### Before vs. After (Runs 3 → 4, common models)
			
 
				+
			
 
				+| Model | coding_quality @ 300 | coding_quality @ 500 | Delta |
			
 
				+|-------|---------------------|---------------------|-------|
			
 
				+| deepseek-coder-v2:latest | 0.783 | 0.833 | +0.050 |
			
 
				+| qwen2.5-coder:7b | 0.800 | 0.850 | +0.050 |
			
 
				+| llama3.2:3b | 0.850 | 0.890 | +0.040 |
			
 
				+| gemma3:12b-it-q4_K_M | 0.850 | 0.873 | +0.023 |
			
 
				+
			
 
				+### Full Run 4 Results (num_predict=500, 9 models)
			
 
				+
			
 
				+| Model | tok/sec | coding_q | general_q | latency_ms | coding_composite | general_composite | category |
			
 
				+|-------|---------|----------|-----------|------------|-----------------|------------------|----------|
			
 
				+| deepseek-coder-v2:16b | 24.5 | 0.833 | 0.885 | 1415.1 | 0.738 | 0.762 | coding |
			
 
				+| deepseek-coder-v2:latest | 25.0 | 0.833 | 0.885 | 1543.2 | 0.735 | 0.758 | coding |
			
 
				+| qwen2.5-coder:latest | 12.8 | 0.850 | 0.910 | 1228.2 | 0.667 | 0.694 | coding |
			
 
				+| qwen2.5-coder:7b | 12.7 | 0.850 | 0.910 | 1231.9 | 0.666 | 0.693 | coding |
			
 
				+| qwen2.5-coder:14B | 6.6 | 0.850 | 0.931 | 2195.9 | 0.572 | 0.609 | coding |
			
 
				+| codellama:34b | 3.2 | 0.833 | 0.586 | 4244.1 | 0.437 | 0.326 | coding |
			
 
				+| llama3.2:3b | 22.3 | 0.890 | 0.954 | 644.2 | 0.785 | 0.814 | general |
			
 
				+| llama3.1:8b | 11.8 | 0.823 | 0.877 | 2249.3 | 0.596 | 0.621 | general |
			
 
				+| gemma3:12b-it-q4_K_M | 6.4 | 0.873 | 0.966 | 6355.8 | 0.441 | 0.483 | general |
			
 
				+
			
 
				+### Current Slot Assignments (model_selection.json)
			
 
				+
			
 
				+| Slot | Socket | Role | Model | Composite |
			
 
				+|------|--------|------|-------|-----------|
			
 
				+| 1 | Node 1 (port 11434) | General (locked) | llama3.2:3b | 0.814 |
			
 
				+| 2 | Node 1 (port 11434) | General (locked) | llama3.1:8b | 0.621 |
			
 
				+| 3 | Node 0 (port 11435) | Coding (locked) | deepseek-coder-v2:16b | 0.738 |
			
 
				+| 4 | Node 0 (port 11435) | Coding (locked) | deepseek-coder-v2:latest | 0.735 |
			
 
				+| 5 | Node 1 (port 11434) | General (rotate) | gemma3:12b-it-q4_K_M | 0.483 |
			
 
				+| 6 | Node 0 (port 11435) | Coding (rotate) | qwen2.5-coder:latest | 0.667 |
			
 
				+
			
 
				+### Tuning Variable Status
			
 
				+
			
 
				+| Variable | Value | Status |
			
 
				+|----------|-------|--------|
			
 
				+| `benchmark_num_predict` | 500 | Optimal — rubric ceiling is now the binding constraint |
			
 
				+| `benchmark_large_timeout` | 480s | Adequate — 6–20x margin at current 3–25 tok/sec speeds |
			
 
				+| `benchmark_toks_norm_ceiling` | 40 | Correct — fastest model at 62.5% of ceiling |
			
 
				+| `benchmark_coding_threshold` | 0.10 | Correct — name-pattern fallback handling remaining cases |
			
 
				+| Scoring weights | 0.45/0.30/0.25 | Appropriate for interactive serving platform |
			
 
				+
			
 
				+### Findings
			
 
				+
			
 
				+**Finding 1 — num_predict=500 confirmed correct.** Every model improved on coding_quality
			
 
				+(+0.023 to +0.050). No timeouts observed. The rubric ceiling is now the binding constraint;
			
 
				+further increases (700+) would yield at most +0.02 additional improvement.
			
 
				+
			
 
				+**Finding 2 — Coding quality inversion narrowed (expected, not a bug).** Coding specialists
			
 
				+score lower on coding than general quality because general prompts don't require `assert`,
			
 
				+`test_def`, or `type_hint` (the hardest scoring markers). The gap halved from ~−0.110 to
			
 
				+~−0.052 vs. Run 3, confirming truncation was part of the cause. Name-pattern fallback
			
 
				+continues to correctly classify these models.
			
 
				+
			
 
				+**Finding 3 — deepseek-coder-v2:16b and :latest may be the same weights (ACTION REQUIRED).**
			
 
				+Both share identical quality scores (0.833/0.885) and nearly identical throughput (24.5 vs.
			
 
				+25.0 tok/sec). In Ollama, `:latest` typically resolves to the same weights as the default
			
 
				+variant. If confirmed identical, slots 3 and 4 hold duplicate models — zero benefit, wasted
			
 
				+VRAM. See Testing Needed for verification steps.
			
 
				+
			
 
				+**Finding 4 — qwen2.5-coder:latest and :7b are near-identical (informational).** Composites
			
 
				+of 0.667 vs. 0.666. Lower impact since only one is active in slot 6 at a time.
			
 
				+
			
 
				+**Finding 5 — llama3.2:3b outperforms coding specialists on coding composite (informational).**
			
 
				+coding_composite=0.785 beats all dedicated coding models. Mathematically correct: speed
			
 
				+(22.3 tok/sec) and latency (644ms) dominate. Correctly classified general because
			
 
				+general_composite (0.814) > coding_composite (0.785), delta < 0.10 threshold.
			
 
				+
			
 
				+**Finding 6 — codellama:34b correctly excluded.** 3.2 tok/sec, general_quality=0.586 falls
			
 
				+below min_quality_score=0.6. Scoring system worked as designed.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Testing Needed
			
 
				+
			
 
				+### Finding 3 — Verify deepseek-coder-v2:16b vs :latest digest
			
 
				+
			
 
				+Run on `ai_server`:
			
 
				+
			
 
				+```bash
			
 
				+ollama show deepseek-coder-v2:16b --modelfile | grep FROM
			
 
				+ollama show deepseek-coder-v2:latest --modelfile | grep FROM
			
 
				+```
			
 
				+
			
 
				+**If digests match (same weights):** update `model_selection.json` slot4_coding manually
			
 
				+(or remove one deepseek variant and re-run `03_benchmark.yml`) to redirect slot 4 to
			
 
				+`qwen2.5-coder:14B` (composite=0.572) or another diverse candidate for model diversity.
			
 
				+
			
 
				+**If digests differ (different weights):** no action — the pipeline is working as designed.
			
 
				+
			
 
				+### Regression check after any slot4 change
			
 
				+
			
 
				+If slot4 is redirected, run:
			
 
				+
			
 
				+```bash
			
 
				+ansible-playbook playbooks/04_models.yml -K -e @local.yml
			
 
				+```
			
 
				+
			
 
				+Confirm both warmup services start cleanly:
			
 
				+
			
 
				+```bash
			
 
				+systemctl status ollama-warmup.service ollama-warmup-node0.service
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Addendum — Run 5 Review (post deepseek:latest removal)
			
 
				+
			
 
				+## Run History (all five runs)
			
 
				+
			
 
				+| Run | Timestamp | Condition | Models | Result |
			
 
				+|-----|-----------|-----------|--------|--------|
			
 
				+| 1 | 20260309T080551 | Broken NUMA (membind + CPUAffinity) | — | quality=0, tok/sec≈0.0–0.1 |
			
 
				+| 2 | 20260309T174604 | Broken NUMA (same bug) | — | quality=0, tok/sec=0.1 |
			
 
				+| 3 | 20260310T094843 | Post-NUMA-fix, num_predict=300 | 4 | quality=0.78–0.97, tok/sec=6.5–22.8 |
			
 
				+| 4 | 20260310T110632 | num_predict=500, deepseek:latest present | 9 | quality=0.83–0.97, tok/sec=3.2–25.0 |
			
 
				+| 5 | 20260310T122818 | num_predict=500, deepseek:latest removed | 8 | quality=0.83–0.97, tok/sec=3.2–24.5 |
			
 
				+
			
 
				+## Run 4 → Run 5 Comparison (all common models)
			
 
				+
			
 
				+| Model | R4 tok/sec | R5 tok/sec | R4 coding_comp | R5 coding_comp | Delta |
			
 
				+|-------|-----------|-----------|----------------|----------------|-------|
			
 
				+| deepseek-coder-v2:16b | 24.5 | 24.1 | 0.738 | 0.727 | −0.011 (noise) |
			
 
				+| qwen2.5-coder:latest | 12.8 | 12.4 | 0.667 | 0.671 | +0.004 (noise) |
			
 
				+| qwen2.5-coder:7b | 12.7 | 12.6 | 0.666 | 0.674 | +0.008 (noise) |
			
 
				+| qwen2.5-coder:14B | 6.6 | 6.6 | 0.572 | 0.573 | +0.001 (noise) |
			
 
				+| llama3.2:3b | 22.3 | 24.5 | 0.785 | 0.806 | +0.021 (notable) |
			
 
				+| llama3.1:8b | 11.8 | 11.9 | 0.596 | 0.600 | +0.004 (noise) |
			
 
				+| gemma3:12b-it-q4_K_M | 6.4 | 6.2 | 0.441 | 0.439 | −0.002 (noise) |
			
 
				+| codellama:34b | 3.2 | 3.2 | 0.437 | 0.436 | −0.001 (noise) |
			
 
				+
			
 
				+Quality scores (coding_quality, general_quality) are **identical** across both runs —
			
 
				+confirming rubric scores are stable and deterministic at num_predict=500.
			
 
				+
			
 
				+## Run 5 Slot Assignments (model_selection.json)
			
 
				+
			
 
				+| Slot | Socket | Role | Model | Composite |
			
 
				+|------|--------|------|-------|-----------|
			
 
				+| 1 | Node 1 (port 11434) | General (locked) | llama3.2:3b | 0.835 |
			
 
				+| 2 | Node 1 (port 11434) | General (locked) | llama3.1:8b | 0.624 |
			
 
				+| 5 | Node 1 (port 11434) | General (rotate) | gemma3:12b-it-q4_K_M | 0.481 |
			
 
				+| 3 | Node 0 (port 11435) | Coding (locked) | deepseek-coder-v2:16b | 0.727 |
			
 
				+| 4 | Node 0 (port 11435) | Coding (locked) | qwen2.5-coder:7b | 0.674 |
			
 
				+| 6 | Node 0 (port 11435) | Coding (rotate) | qwen2.5-coder:latest | 0.671 |
			
 
				+
			
 
				+Note: slot4 is `qwen2.5-coder:7b` — the pipeline correctly ranked it #2 coding (0.674),
			
 
				+superseding the manual `qwen2.5-coder:14B` edit made earlier this session.
			
 
				+
			
 
				+## Findings
			
 
				+
			
 
				+**Finding 1 — System is stable; tuning parameters remain optimal (no action).** All quality
			
 
				+scores are identical between Run 4 and Run 5. Speed and latency deltas are within normal
			
 
				+run-to-run variance (±0.4 tok/sec, ±200ms TTFT). No tuning changes needed.
			
 
				+
			
 
				+| Variable | Value | Status |
			
 
				+|----------|-------|--------|
			
 
				+| `benchmark_num_predict` | 500 | Optimal — rubric ceiling is binding constraint |
			
 
				+| `benchmark_large_timeout` | 480s | Adequate — 6–20x margin at 3–25 tok/sec |
			
 
				+| `benchmark_toks_norm_ceiling` | 40 | Correct — fastest model at 61% of ceiling |
			
 
				+| `benchmark_coding_threshold` | 0.10 | Correct — name-pattern fallback working |
			
 
				+| Scoring weights | 0.45/0.30/0.25 | Appropriate for interactive serving |
			
 
				+
			
 
				+**Finding 2 — llama3.2:3b improved after deepseek:latest removal (informational).**
			
 
				+tok/sec: 22.3 → 24.5 (+2.2), general_composite: 0.814 → 0.835 (+0.021). Likely cause:
			
 
				+removing one large model reduced memory pressure / NUMA contention during warmup. The 3b
			
 
				+model benefits most as it runs fastest and competes most for memory bandwidth.
			
 
				+
			
 
				+**Finding 3 — qwen2.5-coder:7b and :latest confirmed duplicate weights (RESOLVED).**
			
 
				+Run 5 slot4=`:7b` (0.674) and slot6=`:latest` (0.671) showed identical quality scores
			
 
				+(coding=0.850, general=0.910) and nearly identical throughput (~12.4–12.8 tok/sec) across
			
 
				+both runs — same pattern as the deepseek duplicate. Verified on ai_server:
			
 
				+
			
 
				+```
			
 
				+qwen2.5-coder:7b    → sha256-60e05f2100071479f596b964f89f510f057ce397ea22f2833a0cfe029bfc2463
			
 
				+qwen2.5-coder:latest → sha256-60e05f2100071479f596b964f89f510f057ce397ea22f2833a0cfe029bfc2463
			
 
				+```
			
 
				+
			
 
				+Digests match. `qwen2.5-coder:latest` removed. Next step: re-run `03_benchmark.yml` (Run 6)
			
 
				+to promote `qwen2.5-coder:14B` to slot6_rotate, achieving genuine speed/quality diversity
			
 
				+on Node 0:
			
 
				+- slot3: deepseek-coder-v2:16b — fast+deep (24 tok/sec, 16B)
			
 
				+- slot4: qwen2.5-coder:7b — fast+light (12.6 tok/sec, 7B)
			
 
				+- slot6: qwen2.5-coder:14B — slower+richer quality (6.6 tok/sec, 14B)
			
 
				+
			
 
				+**Finding 4 — gemma3:12b latency_score=0 is persistent (informational, no action).**
			
 
				+TTFT consistently 6.1–6.4 seconds, above the 5000ms floor → latency_score=0 every run.
			
 
				+Hardware-limited (large quant loading time on Node 1), not a tuning issue. The model
			
 
				+correctly holds slot5_general_rotate on the strength of general_quality=0.966. The latency
			
 
				+penalty is working as intended.
			
 
				+
			
 
				+**Finding 5 — codellama:34b remains correctly excluded (informational, no action).**
			
 
				+composite=0.436, general_quality=0.586 — below both min_composite_score=0.55 and
			
 
				+min_quality_score=0.6 every run. Pipeline working as designed.
			
 
				+
			
 
				+## Next Action
			
 
				+
			
 
				+Run 6: re-benchmark after `qwen2.5-coder:latest` removal to promote `qwen2.5-coder:14B`
			
 
				+to slot6_rotate and achieve model diversity on Node 0.
			
 
				+
			
 
				+```bash
			
 
				+ansible-playbook playbooks/03_benchmark.yml -K -e @local.yml && \
			
 
				+ansible-playbook playbooks/04_models.yml -K -e @local.yml
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Addendum — Run 6 Review (post qwen2.5-coder:latest removal)
			
 
				+
			
 
				+## Run History (all six runs)
			
 
				+
			
 
				+| Run | Timestamp | Condition | Models | Result |
			
 
				+|-----|-----------|-----------|--------|--------|
			
 
				+| 1 | 20260309T080551 | Broken NUMA (membind + CPUAffinity) | — | quality=0, tok/sec≈0.0–0.1 |
			
 
				+| 2 | 20260309T174604 | Broken NUMA (same bug) | — | quality=0, tok/sec=0.1 |
			
 
				+| 3 | 20260310T094843 | Post-NUMA-fix, num_predict=300 | 4 | quality=0.78–0.97, tok/sec=6.5–22.8 |
			
 
				+| 4 | 20260310T110632 | num_predict=500, deepseek:latest present | 9 | quality=0.83–0.97, tok/sec=3.2–25.0 |
			
 
				+| 5 | 20260310T122818 | num_predict=500, deepseek:latest removed | 8 | quality=0.83–0.97, tok/sec=3.2–24.5 |
			
 
				+| 6 | 20260310T160815 | num_predict=500, qwen2.5-coder:latest removed | 8 | quality=0.83–0.97, tok/sec=3.2–24.2 |
			
 
				+
			
 
				+## Full Run 6 Results
			
 
				+
			
 
				+| Model | tok/sec | coding_q | general_q | latency_ms | coding_comp | general_comp | category |
			
 
				+|-------|---------|----------|-----------|------------|-------------|--------------|----------|
			
 
				+| deepseek-coder-v2:16b | 24.2 | 0.833 | 0.885 | 1383.8 | 0.737 | 0.760 | coding |
			
 
				+| deepseek-coder-v2:latest | 24.1 | 0.833 | 0.885 | 1411.4 | 0.735 | 0.759 | coding |
			
 
				+| qwen2.5-coder:7b | 12.6 | 0.850 | 0.910 | 1210.0 | 0.666 | 0.693 | coding |
			
 
				+| qwen2.5-coder:14B | 6.6 | 0.850 | 0.931 | 2181.0 | 0.573 | 0.609 | coding |
			
 
				+| codellama:34b | 3.2 | 0.833 | 0.586 | 4336.2 | 0.432 | 0.321 | coding |
			
 
				+| llama3.2:3b | 24.2 | 0.890 | 0.954 | 581.0 | 0.803 | 0.832 | general |
			
 
				+| llama3.1:8b | 11.8 | 0.823 | 0.877 | 2183.4 | 0.600 | 0.624 | general |
			
 
				+| gemma3:12b-it-q4_K_M | 6.2 | 0.873 | 0.966 | 5540.1 | 0.440 | 0.482 | general |
			
 
				+
			
 
				+## Run 5 → Run 6 Comparison (all common models)
			
 
				+
			
 
				+| Model | R5 tok/sec | R6 tok/sec | R5 coding_comp | R6 coding_comp | Delta |
			
 
				+|-------|-----------|-----------|----------------|----------------|-------|
			
 
				+| deepseek-coder-v2:16b | 24.1 | 24.2 | 0.727 | 0.737 | +0.010 (noise) |
			
 
				+| qwen2.5-coder:7b | 12.6 | 12.6 | 0.674 | 0.666 | −0.008 (noise) |
			
 
				+| qwen2.5-coder:14B | 6.6 | 6.6 | 0.573 | 0.573 | 0.000 |
			
 
				+| llama3.2:3b | 24.5 | 24.2 | 0.806 | 0.803 | −0.003 (noise) |
			
 
				+| llama3.1:8b | 11.9 | 11.8 | 0.600 | 0.600 | 0.000 |
			
 
				+| gemma3:12b-it-q4_K_M | 6.2 | 6.2 | 0.439 | 0.440 | +0.001 (noise) |
			
 
				+| codellama:34b | 3.2 | 3.2 | 0.436 | 0.432 | −0.004 (noise) |
			
 
				+
			
 
				+Quality scores are **identical** across all common models. All composites within run-to-run
			
 
				+noise (≤ ±0.010). Rubric confirmed deterministic across 6 runs.
			
 
				+
			
 
				+## Run 6 Slot Assignments (model_selection.json — current state)
			
 
				+
			
 
				+| Slot | Socket | Role | Model | Composite |
			
 
				+|------|--------|------|-------|-----------|
			
 
				+| 1 | Node 1 (port 11434) | General (locked) | llama3.2:3b | 0.832 |
			
 
				+| 2 | Node 1 (port 11434) | General (locked) | llama3.1:8b | 0.624 |
			
 
				+| 5 | Node 1 (port 11434) | General (rotate) | gemma3:12b-it-q4_K_M | 0.482 |
			
 
				+| 3 | Node 0 (port 11435) | Coding (locked) | deepseek-coder-v2:16b | 0.737 |
			
 
				+| 4 | Node 0 (port 11435) | Coding (locked) | deepseek-coder-v2:latest | 0.735 ← REGRESSION |
			
 
				+| 6 | Node 0 (port 11435) | Coding (rotate) | qwen2.5-coder:7b | 0.666 |
			
 
				+
			
 
				+## Findings
			
 
				+
			
 
				+**Finding 1 — deepseek-coder-v2:latest re-appeared in slot4 (REGRESSION, now fixed).**
			
 
				+Previously confirmed duplicate of `:16b` and removed after Run 4. Re-appeared in Run 6
			
 
				+because `group_vars/all.yml` contained two pull sources:
			
 
				+
			
 
				+1. `baseline_models` (line 121): `"deepseek-coder-v2"` — untagged, Ollama resolves to
			
 
				+   `:latest`, re-pulling the duplicate on every benchmark run.
			
 
				+2. `candidate_models`: explicit `"deepseek-coder-v2:latest"` entry — unconditionally pulls
			
 
				+   `:latest` as a testable model.
			
 
				+
			
 
				+**Fix applied to `inventory/group_vars/all.yml`:**
			
 
				+- `baseline_models`: changed `"deepseek-coder-v2"` → `"deepseek-coder-v2:16b"` (explicit tag)
			
 
				+- `candidate_models`: removed the `deepseek-coder-v2:latest` entry entirely
			
 
				+
			
 
				+**Also required on ai_server:** `ollama rm deepseek-coder-v2:latest`
			
 
				+
			
 
				+**Finding 2 — All scores and tuning variables remain stable (no action).** Every delta vs
			
 
				+Run 5 is within noise (≤ ±0.010 composite, quality scores identical). The rubric is
			
 
				+confirmed deterministic across 6 runs.
			
 
				+
			
 
				+| Variable | Value | Status |
			
 
				+|----------|-------|--------|
			
 
				+| `benchmark_num_predict` | 500 | Optimal |
			
 
				+| `benchmark_large_timeout` | 480s | Adequate |
			
 
				+| `benchmark_toks_norm_ceiling` | 40 | Correct |
			
 
				+| `benchmark_coding_threshold` | 0.10 | Correct |
			
 
				+
			
 
				+**Finding 3 — qwen2.5-coder:14B not yet in slot6 (consequence of Finding 1).** With
			
 
				+deepseek:latest occupying slot4, the coding rank yields:
			
 
				+  #1 deepseek:16b (0.737) → slot3, #2 deepseek:latest (0.735) → slot4,
			
 
				+  #3 qwen:7b (0.666) → slot6, #4 qwen:14B (0.573) → excluded.
			
 
				+After deepseek:latest is permanently removed, Run 7 expected layout:
			
 
				+  slot3=deepseek:16b, slot4=qwen:7b, slot6=qwen:14B.
			
 
				+
			
 
				+**Finding 4 — gemma3:12b TTFT=5540ms (informational, no action).** Persistently above
			
 
				+5000ms floor → latency_score=0 every run. Hardware-limited, not a tuning issue.
			
 
				+Correctly holds slot5_general_rotate on general_quality=0.966.
			
 
				+
			
 
				+**Finding 5 — codellama:34b correctly excluded again (informational, no action).**
			
 
				+composite=0.432, general_quality=0.586 — below both thresholds. Pipeline working as designed.
			
 
				+
			
 
				+## Next Action
			
 
				+
			
 
				+1. Remove duplicate from ai_server: `ollama rm deepseek-coder-v2:latest`
			
 
				+2. Run 7 (clean benchmark):
			
 
				+
			
 
				+```bash
			
 
				+ansible-playbook playbooks/03_benchmark.yml -K -e @local.yml && \
			
 
				+ansible-playbook playbooks/04_models.yml -K -e @local.yml
			
 
				+```
			
 
				+
			
 
				+Expected Run 7: slot4=`qwen2.5-coder:7b`, slot6=`qwen2.5-coder:14B`,
			
 
				+`deepseek-coder-v2:latest` absent from `all_metrics`.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Addendum — Run 7 Review (target Node 0 layout achieved, session closed)
			
 
				+
			
 
				+## Run History (all seven runs)
			
 
				+
			
 
				+| Run | Timestamp | Condition | Models | Result |
			
 
				+|-----|-----------|-----------|--------|--------|
			
 
				+| 1 | 20260309T080551 | Broken NUMA (membind + CPUAffinity) | — | quality=0, tok/sec≈0.0–0.1 |
			
 
				+| 2 | 20260309T174604 | Broken NUMA (same bug) | — | quality=0, tok/sec=0.1 |
			
 
				+| 3 | 20260310T094843 | Post-NUMA-fix, num_predict=300 | 4 | quality=0.78–0.97, tok/sec=6.5–22.8 |
			
 
				+| 4 | 20260310T110632 | num_predict=500, deepseek:latest present | 9 | quality=0.83–0.97, tok/sec=3.2–25.0 |
			
 
				+| 5 | 20260310T122818 | num_predict=500, deepseek:latest removed | 8 | quality=0.83–0.97, tok/sec=3.2–24.5 |
			
 
				+| 6 | 20260310T160815 | num_predict=500, qwen2.5-coder:latest removed | 8 | quality=0.83–0.97, tok/sec=3.2–24.2 |
			
 
				+| 7 | 20260310T170013 | group_vars fix applied, deepseek:latest absent | 7 | quality=0.83–0.97, tok/sec=3.2–23.5 |
			
 
				+
			
 
				+## Full Run 7 Results
			
 
				+
			
 
				+| Model | tok/sec | coding_q | general_q | latency_ms | coding_comp | general_comp | category |
			
 
				+|-------|---------|----------|-----------|------------|-------------|--------------|----------|
			
 
				+| deepseek-coder-v2:16b | 23.5 | 0.833 | 0.885 | 1568.5 | 0.723 | 0.746 | coding |
			
 
				+| qwen2.5-coder:7b | 12.5 | 0.850 | 0.910 | 1431.0 | 0.655 | 0.682 | coding |
			
 
				+| qwen2.5-coder:14B | 6.6 | 0.850 | 0.931 | 2229.7 | 0.570 | 0.607 | coding |
			
 
				+| codellama:34b | 3.2 | 0.833 | 0.586 | 4235.4 | 0.437 | 0.326 | coding |
			
 
				+| llama3.2:3b | 23.0 | 0.890 | 0.954 | 754.8 | 0.786 | 0.814 | general |
			
 
				+| llama3.1:8b | 11.8 | 0.823 | 0.877 | 2202.0 | 0.599 | 0.623 | general |
			
 
				+| gemma3:12b-it-q4_K_M | 6.1 | 0.873 | 0.966 | 5941.9 | 0.439 | 0.481 | general |
			
 
				+
			
 
				+`deepseek-coder-v2:latest` **absent** from `all_metrics` — group_vars fix verified working.
			
 
				+
			
 
				+## Run 6 → Run 7 Comparison (all common models)
			
 
				+
			
 
				+| Model | R6 tok/sec | R7 tok/sec | R6 coding_comp | R7 coding_comp | Delta |
			
 
				+|-------|-----------|-----------|----------------|----------------|-------|
			
 
				+| deepseek-coder-v2:16b | 24.2 | 23.5 | 0.737 | 0.723 | −0.014 (noise) |
			
 
				+| qwen2.5-coder:7b | 12.6 | 12.5 | 0.666 | 0.655 | −0.011 (noise) |
			
 
				+| qwen2.5-coder:14B | 6.6 | 6.6 | 0.573 | 0.570 | −0.003 (noise) |
			
 
				+| llama3.2:3b | 24.2 | 23.0 | 0.803 | 0.786 | −0.017 (noise) |
			
 
				+| llama3.1:8b | 11.8 | 11.8 | 0.600 | 0.599 | −0.001 (noise) |
			
 
				+| gemma3:12b-it-q4_K_M | 6.2 | 6.1 | 0.440 | 0.439 | −0.001 (noise) |
			
 
				+| codellama:34b | 3.2 | 3.2 | 0.432 | 0.437 | +0.005 (noise) |
			
 
				+
			
 
				+Quality scores are **identical** across all common models. All composites within run-to-run
			
 
				+noise (≤ ±0.017). Rubric confirmed deterministic across 7 runs.
			
 
				+
			
 
				+## Run 7 Slot Assignments (final, confirmed clean)
			
 
				+
			
 
				+| Slot | Socket | Role | Model | Composite |
			
 
				+|------|--------|------|-------|-----------|
			
 
				+| 1 | Node 1 (port 11434) | General (locked) | llama3.2:3b | 0.814 |
			
 
				+| 2 | Node 1 (port 11434) | General (locked) | llama3.1:8b | 0.623 |
			
 
				+| 5 | Node 1 (port 11434) | General (rotate) | gemma3:12b-it-q4_K_M | 0.481 |
			
 
				+| 3 | Node 0 (port 11435) | Coding (locked) | deepseek-coder-v2:16b | 0.723 |
			
 
				+| 4 | Node 0 (port 11435) | Coding (locked) | qwen2.5-coder:7b | 0.655 ✅ |
			
 
				+| 6 | Node 0 (port 11435) | Coding (rotate) | qwen2.5-coder:14B | 0.570 ✅ |
			
 
				+
			
 
				+## Findings
			
 
				+
			
 
				+**Finding 1 — Target Node 0 diversity layout achieved (RESOLVED).** Run 7 confirms the
			
 
				+intended three-tier Node 0 layout:
			
 
				+- slot3: deepseek-coder-v2:16b — deep specialist (23.5 tok/sec, 16B params)
			
 
				+- slot4: qwen2.5-coder:7b — fast+light (12.5 tok/sec, 7B params)
			
 
				+- slot6: qwen2.5-coder:14B — slower+richer (6.6 tok/sec, 14B params)
			
 
				+
			
 
				+All three are genuinely distinct models with different speed/quality tradeoffs.
			
 
				+
			
 
				+**Finding 2 — group_vars fix verified working (RESOLVED).** `deepseek-coder-v2:latest` is
			
 
				+absent from `all_metrics`. Explicit `:16b` tag in `baseline_models` prevents Ollama from
			
 
				+resolving to `:latest` on subsequent runs. The fix is durable — re-running `03_benchmark.yml`
			
 
				+will not re-introduce the duplicate.
			
 
				+
			
 
				+**Finding 3 — All scores and tuning variables stable (no action).** Every delta vs Run 6 is
			
 
				+within noise (≤ ±0.017 composite, quality scores identical). The pipeline is confirmed
			
 
				+deterministic and stable.
			
 
				+
			
 
				+| Variable | Value | Status |
			
 
				+|----------|-------|--------|
			
 
				+| `benchmark_num_predict` | 500 | Optimal |
			
 
				+| `benchmark_large_timeout` | 480s | Adequate |
			
 
				+| `benchmark_toks_norm_ceiling` | 40 | Correct |
			
 
				+| `benchmark_coding_threshold` | 0.10 | Correct |
			
 
				+
			
 
				+**Finding 4 — Benchmark pipeline declared stable. Session closed.** Seven runs over two
			
 
				+days confirmed: NUMA fix correct, scoring rubric deterministic, duplicate-model detection
			
 
				+pattern documented, group_vars idempotent. No further benchmark runs or tuning changes are
			
 
				+needed unless new models are added to `candidate_models`.
			
--- a/benchmarks/results/model_selection.json
+++ b/benchmarks/results/model_selection.json
@@ -1,92 +1,196 @@
 
				 {
			
 
				     "all_metrics": {
			
 
				-        "llama3.2:3b": {
			
 
				-            "avg_tok_per_sec": 0.1,
			
 
				+        "codellama:34b": {
			
 
				+            "avg_tok_per_sec": 3.2,
			
 
				+            "category": "coding",
			
 
				+            "coding_composite": 0.437,
			
 
				+            "coding_quality": 0.833,
			
 
				+            "general_composite": 0.326,
			
 
				+            "general_quality": 0.586,
			
 
				+            "latency_ms": 4235.4,
			
 
				+            "latency_score": 0.153,
			
 
				+            "toks_norm": 0.08
			
 
				+        },
			
 
				+        "deepseek-coder-v2:16b": {
			
 
				+            "avg_tok_per_sec": 23.5,
			
 
				+            "category": "coding",
			
 
				+            "coding_composite": 0.723,
			
 
				+            "coding_quality": 0.833,
			
 
				+            "general_composite": 0.746,
			
 
				+            "general_quality": 0.885,
			
 
				+            "latency_ms": 1568.5,
			
 
				+            "latency_score": 0.686,
			
 
				+            "toks_norm": 0.586
			
 
				+        },
			
 
				+        "gemma3:12b-it-q4_K_M": {
			
 
				+            "avg_tok_per_sec": 6.1,
			
 
				             "category": "general",
			
 
				-            "coding_composite": 0.413,
			
 
				-            "coding_quality": 0.917,
			
 
				-            "general_composite": 0.45,
			
 
				-            "general_quality": 1.0,
			
 
				-            "latency_ms": 9999,
			
 
				+            "coding_composite": 0.439,
			
 
				+            "coding_quality": 0.873,
			
 
				+            "general_composite": 0.481,
			
 
				+            "general_quality": 0.966,
			
 
				+            "latency_ms": 5941.9,
			
 
				             "latency_score": 0,
			
 
				-            "toks_norm": 0.002
			
 
				+            "toks_norm": 0.153
			
 
				         },
			
 
				-        "mistral-nemo:latest": {
			
 
				-            "avg_tok_per_sec": 0.1,
			
 
				+        "llama3.1:8b": {
			
 
				+            "avg_tok_per_sec": 11.8,
			
 
				             "category": "general",
			
 
				-            "coding_composite": 0.383,
			
 
				+            "coding_composite": 0.599,
			
 
				+            "coding_quality": 0.823,
			
 
				+            "general_composite": 0.623,
			
 
				+            "general_quality": 0.877,
			
 
				+            "latency_ms": 2202.0,
			
 
				+            "latency_score": 0.56,
			
 
				+            "toks_norm": 0.294
			
 
				+        },
			
 
				+        "llama3.2:3b": {
			
 
				+            "avg_tok_per_sec": 23.0,
			
 
				+            "category": "general",
			
 
				+            "coding_composite": 0.786,
			
 
				+            "coding_quality": 0.89,
			
 
				+            "general_composite": 0.814,
			
 
				+            "general_quality": 0.954,
			
 
				+            "latency_ms": 754.8,
			
 
				+            "latency_score": 0.849,
			
 
				+            "toks_norm": 0.576
			
 
				+        },
			
 
				+        "qwen2.5-coder:14B": {
			
 
				+            "avg_tok_per_sec": 6.6,
			
 
				+            "category": "coding",
			
 
				+            "coding_composite": 0.57,
			
 
				             "coding_quality": 0.85,
			
 
				-            "general_composite": 0.45,
			
 
				-            "general_quality": 1.0,
			
 
				-            "latency_ms": 9999,
			
 
				-            "latency_score": 0,
			
 
				-            "toks_norm": 0.001
			
 
				+            "general_composite": 0.607,
			
 
				+            "general_quality": 0.931,
			
 
				+            "latency_ms": 2229.7,
			
 
				+            "latency_score": 0.554,
			
 
				+            "toks_norm": 0.164
			
 
				         },
			
 
				         "qwen2.5-coder:7b": {
			
 
				-            "avg_tok_per_sec": 0.1,
			
 
				+            "avg_tok_per_sec": 12.5,
			
 
				             "category": "coding",
			
 
				-            "coding_composite": 0.371,
			
 
				-            "coding_quality": 0.823,
			
 
				-            "general_composite": 0.383,
			
 
				-            "general_quality": 0.85,
			
 
				-            "latency_ms": 9999,
			
 
				-            "latency_score": 0,
			
 
				-            "toks_norm": 0.001
			
 
				+            "coding_composite": 0.655,
			
 
				+            "coding_quality": 0.85,
			
 
				+            "general_composite": 0.682,
			
 
				+            "general_quality": 0.91,
			
 
				+            "latency_ms": 1431.0,
			
 
				+            "latency_score": 0.714,
			
 
				+            "toks_norm": 0.312
			
 
				         }
			
 
				     },
			
 
				     "coding_ranking": [
			
 
				         {
			
 
				-            "composite": 0.371,
			
 
				+            "composite": 0.723,
			
 
				             "metrics": {
			
 
				-                "avg_tok_per_sec": 0.1,
			
 
				+                "avg_tok_per_sec": 23.5,
			
 
				                 "category": "coding",
			
 
				-                "coding_composite": 0.371,
			
 
				-                "coding_quality": 0.823,
			
 
				-                "general_composite": 0.383,
			
 
				-                "general_quality": 0.85,
			
 
				-                "latency_ms": 9999,
			
 
				-                "latency_score": 0,
			
 
				-                "toks_norm": 0.001
			
 
				+                "coding_composite": 0.723,
			
 
				+                "coding_quality": 0.833,
			
 
				+                "general_composite": 0.746,
			
 
				+                "general_quality": 0.885,
			
 
				+                "latency_ms": 1568.5,
			
 
				+                "latency_score": 0.686,
			
 
				+                "toks_norm": 0.586
			
 
				+            },
			
 
				+            "name": "deepseek-coder-v2:16b"
			
 
				+        },
			
 
				+        {
			
 
				+            "composite": 0.655,
			
 
				+            "metrics": {
			
 
				+                "avg_tok_per_sec": 12.5,
			
 
				+                "category": "coding",
			
 
				+                "coding_composite": 0.655,
			
 
				+                "coding_quality": 0.85,
			
 
				+                "general_composite": 0.682,
			
 
				+                "general_quality": 0.91,
			
 
				+                "latency_ms": 1431.0,
			
 
				+                "latency_score": 0.714,
			
 
				+                "toks_norm": 0.312
			
 
				             },
			
 
				             "name": "qwen2.5-coder:7b"
			
 
				+        },
			
 
				+        {
			
 
				+            "composite": 0.57,
			
 
				+            "metrics": {
			
 
				+                "avg_tok_per_sec": 6.6,
			
 
				+                "category": "coding",
			
 
				+                "coding_composite": 0.57,
			
 
				+                "coding_quality": 0.85,
			
 
				+                "general_composite": 0.607,
			
 
				+                "general_quality": 0.931,
			
 
				+                "latency_ms": 2229.7,
			
 
				+                "latency_score": 0.554,
			
 
				+                "toks_norm": 0.164
			
 
				+            },
			
 
				+            "name": "qwen2.5-coder:14B"
			
 
				+        },
			
 
				+        {
			
 
				+            "composite": 0.437,
			
 
				+            "metrics": {
			
 
				+                "avg_tok_per_sec": 3.2,
			
 
				+                "category": "coding",
			
 
				+                "coding_composite": 0.437,
			
 
				+                "coding_quality": 0.833,
			
 
				+                "general_composite": 0.326,
			
 
				+                "general_quality": 0.586,
			
 
				+                "latency_ms": 4235.4,
			
 
				+                "latency_score": 0.153,
			
 
				+                "toks_norm": 0.08
			
 
				+            },
			
 
				+            "name": "codellama:34b"
			
 
				         }
			
 
				     ],
			
 
				     "general_ranking": [
			
 
				         {
			
 
				-            "composite": 0.45,
			
 
				+            "composite": 0.814,
			
 
				             "metrics": {
			
 
				-                "avg_tok_per_sec": 0.1,
			
 
				+                "avg_tok_per_sec": 23.0,
			
 
				                 "category": "general",
			
 
				-                "coding_composite": 0.413,
			
 
				-                "coding_quality": 0.917,
			
 
				-                "general_composite": 0.45,
			
 
				-                "general_quality": 1.0,
			
 
				-                "latency_ms": 9999,
			
 
				-                "latency_score": 0,
			
 
				-                "toks_norm": 0.002
			
 
				+                "coding_composite": 0.786,
			
 
				+                "coding_quality": 0.89,
			
 
				+                "general_composite": 0.814,
			
 
				+                "general_quality": 0.954,
			
 
				+                "latency_ms": 754.8,
			
 
				+                "latency_score": 0.849,
			
 
				+                "toks_norm": 0.576
			
 
				             },
			
 
				             "name": "llama3.2:3b"
			
 
				         },
			
 
				         {
			
 
				-            "composite": 0.45,
			
 
				+            "composite": 0.623,
			
 
				             "metrics": {
			
 
				-                "avg_tok_per_sec": 0.1,
			
 
				+                "avg_tok_per_sec": 11.8,
			
 
				                 "category": "general",
			
 
				-                "coding_composite": 0.383,
			
 
				-                "coding_quality": 0.85,
			
 
				-                "general_composite": 0.45,
			
 
				-                "general_quality": 1.0,
			
 
				-                "latency_ms": 9999,
			
 
				+                "coding_composite": 0.599,
			
 
				+                "coding_quality": 0.823,
			
 
				+                "general_composite": 0.623,
			
 
				+                "general_quality": 0.877,
			
 
				+                "latency_ms": 2202.0,
			
 
				+                "latency_score": 0.56,
			
 
				+                "toks_norm": 0.294
			
 
				+            },
			
 
				+            "name": "llama3.1:8b"
			
 
				+        },
			
 
				+        {
			
 
				+            "composite": 0.481,
			
 
				+            "metrics": {
			
 
				+                "avg_tok_per_sec": 6.1,
			
 
				+                "category": "general",
			
 
				+                "coding_composite": 0.439,
			
 
				+                "coding_quality": 0.873,
			
 
				+                "general_composite": 0.481,
			
 
				+                "general_quality": 0.966,
			
 
				+                "latency_ms": 5941.9,
			
 
				                 "latency_score": 0,
			
 
				-                "toks_norm": 0.001
			
 
				+                "toks_norm": 0.153
			
 
				             },
			
 
				-            "name": "mistral-nemo:latest"
			
 
				+            "name": "gemma3:12b-it-q4_K_M"
			
 
				         }
			
 
				     ],
			
 
				     "slot1_general": "llama3.2:3b",
			
 
				-    "slot2_general": "mistral-nemo:latest",
			
 
				-    "slot3_coding": "qwen2.5-coder:7b",
			
 
				+    "slot2_general": "llama3.1:8b",
			
 
				+    "slot3_coding": "deepseek-coder-v2:16b",
			
 
				     "slot4_coding": "qwen2.5-coder:7b",
			
 
				-    "slot5_general_rotate": "none",
			
 
				-    "slot6_coding_rotate": "none"
			
 
				+    "slot5_general_rotate": "gemma3:12b-it-q4_K_M",
			
 
				+    "slot6_coding_rotate": "qwen2.5-coder:14B"
			
 
				 }
			
--- a/inventory/group_vars/all.yml
+++ b/inventory/group_vars/all.yml
@@ -107,7 +107,7 @@ benchmark_load_timeout: 180      # seconds — warm-up "Hi" prompt per model bef
 
				 benchmark_small_timeout: 90      # seconds per request, small models (<10 GB)
			
 
				 benchmark_medium_timeout: 240    # seconds per request, medium models (10–15 GB)
			
 
				 benchmark_large_timeout: 480     # seconds per request, large models (>15 GB)
			
 
				-benchmark_num_predict: 300       # cap output tokens per prompt; bounds worst-case at 0.1 tok/sec to ~3000s, at 10 tok/sec to ~30s
			
 
				+benchmark_num_predict: 500       # cap output tokens; allows full coding responses (def+return+docstring+assert); worst-case: 6.5 tok/s→77s, 22 tok/s→23s
			
 
				 
			
 
				 # Explicit category overrides applied before heuristics. Keys are model names as
			
 
				 # returned by `ollama list`. Valid values: 'coding' or 'general'.
			
@@ -118,7 +118,7 @@ model_category_overrides: {}
 
				 # These are the minimum set needed to populate all 4 slots with meaningful candidates.
			
 
				 baseline_models:
			
 
				   - "llama3.2:3b"
			
 
				-  - "deepseek-coder-v2"
			
 
				+  - "deepseek-coder-v2:16b"
			
 
				   - "qwen2.5-coder:7b"
			
 
				   - "llama3.1:8b"
			
 
				 
			
@@ -129,11 +129,6 @@ candidate_models:
 
				     expected_tokens_sec: 4.5
			
 
				     reason: "Larger qwen2.5-coder for higher quality"
			
 
				     category: coding
			
 
				-  - name: "deepseek-coder-v2:latest"
			
 
				-    size_gb: 9
			
 
				-    expected_tokens_sec: 8.0
			
 
				-    reason: "DeepSeek Coder V2 full model"
			
 
				-    category: coding
			
 
				   - name: "codegemma:7b-instruct-q5_K_M"
			
 
				     size_gb: 5.5
			
 
				     expected_tokens_sec: 12.0