5 日前 · 55d412f85d
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -6,22 +6,26 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
				 
			
 
				 ```bash
			
 
				 # Full deployment
			
 
				-ansible-playbook playbooks/site.yml
			
 
				+ansible-playbook playbooks/site.yml -K -e @local.yml
			
 
				 
			
 
				 # Run a single playbook
			
 
				-ansible-playbook playbooks/03_benchmark.yml
			
 
				+ansible-playbook playbooks/03_benchmark.yml -K -e @local.yml
			
 
				 
			
 
				 # Run with tags (each playbook defines granular tags)
			
 
				-ansible-playbook playbooks/site.yml --tags ollama,docker
			
 
				+ansible-playbook playbooks/site.yml --tags ollama,docker -K -e @local.yml
			
 
				 
			
 
				 # Benchmark and update warm-up slots in one shot
			
 
				-ansible-playbook playbooks/03_benchmark.yml && ansible-playbook playbooks/04_models.yml
			
 
				+ansible-playbook playbooks/03_benchmark.yml -K -e @local.yml && \
			
 
				+ansible-playbook playbooks/04_models.yml -K -e @local.yml
			
 
				 
			
 
				-# Override slot 4 with a specific model
			
 
				-ansible-playbook playbooks/04_models.yml -e "slot4_model=qwen2.5-coder:7b"
			
 
				+# Rotate general slot (Node 1, port 11434)
			
 
				+ansible-playbook playbooks/04_models.yml -K -e @local.yml -e "slot5_model=mistral:latest"
			
 
				+
			
 
				+# Rotate coding slot (Node 0, port 11435)
			
 
				+ansible-playbook playbooks/04_models.yml -K -e @local.yml -e "slot6_model=llama3.1:70b"
			
 
				 
			
 
				 # Run against a subset of hosts
			
 
				-ansible-playbook playbooks/09_nginx.yml --limit nginx_proxy
			
 
				+ansible-playbook playbooks/09_nginx.yml --limit nginx_proxy -K -e @local.yml
			
 
				 
			
 
				 # Lint playbooks
			
 
				 ansible-lint playbooks/
			
@@ -30,7 +34,7 @@ ansible-lint playbooks/
 
				 ansible-galaxy collection install -r requirements.yml
			
 
				 
			
 
				 # Check mode (dry run)
			
 
				-ansible-playbook playbooks/site.yml --check --diff
			
 
				+ansible-playbook playbooks/site.yml --check --diff -K -e @local.yml
			
 
				 ```
			
 
				 
			
 
				 ## Required Local Configuration
			
@@ -87,17 +91,20 @@ All credentials live exclusively in Vault under `secret/data/{{ vault_project_sl
 
				 
			
 
				 **Composite score formula:**
			
 
				 ```
			
 
				-composite = (quality × 0.45) + (tokens_per_sec / 30, capped at 1.0) × 0.30 + (1 - ttft_ms/5000, floored at 0) × 0.25
			
 
				+composite = (quality × 0.45) + (tokens_per_sec / ceiling, capped at 1.0) × 0.30 + (1 - ttft_ms/5000, floored at 0) × 0.25
			
 
				 ```
			
 
				+`benchmark_toks_norm_ceiling` defaults to 40 (dual-socket target).
			
 
				+
			
 
				+**Slot classification:** if `coding_composite - general_composite >= 0.10` (configurable via `benchmark_coding_threshold`), model goes to a coding slot; otherwise general.
			
 
				 
			
 
				-**Slot classification:** if `coding_composite - general_composite >= 0.15` (configurable via `benchmark_coding_threshold`), model goes to a coding slot; otherwise general.
			
 
				+**6 warm-up slots across two NUMA instances:**
			
 
				+- Node 1 (port 11434): slots 1–2 locked general + slot 5 rotatable general
			
 
				+- Node 0 (port 11435): slots 3–4 locked coding + slot 6 rotatable coding
			
 
				+- Slots 5/6 rotatable via `-e slot5_model=<name>` / `-e slot6_model=<name>` without re-benchmarking
			
 
				 
			
 
				-**4 warm-up slots always hot in RAM:**
			
 
				-- Slots 1–2: top general-purpose models by composite score
			
 
				-- Slots 3–4: top coding models by composite score
			
 
				-- Slot 4 is user-rotatable via `-e slot4_model=<name>` without re-benchmarking
			
 
				+`04_models.yml` creates Modelfiles (`coder-128k`, `coder-32k`, `coder-rotate`, `llama-family`, `gemma-family`) and two warmup services: `ollama-warmup.service` (Node 1) and `ollama-warmup-node0.service` (Node 0).
			
 
				 
			
 
				-`04_models.yml` creates named Ollama Modelfiles (`coder-128k`, `coder-32k`, `llama-family`, `gemma-family`) and a `ollama-warmup.service` systemd one-shot that pre-loads all 4 slots after Ollama starts.
			
 
				+**Benchmark alias filter:** `benchmark_skip_aliases` in `group_vars/all.yml` lists the Modelfile aliases — the benchmark playbook excludes these from the test loop to prevent 32k-token KV-cache allocations from stalling the run.
			
 
				 
			
 
				 ### Key Variables
			
 
				 
			
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ bot access -- all driven by a single `ansible-playbook deploy_ai.yml` command.
 
				           ┌───────────────▼┐    ┌────▼──────────────────────┐
			
 
				           │ coredns_host   │    │ ai_server                 │
			
 
				           │ 192.168.1.29   │    │ 192.168.1.100             │
			
 
				-          │                │    │                            │
			
 
				+          │                │    │                           │
			
 
				           │ - CoreDNS      │    │ - Ollama (LLM inference)  │
			
 
				           └────────────────┘    │ - Open WebUI              │
			
 
				                                 │ - Keycloak (SSO/OIDC)     │
			
@@ -292,11 +292,13 @@ The benchmark playbook automatically selects the best coding models and keeps th
 
				 Check the current slot assignments in `benchmarks/results/model_selection.json`:
			
 
				 
			
 
				 ```bash
			
 
				-cat benchmarks/results/model_selection.json | python3 -m json.tool | grep slot
			
 
				+python3 -m json.tool benchmarks/results/model_selection.json | grep slot
			
 
				 ```
			
 
				 
			
 
				-Slots 3 and 4 are always coding-classified models. Use the `slot3_coding` model for
			
 
				-primary work and `slot4_coding` for a lighter/faster alternative.
			
 
				+Slots 3–6 are coding-classified models, all running on the Node 0 instance at port 11435.
			
 
				+Use `slot3_coding` (the highest-scoring coding model) as your primary model. Connect coding
			
 
				+tools directly to `https://ollama-api.<domain>` (proxied from port 11434, Node 1) or to
			
 
				+Open WebUI which load-balances across both instances.
			
 
				 
			
 
				 ## Day-2 Operations
			
 
				 
			
@@ -343,6 +345,13 @@ ansible-playbook playbooks/03_benchmark.yml -K -e @local.yml \
 
				   -e "benchmark_models=qwen2.5-coder:14b-instruct-q4_K_M,codestral:22b-v0.1-q4_K_M"
			
 
				 ```
			
 
				 
			
 
				+**Override tier boundaries or timeouts (see [benchmarks/README.md](benchmarks/README.md#three-pass-execution)):**
			
 
				+
			
 
				+```bash
			
 
				+ansible-playbook playbooks/03_benchmark.yml -K -e @local.yml \
			
 
				+  -e "benchmark_small_max_gb=8 benchmark_medium_max_gb=20"
			
 
				+```
			
 
				+
			
 
				 **Pull recommended models if scores are below threshold:**
			
 
				 
			
 
				 ```bash
			
@@ -355,10 +364,20 @@ ansible-playbook playbooks/03_benchmark.yml -K -e @local.yml -e "pull_if_better=
 
				 ansible-playbook playbooks/04_models.yml -K -e @local.yml
			
 
				 ```
			
 
				 
			
 
				-**Rotate slot 4 to a specific model:**
			
 
				+**Rotate slot 5 (general) or slot 6 (coding) to a specific model:**
			
 
				 
			
 
				 ```bash
			
 
				-ansible-playbook playbooks/04_models.yml -K -e @local.yml -e "slot4_model=deepseek-r1:14b"
			
 
				+# Swap general rotate slot
			
 
				+ansible-playbook playbooks/04_models.yml -K -e @local.yml -e "slot5_model=mistral:latest"
			
 
				+
			
 
				+# Swap coding rotate slot
			
 
				+ansible-playbook playbooks/04_models.yml -K -e @local.yml -e "slot6_model=llama3.1:70b"
			
 
				+
			
 
				+# Both at once
			
 
				+ansible-playbook playbooks/04_models.yml -K -e @local.yml -e "slot5_model=mistral:latest" -e "slot6_model=command-r:35b"
			
 
				+
			
 
				+# Reset both rotate slots back to benchmark recommendations
			
 
				+ansible-playbook playbooks/04_models.yml -K -e @local.yml
			
 
				 ```
			
 
				 
			
 
				 **Redeploy Keycloak only:**
			
@@ -393,16 +412,25 @@ ansible-playbook playbooks/11_vault_oidc.yml -K -e @local.yml
 
				 
			
 
				 ## Model Slot System
			
 
				 
			
 
				-Four models are kept warm in RAM at all times (`OLLAMA_MAX_LOADED_MODELS=4`, `OLLAMA_KEEP_ALIVE=-1`). Slots are filled by the benchmark playbook — no model names are hardcoded.
			
 
				+Six models are kept warm across two Ollama instances (`OLLAMA_MAX_LOADED_MODELS=3` each, `OLLAMA_KEEP_ALIVE=-1`). Slots are filled automatically by the benchmark playbook — no model names are hardcoded.
			
 
				+
			
 
				+```
			
 
				+NUMA Node 1 — ollama.service     — port 11434  (general models)
			
 
				+NUMA Node 0 — ollama-node0.service — port 11435 (coding models)
			
 
				+```
			
 
				+
			
 
				+| Slot | Instance      | Port  | Role                    | Selection                     | Rotation                                    |
			
 
				+|------|---------------|-------|-------------------------|-------------------------------|---------------------------------------------|
			
 
				+| 1    | Node 1        | 11434 | General primary (locked) | Top general composite score  | Replaced only by re-benchmark               |
			
 
				+| 2    | Node 1        | 11434 | General secondary (locked)| 2nd general composite score | Replaced only by re-benchmark               |
			
 
				+| 5    | Node 1        | 11434 | General rotate           | 3rd general composite score   | `-e slot5_model=<name>`                     |
			
 
				+| 3    | Node 0        | 11435 | Coding primary (locked)  | Top coding composite score    | Replaced only by re-benchmark               |
			
 
				+| 4    | Node 0        | 11435 | Coding secondary (locked)| 2nd coding composite score    | Replaced only by re-benchmark               |
			
 
				+| 6    | Node 0        | 11435 | Coding rotate            | 3rd coding composite score    | `-e slot6_model=<name>`                     |
			
 
				 
			
 
				-| Slot | Role                      | Selection                     | Rotation                              |
			
 
				-|------|---------------------------|-------------------------------|---------------------------------------|
			
 
				-| 1    | General-purpose primary   | Top general composite score   | Replaced if score < threshold         |
			
 
				-| 2    | General-purpose secondary | 2nd general composite score   | Replaced if score < threshold         |
			
 
				-| 3    | Coding primary            | Top coding composite score    | Locked; replaced only by re-benchmark |
			
 
				-| 4    | Coding secondary          | 2nd coding composite score    | Rotatable: `-e slot4_model=<name>`    |
			
 
				+**Classification rule:** a model is classified `coding` if its coding composite score exceeds its general composite score by ≥ 0.10; otherwise `general`.
			
 
				 
			
 
				-**Classification rule:** a model is classified `coding` if its coding composite score exceeds its general composite score by ≥ 0.15; otherwise `general`.
			
 
				+**Modelfile aliases** (`coder-128k`, `coder-32k`, `coder-rotate`, `llama-family`, `gemma-family`) are excluded from benchmarking to prevent KV-cache allocation stalls.
			
 
				 
			
 
				 ## Verification Steps
			
 
				 
			
@@ -416,8 +444,10 @@ After a full `deploy_ai.yml` run, verify the deployment (substitute your actual
 
				 6. **Qdrant health** -- `curl -s http://<ai_server_ip>:6333/healthz` returns OK
			
 
				 7. **CoreDNS resolution** -- `dig @<coredns_host_ip> vault.example.com` returns `<nginx_proxy_ip>`
			
 
				 8. **NGINX configs** -- `ssh <nginx_proxy_ip> 'sudo nginx -t'` passes
			
 
				-9. **OpenClaw** -- send a message to the Telegram bot, confirm response
			
 
				+9. **OpenClaw** -- send a message to the Telegram bot, confirm response using slot1_general model
			
 
				 10. **Benchmark report** -- check `benchmarks/results/benchmark_<timestamp>.md` for latest results
			
 
				+11. **Node 0 Ollama** -- `curl -s -H "Authorization: Bearer <key>" http://<ai_server_ip>:11435/api/tags` returns model list
			
 
				+12. **Both warmup services** -- `systemctl status ollama-warmup ollama-warmup-node0` both show `active (exited)`
			
 
				 
			
 
				 ## Role Reference
			
 
				 
			
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -3,133 +3,184 @@
 
				 ## Overview
			
 
				 
			
 
				 Dynamic benchmark system for all installed Ollama models. Runs a suite of coding and
			
 
				-general-purpose tests against every model currently available on the Ollama server,
			
 
				-scores each model on a composite metric, and assigns models to the 4-slot system
			
 
				-based on results.
			
 
				+general-purpose tests against every model on the Ollama server, scores each model on a
			
 
				+composite metric, and assigns models to the 6-slot dual-socket system based on results.
			
 
				+
			
 
				+Modelfile aliases (`coder-128k`, `coder-32k`, `coder-rotate`, `llama-family`,
			
 
				+`gemma-family`) are automatically excluded from benchmarking — they share weights with
			
 
				+real models and their large context window parameters would stall every run with
			
 
				+285-second KV-cache allocations.
			
 
				 
			
 
				 ## How to Run
			
 
				 
			
 
				 **Benchmark all installed models:**
			
 
				 
			
 
				 ```bash
			
 
				-ansible-playbook playbooks/05_benchmark.yml
			
 
				+ansible-playbook playbooks/03_benchmark.yml -K -e @local.yml
			
 
				 ```
			
 
				 
			
 
				 **Benchmark specific models only:**
			
 
				 
			
 
				 ```bash
			
 
				-ansible-playbook playbooks/05_benchmark.yml -e '{"benchmark_specific_models":["qwen2.5-coder:14b","deepseek-coder-v2:16b"]}'
			
 
				+ansible-playbook playbooks/03_benchmark.yml -K -e @local.yml \
			
 
				+  -e "benchmark_models=qwen2.5-coder:14b,deepseek-coder-v2:16b"
			
 
				 ```
			
 
				 
			
 
				-**Benchmark with automatic model pulling if a better model is found:**
			
 
				+**Benchmark and immediately push 6-slot warm-up selections:**
			
 
				 
			
 
				 ```bash
			
 
				-ansible-playbook playbooks/05_benchmark.yml -e pull_if_better=true
			
 
				+ansible-playbook playbooks/03_benchmark.yml -K -e @local.yml && \
			
 
				+ansible-playbook playbooks/04_models.yml -K -e @local.yml
			
 
				+```
			
 
				+
			
 
				+## Three-Pass Execution
			
 
				+
			
 
				+Models are split into three size tiers before benchmarking. Each tier gets its own
			
 
				+per-request timeout to avoid small models waiting behind 70 B giants:
			
 
				+
			
 
				+| Tier   | RAM threshold | Timeout | Description                       |
			
 
				+|--------|---------------|---------|-----------------------------------|
			
 
				+| Small  | < 10 GB       | 300 s   | 7 B and under — fast path         |
			
 
				+| Medium | 10–15 GB      | 900 s   | 16 B lite / 12 B — standard wait  |
			
 
				+| Large  | > 15 GB       | 1200 s  | 34 B+ — 20-minute ceiling         |
			
 
				+
			
 
				+**Size source vs runtime RAM:** `ollama list` reports on-disk (compressed) sizes, which
			
 
				+are smaller than actual runtime RAM usage (model weights + KV cache + overhead). A
			
 
				+`benchmark_size_overhead_factor` (default `1.2`) is applied when computing tier
			
 
				+boundaries: the disk-size cutoffs are divided by the factor before comparison. For
			
 
				+example, with default settings a 9 GB on-disk model is treated as ~10.8 GB at runtime
			
 
				+and falls in the medium tier rather than small.
			
 
				+
			
 
				+**Override tier boundaries:**
			
 
				+
			
 
				+```bash
			
 
				+# Adjust where small/medium boundary sits
			
 
				+ansible-playbook playbooks/03_benchmark.yml -K -e @local.yml \
			
 
				+  -e "benchmark_small_max_gb=8 benchmark_medium_max_gb=20"
			
 
				+
			
 
				+# Tune the overhead factor if your models load larger/smaller than expected
			
 
				+ansible-playbook playbooks/03_benchmark.yml -K -e @local.yml \
			
 
				+  -e "benchmark_size_overhead_factor=1.25"
			
 
				+
			
 
				+# Override timeouts only
			
 
				+ansible-playbook playbooks/03_benchmark.yml -K -e @local.yml \
			
 
				+  -e "benchmark_medium_timeout=600 benchmark_large_timeout=1800"
			
 
				 ```
			
 
				 
			
 
				 ## Test Suites
			
 
				 
			
 
				 ### Coding Tests
			
 
				 
			
 
				-| Test       | Prompt                                                         | What Is Measured              |
			
 
				-|------------|----------------------------------------------------------------|-------------------------------|
			
 
				-| `code_gen` | "Write a Python function that implements binary search on a sorted list. Include type hints and docstring." | Correctness (def + return present), code structure, tokens/sec |
			
 
				-| `debug`    | "Find and fix the bug in this Python code: `def factorial(n): return n * factorial(n)`. Explain the issue." | Identifies base case bug, explanation quality, tokens/sec |
			
 
				-| `refactor` | "Refactor this code to use list comprehension: `result = []; for i in range(10): if i % 2 == 0: result.append(i*i)`" | Produces list comprehension, conciseness, tokens/sec |
			
 
				+| Test       | Prompt                                                                     | What Is Measured                                   |
			
 
				+|------------|----------------------------------------------------------------------------|----------------------------------------------------|
			
 
				+| `code_gen` | Write a Python merge sort with type hints, docstring, and 3 unit tests     | `def`, `return`, `"""`, `->`, `assert`, `def test_`, `import` |
			
 
				+| `debug`    | Find and fix 3 bugs in a given Python function                             | `def`, `return`, code block, `assert`              |
			
 
				+| `refactor` | Refactor a loop for readability and performance                            | `def`, `return`, code block, type hint, `import`   |
			
 
				 
			
 
				 ### General Tests
			
 
				 
			
 
				-| Test        | Prompt                                                        | What Is Measured              |
			
 
				-|-------------|---------------------------------------------------------------|-------------------------------|
			
 
				-| `explain`   | "Explain the concept of recursion to a beginner programmer. Use a simple analogy." | Clarity, analogy presence, length adequacy, tokens/sec |
			
 
				-| `creative`  | "Write a short poem about artificial intelligence."           | Creativity (line count, poetic structure), tokens/sec |
			
 
				-| `reasoning` | "A farmer has 17 sheep. All but 9 die. How many are left? Explain your reasoning step by step." | Correct answer (9), step-by-step reasoning, tokens/sec |
			
 
				+| Test        | Prompt                                                     | What Is Measured                                     |
			
 
				+|-------------|------------------------------------------------------------|------------------------------------------------------|
			
 
				+| `explain`   | Explain how Python's GIL works and when it matters         | Response length, paragraph structure, list formatting |
			
 
				+| `creative`  | Suggest 5 fun family activities for a rainy weekend        | Response length, paragraph structure, list formatting |
			
 
				+| `reasoning` | Apple arithmetic word problem                              | Response length, paragraph structure, list formatting |
			
 
				 
			
 
				 ### Latency Test
			
 
				 
			
 
				-| Test      | Prompt | What Is Measured           |
			
 
				-|-----------|--------|----------------------------|
			
 
				-| `latency` | "Hi"   | Time to first token (TTFT) |
			
 
				+| Test      | Prompt | What Is Measured                                   |
			
 
				+|-----------|--------|----------------------------------------------------|
			
 
				+| `latency` | "Hi"   | Total response time (eval + prompt eval), used as TTFT proxy |
			
 
				 
			
 
				 ## Scoring
			
 
				 
			
 
				-### Metrics Collected from Ollama API
			
 
				-
			
 
				-- **tokens/sec** -- generation throughput from `/api/generate` response
			
 
				-- **TTFT** (time to first token) -- measured from request start to first streamed token
			
 
				-- **Quality heuristics** -- regex and length checks specific to each test type
			
 
				-
			
 
				 ### Composite Score Formula
			
 
				 
			
 
				 For each category (coding, general), a composite score is calculated:
			
 
				 
			
 
				 ```
			
 
				-composite = (quality * 0.45) + (tokens_per_sec_normalized * 0.30) + (latency_score * 0.25)
			
 
				+composite = (quality * 0.45) + (tokens_per_sec / ceiling, capped 1.0) * 0.30
			
 
				+          + (1 - ttft_ms / 5000, floored 0) * 0.25
			
 
				 ```
			
 
				 
			
 
				 Where:
			
 
				-- `quality` is 0.0-1.0 based on heuristic checks for the test type
			
 
				-- `tokens_per_sec_normalized` is the model's tokens/sec divided by the fastest model's tokens/sec
			
 
				-- `latency_score` is 1.0 - (model_ttft / slowest_ttft)
			
 
				+- `quality` — 0.0–1.0 from heuristic checks per test type (see CLAUDE.md for weights)
			
 
				+- `tokens_per_sec` — averaged across all test responses; normalized against `benchmark_toks_norm_ceiling` (default 40)
			
 
				+- `ttft_ms` — latency test response time in milliseconds
			
 
				 
			
 
				 ### Classification Rule
			
 
				 
			
 
				-A model is classified as a **coding** model if:
			
 
				+A model is classified as **coding** if:
			
 
				 
			
 
				 ```
			
 
				-coding_composite - general_composite >= 0.15
			
 
				+coding_composite - general_composite >= benchmark_coding_threshold   # default 0.10
			
 
				 ```
			
 
				 
			
 
				-Otherwise it is classified as **general**.
			
 
				+Name-pattern heuristics (`coder`, `codestral`, `codellama`, `starcoder`) apply as a
			
 
				+tiebreaker. Category can also be forced with `model_category_overrides` in `group_vars/all.yml`.
			
 
				 
			
 
				 ## Thresholds and Configuration
			
 
				 
			
 
				-All thresholds are configurable via `group_vars/all.yml`:
			
 
				-
			
 
				-| Key                            | Default | Description                                    |
			
 
				-|--------------------------------|---------|------------------------------------------------|
			
 
				-| `benchmark_min_tokens_per_sec` | 10      | Minimum tokens/sec to pass a model             |
			
 
				-| `benchmark_max_ttft_ms`        | 5000    | Maximum time to first token in milliseconds    |
			
 
				-| `benchmark_quality_weight`     | 0.45    | Weight of quality score in composite            |
			
 
				-| `benchmark_speed_weight`       | 0.30    | Weight of tokens/sec in composite               |
			
 
				-| `benchmark_latency_weight`     | 0.25    | Weight of latency score in composite            |
			
 
				-| `benchmark_coding_threshold`   | 0.15    | Minimum coding-general delta for coding classification |
			
 
				+All thresholds are configurable in `inventory/group_vars/all.yml`:
			
 
				+
			
 
				+| Key                               | Default | Description                                            |
			
 
				+|-----------------------------------|---------|--------------------------------------------------------|
			
 
				+| `benchmark_thresholds.min_tokens_per_sec`  | 5.0  | Minimum tok/sec to be slot-eligible          |
			
 
				+| `benchmark_thresholds.min_quality_score`   | 0.6  | Minimum quality score to be slot-eligible    |
			
 
				+| `benchmark_thresholds.min_composite_score` | 0.55 | Minimum composite to avoid threshold warning |
			
 
				+| `benchmark_toks_norm_ceiling`     | 40      | tok/sec ceiling for normalization (dual-socket target) |
			
 
				+| `benchmark_coding_threshold`      | 0.10    | coding-general composite delta for classification      |
			
 
				+| `benchmark_small_max_gb`          | 10      | Runtime RAM upper bound for small pass (GB)            |
			
 
				+| `benchmark_medium_max_gb`         | 15      | Runtime RAM upper bound for medium pass (GB)           |
			
 
				+| `benchmark_size_overhead_factor`  | 1.2     | Multiplier applied to `ollama list` disk sizes to estimate runtime RAM |
			
 
				+| `benchmark_small_timeout`         | 300     | Per-request timeout for small models (seconds)         |
			
 
				+| `benchmark_medium_timeout`        | 900     | Per-request timeout for medium models (seconds)        |
			
 
				+| `benchmark_large_timeout`         | 1200    | Per-request timeout for large models (seconds)         |
			
 
				+| `benchmark_skip_aliases`          | see below| Modelfile aliases excluded from benchmark loop        |
			
 
				+
			
 
				+Default `benchmark_skip_aliases`:
			
 
				+```yaml
			
 
				+- coder-128k
			
 
				+- coder-32k
			
 
				+- coder-rotate
			
 
				+- llama-family
			
 
				+- gemma-family
			
 
				+```
			
 
				 
			
 
				 ## Output Format
			
 
				 
			
 
				 ### Benchmark Report
			
 
				 
			
 
				-Each run produces `benchmarks/benchmark_<timestamp>.md` with a results table:
			
 
				+Each run produces `benchmarks/results/benchmark_<timestamp>.md`. The slot table now
			
 
				+covers all 6 slots across both NUMA instances:
			
 
				 
			
 
				 ```
			
 
				-| Model                  | Coding Composite | General Composite | Classification | Tokens/sec | TTFT (ms) |
			
 
				-|------------------------|------------------|-------------------|----------------|------------|-----------|
			
 
				-| qwen2.5-coder:14b      | 0.82             | 0.65              | coding         | 38.2       | 420       |
			
 
				-| deepseek-coder-v2:16b  | 0.78             | 0.63              | coding         | 35.1       | 510       |
			
 
				-| llama3.1:8b            | 0.61             | 0.74              | general        | 52.3       | 280       |
			
 
				-| mistral:7b             | 0.58             | 0.71              | general        | 55.8       | 250       |
			
 
				+| Slot | Socket              | Role            | Model                     | Composite |
			
 
				+|------|---------------------|-----------------|---------------------------|-----------|
			
 
				+| 1    | Node 1 (port 11434) | General (locked)| llama3.1:8b               | 0.74      |
			
 
				+| 2    | Node 1 (port 11434) | General (locked)| mistral:latest            | 0.71      |
			
 
				+| 5    | Node 1 (port 11434) | General (rotate)| llama3.2:3b               | 0.63      |
			
 
				+| 3    | Node 0 (port 11435) | Coding (locked) | deepseek-coder-v2:16b     | 0.82      |
			
 
				+| 4    | Node 0 (port 11435) | Coding (locked) | qwen2.5-coder:7b          | 0.78      |
			
 
				+| 6    | Node 0 (port 11435) | Coding (rotate) | codegemma:7b              | 0.69      |
			
 
				 ```
			
 
				 
			
 
				-### Model Selection File
			
 
				+### model_selection.json
			
 
				 
			
 
				-Results are also written to `model_selection.json`:
			
 
				+Results are written to `benchmarks/results/model_selection.json`:
			
 
				 
			
 
				 ```json
			
 
				 {
			
 
				-  "timestamp": "2025-01-15T10:30:00Z",
			
 
				-  "slot1_coding": "qwen2.5-coder:14b",
			
 
				-  "slot2_general": "llama3.1:8b",
			
 
				-  "slot3_backup": "deepseek-coder-v2:16b",
			
 
				-  "slot4_experimental": null,
			
 
				-  "results": { ... }
			
 
				+  "slot1_general": "llama3.1:8b",
			
 
				+  "slot2_general": "mistral:latest",
			
 
				+  "slot5_general_rotate": "llama3.2:3b",
			
 
				+  "slot3_coding": "deepseek-coder-v2:16b",
			
 
				+  "slot4_coding": "qwen2.5-coder:7b",
			
 
				+  "slot6_coding_rotate": "codegemma:7b",
			
 
				+  "general_ranking": [...],
			
 
				+  "coding_ranking": [...],
			
 
				+  "all_metrics": { ... }
			
 
				 }
			
 
				 ```
			
 
				 
			
 
				-## Slot Selection
			
 
				-
			
 
				-Slots are assigned from benchmark results as follows:
			
 
				-
			
 
				-1. **Slot 1 (Primary Coding)** -- model with the highest `coding_composite` score
			
 
				-2. **Slot 2 (Primary General)** -- model with the highest `general_composite` score
			
 
				-3. **Slot 3 (Secondary / Backup)** -- next-best model by overall average composite
			
 
				-4. **Slot 4 (Experimental)** -- not assigned by benchmarks; set manually via `-e slot4_model=<name>`
			
 
				+This file is read by `04_models.yml` to decide what to pull and warm up. It is committed
			
 
				+to the repo so slot selections survive a clean checkout.
			
--- a/benchmarks/results/benchmark_20260307T170059.md
+++ b/benchmarks/results/benchmark_20260307T170059.md
@@ -1,15 +1,20 @@
 
				 # Benchmark Results - 20260307T170059
			
 
				 
			
 
				 ## Model Selection
			
 
				-| Slot | Role | Model | Composite Score |
			
 
				-|------|------|-------|----------------|
			
 
				-| 1 | General (Primary) | llama3.2:3b | 0.967 |
			
 
				-| 2 | General (Secondary) | llama3.2:3b | 0.967 |
			
 
				-| 3 | Coding (Primary) | deepseek-coder-v2 | 0.738 |
			
 
				-| 4 | Coding (Secondary) | qwen2.5-coder:7b | 0.63 |
			
 
				+
			
 
				+
			
 
				+| Slot | Role                | Model             | Composite Score |
			
 
				+| ---- | ------------------- | ----------------- | --------------- |
			
 
				+| 1    | General (Primary)   | llama3.2:3b       | 0.967           |
			
 
				+| 2    | General (Secondary) | llama3.2:3b       | 0.967           |
			
 
				+| 3    | Coding (Primary)    | deepseek-coder-v2 | 0.738           |
			
 
				+| 4    | Coding (Secondary)  | qwen2.5-coder:7b  | 0.63            |
			
 
				+
			
 
				 
			
 
				 ## Detailed Metrics
			
 
				+
			
 
				 ### deepseek-coder-v2
			
 
				+
			
 
				 - **Category**: coding
			
 
				 - **Coding Quality**: 0.667
			
 
				 - **General Quality**: 0.918
			
@@ -17,7 +22,9 @@
 
				 - **Latency (ms)**: 1744.5
			
 
				 - **Coding Composite**: 0.738
			
 
				 - **General Composite**: 0.852
			
 
				+
			
 
				 ### qwen2.5-coder:7b
			
 
				+
			
 
				 - **Category**: coding
			
 
				 - **Coding Quality**: 0.64
			
 
				 - **General Quality**: 0.922
			
@@ -25,7 +32,9 @@
 
				 - **Latency (ms)**: 1211.5
			
 
				 - **Coding Composite**: 0.63
			
 
				 - **General Composite**: 0.757
			
 
				+
			
 
				 ### llama3.2:3b
			
 
				+
			
 
				 - **Category**: general
			
 
				 - **Coding Quality**: 0.607
			
 
				 - **General Quality**: 0.991
			
@@ -35,7 +44,9 @@
 
				 - **General Composite**: 0.967
			
 
				 
			
 
				 ## Scoring Formula
			
 
				+
			
 
				 - Composite = quality * 0.45 + token_speed_normalized * 0.30 + latency_score * 0.25
			
 
				 - Speed normalized against 22 tok/sec ceiling (hardware-observed max)
			
 
				 - Coding quality: has_def×0.20 + has_return×0.20 + has_docstring×0.15 + has_type_hint×0.15 + has_code_block×0.10 + has_assert×0.08 + has_test_def×0.07 + has_import×0.05
			
 
				 - Category: override dict → quality delta (coding_avg - general_avg >= 0.1) → name pattern (coder/codestral/codellama/starcoder) → general
			
 
				+
			
--- a/benchmarks/results/benchmark_20260307T184212.md
+++ b/benchmarks/results/benchmark_20260307T184212.md
@@ -0,0 +1,92 @@
 
				+# Benchmark Results - 20260307T184212
			
 
				+
			
 
				+## Model Selection
			
 
				+| Slot | Role | Model | Composite Score |
			
 
				+|------|------|-------|----------------|
			
 
				+| 1 | General (Primary) | llama3.2:3b | 0.001 |
			
 
				+| 2 | General (Secondary) | gemma-family:latest | 0.0 |
			
 
				+| 3 | Coding (Primary) | coder-128k:latest | 0.001 |
			
 
				+| 4 | Coding (Secondary) | coder-32k:latest | 0.001 |
			
 
				+
			
 
				+## Detailed Metrics
			
 
				+### gemma-family:latest
			
 
				+- **Category**: general
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.0
			
 
				+- **Latency (ms)**: 9999
			
 
				+- **Coding Composite**: 0.0
			
 
				+- **General Composite**: 0.0
			
 
				+### llama-family:latest
			
 
				+- **Category**: general
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.0
			
 
				+- **Latency (ms)**: 9999
			
 
				+- **Coding Composite**: 0.0
			
 
				+- **General Composite**: 0.0
			
 
				+### coder-128k:latest
			
 
				+- **Category**: coding
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.0
			
 
				+- **Latency (ms)**: 285394.5
			
 
				+- **Coding Composite**: 0.001
			
 
				+- **General Composite**: 0.001
			
 
				+### coder-32k:latest
			
 
				+- **Category**: coding
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.1
			
 
				+- **Latency (ms)**: 142328.6
			
 
				+- **Coding Composite**: 0.001
			
 
				+- **General Composite**: 0.001
			
 
				+### llama3.1:8b
			
 
				+- **Category**: general
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.0
			
 
				+- **Latency (ms)**: 9999
			
 
				+- **Coding Composite**: 0.0
			
 
				+- **General Composite**: 0.0
			
 
				+### deepseek-coder-v2:latest
			
 
				+- **Category**: coding
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.0
			
 
				+- **Latency (ms)**: 9999
			
 
				+- **Coding Composite**: 0.0
			
 
				+- **General Composite**: 0.0
			
 
				+### qwen2.5-coder:7b
			
 
				+- **Category**: coding
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.1
			
 
				+- **Latency (ms)**: 143942.9
			
 
				+- **Coding Composite**: 0.001
			
 
				+- **General Composite**: 0.001
			
 
				+### gemma3:12b-it-q4_K_M
			
 
				+- **Category**: general
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.0
			
 
				+- **Latency (ms)**: 9999
			
 
				+- **Coding Composite**: 0.0
			
 
				+- **General Composite**: 0.0
			
 
				+### llama3.2:3b
			
 
				+- **Category**: general
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.1
			
 
				+- **Latency (ms)**: 139756.5
			
 
				+- **Coding Composite**: 0.001
			
 
				+- **General Composite**: 0.001
			
 
				+
			
 
				+## Scoring Formula
			
 
				+- Composite = quality * 0.45 + token_speed_normalized * 0.30 + latency_score * 0.25
			
 
				+- Speed normalized against 22 tok/sec ceiling (hardware-observed max)
			
 
				+- Coding quality (per-prompt):
			
 
				+  code_gen: has_def×0.20 + has_return×0.20 + has_docstring×0.15 + has_type_hint×0.15 + has_code_block×0.10 + has_assert×0.08 + has_test_def×0.07 + has_import×0.05
			
 
				+  debug:    has_def×0.30 + has_return×0.30 + has_code_block×0.25 + has_assert×0.15
			
 
				+  refactor: has_def×0.25 + has_return×0.25 + has_code_block×0.20 + has_type_hint×0.15 + has_import×0.15
			
 
				+- Category: override dict → quality delta (coding_avg - general_avg >= 0.1) → name pattern (coder/codestral/codellama/starcoder) → general
			
--- a/benchmarks/results/benchmark_20260308T003605.md
+++ b/benchmarks/results/benchmark_20260308T003605.md
@@ -0,0 +1,147 @@
 
				+# Benchmark Results - 20260308T003605
			
 
				+
			
 
				+## Model Selection (6-slot / 2-socket)
			
 
				+
			
 
				+
			
 
				+| Slot | Socket              | Role             | Model                 | Composite Score |
			
 
				+| ---- | ------------------- | ---------------- | --------------------- | --------------- |
			
 
				+| 1    | Node 1 (port 11434) | General (locked) | llama3.2:3b           | 0.001           |
			
 
				+| 2    | Node 1 (port 11434) | General (locked) | command-r:35b         | 0.0             |
			
 
				+| 5    | Node 1 (port 11434) | General (rotate) | llama3.1:70b          | 0.0             |
			
 
				+| 3    | Node 0 (port 11435) | Coding (locked)  | codellama:34b         | 0.0             |
			
 
				+| 4    | Node 0 (port 11435) | Coding (locked)  | deepseek-coder-v2:16b | 0.0             |
			
 
				+| 6    | Node 0 (port 11435) | Coding (rotate)  | qwen2.5-coder:14b     | 0.0             |
			
 
				+
			
 
				+
			
 
				+## Detailed Metrics
			
 
				+
			
 
				+### codellama:34b
			
 
				+
			
 
				+- **Category**: coding
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0.008
			
 
				+- **Avg Tokens/sec**: 0.1
			
 
				+- **Latency (ms)**: 221414.9
			
 
				+- **Coding Composite**: 0.0
			
 
				+- **General Composite**: 0.004
			
 
				+
			
 
				+### deepseek-coder-v2:16b
			
 
				+
			
 
				+- **Category**: coding
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.0
			
 
				+- **Latency (ms)**: 9999
			
 
				+- **Coding Composite**: 0.0
			
 
				+- **General Composite**: 0.0
			
 
				+
			
 
				+### qwen2.5-coder:14b
			
 
				+
			
 
				+- **Category**: coding
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.0
			
 
				+- **Latency (ms)**: 239690.0
			
 
				+- **Coding Composite**: 0.0
			
 
				+- **General Composite**: 0.0
			
 
				+
			
 
				+### deepseek-coder-v2:latest
			
 
				+
			
 
				+- **Category**: coding
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.0
			
 
				+- **Latency (ms)**: 9999
			
 
				+- **Coding Composite**: 0.0
			
 
				+- **General Composite**: 0.0
			
 
				+
			
 
				+### command-r:35b
			
 
				+
			
 
				+- **Category**: general
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.1
			
 
				+- **Latency (ms)**: 169971.8
			
 
				+- **Coding Composite**: 0.0
			
 
				+- **General Composite**: 0.0
			
 
				+
			
 
				+### llama3.1:70b
			
 
				+
			
 
				+- **Category**: general
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.0
			
 
				+- **Latency (ms)**: 9999
			
 
				+- **Coding Composite**: 0.0
			
 
				+- **General Composite**: 0.0
			
 
				+
			
 
				+### mistral-nemo:latest
			
 
				+
			
 
				+- **Category**: general
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.0
			
 
				+- **Latency (ms)**: 9999
			
 
				+- **Coding Composite**: 0.0
			
 
				+- **General Composite**: 0.0
			
 
				+
			
 
				+### mistral:latest
			
 
				+
			
 
				+- **Category**: general
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.0
			
 
				+- **Latency (ms)**: 9999
			
 
				+- **Coding Composite**: 0.0
			
 
				+- **General Composite**: 0.0
			
 
				+
			
 
				+### llama3.1:8b
			
 
				+
			
 
				+- **Category**: general
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.0
			
 
				+- **Latency (ms)**: 9999
			
 
				+- **Coding Composite**: 0.0
			
 
				+- **General Composite**: 0.0
			
 
				+
			
 
				+### qwen2.5-coder:7b
			
 
				+
			
 
				+- **Category**: coding
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.0
			
 
				+- **Latency (ms)**: 9999
			
 
				+- **Coding Composite**: 0.0
			
 
				+- **General Composite**: 0.0
			
 
				+
			
 
				+### gemma3:12b-it-q4_K_M
			
 
				+
			
 
				+- **Category**: general
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.0
			
 
				+- **Latency (ms)**: 9999
			
 
				+- **Coding Composite**: 0.0
			
 
				+- **General Composite**: 0.0
			
 
				+
			
 
				+### llama3.2:3b
			
 
				+
			
 
				+- **Category**: general
			
 
				+- **Coding Quality**: 0
			
 
				+- **General Quality**: 0
			
 
				+- **Avg Tokens/sec**: 0.1
			
 
				+- **Latency (ms)**: 130127.2
			
 
				+- **Coding Composite**: 0.001
			
 
				+- **General Composite**: 0.001
			
 
				+
			
 
				+## Scoring Formula
			
 
				+
			
 
				+- Composite = quality * 0.45 + token_speed_normalized * 0.30 + latency_score * 0.25
			
 
				+- Speed normalized against 40 tok/sec ceiling (hardware-observed max)
			
 
				+- Coding quality (per-prompt):
			
 
				+code_gen: has_def×0.20 + has_return×0.20 + has_docstring×0.15 + has_type_hint×0.15 + has_code_block×0.10 + has_assert×0.08 + has_test_def×0.07 + has_import×0.05
			
 
				+debug:    has_def×0.30 + has_return×0.30 + has_code_block×0.25 + has_assert×0.15
			
 
				+refactor: has_def×0.25 + has_return×0.25 + has_code_block×0.20 + has_type_hint×0.15 + has_import×0.15
			
 
				+- Category: override dict → quality delta (coding_avg - general_avg >= 0.1) → name pattern (coder/codestral/codellama/starcoder) → general
			
 
				+
			
--- a/benchmarks/results/model_selection.json
+++ b/benchmarks/results/model_selection.json
@@ -1,116 +1,39 @@
 
				 {
			
 
				     "all_metrics": {
			
 
				-        "deepseek-coder-v2:latest": {
			
 
				-            "avg_tok_per_sec": 21.6,
			
 
				+        "mistral-nemo:latest:mistral:latest:llama3.1:8b:qwen2.5-coder:7b:gemma3:12b-it-q4_K_M:llama3.2:3b": {
			
 
				+            "avg_tok_per_sec": 0.0,
			
 
				             "category": "coding",
			
 
				-            "coding_composite": 0.764,
			
 
				-            "coding_quality": 0.657,
			
 
				-            "general_composite": 0.867,
			
 
				-            "general_quality": 0.886,
			
 
				-            "latency_ms": 1510.5,
			
 
				-            "latency_score": 0.698,
			
 
				-            "toks_norm": 0.982
			
 
				-        },
			
 
				-        "gemma3:12b-it-q4_K_M": {
			
 
				-            "avg_tok_per_sec": 5.6,
			
 
				-            "category": "general",
			
 
				-            "coding_composite": 0.416,
			
 
				-            "coding_quality": 0.757,
			
 
				-            "general_composite": 0.495,
			
 
				-            "general_quality": 0.931,
			
 
				-            "latency_ms": 5975.8,
			
 
				+            "coding_composite": 0.0,
			
 
				+            "coding_quality": 0,
			
 
				+            "general_composite": 0.0,
			
 
				+            "general_quality": 0,
			
 
				+            "latency_ms": 9999,
			
 
				             "latency_score": 0,
			
 
				-            "toks_norm": 0.253
			
 
				-        },
			
 
				-        "llama3.2:3b": {
			
 
				-            "avg_tok_per_sec": 22.5,
			
 
				-            "category": "general",
			
 
				-            "coding_composite": 0.846,
			
 
				-            "coding_quality": 0.723,
			
 
				-            "general_composite": 0.961,
			
 
				-            "general_quality": 0.979,
			
 
				-            "latency_ms": 580.7,
			
 
				-            "latency_score": 0.884,
			
 
				-            "toks_norm": 1.0
			
 
				-        },
			
 
				-        "qwen2.5-coder:7b": {
			
 
				-            "avg_tok_per_sec": 12.3,
			
 
				-            "category": "coding",
			
 
				-            "coding_composite": 0.664,
			
 
				-            "coding_quality": 0.683,
			
 
				-            "general_composite": 0.756,
			
 
				-            "general_quality": 0.888,
			
 
				-            "latency_ms": 1222.4,
			
 
				-            "latency_score": 0.756,
			
 
				-            "toks_norm": 0.56
			
 
				+            "toks_norm": 0.0
			
 
				         }
			
 
				     },
			
 
				     "coding_ranking": [
			
 
				         {
			
 
				-            "composite": 0.764,
			
 
				+            "composite": 0.0,
			
 
				             "metrics": {
			
 
				-                "avg_tok_per_sec": 21.6,
			
 
				+                "avg_tok_per_sec": 0.0,
			
 
				                 "category": "coding",
			
 
				-                "coding_composite": 0.764,
			
 
				-                "coding_quality": 0.657,
			
 
				-                "general_composite": 0.867,
			
 
				-                "general_quality": 0.886,
			
 
				-                "latency_ms": 1510.5,
			
 
				-                "latency_score": 0.698,
			
 
				-                "toks_norm": 0.982
			
 
				-            },
			
 
				-            "name": "deepseek-coder-v2:latest"
			
 
				-        },
			
 
				-        {
			
 
				-            "composite": 0.664,
			
 
				-            "metrics": {
			
 
				-                "avg_tok_per_sec": 12.3,
			
 
				-                "category": "coding",
			
 
				-                "coding_composite": 0.664,
			
 
				-                "coding_quality": 0.683,
			
 
				-                "general_composite": 0.756,
			
 
				-                "general_quality": 0.888,
			
 
				-                "latency_ms": 1222.4,
			
 
				-                "latency_score": 0.756,
			
 
				-                "toks_norm": 0.56
			
 
				-            },
			
 
				-            "name": "qwen2.5-coder:7b"
			
 
				-        }
			
 
				-    ],
			
 
				-    "general_ranking": [
			
 
				-        {
			
 
				-            "composite": 0.961,
			
 
				-            "metrics": {
			
 
				-                "avg_tok_per_sec": 22.5,
			
 
				-                "category": "general",
			
 
				-                "coding_composite": 0.846,
			
 
				-                "coding_quality": 0.723,
			
 
				-                "general_composite": 0.961,
			
 
				-                "general_quality": 0.979,
			
 
				-                "latency_ms": 580.7,
			
 
				-                "latency_score": 0.884,
			
 
				-                "toks_norm": 1.0
			
 
				-            },
			
 
				-            "name": "llama3.2:3b"
			
 
				-        },
			
 
				-        {
			
 
				-            "composite": 0.495,
			
 
				-            "metrics": {
			
 
				-                "avg_tok_per_sec": 5.6,
			
 
				-                "category": "general",
			
 
				-                "coding_composite": 0.416,
			
 
				-                "coding_quality": 0.757,
			
 
				-                "general_composite": 0.495,
			
 
				-                "general_quality": 0.931,
			
 
				-                "latency_ms": 5975.8,
			
 
				+                "coding_composite": 0.0,
			
 
				+                "coding_quality": 0,
			
 
				+                "general_composite": 0.0,
			
 
				+                "general_quality": 0,
			
 
				+                "latency_ms": 9999,
			
 
				                 "latency_score": 0,
			
 
				-                "toks_norm": 0.253
			
 
				+                "toks_norm": 0.0
			
 
				             },
			
 
				-            "name": "gemma3:12b-it-q4_K_M"
			
 
				+            "name": "mistral-nemo:latest:mistral:latest:llama3.1:8b:qwen2.5-coder:7b:gemma3:12b-it-q4_K_M:llama3.2:3b"
			
 
				         }
			
 
				     ],
			
 
				-    "slot1_general": "llama3.2:3b",
			
 
				-    "slot2_general": "gemma3:12b-it-q4_K_M",
			
 
				-    "slot3_coding": "deepseek-coder-v2:latest",
			
 
				-    "slot4_coding": "qwen2.5-coder:7b"
			
 
				+    "general_ranking": [],
			
 
				+    "slot1_general": "none",
			
 
				+    "slot2_general": "none",
			
 
				+    "slot3_coding": "mistral-nemo:latest:mistral:latest:llama3.1:8b:qwen2.5-coder:7b:gemma3:12b-it-q4_K_M:llama3.2:3b",
			
 
				+    "slot4_coding": "mistral-nemo:latest:mistral:latest:llama3.1:8b:qwen2.5-coder:7b:gemma3:12b-it-q4_K_M:llama3.2:3b",
			
 
				+    "slot5_general_rotate": "none",
			
 
				+    "slot6_coding_rotate": "none"
			
 
				 }
			
--- a/inventory/group_vars/all.yml
+++ b/inventory/group_vars/all.yml
@@ -51,6 +51,7 @@ vault_approle_name: "ai-services"
 
				 # Service ports
			
 
				 keycloak_port: 8180
			
 
				 ollama_port: 11434
			
 
				+ollama_node0_port: 11435
			
 
				 qdrant_http_port: 6333
			
 
				 qdrant_grpc_port: 6334
			
 
				 
			
@@ -58,17 +59,19 @@ qdrant_grpc_port: 6334
 
				 ollama_host: "0.0.0.0:11434"
			
 
				 ollama_num_threads: 14
			
 
				 ollama_num_parallel: 2
			
 
				-ollama_max_loaded_models: 4
			
 
				+ollama_max_loaded_models: 3   # 3 per socket (6 total across both NUMA instances)
			
 
				 ollama_keep_alive: "-1"
			
 
				 ollama_flash_attention: "1"
			
 
				 
			
 
				 # NUMA/CPU affinity - Dell M630, 2x E5-2690v4
			
 
				 # CPUs are interleaved: odd = socket 1 (NUMA node 1), even = socket 0.
			
 
				 # Physical cores on node 1: 1,3,...,27 (14 cores). HT siblings: 29,31,...,55.
			
 
				+# Physical cores on node 0: 0,2,...,26 (14 cores). HT siblings: 28,30,...,54.
			
 
				 # Pinning to physical cores only eliminates HT contention on the memory bus.
			
 
				 # NUMA node 1 has ~120 GB free RAM vs node 0's ~75 GB.
			
 
				 ollama_numa_node: "1"
			
 
				 ollama_cpu_affinity: "1 3 5 7 9 11 13 15 17 19 21 23 25 27"
			
 
				+ollama_node0_cpu_affinity: "0 2 4 6 8 10 12 14 16 18 20 22 24 26"
			
 
				 ollama_binary_path: /usr/bin/ollama
			
 
				 
			
 
				 # Keycloak configuration
			
@@ -85,9 +88,25 @@ benchmark_thresholds:
 
				   min_quality_score: 0.6
			
 
				   min_composite_score: 0.55
			
 
				 
			
 
				-benchmark_toks_norm_ceiling: 22     # Observed hardware max on Dell M630 (22.5 tok/sec measured)
			
 
				+benchmark_toks_norm_ceiling: 40     # Conservative dual-socket estimate (was 22 single-socket)
			
 
				 benchmark_coding_threshold: 0.10    # Delta to classify a model as coding-specialized
			
 
				 
			
 
				+# Modelfile aliases created by 04_models.yml — excluded from benchmark to prevent
			
 
				+# 32k-token KV cache allocations stalling the run with 285-second response times.
			
 
				+benchmark_skip_aliases:
			
 
				+  - "coder-128k"
			
 
				+  - "coder-32k"
			
 
				+  - "coder-rotate"
			
 
				+  - "llama-family"
			
 
				+  - "gemma-family"
			
 
				+
			
 
				+benchmark_small_max_gb: 10    # upper size boundary for small pass (< 10 GB), based on runtime RAM
			
 
				+benchmark_medium_max_gb: 15   # upper size boundary for medium pass (10–15 GB), based on runtime RAM
			
 
				+benchmark_size_overhead_factor: 1.2  # ollama list shows disk size; multiply by this to estimate runtime RAM
			
 
				+benchmark_small_timeout: 300  # seconds per request, small models
			
 
				+benchmark_medium_timeout: 900 # seconds per request, medium models (15 min)
			
 
				+benchmark_large_timeout: 1200 # seconds per request, large models (20 min)
			
 
				+
			
 
				 # Explicit category overrides applied before heuristics. Keys are model names as
			
 
				 # returned by `ollama list`. Valid values: 'coding' or 'general'.
			
 
				 # Example: { "deepseek-coder-v2": "coding", "qwen2.5-coder:7b": "coding" }
			
@@ -124,8 +143,8 @@ candidate_models:
 
				     reason: "StarCoder2 coding specialist"
			
 
				     category: coding
			
 
				 
			
 
				-# OpenClaw default model
			
 
				-openclaw_model: "llama3.2:3b"
			
 
				+# OpenClaw default model — overridden dynamically by 08_openclaw.yml from slot1_general
			
 
				+openclaw_model: "deepseek-coder-v2:16b-lite-instruct-q4_K_M"
			
 
				 
			
 
				 # AWS Bedrock (OpenAI-compatible API via Open WebUI)
			
 
				 # Pass bearer_token on first run: -e "bedrock_bearer_token=<value>"
			
--- a/playbooks/02_infrastructure.yml
+++ b/playbooks/02_infrastructure.yml
@@ -155,6 +155,42 @@
 
				       tags:
			
 
				         - ollama
			
 
				 
			
 
				+    - name: "Ollama | Deploy ollama-node0 systemd unit"
			
 
				+      ansible.builtin.template:
			
 
				+        src: "{{ playbook_dir }}/../templates/ollama/ollama-node0.service.j2"
			
 
				+        dest: /etc/systemd/system/ollama-node0.service
			
 
				+        mode: "0644"
			
 
				+        owner: root
			
 
				+        group: root
			
 
				+      notify:
			
 
				+        - Reload systemd and start ollama-node0
			
 
				+      tags:
			
 
				+        - ollama
			
 
				+
			
 
				+    - name: "Ollama | Enable and start ollama-node0"
			
 
				+      ansible.builtin.systemd:
			
 
				+        name: ollama-node0
			
 
				+        enabled: true
			
 
				+        state: started
			
 
				+        daemon_reload: true
			
 
				+      tags:
			
 
				+        - ollama
			
 
				+
			
 
				+    - name: "Ollama | Wait for ollama-node0 API to be ready"
			
 
				+      ansible.builtin.uri:
			
 
				+        url: "http://localhost:{{ ollama_node0_port }}/api/tags"
			
 
				+        method: GET
			
 
				+        headers:
			
 
				+          Authorization: "Bearer {{ ollama_api_key }}"
			
 
				+        status_code: 200
			
 
				+        timeout: 10
			
 
				+      register: ollama_node0_ready
			
 
				+      retries: 24
			
 
				+      delay: 5
			
 
				+      until: ollama_node0_ready.status == 200
			
 
				+      tags:
			
 
				+        - ollama
			
 
				+
			
 
				     # ── OS-level kernel tuning for dedicated inference server ────────────────
			
 
				     - name: "OS Tune | Apply sysctl settings for inference workload"
			
 
				       ansible.posix.sysctl:
			
@@ -261,6 +297,12 @@
 
				         state: restarted
			
 
				         daemon_reload: true
			
 
				 
			
 
				+    - name: Reload systemd and start ollama-node0
			
 
				+      ansible.builtin.systemd:
			
 
				+        name: ollama-node0
			
 
				+        state: started
			
 
				+        daemon_reload: true
			
 
				+
			
 
				     - name: Reload systemd daemon
			
 
				       ansible.builtin.systemd:
			
 
				         daemon_reload: true
			
--- a/playbooks/03_benchmark.yml
+++ b/playbooks/03_benchmark.yml
@@ -100,27 +100,69 @@
 
				       tags:
			
 
				         - benchmark-discover
			
 
				 
			
 
				-    - name: "Benchmark | Set models_to_benchmark to all installed models"
			
 
				+    - name: "Benchmark | Parse model sizes from ollama list"
			
 
				       ansible.builtin.set_fact:
			
 
				-        models_to_benchmark: "{{ installed_models }}"
			
 
				+        _benchmark_sizes_json: |
			
 
				+          {% set ns = namespace(d={}) %}
			
 
				+          {% for line in ollama_list_output.stdout_lines[1:] %}
			
 
				+          {%   set p = line.split() %}
			
 
				+          {%   if p | length >= 4 %}
			
 
				+          {%     set gb = (p[2] | float) if (p[3] | upper == 'GB') else ((p[2] | float) / 1024) %}
			
 
				+          {%     set _ = ns.d.update({p[0]: gb}) %}
			
 
				+          {%   endif %}
			
 
				+          {% endfor %}
			
 
				+          {{ ns.d | to_json }}
			
 
				+      tags:
			
 
				+        - benchmark-discover
			
 
				+
			
 
				+    - name: "Benchmark | Partition models into small, medium, and large passes"
			
 
				+      ansible.builtin.set_fact:
			
 
				+        _small_models:  "{{ _alias_filtered | select('in', _small_ok)  | list }}"
			
 
				+        _medium_models: "{{ _alias_filtered | select('in', _medium_ok) | list }}"
			
 
				+        _large_models:  "{{ _alias_filtered | reject('in', _small_ok)  | reject('in', _medium_ok) | list }}"
			
 
				+        models_to_benchmark: "{{ _alias_filtered | list }}"
			
 
				+      vars:
			
 
				+        _sizes:     "{{ _benchmark_sizes_json | from_json }}"
			
 
				+        _small_cut:  "{{ (benchmark_small_max_gb  | float) / (benchmark_size_overhead_factor | float) }}"
			
 
				+        _medium_cut: "{{ (benchmark_medium_max_gb | float) / (benchmark_size_overhead_factor | float) }}"
			
 
				+        _small_ok:  "{{ _sizes | dict2items | selectattr('value', 'le', _small_cut  | float) | map(attribute='key') | list }}"
			
 
				+        _medium_ok: "{{ _sizes | dict2items | selectattr('value', 'gt', _small_cut  | float)
			
 
				+                                            | selectattr('value', 'le', _medium_cut | float)
			
 
				+                                            | map(attribute='key') | list }}"
			
 
				+        _alias_filtered: "{{ installed_models | reject('match', '^(' ~ benchmark_skip_aliases | join('|') ~ ')(:|$)') | list }}"
			
 
				       when: benchmark_models | default('') | length == 0
			
 
				       tags:
			
 
				         - benchmark-discover
			
 
				 
			
 
				     - name: "Benchmark | Set models_to_benchmark to specified subset"
			
 
				       ansible.builtin.set_fact:
			
 
				-        models_to_benchmark: "{{ benchmark_models.split(',') | map('trim') | list }}"
			
 
				+        models_to_benchmark: "{{ _specified }}"
			
 
				+        _small_models:  "{{ _specified | select('in', _small_ok)  | list }}"
			
 
				+        _medium_models: "{{ _specified | select('in', _medium_ok) | list }}"
			
 
				+        _large_models:  "{{ _specified | reject('in', _small_ok)  | reject('in', _medium_ok) | list }}"
			
 
				+      vars:
			
 
				+        _specified: "{{ benchmark_models.split(',') | map('trim') | list }}"
			
 
				+        _sizes:     "{{ _benchmark_sizes_json | from_json }}"
			
 
				+        _small_cut:  "{{ (benchmark_small_max_gb  | float) / (benchmark_size_overhead_factor | float) }}"
			
 
				+        _medium_cut: "{{ (benchmark_medium_max_gb | float) / (benchmark_size_overhead_factor | float) }}"
			
 
				+        _small_ok:  "{{ _sizes | dict2items | selectattr('value', 'le', _small_cut  | float) | map(attribute='key') | list }}"
			
 
				+        _medium_ok: "{{ _sizes | dict2items | selectattr('value', 'gt', _small_cut  | float)
			
 
				+                                            | selectattr('value', 'le', _medium_cut | float)
			
 
				+                                            | map(attribute='key') | list }}"
			
 
				       when: benchmark_models | default('') | length > 0
			
 
				       tags:
			
 
				         - benchmark-discover
			
 
				 
			
 
				     - name: "Benchmark | Display models to benchmark"
			
 
				       ansible.builtin.debug:
			
 
				-        msg: "Will benchmark the following models: {{ models_to_benchmark }}"
			
 
				+        msg:
			
 
				+          - "Small  pass (timeout {{ benchmark_small_timeout }}s,  ≤{{ benchmark_small_max_gb }}GB):  {{ _small_models }}"
			
 
				+          - "Medium pass (timeout {{ benchmark_medium_timeout }}s, {{ benchmark_small_max_gb }}–{{ benchmark_medium_max_gb }}GB): {{ _medium_models }}"
			
 
				+          - "Large  pass (timeout {{ benchmark_large_timeout }}s, >{{ benchmark_medium_max_gb }}GB): {{ _large_models }}"
			
 
				       tags:
			
 
				         - benchmark-discover
			
 
				 
			
 
				-    - name: "Benchmark | Run test prompts against each model"
			
 
				+    - name: "Benchmark | Run test prompts against small models"
			
 
				       ansible.builtin.uri:
			
 
				         url: "{{ ollama_api_url }}/api/generate"
			
 
				         method: POST
			
@@ -131,16 +173,70 @@
 
				           stream: false
			
 
				         headers:
			
 
				           Authorization: "Bearer {{ ollama_api_key }}"
			
 
				-        timeout: 300
			
 
				+        timeout: "{{ benchmark_small_timeout }}"
			
 
				         status_code: 200
			
 
				-      loop: "{{ models_to_benchmark | product(test_prompts.keys() | list) | list }}"
			
 
				+      loop: "{{ _small_models | product(test_prompts.keys() | list) | list }}"
			
 
				       loop_control:
			
 
				         label: "{{ item.0 }} / {{ item.1 }}"
			
 
				-      register: benchmark_raw_results
			
 
				+      register: _bench_small
			
 
				       failed_when: false
			
 
				       tags:
			
 
				         - benchmark-run
			
 
				 
			
 
				+    - name: "Benchmark | Run test prompts against medium models"
			
 
				+      ansible.builtin.uri:
			
 
				+        url: "{{ ollama_api_url }}/api/generate"
			
 
				+        method: POST
			
 
				+        body_format: json
			
 
				+        body:
			
 
				+          model: "{{ item.0 }}"
			
 
				+          prompt: "{{ test_prompts[item.1].prompt }}"
			
 
				+          stream: false
			
 
				+        headers:
			
 
				+          Authorization: "Bearer {{ ollama_api_key }}"
			
 
				+        timeout: "{{ benchmark_medium_timeout }}"
			
 
				+        status_code: 200
			
 
				+      loop: "{{ _medium_models | product(test_prompts.keys() | list) | list }}"
			
 
				+      loop_control:
			
 
				+        label: "{{ item.0 }} / {{ item.1 }}"
			
 
				+      register: _bench_medium
			
 
				+      failed_when: false
			
 
				+      when: _medium_models | length > 0
			
 
				+      tags:
			
 
				+        - benchmark-run
			
 
				+
			
 
				+    - name: "Benchmark | Run test prompts against large models"
			
 
				+      ansible.builtin.uri:
			
 
				+        url: "{{ ollama_api_url }}/api/generate"
			
 
				+        method: POST
			
 
				+        body_format: json
			
 
				+        body:
			
 
				+          model: "{{ item.0 }}"
			
 
				+          prompt: "{{ test_prompts[item.1].prompt }}"
			
 
				+          stream: false
			
 
				+        headers:
			
 
				+          Authorization: "Bearer {{ ollama_api_key }}"
			
 
				+        timeout: "{{ benchmark_large_timeout }}"
			
 
				+        status_code: 200
			
 
				+      loop: "{{ _large_models | product(test_prompts.keys() | list) | list }}"
			
 
				+      loop_control:
			
 
				+        label: "{{ item.0 }} / {{ item.1 }}"
			
 
				+      register: _bench_large
			
 
				+      failed_when: false
			
 
				+      when: _large_models | length > 0
			
 
				+      tags:
			
 
				+        - benchmark-run
			
 
				+
			
 
				+    - name: "Benchmark | Merge small, medium, and large model results"
			
 
				+      ansible.builtin.set_fact:
			
 
				+        benchmark_raw_results:
			
 
				+          results: >-
			
 
				+            {{ (_bench_small.results  | default([]))
			
 
				+             + (_bench_medium.results | default([]))
			
 
				+             + (_bench_large.results  | default([])) }}
			
 
				+      tags:
			
 
				+        - benchmark-run
			
 
				+
			
 
				     - name: "Benchmark | Compute per-model metrics"
			
 
				       ansible.builtin.set_fact:
			
 
				         model_metrics: |
			
@@ -239,9 +335,14 @@
 
				           {% set coding_sorted = coding_models | sort(attribute='composite', reverse=true) %}
			
 
				           {% set slot1 = general_sorted[0].name if general_sorted | length > 0 else 'none' %}
			
 
				           {% set slot2 = general_sorted[1].name if general_sorted | length > 1 else (general_sorted[0].name if general_sorted | length > 0 else 'none') %}
			
 
				+          {% set slot5 = general_sorted[2].name if general_sorted | length > 2 else 'none' %}
			
 
				           {% set slot3 = coding_sorted[0].name if coding_sorted | length > 0 else (general_sorted[0].name if general_sorted | length > 0 else 'none') %}
			
 
				           {% set slot4 = coding_sorted[1].name if coding_sorted | length > 1 else (coding_sorted[0].name if coding_sorted | length > 0 else 'none') %}
			
 
				-          {{ {'slot1_general': slot1, 'slot2_general': slot2, 'slot3_coding': slot3, 'slot4_coding': slot4, 'all_metrics': parsed_metrics, 'general_ranking': general_sorted, 'coding_ranking': coding_sorted} | to_json }}
			
 
				+          {% set slot6 = coding_sorted[2].name if coding_sorted | length > 2 else 'none' %}
			
 
				+          {{ {'slot1_general': slot1, 'slot2_general': slot2, 'slot5_general_rotate': slot5,
			
 
				+              'slot3_coding': slot3, 'slot4_coding': slot4, 'slot6_coding_rotate': slot6,
			
 
				+              'all_metrics': parsed_metrics, 'general_ranking': general_sorted,
			
 
				+              'coding_ranking': coding_sorted} | to_json }}
			
 
				       tags:
			
 
				         - benchmark-select
			
 
				 
			
@@ -255,12 +356,16 @@
 
				       ansible.builtin.debug:
			
 
				         msg:
			
 
				           - "============================================="
			
 
				-          - "  MODEL SELECTION RESULTS"
			
 
				+          - "  MODEL SELECTION RESULTS  (6-slot / 2-socket)"
			
 
				           - "============================================="
			
 
				-          - "  Slot 1 (General Primary):  {{ selection.slot1_general }}"
			
 
				-          - "  Slot 2 (General Secondary): {{ selection.slot2_general }}"
			
 
				-          - "  Slot 3 (Coding Primary):   {{ selection.slot3_coding }}"
			
 
				-          - "  Slot 4 (Coding Secondary): {{ selection.slot4_coding }}"
			
 
				+          - "  Node 1 — General (port 11434)"
			
 
				+          - "  Slot 1 (locked):   {{ selection.slot1_general }}"
			
 
				+          - "  Slot 2 (locked):   {{ selection.slot2_general }}"
			
 
				+          - "  Slot 5 (rotate):   {{ selection.slot5_general_rotate }}"
			
 
				+          - "  Node 0 — Coding (port 11435)"
			
 
				+          - "  Slot 3 (locked):   {{ selection.slot3_coding }}"
			
 
				+          - "  Slot 4 (locked):   {{ selection.slot4_coding }}"
			
 
				+          - "  Slot 6 (rotate):   {{ selection.slot6_coding_rotate }}"
			
 
				           - "============================================="
			
 
				       tags:
			
 
				         - benchmark-select
			
@@ -276,13 +381,15 @@
 
				         content: |
			
 
				           # Benchmark Results - {{ benchmark_timestamp }}
			
 
				 
			
 
				-          ## Model Selection
			
 
				-          | Slot | Role | Model | Composite Score |
			
 
				-          |------|------|-------|----------------|
			
 
				-          | 1 | General (Primary) | {{ selection.slot1_general }} | {{ parsed_metrics[selection.slot1_general].general_composite | default('N/A') }} |
			
 
				-          | 2 | General (Secondary) | {{ selection.slot2_general }} | {{ parsed_metrics[selection.slot2_general].general_composite | default('N/A') }} |
			
 
				-          | 3 | Coding (Primary) | {{ selection.slot3_coding }} | {{ parsed_metrics[selection.slot3_coding].coding_composite | default('N/A') }} |
			
 
				-          | 4 | Coding (Secondary) | {{ selection.slot4_coding }} | {{ parsed_metrics[selection.slot4_coding].coding_composite | default('N/A') }} |
			
 
				+          ## Model Selection (6-slot / 2-socket)
			
 
				+          | Slot | Socket | Role | Model | Composite Score |
			
 
				+          |------|--------|------|-------|----------------|
			
 
				+          | 1 | Node 1 (port 11434) | General (locked) | {{ selection.slot1_general }} | {{ parsed_metrics[selection.slot1_general].general_composite | default('N/A') }} |
			
 
				+          | 2 | Node 1 (port 11434) | General (locked) | {{ selection.slot2_general }} | {{ parsed_metrics[selection.slot2_general].general_composite | default('N/A') }} |
			
 
				+          | 5 | Node 1 (port 11434) | General (rotate) | {{ selection.slot5_general_rotate }} | {{ parsed_metrics[selection.slot5_general_rotate].general_composite | default('N/A') }} |
			
 
				+          | 3 | Node 0 (port 11435) | Coding (locked) | {{ selection.slot3_coding }} | {{ parsed_metrics[selection.slot3_coding].coding_composite | default('N/A') }} |
			
 
				+          | 4 | Node 0 (port 11435) | Coding (locked) | {{ selection.slot4_coding }} | {{ parsed_metrics[selection.slot4_coding].coding_composite | default('N/A') }} |
			
 
				+          | 6 | Node 0 (port 11435) | Coding (rotate) | {{ selection.slot6_coding_rotate }} | {{ parsed_metrics[selection.slot6_coding_rotate].coding_composite | default('N/A') }} |
			
 
				 
			
 
				           ## Detailed Metrics
			
 
				           {% for model, metrics in parsed_metrics.items() %}
			
--- a/playbooks/04_models.yml
+++ b/playbooks/04_models.yml
@@ -11,7 +11,9 @@
 
				   vars:
			
 
				     model_selection_file: "{{ playbook_dir }}/../benchmarks/results/model_selection.json"
			
 
				     modelfiles_dir: /mnt/ai_data/ollama_models/modelfiles
			
 
				-    slot4_model: ""
			
 
				+    slot4_model: ""   # legacy override kept for backwards compatibility
			
 
				+    slot5_model: ""   # overrides slot5_general_rotate
			
 
				+    slot6_model: ""   # overrides slot6_coding_rotate
			
 
				     ollama_api_key: "{{ lookup('community.hashi_vault.hashi_vault', vault_secret_prefix ~ '/ollama:api_key token=' ~ lookup('ansible.builtin.file', vault_token_file) ~ ' url=' ~ vault_url) }}"
			
 
				 
			
 
				   tasks:
			
@@ -38,13 +40,31 @@
 
				       tags:
			
 
				         - models-load
			
 
				 
			
 
				+    - name: "Models | Apply slot5 override if provided"
			
 
				+      ansible.builtin.set_fact:
			
 
				+        model_selection: "{{ model_selection | combine({'slot5_general_rotate': slot5_model}) }}"
			
 
				+      when: slot5_model | length > 0
			
 
				+      tags:
			
 
				+        - models-load
			
 
				+
			
 
				+    - name: "Models | Apply slot6 override if provided"
			
 
				+      ansible.builtin.set_fact:
			
 
				+        model_selection: "{{ model_selection | combine({'slot6_coding_rotate': slot6_model}) }}"
			
 
				+      when: slot6_model | length > 0
			
 
				+      tags:
			
 
				+        - models-load
			
 
				+
			
 
				     - name: "Models | Display selected models"
			
 
				       ansible.builtin.debug:
			
 
				         msg:
			
 
				-          - "Slot 1 (General Primary):   {{ model_selection.slot1_general }}"
			
 
				-          - "Slot 2 (General Secondary):  {{ model_selection.slot2_general }}"
			
 
				-          - "Slot 3 (Coding Primary):    {{ model_selection.slot3_coding }}"
			
 
				-          - "Slot 4 (Coding Secondary):  {{ model_selection.slot4_coding }}"
			
 
				+          - "=== Node 1 — General (port 11434) ==="
			
 
				+          - "Slot 1 (locked):  {{ model_selection.slot1_general }}"
			
 
				+          - "Slot 2 (locked):  {{ model_selection.slot2_general }}"
			
 
				+          - "Slot 5 (rotate):  {{ model_selection.slot5_general_rotate | default('none') }}"
			
 
				+          - "=== Node 0 — Coding (port 11435) ==="
			
 
				+          - "Slot 3 (locked):  {{ model_selection.slot3_coding }}"
			
 
				+          - "Slot 4 (locked):  {{ model_selection.slot4_coding }}"
			
 
				+          - "Slot 6 (rotate):  {{ model_selection.slot6_coding_rotate | default('none') }}"
			
 
				       tags:
			
 
				         - models-load
			
 
				 
			
@@ -72,8 +92,10 @@
 
				       loop:
			
 
				         - "{{ model_selection.slot1_general }}"
			
 
				         - "{{ model_selection.slot2_general }}"
			
 
				+        - "{{ model_selection.slot5_general_rotate | default('none') }}"
			
 
				         - "{{ model_selection.slot3_coding }}"
			
 
				         - "{{ model_selection.slot4_coding }}"
			
 
				+        - "{{ model_selection.slot6_coding_rotate | default('none') }}"
			
 
				       when:
			
 
				         - item | length > 0
			
 
				         - item != 'none'
			
@@ -130,6 +152,20 @@
 
				       tags:
			
 
				         - models-modelfile
			
 
				 
			
 
				+    - name: "Models | Template coder-rotate Modelfile"
			
 
				+      ansible.builtin.copy:
			
 
				+        content: |
			
 
				+          FROM {{ model_selection.slot6_coding_rotate }}
			
 
				+          PARAMETER num_ctx 32768
			
 
				+          SYSTEM You are an expert coding assistant. You write clean, efficient, well-documented code. Always include type hints and follow best practices.
			
 
				+        dest: "{{ modelfiles_dir }}/Modelfile.coder-rotate"
			
 
				+        mode: "0644"
			
 
				+      when:
			
 
				+        - model_selection.slot6_coding_rotate | default('') | length > 0
			
 
				+        - model_selection.slot6_coding_rotate | default('none') != 'none'
			
 
				+      tags:
			
 
				+        - models-modelfile
			
 
				+
			
 
				     - name: "Models | Template llama-family Modelfile"
			
 
				       ansible.builtin.copy:
			
 
				         content: |
			
@@ -156,8 +192,9 @@
 
				     - name: "Models | Register custom models with Ollama"
			
 
				       ansible.builtin.command: "ollama create {{ item.name }} -f {{ modelfiles_dir }}/{{ item.file }}"
			
 
				       loop:
			
 
				-        - { name: "coder-128k", file: "Modelfile.coder-128k" }
			
 
				-        - { name: "coder-32k",  file: "Modelfile.coder-32k",  slot: "{{ model_selection.slot4_coding }}" }
			
 
				+        - { name: "coder-128k",   file: "Modelfile.coder-128k" }
			
 
				+        - { name: "coder-32k",    file: "Modelfile.coder-32k",    slot: "{{ model_selection.slot4_coding }}" }
			
 
				+        - { name: "coder-rotate", file: "Modelfile.coder-rotate", slot: "{{ model_selection.slot6_coding_rotate | default('none') }}" }
			
 
				         - { name: "llama-family", file: "Modelfile.llama-family" }
			
 
				         - { name: "gemma-family", file: "Modelfile.gemma-family" }
			
 
				       when: item.slot is not defined or (item.slot | length > 0 and item.slot != 'none')
			
@@ -201,3 +238,33 @@
 
				         state: started
			
 
				       tags:
			
 
				         - models-warmup
			
 
				+
			
 
				+    # ── Node0 warmup service ─────────────────────────────────────────
			
 
				+    - name: "Models | Template node0 warmup script"
			
 
				+      ansible.builtin.template:
			
 
				+        src: "{{ playbook_dir }}/../templates/ollama/warmup-node0.sh.j2"
			
 
				+        dest: /usr/local/bin/ollama-warmup-node0.sh
			
 
				+        mode: "0755"
			
 
				+        owner: root
			
 
				+        group: root
			
 
				+      tags:
			
 
				+        - models-warmup
			
 
				+
			
 
				+    - name: "Models | Template node0 warmup systemd service"
			
 
				+      ansible.builtin.template:
			
 
				+        src: "{{ playbook_dir }}/../templates/systemd/ollama-warmup-node0.service.j2"
			
 
				+        dest: /etc/systemd/system/ollama-warmup-node0.service
			
 
				+        mode: "0644"
			
 
				+        owner: root
			
 
				+        group: root
			
 
				+      tags:
			
 
				+        - models-warmup
			
 
				+
			
 
				+    - name: "Models | Enable and start node0 warmup service"
			
 
				+      ansible.builtin.systemd:
			
 
				+        name: ollama-warmup-node0
			
 
				+        enabled: true
			
 
				+        state: started
			
 
				+        daemon_reload: true
			
 
				+      tags:
			
 
				+        - models-warmup
			
--- a/playbooks/07_openwebui.yml
+++ b/playbooks/07_openwebui.yml
@@ -90,7 +90,7 @@
 
				         _openwebui_env: >-
			
 
				           {{
			
 
				             {
			
 
				-              'OLLAMA_BASE_URL': 'http://host.docker.internal:11434',
			
 
				+              'OLLAMA_BASE_URLS': 'http://host.docker.internal:11434;http://host.docker.internal:11435',
			
 
				               'OLLAMA_API_KEY': ollama_api_key,
			
 
				               'WEBUI_SECRET_KEY': openwebui_secret_key,
			
 
				               'WEBUI_AUTH': 'true',
			
--- a/playbooks/08_openclaw.yml
+++ b/playbooks/08_openclaw.yml
@@ -77,6 +77,27 @@
 
				       tags:
			
 
				         - openclaw-config
			
 
				 
			
 
				+    - name: "OpenClaw | Load model selection for model assignment"
			
 
				+      ansible.builtin.slurp:
			
 
				+        src: "{{ playbook_dir }}/../benchmarks/results/model_selection.json"
			
 
				+      delegate_to: localhost
			
 
				+      become: false
			
 
				+      register: _model_sel_raw
			
 
				+      ignore_errors: true
			
 
				+      when: not skip_openclaw
			
 
				+      tags:
			
 
				+        - openclaw-config
			
 
				+
			
 
				+    - name: "OpenClaw | Set openclaw_model from benchmark slot 1 (best general)"
			
 
				+      ansible.builtin.set_fact:
			
 
				+        openclaw_model: "{{ (_model_sel_raw.content | b64decode | from_json).slot1_general }}"
			
 
				+      when:
			
 
				+        - not skip_openclaw
			
 
				+        - _model_sel_raw is not failed
			
 
				+        - _model_sel_raw.content is defined
			
 
				+      tags:
			
 
				+        - openclaw-config
			
 
				+
			
 
				     # ── Install Python dependencies ───────────────────────────────────
			
 
				     - name: "OpenClaw | Install Python dependencies"
			
 
				       ansible.builtin.pip:
			
--- a/roles/models/README.md
+++ b/roles/models/README.md
@@ -2,79 +2,109 @@
 
				 
			
 
				 ## Purpose
			
 
				 
			
 
				-Manage the Ollama model lifecycle -- pulling models, creating custom Modelfile
			
 
				-configurations, and running a warm-up service to ensure models are loaded into GPU
			
 
				-memory at boot time.
			
 
				+Manage the Ollama model lifecycle — pulling models, creating custom Modelfile
			
 
				+configurations, and running warm-up services to ensure models are loaded into RAM
			
 
				+at boot time across both NUMA instances.
			
 
				 
			
 
				-## Slot System
			
 
				+## 6-Slot System
			
 
				 
			
 
				-| Slot | Role               | Selection Method                         |
			
 
				-|------|--------------------|------------------------------------------|
			
 
				-| 1    | Primary Coding     | Highest coding composite from benchmarks |
			
 
				-| 2    | Primary General    | Highest general composite from benchmarks|
			
 
				-| 3    | Secondary / Backup | Next-best overall average composite      |
			
 
				-| 4    | Experimental       | Manual override via `-e slot4_model=<name>` |
			
 
				+| Slot | Instance      | Port  | Role             | Selection                      | Rotation                    |
			
 
				+|------|---------------|-------|------------------|--------------------------------|-----------------------------|
			
 
				+| 1    | Node 1        | 11434 | General (locked) | Top general composite          | Re-benchmark only           |
			
 
				+| 2    | Node 1        | 11434 | General (locked) | 2nd general composite          | Re-benchmark only           |
			
 
				+| 5    | Node 1        | 11434 | General (rotate) | 3rd general composite          | `-e slot5_model=<name>`     |
			
 
				+| 3    | Node 0        | 11435 | Coding (locked)  | Top coding composite           | Re-benchmark only           |
			
 
				+| 4    | Node 0        | 11435 | Coding (locked)  | 2nd coding composite           | Re-benchmark only           |
			
 
				+| 6    | Node 0        | 11435 | Coding (rotate)  | 3rd coding composite           | `-e slot6_model=<name>`     |
			
 
				 
			
 
				 ## Slot Rotation
			
 
				 
			
 
				-To override slot 4 with a specific model at runtime:
			
 
				+Rotate the general slot on Node 1 (port 11434):
			
 
				 
			
 
				 ```bash
			
 
				-ansible-playbook playbooks/03_ollama.yml -e slot4_model=mistral:7b
			
 
				+ansible-playbook playbooks/04_models.yml -K -e @local.yml -e "slot5_model=mistral:latest"
			
 
				 ```
			
 
				 
			
 
				-Slots 1-3 are automatically assigned based on the latest benchmark results in
			
 
				-`model_selection.json`. Slot 4 is always user-controlled.
			
 
				+Rotate the coding slot on Node 0 (port 11435):
			
 
				+
			
 
				+```bash
			
 
				+ansible-playbook playbooks/04_models.yml -K -e @local.yml -e "slot6_model=llama3.1:70b"
			
 
				+```
			
 
				+
			
 
				+Both at once:
			
 
				+
			
 
				+```bash
			
 
				+ansible-playbook playbooks/04_models.yml -K -e @local.yml \
			
 
				+  -e "slot5_model=mistral:latest" -e "slot6_model=command-r:35b"
			
 
				+```
			
 
				+
			
 
				+Reset both rotate slots back to benchmark recommendations:
			
 
				+
			
 
				+```bash
			
 
				+ansible-playbook playbooks/04_models.yml -K -e @local.yml
			
 
				+```
			
 
				 
			
 
				 ## Modelfile Configurations
			
 
				 
			
 
				-Custom Modelfile variants are created for fine-tuned context windows and use cases:
			
 
				+Custom Modelfile variants are created for fine-tuned context windows:
			
 
				+
			
 
				+| Custom Model    | Base Slot    | Context | Port  | Use Case                         |
			
 
				+|-----------------|--------------|---------|-------|----------------------------------|
			
 
				+| `coder-128k`    | slot3_coding | 32768   | 11435 | Primary coding (large context)   |
			
 
				+| `coder-32k`     | slot4_coding | 32768   | 11435 | Secondary coding                 |
			
 
				+| `coder-rotate`  | slot6_coding_rotate | 32768 | 11435 | Rotatable coding model      |
			
 
				+| `llama-family`  | llama3.2:3b  | 8192    | 11434 | Family-safe general assistant    |
			
 
				+| `gemma-family`  | llama3.1:8b  | 8192    | 11434 | Family-safe general assistant    |
			
 
				+
			
 
				+**These aliases are excluded from benchmarking** via `benchmark_skip_aliases` — their
			
 
				+32k-token parameter allocations stall the benchmark loop with 285-second responses.
			
 
				+
			
 
				+## Warm-up Services
			
 
				+
			
 
				+Two oneshot systemd services pre-load models after their respective Ollama instances start:
			
 
				 
			
 
				-| Custom Model          | Base Model           | Context Window | Use Case                    |
			
 
				-|-----------------------|----------------------|----------------|-----------------------------|
			
 
				-| `coding-primary`     | (slot 1 model)       | 32768          | Code generation and debugging |
			
 
				-| `general-primary`    | (slot 2 model)       | 16384          | General conversation and reasoning |
			
 
				-| `backup`             | (slot 3 model)       | 16384          | Fallback for either category |
			
 
				-| `experimental`       | (slot 4 model)       | 8192           | Testing new models           |
			
 
				+| Service                      | Warms               | Instance            |
			
 
				+|------------------------------|---------------------|---------------------|
			
 
				+| `ollama-warmup.service`      | slots 1, 2, 5       | Node 1 (port 11434) |
			
 
				+| `ollama-warmup-node0.service`| slots 3, 4, 6       | Node 0 (port 11435) |
			
 
				 
			
 
				-## Warm-up Service
			
 
				+`OLLAMA_KEEP_ALIVE=-1` keeps models pinned once loaded. The warmup services only
			
 
				+need to run once after boot; subsequent requests hit already-loaded models immediately.
			
 
				 
			
 
				-The role deploys `ollama-warmup.service`, a oneshot systemd service that runs after
			
 
				-`ollama.service` starts.
			
 
				+Check warmup status:
			
 
				 
			
 
				-**Why it is needed:** Even though `OLLAMA_KEEP_ALIVE=-1` keeps models loaded in GPU
			
 
				-memory indefinitely once loaded, Ollama does not automatically load models on
			
 
				-startup. The warm-up service sends a minimal inference request to each slot model,
			
 
				-triggering the initial load into GPU memory. Without this, the first user request
			
 
				-to each model would experience a long delay while the model is loaded.
			
 
				+```bash
			
 
				+systemctl status ollama-warmup ollama-warmup-node0
			
 
				+```
			
 
				 
			
 
				-The warm-up service:
			
 
				+Re-run warmup manually (e.g. after rotating a slot):
			
 
				 
			
 
				-1. Waits for Ollama API to be healthy
			
 
				-2. Sends a short prompt to each configured slot model
			
 
				-3. Exits after all models are loaded
			
 
				+```bash
			
 
				+systemctl restart ollama-warmup          # Node 1 general models
			
 
				+systemctl restart ollama-warmup-node0    # Node 0 coding models
			
 
				+```
			
 
				 
			
 
				 ## model_selection.json
			
 
				 
			
 
				-The model selection file is read by this role to determine which models to assign to
			
 
				-each slot. Schema:
			
 
				+`playbooks/04_models.yml` reads `benchmarks/results/model_selection.json`:
			
 
				 
			
 
				 ```json
			
 
				 {
			
 
				-  "timestamp": "2025-01-15T10:30:00Z",
			
 
				-  "slot1_coding": "qwen2.5-coder:14b",
			
 
				-  "slot2_general": "llama3.1:8b",
			
 
				-  "slot3_backup": "deepseek-coder-v2:16b",
			
 
				-  "slot4_experimental": null
			
 
				+  "slot1_general": "llama3.1:8b",
			
 
				+  "slot2_general": "mistral:latest",
			
 
				+  "slot5_general_rotate": "llama3.2:3b",
			
 
				+  "slot3_coding": "deepseek-coder-v2:16b",
			
 
				+  "slot4_coding": "qwen2.5-coder:7b",
			
 
				+  "slot6_coding_rotate": "codegemma:7b",
			
 
				+  "general_ranking": [...],
			
 
				+  "coding_ranking": [...],
			
 
				+  "all_metrics": { ... }
			
 
				 }
			
 
				 ```
			
 
				 
			
 
				-If `model_selection.json` does not exist (first run before benchmarks), the role
			
 
				-falls back to default models defined in `group_vars/all.yml`.
			
 
				-
			
 
				 ## Tags
			
 
				 
			
 
				 ```bash
			
 
				-ansible-playbook playbooks/site.yml --tags models
			
 
				-ansible-playbook playbooks/site.yml --tags warmup
			
 
				+ansible-playbook playbooks/site.yml --tags models -K -e @local.yml
			
 
				+ansible-playbook playbooks/site.yml --tags models-warmup -K -e @local.yml
			
 
				 ```
			
--- a/roles/ollama/README.md
+++ b/roles/ollama/README.md
@@ -2,69 +2,80 @@
 
				 
			
 
				 ## Purpose
			
 
				 
			
 
				-Install, configure, and maintain the Ollama inference server on the AI server host.
			
 
				+Install, configure, and maintain Ollama inference server(s) on the AI server host.
			
 
				+Two instances run simultaneously — one per NUMA socket — to utilize both CPU sockets
			
 
				+on the Dell M630 (2× E5-2690v4).
			
 
				 
			
 
				-## Installation
			
 
				+## Instances
			
 
				 
			
 
				-Ollama is installed using the official install script, which places the binary at
			
 
				-`/usr/local/bin/ollama` and creates a systemd service. The script handles both fresh
			
 
				-installs and upgrades.
			
 
				+| Service                | Port  | NUMA Node | CPUs (physical only) | RAM binding | Purpose          |
			
 
				+|------------------------|-------|-----------|----------------------|-------------|------------------|
			
 
				+| `ollama.service`       | 11434 | Node 1    | 1 3 5 … 27 (odd)     | `--membind=1` | General models |
			
 
				+| `ollama-node0.service` | 11435 | Node 0    | 0 2 4 … 26 (even)    | `--membind=0` | Coding models  |
			
 
				 
			
 
				-## Environment Variables
			
 
				+Both instances share the same model storage directory (`/mnt/ai_data/ollama_models`)
			
 
				+and Ollama API key. Weights are loaded once into the NUMA node's memory; they are not
			
 
				+duplicated between instances.
			
 
				 
			
 
				-Configuration is applied via a systemd drop-in override file at
			
 
				-`/etc/systemd/system/ollama.service.d/override.conf`.
			
 
				+## Configuration
			
 
				 
			
 
				-| Variable                  | Value              | Description                                      |
			
 
				-|---------------------------|--------------------|--------------------------------------------------|
			
 
				-| `OLLAMA_HOST`             | `0.0.0.0:11434`   | Listen on all interfaces, port 11434             |
			
 
				-| `OLLAMA_MODELS`           | `/mnt/ai_data/ollama/models` | Model storage directory                |
			
 
				-| `OLLAMA_KEEP_ALIVE`       | `-1`               | Keep models loaded in GPU memory indefinitely    |
			
 
				-| `OLLAMA_NUM_PARALLEL`     | `4`                | Number of parallel inference requests            |
			
 
				-| `OLLAMA_MAX_LOADED_MODELS`| `4`                | Maximum models loaded in GPU memory at once      |
			
 
				-| `OLLAMA_API_KEY`          | (from Vault)       | API key for authentication                       |
			
 
				-| `OLLAMA_FLASH_ATTENTION`  | `1`                | Enable Flash Attention for performance           |
			
 
				-| `OLLAMA_CONTEXT_LENGTH`   | `32768`            | Default context window size                      |
			
 
				+### Node 1 — systemd override
			
 
				 
			
 
				-## Override.conf Approach
			
 
				+Applied via `/etc/systemd/system/ollama.service.d/override.conf` (templated from
			
 
				+`templates/ollama/override.conf.j2`):
			
 
				 
			
 
				-Rather than modifying the upstream systemd unit file (which would be overwritten on
			
 
				-upgrades), this role uses a systemd drop-in directory:
			
 
				+| Variable                   | Value                        | Description                                      |
			
 
				+|----------------------------|------------------------------|--------------------------------------------------|
			
 
				+| `OLLAMA_API_KEY`           | (from Vault)                 | Shared key for all API requests                  |
			
 
				+| `OLLAMA_HOST`              | `0.0.0.0:11434`              | Listen on all interfaces, port 11434             |
			
 
				+| `OLLAMA_MODELS`            | `/mnt/ai_data/ollama_models` | Shared model storage                             |
			
 
				+| `OLLAMA_KEEP_ALIVE`        | `-1`                         | Never unload models from RAM                     |
			
 
				+| `OLLAMA_FLASH_ATTENTION`   | `1`                          | Fused softmax — ~20% less memory bandwidth       |
			
 
				+| `OLLAMA_NUM_THREADS`       | `14`                         | Physical cores on NUMA node 1 only               |
			
 
				+| `OLLAMA_NUM_PARALLEL`      | `2`                          | Concurrent inference streams per instance        |
			
 
				+| `OLLAMA_MAX_LOADED_MODELS` | `3`                          | 3 models warm per instance (6 total)             |
			
 
				+| `CPUAffinity`              | `1 3 5 … 27`                 | Odd CPUs = socket 1 physical cores               |
			
 
				+| `ExecStart`                | `numactl --membind=1 ollama serve` | Pin memory allocations to Node 1 RAM        |
			
 
				 
			
 
				-```
			
 
				-/etc/systemd/system/ollama.service.d/override.conf
			
 
				-```
			
 
				+### Node 0 — standalone systemd unit
			
 
				+
			
 
				+Deployed to `/etc/systemd/system/ollama-node0.service` (from
			
 
				+`templates/ollama/ollama-node0.service.j2`). Uses the same variables but with:
			
 
				 
			
 
				-This ensures environment variables survive Ollama upgrades while keeping the
			
 
				-upstream service file intact.
			
 
				+| Variable   | Value           |
			
 
				+|------------|-----------------|
			
 
				+| `OLLAMA_HOST` | `0.0.0.0:11435` |
			
 
				+| `CPUAffinity` | `0 2 4 … 26` |
			
 
				+| `ExecStart`   | `numactl --membind=0 ollama serve` |
			
 
				 
			
 
				-## Why OLLAMA_API_KEY
			
 
				+## NUMA Rationale
			
 
				 
			
 
				-Without an API key, anyone with network access to port 11434 can use the Ollama API
			
 
				-to run inference, pull models, or delete models. Setting `OLLAMA_API_KEY` requires
			
 
				-all API requests to include an `Authorization: Bearer <key>` header, preventing
			
 
				-unauthenticated access.
			
 
				+On the M630 with dual E5-2690v4:
			
 
				+- **Node 1** (odd CPUs) has ~120 GB free RAM — assigned general models (larger)
			
 
				+- **Node 0** (even CPUs) has ~75 GB free RAM — assigned coding models
			
 
				+
			
 
				+Without `numactl --membind`, the OS allocates model weights and KV cache across both
			
 
				+nodes, causing cross-socket memory traffic (~40 GB/s vs ~68–75 GB/s local).
			
 
				+`CPUAffinity` alone sets the scheduler; `numactl` sets the memory policy.
			
 
				 
			
 
				 ## OLLAMA_FLASH_ATTENTION
			
 
				 
			
 
				-Flash Attention is a GPU memory optimization that reduces memory usage and increases
			
 
				-throughput for transformer inference. Setting `OLLAMA_FLASH_ATTENTION=1` enables
			
 
				-this optimization for all models. This is a newer addition to Ollama and provides
			
 
				-measurable performance improvements.
			
 
				+Enables fused softmax kernel — reduces attention memory bandwidth by ~20% and improves
			
 
				+throughput at all context lengths on AVX2 (E5-2690v4). Note: `OLLAMA_KV_CACHE_TYPE`
			
 
				+is intentionally **not** set — q8_0 dequantization overhead regressed throughput on
			
 
				+this CPU despite the bandwidth savings.
			
 
				 
			
 
				 ## Upgrade Procedure
			
 
				 
			
 
				-To upgrade Ollama to the latest version:
			
 
				-
			
 
				 ```bash
			
 
				-ansible-playbook playbooks/03_ollama.yml
			
 
				+ansible-playbook playbooks/02_infrastructure.yml -K -e @local.yml --tags ollama
			
 
				 ```
			
 
				 
			
 
				-The official install script detects the existing installation and performs an
			
 
				-in-place upgrade. The service is restarted after the upgrade.
			
 
				+The official install script detects the existing installation and performs an in-place
			
 
				+upgrade. Both `ollama.service` and `ollama-node0.service` are restarted.
			
 
				 
			
 
				 ## Tags
			
 
				 
			
 
				 ```bash
			
 
				-ansible-playbook playbooks/site.yml --tags ollama
			
 
				+ansible-playbook playbooks/site.yml --tags ollama -K -e @local.yml
			
 
				 ```
			
--- a/roles/openclaw/README.md
+++ b/roles/openclaw/README.md
@@ -3,56 +3,62 @@
 
				 ## Purpose
			
 
				 
			
 
				 Deploy OpenClaw, a Telegram bot that provides access to Ollama models via Telegram
			
 
				-messaging.
			
 
				+messaging. Always uses the best warm general-purpose model (`slot1_general` from the
			
 
				+last benchmark run).
			
 
				 
			
 
				 ## Prerequisites
			
 
				 
			
 
				 - A Telegram bot token obtained from [@BotFather](https://t.me/BotFather)
			
 
				 - The token must be stored in Vault at `{{ vault_secret_prefix }}/openclaw:telegram_token`
			
 
				+- `benchmarks/results/model_selection.json` must exist (produced by `03_benchmark.yml`)
			
 
				 
			
 
				-## Installation
			
 
				+## Model Selection
			
 
				 
			
 
				-1. Node.js 20 is installed on the target host
			
 
				-2. OpenClaw is installed globally via `npm install -g openclaw`
			
 
				-3. A systemd service (`openclaw.service`) is created for process management
			
 
				+`08_openclaw.yml` reads `benchmarks/results/model_selection.json` at deploy time and
			
 
				+sets `openclaw_model` to `slot1_general` — the highest-scoring general model that is
			
 
				+always warm on the Node 1 instance (port 11434). This ensures the bot always uses the
			
 
				+best available model without requiring manual updates after a benchmark run.
			
 
				 
			
 
				-## Configuration
			
 
				+The fallback value (used when `model_selection.json` is absent) is set in
			
 
				+`inventory/group_vars/all.yml` under `openclaw_model`.
			
 
				 
			
 
				-Config file location: `/mnt/ai_data/openclaw/config.yml`
			
 
				+## Ollama Endpoint
			
 
				 
			
 
				-The configuration includes:
			
 
				+OpenClaw connects to `localhost:11434` — the Node 1 general instance. Coding models on
			
 
				+port 11435 are not accessible to the bot; they are reserved for IDE and API integrations.
			
 
				 
			
 
				-- Ollama API endpoint and authentication
			
 
				-- Telegram bot token (read from Vault)
			
 
				-- Default model selection
			
 
				-- Allowed user IDs (if access control is needed)
			
 
				+## Installation
			
 
				 
			
 
				-## Service
			
 
				+1. Python 3 dependencies (`python-telegram-bot`, `requests`, `pyyaml`) are installed via `pip3`
			
 
				+2. The bot script is deployed to `/mnt/ai_data/openclaw/bot.py`
			
 
				+3. Config is templated to `/mnt/ai_data/openclaw/config.yml`
			
 
				+4. A systemd service (`openclaw.service`) manages the process
			
 
				 
			
 
				-```
			
 
				-/etc/systemd/system/openclaw.service
			
 
				-```
			
 
				+## Configuration
			
 
				+
			
 
				+Config file location: `/mnt/ai_data/openclaw/config.yml`
			
 
				 
			
 
				-The service runs as a systemd unit, automatically starting on boot and restarting
			
 
				-on failure.
			
 
				+The configuration includes:
			
 
				+- Ollama API endpoint (`http://localhost:11434`) and API key (from Vault)
			
 
				+- Telegram bot token (from Vault)
			
 
				+- Model name (from `slot1_general`)
			
 
				 
			
 
				 ## Vault Integration
			
 
				 
			
 
				-The Telegram bot token is stored in Vault:
			
 
				-
			
 
				 - **Path:** `{{ vault_secret_prefix }}/openclaw`
			
 
				 - **Key:** `telegram_token`
			
 
				 
			
 
				-The role reads the token from Vault at deploy time and writes it to the config file.
			
 
				+The Telegram token is read from Vault at deploy time and written to the config file.
			
 
				 
			
 
				 ## Skipping Installation
			
 
				 
			
 
				-If no Telegram bot token is configured (the Vault secret is empty or absent),
			
 
				-the OpenClaw installation is skipped entirely during `site.yml`. This allows
			
 
				-running the full playbook without a Telegram bot token if the feature is not needed.
			
 
				+If no Telegram bot token is configured (Vault secret absent or empty), the entire
			
 
				+OpenClaw installation is skipped. This allows running `site.yml` without a Telegram
			
 
				+bot token.
			
 
				 
			
 
				 ## Tags
			
 
				 
			
 
				 ```bash
			
 
				-ansible-playbook playbooks/site.yml --tags openclaw
			
 
				+ansible-playbook playbooks/site.yml --tags openclaw -K -e @local.yml
			
 
				+ansible-playbook playbooks/08_openclaw.yml -K -e @local.yml
			
 
				 ```
			
--- a/roles/openwebui/README.md
+++ b/roles/openwebui/README.md
@@ -2,30 +2,41 @@
 
				 
			
 
				 ## Purpose
			
 
				 
			
 
				-Deploy Open WebUI with full Ollama integration, RAG support via Qdrant, and SSO via
			
 
				-Keycloak OIDC.
			
 
				+Deploy Open WebUI with full Ollama integration across both NUMA instances, RAG support
			
 
				+via Qdrant, and SSO via Keycloak OIDC.
			
 
				+
			
 
				+## Ollama Backend Configuration
			
 
				+
			
 
				+Open WebUI connects to **both** Ollama instances simultaneously via `OLLAMA_BASE_URLS`.
			
 
				+It load-balances requests across them and presents models from both as a single unified
			
 
				+list.
			
 
				+
			
 
				+| Instance      | Port  | Models              |
			
 
				+|---------------|-------|---------------------|
			
 
				+| Node 1        | 11434 | General (slots 1-2-5) |
			
 
				+| Node 0        | 11435 | Coding (slots 3-4-6) |
			
 
				 
			
 
				 ## Environment Variables
			
 
				 
			
 
				-| Variable                      | Value                                                        | Source      |
			
 
				-|-------------------------------|--------------------------------------------------------------|-------------|
			
 
				-| `OLLAMA_BASE_URL`             | `http://host.docker.internal:11434`                         | Hardcoded   |
			
 
				-| `OLLAMA_API_KEY`              | (Ollama API key)                                             | Vault       |
			
 
				-| `WEBUI_SECRET_KEY`            | (session signing key)                                        | Vault       |
			
 
				-| `VECTOR_DB`                   | `qdrant`                                                     | Hardcoded   |
			
 
				-| `QDRANT_URI`                  | `http://host.docker.internal:6333`                          | Hardcoded   |
			
 
				-| `ENABLE_RAG_WEB_SEARCH`      | `true`                                                       | Hardcoded   |
			
 
				-| `OAUTH_CLIENT_ID`            | `open-webui`                                                 | Hardcoded   |
			
 
				-| `OAUTH_CLIENT_SECRET`        | (OIDC client secret)                                         | Vault       |
			
 
				-| `OPENID_PROVIDER_URL`        | `https://idm.<domain>/realms/<keycloak_realm>/.well-known/openid-configuration` | Vault (keycloak_oidc_url) |
			
 
				-| `OAUTH_PROVIDER_NAME`        | `{{ platform_name }}`                                        | group_vars  |
			
 
				-| `ENABLE_OAUTH_SIGNUP`        | `true`                                                       | Hardcoded   |
			
 
				-| `DEFAULT_USER_ROLE`          | `user`                                                       | Hardcoded   |
			
 
				-| `WEBUI_NAME`                 | `{{ platform_name }}`                                        | group_vars  |
			
 
				-| `ENABLE_OAUTH_ROLE_MANAGEMENT` | `true`                                                     | Hardcoded   |
			
 
				-| `OAUTH_ROLES_CLAIM`          | `realm_access.roles`                                         | Hardcoded   |
			
 
				-| `OAUTH_ALLOWED_ROLES`        | `ai-user,ai-admin`                                           | Hardcoded   |
			
 
				-| `OAUTH_ADMIN_ROLES`          | `ai-admin`                                                   | Hardcoded   |
			
 
				+| Variable                      | Value                                                                                     | Source      |
			
 
				+|-------------------------------|-------------------------------------------------------------------------------------------|-------------|
			
 
				+| `OLLAMA_BASE_URLS`            | `http://host.docker.internal:11434;http://host.docker.internal:11435`                    | Hardcoded   |
			
 
				+| `OLLAMA_API_KEY`              | (Ollama API key)                                                                          | Vault       |
			
 
				+| `RAG_OLLAMA_BASE_URL`         | `http://host.docker.internal:11434`                                                       | Hardcoded   |
			
 
				+| `WEBUI_SECRET_KEY`            | (session signing key)                                                                     | Vault       |
			
 
				+| `VECTOR_DB`                   | `qdrant`                                                                                  | Hardcoded   |
			
 
				+| `QDRANT_URI`                  | `http://host.docker.internal:6333`                                                        | Hardcoded   |
			
 
				+| `OAUTH_CLIENT_ID`             | `open-webui`                                                                              | Hardcoded   |
			
 
				+| `OAUTH_CLIENT_SECRET`         | (OIDC client secret)                                                                      | Vault       |
			
 
				+| `OPENID_PROVIDER_URL`         | `https://idm.<domain>/realms/<keycloak_realm>/.well-known/openid-configuration`           | Vault       |
			
 
				+| `OAUTH_PROVIDER_NAME`         | `{{ platform_name }}`                                                                     | group_vars  |
			
 
				+| `ENABLE_OAUTH_SIGNUP`         | `true`                                                                                    | Hardcoded   |
			
 
				+| `ENABLE_OAUTH_ROLE_MANAGEMENT`| `true`                                                                                    | Hardcoded   |
			
 
				+| `OAUTH_ROLES_CLAIM`           | `realm_access.roles`                                                                      | Hardcoded   |
			
 
				+| `OAUTH_ALLOWED_ROLES`         | `ai-user,ai-admin`                                                                        | Hardcoded   |
			
 
				+| `OAUTH_ADMIN_ROLES`           | `ai-admin`                                                                                | Hardcoded   |
			
 
				+| `DEFAULT_MODELS`              | `llama-family`                                                                            | Hardcoded   |
			
 
				+| `WEBUI_NAME`                  | `{{ platform_name }}`                                                                     | group_vars  |
			
 
				 
			
 
				 ## OIDC Setup
			
 
				 
			
@@ -38,22 +49,12 @@ Open WebUI uses Keycloak as its OIDC provider:
 
				 ## RAG
			
 
				 
			
 
				 - **Vector DB:** Qdrant at `http://host.docker.internal:6333`
			
 
				-- **Web search:** enabled via `ENABLE_RAG_WEB_SEARCH=true`
			
 
				-- Users can upload documents through the Open WebUI interface for RAG-augmented
			
 
				-  conversations
			
 
				-
			
 
				-## Model Access
			
 
				-
			
 
				-Open WebUI connects to Ollama at `http://host.docker.internal:11434` (the Docker
			
 
				-host network). The `OLLAMA_API_KEY` environment variable authenticates API requests
			
 
				-to the Ollama server.
			
 
				+- `RAG_OLLAMA_BASE_URL` is pinned to port 11434 (Node 1) for embedding requests —
			
 
				+  keeping RAG on a single stable endpoint avoids split-brain embedding indices
			
 
				+- Users can upload documents through the Open WebUI interface for RAG-augmented conversations
			
 
				 
			
 
				 ## SSO
			
 
				 
			
 
				-Users see a "Sign in with {{ platform_name }}" button on the login page. Clicking it
			
 
				-redirects to the Keycloak login page for the `{{ keycloak_realm }}` realm. After
			
 
				-authentication, users are redirected back to Open WebUI.
			
 
				-
			
 
				 Access is restricted by Keycloak realm role:
			
 
				 
			
 
				 | Keycloak role | Open WebUI access      |
			
@@ -62,12 +63,8 @@ Access is restricted by Keycloak realm role:
 
				 | `ai-admin`    | ✅ Admin               |
			
 
				 | *(none)*      | ❌ Login blocked       |
			
 
				 
			
 
				-New users who authenticate via SSO are automatically created. Their Open WebUI role
			
 
				-is set based on `OAUTH_ADMIN_ROLES` — users with `ai-admin` get admin access,
			
 
				-all others get standard user access.
			
 
				-
			
 
				 ## Tags
			
 
				 
			
 
				 ```bash
			
 
				-ansible-playbook playbooks/site.yml --tags openwebui
			
 
				+ansible-playbook playbooks/site.yml --tags openwebui -K -e @local.yml
			
 
				 ```
			
--- a/templates/ollama/ollama-node0.service.j2
+++ b/templates/ollama/ollama-node0.service.j2
@@ -0,0 +1,26 @@
 
				+[Unit]
			
 
				+Description=Ollama Service — NUMA Node 0 (Coding Models)
			
 
				+After=network-online.target ollama.service
			
 
				+Wants=network-online.target
			
 
				+
			
 
				+[Service]
			
 
				+ExecStart=/usr/bin/numactl --membind=0 {{ ollama_binary_path }} serve
			
 
				+Environment="OLLAMA_API_KEY={{ ollama_api_key }}"
			
 
				+Environment="OLLAMA_HOST=0.0.0.0:{{ ollama_node0_port }}"
			
 
				+Environment="OLLAMA_MODELS={{ ollama_models_path }}"
			
 
				+Environment="OLLAMA_KEEP_ALIVE={{ ollama_keep_alive }}"
			
 
				+Environment="OLLAMA_FLASH_ATTENTION={{ ollama_flash_attention }}"
			
 
				+Environment="OLLAMA_NUM_THREADS={{ ollama_num_threads }}"
			
 
				+Environment="OLLAMA_NUM_PARALLEL={{ ollama_num_parallel }}"
			
 
				+Environment="OLLAMA_MAX_LOADED_MODELS={{ ollama_max_loaded_models }}"
			
 
				+CPUAffinity={{ ollama_node0_cpu_affinity }}
			
 
				+LimitMEMLOCK=infinity
			
 
				+LimitNOFILE=65535
			
 
				+OOMScoreAdjust=-500
			
 
				+Restart=always
			
 
				+RestartSec=3
			
 
				+User=ollama
			
 
				+Group=ollama
			
 
				+
			
 
				+[Install]
			
 
				+WantedBy=multi-user.target
			
--- a/templates/ollama/override.conf.j2
+++ b/templates/ollama/override.conf.j2
@@ -9,12 +9,6 @@ Environment="OLLAMA_KEEP_ALIVE=-1"
 
				 # Flash attention: fused softmax, ~20% less memory bandwidth, faster on AVX2
			
 
				 Environment="OLLAMA_FLASH_ATTENTION=1"
			
 
				 
			
 
				-# KV cache quantization: q8_0 halves KV cache memory vs fp16.
			
 
				-# Attention reads dominate memory bandwidth at long contexts; smaller KV =
			
 
				-# fewer bytes transferred per token generated. q8_0 over q4_0: negligible
			
 
				-# quality loss vs significant noise at long contexts with q4_0.
			
 
				-Environment="OLLAMA_KV_CACHE_TYPE=q8_0"
			
 
				-
			
 
				 # Threads: 14 physical cores on NUMA node 1 only (no hyperthreads).
			
 
				 # LLM inference is memory-bandwidth-bound; HT siblings share the same memory
			
 
				 # pipeline and add scheduling overhead without adding bandwidth.
			
@@ -24,7 +18,7 @@ Environment="OLLAMA_NUM_THREADS={{ ollama_num_threads }}"
 
				 # Keeps per-request throughput high for interactive/single-user workloads.
			
 
				 Environment="OLLAMA_NUM_PARALLEL={{ ollama_num_parallel }}"
			
 
				 
			
 
				-# Keep 4 models warm in RAM (KEEP_ALIVE=-1 means never unload)
			
 
				+# Keep 3 models warm in RAM per instance (KEEP_ALIVE=-1 means never unload; 6 total across both sockets)
			
 
				 Environment="OLLAMA_MAX_LOADED_MODELS={{ ollama_max_loaded_models }}"
			
 
				 
			
 
				 # ── NUMA / CPU binding ────────────────────────────────────────────────────
			
--- a/templates/ollama/warmup-node0.sh.j2
+++ b/templates/ollama/warmup-node0.sh.j2
@@ -0,0 +1,28 @@
 
				+#!/bin/bash
			
 
				+# Ollama Node 0 model warm-up script (coding models, port {{ ollama_node0_port }})
			
 
				+# Sends a 1-token generation to each slot model to pin them in RAM
			
 
				+
			
 
				+set -e
			
 
				+
			
 
				+OLLAMA_URL="http://localhost:{{ ollama_node0_port }}"
			
 
				+API_KEY="{{ ollama_api_key }}"
			
 
				+
			
 
				+warmup_model() {
			
 
				+    local model="$1"
			
 
				+    echo "[warmup-node0] Loading: $model"
			
 
				+    curl -sf -X POST "${OLLAMA_URL}/api/generate" \
			
 
				+        -H "Authorization: Bearer ${API_KEY}" \
			
 
				+        -H "Content-Type: application/json" \
			
 
				+        -d "{\"model\":\"${model}\",\"prompt\":\"Hi\",\"stream\":false,\"options\":{\"num_predict\":1}}" \
			
 
				+        > /dev/null || echo "[warmup-node0] Warning: failed to warm up ${model}"
			
 
				+    echo "[warmup-node0] Done: $model"
			
 
				+}
			
 
				+
			
 
				+warmup_model "{{ model_selection.slot3_coding }}"
			
 
				+warmup_model "{{ model_selection.slot4_coding }}"
			
 
				+{% if model_selection.slot6_coding_rotate | default('') | length > 0
			
 
				+      and model_selection.slot6_coding_rotate | default('none') != 'none' %}
			
 
				+warmup_model "{{ model_selection.slot6_coding_rotate }}"
			
 
				+{% endif %}
			
 
				+
			
 
				+echo "[warmup-node0] All Node 0 coding models warmed up."
			
--- a/templates/ollama/warmup.sh.j2
+++ b/templates/ollama/warmup.sh.j2
@@ -20,9 +20,9 @@ warmup_model() {
 
				 
			
 
				 warmup_model "{{ model_selection.slot1_general }}"
			
 
				 warmup_model "{{ model_selection.slot2_general }}"
			
 
				-warmup_model "{{ model_selection.slot3_coding }}"
			
 
				-{% if model_selection.slot4_coding | length > 0 and model_selection.slot4_coding != 'none' %}
			
 
				-warmup_model "{{ model_selection.slot4_coding }}"
			
 
				+{% if model_selection.slot5_general_rotate | default('') | length > 0
			
 
				+      and model_selection.slot5_general_rotate | default('none') != 'none' %}
			
 
				+warmup_model "{{ model_selection.slot5_general_rotate }}"
			
 
				 {% endif %}
			
 
				 
			
 
				-echo "[warmup] All models warmed up."
			
 
				+echo "[warmup] All Node 1 general models warmed up."
			
--- a/templates/systemd/ollama-warmup-node0.service.j2
+++ b/templates/systemd/ollama-warmup-node0.service.j2
@@ -0,0 +1,14 @@
 
				+[Unit]
			
 
				+Description=Ollama Model Warm-Up — Node 0 (Coding)
			
 
				+After=ollama-node0.service
			
 
				+Requires=ollama-node0.service
			
 
				+
			
 
				+[Service]
			
 
				+Type=oneshot
			
 
				+RemainAfterExit=yes
			
 
				+ExecStart=/usr/local/bin/ollama-warmup-node0.sh
			
 
				+StandardOutput=journal
			
 
				+StandardError=journal
			
 
				+
			
 
				+[Install]
			
 
				+WantedBy=multi-user.target