hai 5 días · fb7c5f1061
--- a/benchmarks/results/benchmark_20260307T174357.md
+++ b/benchmarks/results/benchmark_20260307T174357.md
@@ -0,0 +1,17 @@
 
				+# Benchmark Results - 20260307T174357
			
 
				+
			
 
				+## Model Selection
			
 
				+| Slot | Role | Model | Composite Score |
			
 
				+|------|------|-------|----------------|
			
 
				+| 1 | General (Primary) | none | N/A |
			
 
				+| 2 | General (Secondary) | none | N/A |
			
 
				+| 3 | Coding (Primary) | none | N/A |
			
 
				+| 4 | Coding (Secondary) | none | N/A |
			
 
				+
			
 
				+## Detailed Metrics
			
 
				+
			
 
				+## Scoring Formula
			
 
				+- Composite = quality * 0.45 + token_speed_normalized * 0.30 + latency_score * 0.25
			
 
				+- Speed normalized against 22 tok/sec ceiling (hardware-observed max)
			
 
				+- Coding quality: has_def×0.20 + has_return×0.20 + has_docstring×0.15 + has_type_hint×0.15 + has_code_block×0.10 + has_assert×0.08 + has_test_def×0.07 + has_import×0.05
			
 
				+- Category: override dict → quality delta (coding_avg - general_avg >= 0.1) → name pattern (coder/codestral/codellama/starcoder) → general
			
--- a/benchmarks/results/benchmark_20260307T175402.md
+++ b/benchmarks/results/benchmark_20260307T175402.md
@@ -0,0 +1,62 @@
 
				+# Benchmark Results - 20260307T175402
			
 
				+
			
 
				+## Model Selection
			
 
				+
			
 
				+
			
 
				+| Slot | Role                | Model                    | Composite Score |
			
 
				+| ---- | ------------------- | ------------------------ | --------------- |
			
 
				+| 1    | General (Primary)   | llama3.2:3b              | 0.961           |
			
 
				+| 2    | General (Secondary) | gemma3:12b-it-q4_K_M     | 0.495           |
			
 
				+| 3    | Coding (Primary)    | deepseek-coder-v2:latest | 0.764           |
			
 
				+| 4    | Coding (Secondary)  | qwen2.5-coder:7b         | 0.664           |
			
 
				+
			
 
				+
			
 
				+## Detailed Metrics
			
 
				+
			
 
				+### deepseek-coder-v2:latest
			
 
				+
			
 
				+- **Category**: coding
			
 
				+- **Coding Quality**: 0.657
			
 
				+- **General Quality**: 0.886
			
 
				+- **Avg Tokens/sec**: 21.6
			
 
				+- **Latency (ms)**: 1510.5
			
 
				+- **Coding Composite**: 0.764
			
 
				+- **General Composite**: 0.867
			
 
				+
			
 
				+### qwen2.5-coder:7b
			
 
				+
			
 
				+- **Category**: coding
			
 
				+- **Coding Quality**: 0.683
			
 
				+- **General Quality**: 0.888
			
 
				+- **Avg Tokens/sec**: 12.3
			
 
				+- **Latency (ms)**: 1222.4
			
 
				+- **Coding Composite**: 0.664
			
 
				+- **General Composite**: 0.756
			
 
				+
			
 
				+### gemma3:12b-it-q4_K_M
			
 
				+
			
 
				+- **Category**: general
			
 
				+- **Coding Quality**: 0.757
			
 
				+- **General Quality**: 0.931
			
 
				+- **Avg Tokens/sec**: 5.6
			
 
				+- **Latency (ms)**: 5975.8
			
 
				+- **Coding Composite**: 0.416
			
 
				+- **General Composite**: 0.495
			
 
				+
			
 
				+### llama3.2:3b
			
 
				+
			
 
				+- **Category**: general
			
 
				+- **Coding Quality**: 0.723
			
 
				+- **General Quality**: 0.979
			
 
				+- **Avg Tokens/sec**: 22.5
			
 
				+- **Latency (ms)**: 580.7
			
 
				+- **Coding Composite**: 0.846
			
 
				+- **General Composite**: 0.961
			
 
				+
			
 
				+## Scoring Formula
			
 
				+
			
 
				+- Composite = quality * 0.45 + token_speed_normalized * 0.30 + latency_score * 0.25
			
 
				+- Speed normalized against 22 tok/sec ceiling (hardware-observed max)
			
 
				+- Coding quality: has_def×0.20 + has_return×0.20 + has_docstring×0.15 + has_type_hint×0.15 + has_code_block×0.10 + has_assert×0.08 + has_test_def×0.07 + has_import×0.05
			
 
				+- Category: override dict → quality delta (coding_avg - general_avg >= 0.1) → name pattern (coder/codestral/codellama/starcoder) → general
			
 
				+
			
--- a/benchmarks/results/model_selection.json
+++ b/benchmarks/results/model_selection.json
@@ -1,90 +1,116 @@
 
				 {
			
 
				     "all_metrics": {
			
 
				-        "deepseek-coder-v2": {
			
 
				-            "avg_tok_per_sec": 20.2,
			
 
				+        "deepseek-coder-v2:latest": {
			
 
				+            "avg_tok_per_sec": 21.6,
			
 
				             "category": "coding",
			
 
				-            "coding_composite": 0.738,
			
 
				-            "coding_quality": 0.667,
			
 
				-            "general_composite": 0.852,
			
 
				-            "general_quality": 0.918,
			
 
				-            "latency_ms": 1744.5,
			
 
				-            "latency_score": 0.651,
			
 
				-            "toks_norm": 0.919
			
 
				+            "coding_composite": 0.764,
			
 
				+            "coding_quality": 0.657,
			
 
				+            "general_composite": 0.867,
			
 
				+            "general_quality": 0.886,
			
 
				+            "latency_ms": 1510.5,
			
 
				+            "latency_score": 0.698,
			
 
				+            "toks_norm": 0.982
			
 
				+        },
			
 
				+        "gemma3:12b-it-q4_K_M": {
			
 
				+            "avg_tok_per_sec": 5.6,
			
 
				+            "category": "general",
			
 
				+            "coding_composite": 0.416,
			
 
				+            "coding_quality": 0.757,
			
 
				+            "general_composite": 0.495,
			
 
				+            "general_quality": 0.931,
			
 
				+            "latency_ms": 5975.8,
			
 
				+            "latency_score": 0,
			
 
				+            "toks_norm": 0.253
			
 
				         },
			
 
				         "llama3.2:3b": {
			
 
				             "avg_tok_per_sec": 22.5,
			
 
				             "category": "general",
			
 
				-            "coding_composite": 0.794,
			
 
				-            "coding_quality": 0.607,
			
 
				-            "general_composite": 0.967,
			
 
				-            "general_quality": 0.991,
			
 
				-            "latency_ms": 576.1,
			
 
				-            "latency_score": 0.885,
			
 
				+            "coding_composite": 0.846,
			
 
				+            "coding_quality": 0.723,
			
 
				+            "general_composite": 0.961,
			
 
				+            "general_quality": 0.979,
			
 
				+            "latency_ms": 580.7,
			
 
				+            "latency_score": 0.884,
			
 
				             "toks_norm": 1.0
			
 
				         },
			
 
				         "qwen2.5-coder:7b": {
			
 
				-            "avg_tok_per_sec": 11.2,
			
 
				+            "avg_tok_per_sec": 12.3,
			
 
				             "category": "coding",
			
 
				-            "coding_composite": 0.63,
			
 
				-            "coding_quality": 0.64,
			
 
				-            "general_composite": 0.757,
			
 
				-            "general_quality": 0.922,
			
 
				-            "latency_ms": 1211.5,
			
 
				-            "latency_score": 0.758,
			
 
				-            "toks_norm": 0.509
			
 
				+            "coding_composite": 0.664,
			
 
				+            "coding_quality": 0.683,
			
 
				+            "general_composite": 0.756,
			
 
				+            "general_quality": 0.888,
			
 
				+            "latency_ms": 1222.4,
			
 
				+            "latency_score": 0.756,
			
 
				+            "toks_norm": 0.56
			
 
				         }
			
 
				     },
			
 
				     "coding_ranking": [
			
 
				         {
			
 
				-            "composite": 0.738,
			
 
				+            "composite": 0.764,
			
 
				             "metrics": {
			
 
				-                "avg_tok_per_sec": 20.2,
			
 
				+                "avg_tok_per_sec": 21.6,
			
 
				                 "category": "coding",
			
 
				-                "coding_composite": 0.738,
			
 
				-                "coding_quality": 0.667,
			
 
				-                "general_composite": 0.852,
			
 
				-                "general_quality": 0.918,
			
 
				-                "latency_ms": 1744.5,
			
 
				-                "latency_score": 0.651,
			
 
				-                "toks_norm": 0.919
			
 
				+                "coding_composite": 0.764,
			
 
				+                "coding_quality": 0.657,
			
 
				+                "general_composite": 0.867,
			
 
				+                "general_quality": 0.886,
			
 
				+                "latency_ms": 1510.5,
			
 
				+                "latency_score": 0.698,
			
 
				+                "toks_norm": 0.982
			
 
				             },
			
 
				-            "name": "deepseek-coder-v2"
			
 
				+            "name": "deepseek-coder-v2:latest"
			
 
				         },
			
 
				         {
			
 
				-            "composite": 0.63,
			
 
				+            "composite": 0.664,
			
 
				             "metrics": {
			
 
				-                "avg_tok_per_sec": 11.2,
			
 
				+                "avg_tok_per_sec": 12.3,
			
 
				                 "category": "coding",
			
 
				-                "coding_composite": 0.63,
			
 
				-                "coding_quality": 0.64,
			
 
				-                "general_composite": 0.757,
			
 
				-                "general_quality": 0.922,
			
 
				-                "latency_ms": 1211.5,
			
 
				-                "latency_score": 0.758,
			
 
				-                "toks_norm": 0.509
			
 
				+                "coding_composite": 0.664,
			
 
				+                "coding_quality": 0.683,
			
 
				+                "general_composite": 0.756,
			
 
				+                "general_quality": 0.888,
			
 
				+                "latency_ms": 1222.4,
			
 
				+                "latency_score": 0.756,
			
 
				+                "toks_norm": 0.56
			
 
				             },
			
 
				             "name": "qwen2.5-coder:7b"
			
 
				         }
			
 
				     ],
			
 
				     "general_ranking": [
			
 
				         {
			
 
				-            "composite": 0.967,
			
 
				+            "composite": 0.961,
			
 
				             "metrics": {
			
 
				                 "avg_tok_per_sec": 22.5,
			
 
				                 "category": "general",
			
 
				-                "coding_composite": 0.794,
			
 
				-                "coding_quality": 0.607,
			
 
				-                "general_composite": 0.967,
			
 
				-                "general_quality": 0.991,
			
 
				-                "latency_ms": 576.1,
			
 
				-                "latency_score": 0.885,
			
 
				+                "coding_composite": 0.846,
			
 
				+                "coding_quality": 0.723,
			
 
				+                "general_composite": 0.961,
			
 
				+                "general_quality": 0.979,
			
 
				+                "latency_ms": 580.7,
			
 
				+                "latency_score": 0.884,
			
 
				                 "toks_norm": 1.0
			
 
				             },
			
 
				             "name": "llama3.2:3b"
			
 
				+        },
			
 
				+        {
			
 
				+            "composite": 0.495,
			
 
				+            "metrics": {
			
 
				+                "avg_tok_per_sec": 5.6,
			
 
				+                "category": "general",
			
 
				+                "coding_composite": 0.416,
			
 
				+                "coding_quality": 0.757,
			
 
				+                "general_composite": 0.495,
			
 
				+                "general_quality": 0.931,
			
 
				+                "latency_ms": 5975.8,
			
 
				+                "latency_score": 0,
			
 
				+                "toks_norm": 0.253
			
 
				+            },
			
 
				+            "name": "gemma3:12b-it-q4_K_M"
			
 
				         }
			
 
				     ],
			
 
				     "slot1_general": "llama3.2:3b",
			
 
				-    "slot2_general": "llama3.2:3b",
			
 
				-    "slot3_coding": "deepseek-coder-v2",
			
 
				+    "slot2_general": "gemma3:12b-it-q4_K_M",
			
 
				+    "slot3_coding": "deepseek-coder-v2:latest",
			
 
				     "slot4_coding": "qwen2.5-coder:7b"
			
 
				 }
			
--- a/inventory/group_vars/all.yml
+++ b/inventory/group_vars/all.yml
@@ -56,16 +56,20 @@ qdrant_grpc_port: 6334
 
				 
			
 
				 # Ollama configuration
			
 
				 ollama_host: "0.0.0.0:11434"
			
 
				-ollama_num_threads: 28
			
 
				-ollama_num_parallel: 4
			
 
				+ollama_num_threads: 14
			
 
				+ollama_num_parallel: 2
			
 
				 ollama_max_loaded_models: 4
			
 
				 ollama_keep_alive: "-1"
			
 
				 ollama_flash_attention: "1"
			
 
				 
			
 
				 # NUMA/CPU affinity - Dell M630, 2x E5-2690v4
			
 
				-# NUMA node 1 (odd CPUs) has ~120 GB free RAM vs node 0's ~75 GB
			
 
				+# CPUs are interleaved: odd = socket 1 (NUMA node 1), even = socket 0.
			
 
				+# Physical cores on node 1: 1,3,...,27 (14 cores). HT siblings: 29,31,...,55.
			
 
				+# Pinning to physical cores only eliminates HT contention on the memory bus.
			
 
				+# NUMA node 1 has ~120 GB free RAM vs node 0's ~75 GB.
			
 
				 ollama_numa_node: "1"
			
 
				-ollama_cpu_affinity: "1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47 49 51 53 55"
			
 
				+ollama_cpu_affinity: "1 3 5 7 9 11 13 15 17 19 21 23 25 27"
			
 
				+ollama_binary_path: /usr/bin/ollama
			
 
				 
			
 
				 # Keycloak configuration
			
 
				 keycloak_realm: "{{ vault_project_slug }}"
			
@@ -81,7 +85,7 @@ benchmark_thresholds:
 
				   min_quality_score: 0.6
			
 
				   min_composite_score: 0.55
			
 
				 
			
 
				-benchmark_toks_norm_ceiling: 22     # Observed hardware max on Dell M630 (21.8 tok/sec measured)
			
 
				+benchmark_toks_norm_ceiling: 22     # Observed hardware max on Dell M630 (22.5 tok/sec measured)
			
 
				 benchmark_coding_threshold: 0.10    # Delta to classify a model as coding-specialized
			
 
				 
			
 
				 # Explicit category overrides applied before heuristics. Keys are model names as
			
@@ -89,6 +93,14 @@ benchmark_coding_threshold: 0.10    # Delta to classify a model as coding-specia
 
				 # Example: { "deepseek-coder-v2": "coding", "qwen2.5-coder:7b": "coding" }
			
 
				 model_category_overrides: {}
			
 
				 
			
 
				+# Baseline models — always pulled before benchmarking regardless of model_selection.json.
			
 
				+# These are the minimum set needed to populate all 4 slots with meaningful candidates.
			
 
				+baseline_models:
			
 
				+  - "llama3.2:3b"
			
 
				+  - "deepseek-coder-v2"
			
 
				+  - "qwen2.5-coder:7b"
			
 
				+  - "llama3.1:8b"
			
 
				+
			
 
				 # Candidate models to recommend/pull if benchmark scores are below threshold
			
 
				 candidate_models:
			
 
				   - name: "qwen2.5-coder:32b-instruct-q4_K_M"
			
@@ -115,6 +127,11 @@ candidate_models:
 
				 # OpenClaw default model
			
 
				 openclaw_model: "llama3.2:3b"
			
 
				 
			
 
				+# AWS Bedrock (OpenAI-compatible API via Open WebUI)
			
 
				+# Pass bearer_token on first run: -e "bedrock_bearer_token=<value>"
			
 
				+# To rotate: re-run with the new token value.
			
 
				+bedrock_aws_region: "us-east-1"
			
 
				+
			
 
				 # NGINX SSL certificate paths (on nginx_proxy)
			
 
				 nginx_ssl_cert: "/etc/nginx/ssl/{{ domain }}.crt"
			
 
				 nginx_ssl_key: "/etc/nginx/ssl/{{ domain }}.key"
			
--- a/playbooks/02_infrastructure.yml
+++ b/playbooks/02_infrastructure.yml
@@ -11,12 +11,6 @@
 
				   vars:
			
 
				     vault_token_file: "{{ playbook_dir }}/../vault/.vault-token"
			
 
				     vault_url: "http://{{ ai_server_ip }}:{{ vault_port }}"
			
 
				-    ollama_num_threads: 28
			
 
				-    ollama_num_parallel: 4
			
 
				-    ollama_max_loaded_models: 4
			
 
				-    ollama_keep_alive: "-1"
			
 
				-    ollama_numa_node: "1"
			
 
				-    ollama_cpu_affinity: "1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47 49 51 53 55"
			
 
				 
			
 
				   pre_tasks:
			
 
				     - name: "Infrastructure | Install Python Docker SDK prerequisites"
			
@@ -24,6 +18,7 @@
 
				         name:
			
 
				           - python3-pip
			
 
				           - python3-requests
			
 
				+          - numactl
			
 
				         state: present
			
 
				       tags: always
			
 
				 
			
@@ -160,9 +155,112 @@
 
				       tags:
			
 
				         - ollama
			
 
				 
			
 
				+    # ── OS-level kernel tuning for dedicated inference server ────────────────
			
 
				+    - name: "OS Tune | Apply sysctl settings for inference workload"
			
 
				+      ansible.posix.sysctl:
			
 
				+        name: "{{ item.name }}"
			
 
				+        value: "{{ item.value }}"
			
 
				+        sysctl_file: /etc/sysctl.d/99-ollama-perf.conf
			
 
				+        reload: true
			
 
				+        state: present
			
 
				+      loop:
			
 
				+        # Disable auto-NUMA migration — fights explicit numactl --membind=1 by
			
 
				+        # moving KV-cache pages mid-inference to a different NUMA node.
			
 
				+        - { name: kernel.numa_balancing, value: "0" }
			
 
				+        # Near-zero swappiness: prevents model weights being paged out under
			
 
				+        # memory pressure (complements LimitMEMLOCK=infinity in the unit file).
			
 
				+        - { name: vm.swappiness, value: "1" }
			
 
				+        # Required for mlock to succeed without reservation failures.
			
 
				+        - { name: vm.overcommit_memory, value: "1" }
			
 
				+      tags:
			
 
				+        - os-tune
			
 
				+
			
 
				+    - name: "OS Tune | Set Transparent Huge Pages to madvise (immediate)"
			
 
				+      ansible.builtin.shell:
			
 
				+        cmd: echo madvise > /sys/kernel/mm/transparent_hugepage/enabled
			
 
				+      changed_when: true
			
 
				+      tags:
			
 
				+        - os-tune
			
 
				+
			
 
				+    - name: "OS Tune | Install THP madvise persistence service"
			
 
				+      ansible.builtin.copy:
			
 
				+        dest: /etc/systemd/system/thp-madvise.service
			
 
				+        mode: "0644"
			
 
				+        owner: root
			
 
				+        group: root
			
 
				+        content: |
			
 
				+          [Unit]
			
 
				+          Description=Set Transparent Huge Pages to madvise
			
 
				+          After=local-fs.target
			
 
				+
			
 
				+          [Service]
			
 
				+          Type=oneshot
			
 
				+          ExecStart=/bin/sh -c 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled'
			
 
				+          RemainAfterExit=yes
			
 
				+
			
 
				+          [Install]
			
 
				+          WantedBy=multi-user.target
			
 
				+      notify:
			
 
				+        - Reload systemd daemon
			
 
				+      tags:
			
 
				+        - os-tune
			
 
				+
			
 
				+    - name: "OS Tune | Enable THP madvise persistence service"
			
 
				+      ansible.builtin.systemd:
			
 
				+        name: thp-madvise.service
			
 
				+        enabled: true
			
 
				+        daemon_reload: false
			
 
				+      tags:
			
 
				+        - os-tune
			
 
				+
			
 
				+    - name: "OS Tune | Set CPU governor to performance (immediate)"
			
 
				+      ansible.builtin.shell:
			
 
				+        cmd: |
			
 
				+          for gov in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
			
 
				+            [ -f "$gov" ] && echo performance > "$gov"
			
 
				+          done
			
 
				+      changed_when: true
			
 
				+      tags:
			
 
				+        - os-tune
			
 
				+
			
 
				+    - name: "OS Tune | Install CPU performance governor persistence service"
			
 
				+      ansible.builtin.copy:
			
 
				+        dest: /etc/systemd/system/cpu-performance.service
			
 
				+        mode: "0644"
			
 
				+        owner: root
			
 
				+        group: root
			
 
				+        content: |
			
 
				+          [Unit]
			
 
				+          Description=Set CPU scaling governor to performance
			
 
				+          After=local-fs.target
			
 
				+
			
 
				+          [Service]
			
 
				+          Type=oneshot
			
 
				+          ExecStart=/bin/sh -c 'for gov in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do [ -f "$gov" ] && echo performance > "$gov"; done'
			
 
				+          RemainAfterExit=yes
			
 
				+
			
 
				+          [Install]
			
 
				+          WantedBy=multi-user.target
			
 
				+      notify:
			
 
				+        - Reload systemd daemon
			
 
				+      tags:
			
 
				+        - os-tune
			
 
				+
			
 
				+    - name: "OS Tune | Enable CPU performance governor persistence service"
			
 
				+      ansible.builtin.systemd:
			
 
				+        name: cpu-performance.service
			
 
				+        enabled: true
			
 
				+        daemon_reload: false
			
 
				+      tags:
			
 
				+        - os-tune
			
 
				+
			
 
				   handlers:
			
 
				     - name: Reload systemd and restart ollama
			
 
				       ansible.builtin.systemd:
			
 
				         name: ollama
			
 
				         state: restarted
			
 
				         daemon_reload: true
			
 
				+
			
 
				+    - name: Reload systemd daemon
			
 
				+      ansible.builtin.systemd:
			
 
				+        daemon_reload: true
			
--- a/playbooks/03_benchmark.yml
+++ b/playbooks/03_benchmark.yml
@@ -172,7 +172,13 @@
 
				           {%         set has_type_hint = 1 if ' -> ' in response_text else 0 %}
			
 
				           {%         set has_code_block = 1 if '```' in response_text else 0 %}
			
 
				           {%         set has_import = 1 if ('import ' in response_text or 'from ' in response_text) else 0 %}
			
 
				-          {%         set quality = (has_def * 0.20 + has_return * 0.20 + has_docstring * 0.15 + has_type_hint * 0.15 + has_code_block * 0.10 + has_assert * 0.08 + has_test_def * 0.07 + has_import * 0.05) %}
			
 
				+          {%         if test_name == 'code_gen' %}
			
 
				+          {%           set quality = (has_def * 0.20 + has_return * 0.20 + has_docstring * 0.15 + has_type_hint * 0.15 + has_code_block * 0.10 + has_assert * 0.08 + has_test_def * 0.07 + has_import * 0.05) %}
			
 
				+          {%         elif test_name == 'debug' %}
			
 
				+          {%           set quality = (has_def * 0.30 + has_return * 0.30 + has_code_block * 0.25 + has_assert * 0.15) %}
			
 
				+          {%         else %}
			
 
				+          {%           set quality = (has_def * 0.25 + has_return * 0.25 + has_code_block * 0.20 + has_type_hint * 0.15 + has_import * 0.15) %}
			
 
				+          {%         endif %}
			
 
				           {%         set ns2.coding_quality = ns2.coding_quality + quality %}
			
 
				           {%         set ns2.coding_count = ns2.coding_count + 1 %}
			
 
				           {%       elif test_name in ['explain', 'creative', 'reasoning'] %}
			
@@ -293,7 +299,10 @@
 
				           ## Scoring Formula
			
 
				           - Composite = quality * 0.45 + token_speed_normalized * 0.30 + latency_score * 0.25
			
 
				           - Speed normalized against {{ benchmark_toks_norm_ceiling }} tok/sec ceiling (hardware-observed max)
			
 
				-          - Coding quality: has_def×0.20 + has_return×0.20 + has_docstring×0.15 + has_type_hint×0.15 + has_code_block×0.10 + has_assert×0.08 + has_test_def×0.07 + has_import×0.05
			
 
				+          - Coding quality (per-prompt):
			
 
				+            code_gen: has_def×0.20 + has_return×0.20 + has_docstring×0.15 + has_type_hint×0.15 + has_code_block×0.10 + has_assert×0.08 + has_test_def×0.07 + has_import×0.05
			
 
				+            debug:    has_def×0.30 + has_return×0.30 + has_code_block×0.25 + has_assert×0.15
			
 
				+            refactor: has_def×0.25 + has_return×0.25 + has_code_block×0.20 + has_type_hint×0.15 + has_import×0.15
			
 
				           - Category: override dict → quality delta (coding_avg - general_avg >= {{ benchmark_coding_threshold }}) → name pattern (coder/codestral/codellama/starcoder) → general
			
 
				         dest: "{{ benchmark_results_dir }}/benchmark_{{ benchmark_timestamp }}.md"
			
 
				         mode: "0644"
			
--- a/playbooks/04_models.yml
+++ b/playbooks/04_models.yml
@@ -84,11 +84,9 @@
 
				       tags:
			
 
				         - models-pull
			
 
				 
			
 
				-    - name: "Models | Pull fixed models if not already present"
			
 
				+    - name: "Models | Pull baseline models if not already present"
			
 
				       ansible.builtin.command: "ollama pull {{ item }}"
			
 
				-      loop:
			
 
				-        - "llama3.2:3b"
			
 
				-        - "gemma3:12b-it-q4_K_M"
			
 
				+      loop: "{{ baseline_models }}"
			
 
				       when: item not in installed_model_names
			
 
				       changed_when: true
			
 
				       loop_control:
			
@@ -111,7 +109,7 @@
 
				       ansible.builtin.copy:
			
 
				         content: |
			
 
				           FROM {{ model_selection.slot3_coding }}
			
 
				-          PARAMETER num_ctx 131072
			
 
				+          PARAMETER num_ctx 32768
			
 
				           SYSTEM You are an expert coding assistant. You write clean, efficient, well-documented code. Always include type hints and follow best practices.
			
 
				         dest: "{{ modelfiles_dir }}/Modelfile.coder-128k"
			
 
				         mode: "0644"
			
@@ -146,8 +144,8 @@
 
				     - name: "Models | Template gemma-family Modelfile"
			
 
				       ansible.builtin.copy:
			
 
				         content: |
			
 
				-          FROM gemma3:12b-it-q4_K_M
			
 
				-          PARAMETER num_ctx 32768
			
 
				+          FROM llama3.1:8b
			
 
				+          PARAMETER num_ctx 8192
			
 
				           SYSTEM You are a helpful, friendly family assistant. Provide safe, age-appropriate responses suitable for all family members.
			
 
				         dest: "{{ modelfiles_dir }}/Modelfile.gemma-family"
			
 
				         mode: "0644"
			
--- a/playbooks/07_openwebui.yml
+++ b/playbooks/07_openwebui.yml
@@ -40,6 +40,33 @@
 
				       tags:
			
 
				         - openwebui-secrets
			
 
				 
			
 
				+    - name: "Open WebUI | Store Bedrock bearer token in Vault"
			
 
				+      ansible.builtin.uri:
			
 
				+        url: "{{ vault_url }}/v1/{{ vault_secret_prefix }}/bedrock"
			
 
				+        method: POST
			
 
				+        headers:
			
 
				+          X-Vault-Token: "{{ lookup('ansible.builtin.file', vault_token_file) }}"
			
 
				+        body_format: json
			
 
				+        body:
			
 
				+          data:
			
 
				+            bearer_token: "{{ bedrock_bearer_token }}"
			
 
				+        status_code: [200, 204]
			
 
				+      when: bedrock_bearer_token is defined and bedrock_bearer_token | length > 0
			
 
				+      tags:
			
 
				+        - openwebui-secrets
			
 
				+
			
 
				+    - name: "Open WebUI | Retrieve Bedrock bearer token from Vault"
			
 
				+      block:
			
 
				+        - name: "Open WebUI | Fetch Bedrock bearer token"
			
 
				+          ansible.builtin.set_fact:
			
 
				+            _bedrock_token: "{{ lookup('community.hashi_vault.hashi_vault', vault_secret_prefix ~ '/bedrock:bearer_token token=' ~ lookup('ansible.builtin.file', vault_token_file) ~ ' url=' ~ vault_url) }}"
			
 
				+      rescue:
			
 
				+        - name: "Open WebUI | Bedrock not configured, skipping"
			
 
				+          ansible.builtin.set_fact:
			
 
				+            _bedrock_token: ""
			
 
				+      tags:
			
 
				+        - openwebui-secrets
			
 
				+
			
 
				     # ── Container deployment ─────────────────────────────────────────
			
 
				     - name: "Open WebUI | Stop and remove existing container"
			
 
				       community.docker.docker_container:
			
@@ -58,6 +85,44 @@
 
				       tags:
			
 
				         - openwebui-deploy
			
 
				 
			
 
				+    - name: "Open WebUI | Build container environment"
			
 
				+      ansible.builtin.set_fact:
			
 
				+        _openwebui_env: >-
			
 
				+          {{
			
 
				+            {
			
 
				+              'OLLAMA_BASE_URL': 'http://host.docker.internal:11434',
			
 
				+              'OLLAMA_API_KEY': ollama_api_key,
			
 
				+              'WEBUI_SECRET_KEY': openwebui_secret_key,
			
 
				+              'WEBUI_AUTH': 'true',
			
 
				+              'ENABLE_OAUTH_SIGNUP': 'true',
			
 
				+              'OAUTH_PROVIDER_NAME': platform_name,
			
 
				+              'OAUTH_CLIENT_ID': 'open-webui',
			
 
				+              'OAUTH_CLIENT_SECRET': keycloak_client_secret,
			
 
				+              'OPENID_PROVIDER_URL': keycloak_oidc_url ~ '/.well-known/openid-configuration',
			
 
				+              'OAUTH_SCOPES': 'openid email profile',
			
 
				+              'ENABLE_OAUTH_ROLE_MANAGEMENT': 'true',
			
 
				+              'OAUTH_ROLES_CLAIM': 'realm_access.roles',
			
 
				+              'OAUTH_ALLOWED_ROLES': 'ai-user,ai-admin',
			
 
				+              'OAUTH_ADMIN_ROLES': 'ai-admin',
			
 
				+              'ENABLE_RAG_WEB_SEARCH': 'false',
			
 
				+              'RAG_EMBEDDING_ENGINE': 'ollama',
			
 
				+              'RAG_EMBEDDING_MODEL': 'nomic-embed-text',
			
 
				+              'RAG_OLLAMA_BASE_URL': 'http://host.docker.internal:11434',
			
 
				+              'VECTOR_DB': 'qdrant',
			
 
				+              'QDRANT_URI': 'http://host.docker.internal:6333',
			
 
				+              'ENABLE_ADMIN_EXPORT': 'true',
			
 
				+              'DEFAULT_MODELS': 'llama-family',
			
 
				+              'WEBUI_NAME': platform_name,
			
 
				+            } | combine(
			
 
				+              {
			
 
				+                'OPENAI_API_BASE_URL': 'https://bedrock-runtime.' ~ bedrock_aws_region ~ '.amazonaws.com/v1',
			
 
				+                'OPENAI_API_KEY': _bedrock_token,
			
 
				+              } if _bedrock_token | default('') | length > 0 else {}
			
 
				+            )
			
 
				+          }}
			
 
				+      tags:
			
 
				+        - openwebui-deploy
			
 
				+
			
 
				     - name: "Open WebUI | Run Open WebUI container"
			
 
				       community.docker.docker_container:
			
 
				         name: "{{ openwebui_container_name }}"
			
@@ -70,30 +135,7 @@
 
				           host.docker.internal: host-gateway
			
 
				         volumes:
			
 
				           - "{{ openwebui_data_dir }}:/app/backend/data"
			
 
				-        env:
			
 
				-          OLLAMA_BASE_URL: "http://host.docker.internal:11434"
			
 
				-          OLLAMA_API_KEY: "{{ ollama_api_key }}"
			
 
				-          WEBUI_SECRET_KEY: "{{ openwebui_secret_key }}"
			
 
				-          WEBUI_AUTH: "true"
			
 
				-          ENABLE_OAUTH_SIGNUP: "true"
			
 
				-          OAUTH_PROVIDER_NAME: "{{ platform_name }}"
			
 
				-          OAUTH_CLIENT_ID: "open-webui"
			
 
				-          OAUTH_CLIENT_SECRET: "{{ keycloak_client_secret }}"
			
 
				-          OPENID_PROVIDER_URL: "{{ keycloak_oidc_url }}/.well-known/openid-configuration"
			
 
				-          OAUTH_SCOPES: "openid email profile"
			
 
				-          ENABLE_OAUTH_ROLE_MANAGEMENT: "true"
			
 
				-          OAUTH_ROLES_CLAIM: "realm_access.roles"
			
 
				-          OAUTH_ALLOWED_ROLES: "ai-user,ai-admin"
			
 
				-          OAUTH_ADMIN_ROLES: "ai-admin"
			
 
				-          ENABLE_RAG_WEB_SEARCH: "false"
			
 
				-          RAG_EMBEDDING_ENGINE: "ollama"
			
 
				-          RAG_EMBEDDING_MODEL: "nomic-embed-text"
			
 
				-          RAG_OLLAMA_BASE_URL: "http://host.docker.internal:11434"
			
 
				-          VECTOR_DB: "qdrant"
			
 
				-          QDRANT_URI: "http://host.docker.internal:6333"
			
 
				-          ENABLE_ADMIN_EXPORT: "true"
			
 
				-          DEFAULT_MODELS: "llama-family"
			
 
				-          WEBUI_NAME: "{{ platform_name }}"
			
 
				+        env: "{{ _openwebui_env }}"
			
 
				       tags:
			
 
				         - openwebui-deploy
			
 
				 
			
--- a/templates/ollama/override.conf.j2
+++ b/templates/ollama/override.conf.j2
@@ -9,21 +9,33 @@ Environment="OLLAMA_KEEP_ALIVE=-1"
 
				 # Flash attention: fused softmax, ~20% less memory bandwidth, faster on AVX2
			
 
				 Environment="OLLAMA_FLASH_ATTENTION=1"
			
 
				 
			
 
				-# Threads: 28 logical CPUs on NUMA node 1 (14 physical cores × 2 HT)
			
 
				-# Covers all threads on the socket so no cross-socket migrations occur
			
 
				+# KV cache quantization: q8_0 halves KV cache memory vs fp16.
			
 
				+# Attention reads dominate memory bandwidth at long contexts; smaller KV =
			
 
				+# fewer bytes transferred per token generated. q8_0 over q4_0: negligible
			
 
				+# quality loss vs significant noise at long contexts with q4_0.
			
 
				+Environment="OLLAMA_KV_CACHE_TYPE=q8_0"
			
 
				+
			
 
				+# Threads: 14 physical cores on NUMA node 1 only (no hyperthreads).
			
 
				+# LLM inference is memory-bandwidth-bound; HT siblings share the same memory
			
 
				+# pipeline and add scheduling overhead without adding bandwidth.
			
 
				 Environment="OLLAMA_NUM_THREADS={{ ollama_num_threads }}"
			
 
				 
			
 
				-# Parallel inference streams — 4 simultaneous requests, 7 threads each
			
 
				+# Parallel inference streams — 2 simultaneous requests, 7 threads each.
			
 
				+# Keeps per-request throughput high for interactive/single-user workloads.
			
 
				 Environment="OLLAMA_NUM_PARALLEL={{ ollama_num_parallel }}"
			
 
				 
			
 
				 # Keep 4 models warm in RAM (KEEP_ALIVE=-1 means never unload)
			
 
				 Environment="OLLAMA_MAX_LOADED_MODELS={{ ollama_max_loaded_models }}"
			
 
				 
			
 
				 # ── NUMA / CPU binding ────────────────────────────────────────────────────
			
 
				-# Pin all Ollama threads to NUMA node 1 CPUs (odd: 1,3,5,...,55).
			
 
				-# Node 1 has ~120 GB free RAM vs node 0's ~75 GB.
			
 
				-# CPUAffinity prevents cross-NUMA thread migration; Linux will naturally
			
 
				-# allocate memory from the local node when all threads are on that node.
			
 
				+# ExecStart override: numactl --membind=1 guarantees model weights and KV
			
 
				+# cache are allocated from NUMA node 1 RAM (120 GB free). CPUAffinity alone
			
 
				+# does not set the memory policy; numactl makes it explicit.
			
 
				+ExecStart=
			
 
				+ExecStart=/usr/bin/numactl --membind=1 {{ ollama_binary_path }} serve
			
 
				+
			
 
				+# Restrict scheduler to physical cores on node 1 only (odd CPUs 1–27).
			
 
				+# Omitting HT siblings (29–55) prevents cross-HT contention on the memory bus.
			
 
				 CPUAffinity={{ ollama_cpu_affinity }}
			
 
				 
			
 
				 # ── Memory hardening ───────────────────────────────────────────────────────