Kaynağa Gözat

Replace prescreen with batched warm-before-benchmark pipeline

Each batch of 6 models loads into RAM first (3 per NUMA node, via their
respective ports), confirms HTTP 200, then benchmarks only the warmed
models. Cold-start disk I/O no longer corrupts benchmark scores.

- benchmark_prescreen_timeout → benchmark_load_timeout (300s)
- _bench_tier_batch.yml: new include, handles one batch end-to-end
  (load node1 → load node0 → filter eligible → bench node1 → bench
  node0 → accumulate into bench_all_results/all_eligible_models)
- Compute task iterates all_eligible_models not models_to_benchmark,
  so load failures are excluded from scoring automatically
- Sort models largest-first before batching so heaviest models land
  on node1 (the consistently higher-capacity inference socket)
- node0 health check added to benchmark-setup tag
Shaun Arman 4 gün önce
ebeveyn
işleme
7c5d10799a

+ 4 - 3
inventory/group_vars/all.yml

@@ -103,9 +103,10 @@ benchmark_skip_aliases:
 benchmark_small_max_gb: 10    # upper size boundary for small pass (< 10 GB), based on runtime RAM
 benchmark_medium_max_gb: 15   # upper size boundary for medium pass (10–15 GB), based on runtime RAM
 benchmark_size_overhead_factor: 1.2  # ollama list shows disk size; multiply by this to estimate runtime RAM
-benchmark_small_timeout: 300  # seconds per request, small models
-benchmark_medium_timeout: 900 # seconds per request, medium models (15 min)
-benchmark_large_timeout: 1200 # seconds per request, large models (20 min)
+benchmark_load_timeout: 300      # seconds — warm-up "Hi" prompt per model before benchmarking
+benchmark_small_timeout: 300     # seconds per request, small models
+benchmark_medium_timeout: 900    # seconds per request, medium models (15 min)
+benchmark_large_timeout: 1200    # seconds per request, large models (20 min)
 
 # Explicit category overrides applied before heuristics. Keys are model names as
 # returned by `ollama list`. Valid values: 'coding' or 'general'.

+ 90 - 70
playbooks/03_benchmark.yml

@@ -84,6 +84,31 @@
       tags:
         - benchmark-discover
 
+    - name: "Benchmark | Stop warmup services for clean benchmark run"
+      ansible.builtin.systemd:
+        name: "{{ item }}"
+        state: stopped
+      loop:
+        - ollama-warmup.service
+        - ollama-warmup-node0.service
+      failed_when: false
+      become: true
+      tags:
+        - benchmark-setup
+
+    - name: "Benchmark | Wait for node0 Ollama API to be ready"
+      ansible.builtin.uri:
+        url: "http://localhost:{{ ollama_node0_port }}/api/tags"
+        method: GET
+        status_code: 200
+        timeout: 10
+      register: ollama_node0_ready
+      retries: 24
+      delay: 5
+      until: ollama_node0_ready.status == 200
+      tags:
+        - benchmark-setup
+
     - name: "Benchmark | Discover installed models"
       ansible.builtin.command: ollama list
       changed_when: false
@@ -153,87 +178,70 @@
       tags:
         - benchmark-discover
 
+    - name: "Benchmark | Initialize batch accumulator facts"
+      ansible.builtin.set_fact:
+        bench_all_results: []
+        all_eligible_models: []
+      tags:
+        - benchmark-discover
+
+    - name: "Benchmark | Build per-model benchmark timeout map"
+      ansible.builtin.set_fact:
+        _benchmark_timeout_map_json: |
+          {% set ns = namespace(d={}) %}
+          {% for m in models_to_benchmark %}
+          {%   if m in _small_models %}
+          {%     set _ = ns.d.update({m: benchmark_small_timeout | int}) %}
+          {%   elif m in _medium_models %}
+          {%     set _ = ns.d.update({m: benchmark_medium_timeout | int}) %}
+          {%   else %}
+          {%     set _ = ns.d.update({m: benchmark_large_timeout | int}) %}
+          {%   endif %}
+          {% endfor %}
+          {{ ns.d | to_json }}
+      tags:
+        - benchmark-discover
+
+    - name: "Benchmark | Parse benchmark timeout map"
+      ansible.builtin.set_fact:
+        _benchmark_timeout_map: "{{ _benchmark_timeout_map_json | from_json }}"
+      tags:
+        - benchmark-discover
+
+    - name: "Benchmark | Sort models largest-first so heaviest models land on node1 (120 GB)"
+      ansible.builtin.set_fact:
+        models_to_benchmark: >-
+          {{ (_large_models + _medium_models + _small_models)
+             | select('in', models_to_benchmark) | list }}
+      tags:
+        - benchmark-discover
+
     - name: "Benchmark | Display models to benchmark"
       ansible.builtin.debug:
         msg:
           - "Small  pass (timeout {{ benchmark_small_timeout }}s,  ≤{{ benchmark_small_max_gb }}GB):  {{ _small_models }}"
           - "Medium pass (timeout {{ benchmark_medium_timeout }}s, {{ benchmark_small_max_gb }}–{{ benchmark_medium_max_gb }}GB): {{ _medium_models }}"
           - "Large  pass (timeout {{ benchmark_large_timeout }}s, >{{ benchmark_medium_max_gb }}GB): {{ _large_models }}"
+          - "Load timeout (warm-up 'Hi' prompt): {{ benchmark_load_timeout }}s"
+          - "Total: {{ models_to_benchmark | length }} models, {{ (models_to_benchmark | batch(6) | list) | length }} batch(es) of ≤6"
       tags:
         - benchmark-discover
 
-    - name: "Benchmark | Run test prompts against small models"
-      ansible.builtin.uri:
-        url: "{{ ollama_api_url }}/api/generate"
-        method: POST
-        body_format: json
-        body:
-          model: "{{ item.0 }}"
-          prompt: "{{ test_prompts[item.1].prompt }}"
-          stream: false
-        headers:
-          Authorization: "Bearer {{ ollama_api_key }}"
-        timeout: "{{ benchmark_small_timeout }}"
-        status_code: 200
-      loop: "{{ _small_models | product(test_prompts.keys() | list) | list }}"
-      loop_control:
-        label: "{{ item.0 }} / {{ item.1 }}"
-      register: _bench_small
-      failed_when: false
-      tags:
-        - benchmark-run
-
-    - name: "Benchmark | Run test prompts against medium models"
-      ansible.builtin.uri:
-        url: "{{ ollama_api_url }}/api/generate"
-        method: POST
-        body_format: json
-        body:
-          model: "{{ item.0 }}"
-          prompt: "{{ test_prompts[item.1].prompt }}"
-          stream: false
-        headers:
-          Authorization: "Bearer {{ ollama_api_key }}"
-        timeout: "{{ benchmark_medium_timeout }}"
-        status_code: 200
-      loop: "{{ _medium_models | product(test_prompts.keys() | list) | list }}"
-      loop_control:
-        label: "{{ item.0 }} / {{ item.1 }}"
-      register: _bench_medium
-      failed_when: false
-      when: _medium_models | length > 0
-      tags:
-        - benchmark-run
-
-    - name: "Benchmark | Run test prompts against large models"
-      ansible.builtin.uri:
-        url: "{{ ollama_api_url }}/api/generate"
-        method: POST
-        body_format: json
-        body:
-          model: "{{ item.0 }}"
-          prompt: "{{ test_prompts[item.1].prompt }}"
-          stream: false
-        headers:
-          Authorization: "Bearer {{ ollama_api_key }}"
-        timeout: "{{ benchmark_large_timeout }}"
-        status_code: 200
-      loop: "{{ _large_models | product(test_prompts.keys() | list) | list }}"
+    - name: "Benchmark | Process batch {{ _loop_idx + 1 }} of {{ models_to_benchmark | batch(6) | list | length }}"
+      ansible.builtin.include_tasks: _bench_tier_batch.yml
+      vars:
+        _batch_node1: "{{ item[:3] }}"
+        _batch_node0: "{{ item[3:] }}"
+      loop: "{{ models_to_benchmark | batch(6) | list }}"
       loop_control:
-        label: "{{ item.0 }} / {{ item.1 }}"
-      register: _bench_large
-      failed_when: false
-      when: _large_models | length > 0
+        label: "batch {{ _loop_idx + 1 }}: node1={{ item[:3] }} node0={{ item[3:] }}"
+        index_var: _loop_idx
       tags:
         - benchmark-run
 
-    - name: "Benchmark | Merge small, medium, and large model results"
-      ansible.builtin.set_fact:
-        benchmark_raw_results:
-          results: >-
-            {{ (_bench_small.results  | default([]))
-             + (_bench_medium.results | default([]))
-             + (_bench_large.results  | default([])) }}
+    - name: "Benchmark | Display models that failed to load"
+      ansible.builtin.debug:
+        msg: "Load failures (excluded from scoring): {{ models_to_benchmark | reject('in', all_eligible_models) | list }}"
       tags:
         - benchmark-run
 
@@ -241,9 +249,9 @@
       ansible.builtin.set_fact:
         model_metrics: |
           {% set ns = namespace(results={}) %}
-          {% for model in models_to_benchmark %}
+          {% for model in all_eligible_models %}
           {%   set ns2 = namespace(coding_quality=0, coding_count=0, general_quality=0, general_count=0, total_toks=0, total_eval_time=0, ttft_sum=0, ttft_count=0, latency_ns=0) %}
-          {%   for result in benchmark_raw_results.results %}
+          {%   for result in bench_all_results %}
           {%     if result.item[0] == model and result.status == 200 %}
           {%       set test_name = result.item[1] %}
           {%       set resp = result.json | default({}) %}
@@ -449,3 +457,15 @@
       changed_when: true
       tags:
         - benchmark-pull
+
+    - name: "Benchmark | Restart warmup services after benchmark"
+      ansible.builtin.systemd:
+        name: "{{ item }}"
+        state: restarted
+      loop:
+        - ollama-warmup.service
+        - ollama-warmup-node0.service
+      failed_when: false
+      become: true
+      tags:
+        - benchmark-cleanup

+ 109 - 0
playbooks/_bench_tier_batch.yml

@@ -0,0 +1,109 @@
+---
+# playbooks/_bench_tier_batch.yml
+# Included by 03_benchmark.yml once per batch of up to 6 models.
+#
+# Expected vars (passed via include_tasks vars:):
+#   _batch_node1  — list of 0–3 model names for port 11434
+#   _batch_node0  — list of 0–3 model names for port 11435
+#
+# Mutates host facts (accumulated across batches):
+#   bench_all_results    — list of uri result dicts
+#   all_eligible_models  — list of model names that passed load
+
+# ── Load models into RAM ──────────────────────────────────────────────────────
+
+- name: "Benchmark | Load node1 models into RAM"
+  ansible.builtin.uri:
+    url: "http://localhost:11434/api/generate"
+    method: POST
+    body_format: json
+    body:
+      model: "{{ item }}"
+      prompt: "Hi"
+      stream: false
+    headers:
+      Authorization: "Bearer {{ ollama_api_key }}"
+    timeout: "{{ benchmark_load_timeout }}"
+    status_code: 200
+  loop: "{{ _batch_node1 }}"
+  loop_control:
+    label: "node1 load: {{ item }}"
+  register: _load_node1
+  failed_when: false
+
+- name: "Benchmark | Load node0 models into RAM"
+  ansible.builtin.uri:
+    url: "http://localhost:{{ ollama_node0_port }}/api/generate"
+    method: POST
+    body_format: json
+    body:
+      model: "{{ item }}"
+      prompt: "Hi"
+      stream: false
+    headers:
+      Authorization: "Bearer {{ ollama_api_key }}"
+    timeout: "{{ benchmark_load_timeout }}"
+    status_code: 200
+  loop: "{{ _batch_node0 }}"
+  loop_control:
+    label: "node0 load: {{ item }}"
+  register: _load_node0
+  failed_when: false
+
+# ── Identify successfully loaded models ───────────────────────────────────────
+
+- name: "Benchmark | Identify loaded models in batch"
+  ansible.builtin.set_fact:
+    _eligible_node1: "{{ _load_node1.results | selectattr('status', 'equalto', 200) | map(attribute='item') | list }}"
+    _eligible_node0: "{{ _load_node0.results | selectattr('status', 'equalto', 200) | map(attribute='item') | list }}"
+
+# ── Benchmark loaded models ───────────────────────────────────────────────────
+
+- name: "Benchmark | Run test prompts against node1 models"
+  ansible.builtin.uri:
+    url: "http://localhost:11434/api/generate"
+    method: POST
+    body_format: json
+    body:
+      model: "{{ item.0 }}"
+      prompt: "{{ test_prompts[item.1].prompt }}"
+      stream: false
+    headers:
+      Authorization: "Bearer {{ ollama_api_key }}"
+    timeout: "{{ _benchmark_timeout_map[item.0] | default(benchmark_large_timeout) }}"
+    status_code: 200
+  loop: "{{ _eligible_node1 | product(test_prompts.keys() | list) | list }}"
+  loop_control:
+    label: "{{ item.0 }} / {{ item.1 }}"
+  register: _bench_node1
+  failed_when: false
+
+- name: "Benchmark | Run test prompts against node0 models"
+  ansible.builtin.uri:
+    url: "http://localhost:{{ ollama_node0_port }}/api/generate"
+    method: POST
+    body_format: json
+    body:
+      model: "{{ item.0 }}"
+      prompt: "{{ test_prompts[item.1].prompt }}"
+      stream: false
+    headers:
+      Authorization: "Bearer {{ ollama_api_key }}"
+    timeout: "{{ _benchmark_timeout_map[item.0] | default(benchmark_large_timeout) }}"
+    status_code: 200
+  loop: "{{ _eligible_node0 | product(test_prompts.keys() | list) | list }}"
+  loop_control:
+    label: "{{ item.0 }} / {{ item.1 }}"
+  register: _bench_node0
+  failed_when: false
+
+# ── Accumulate results into play-scoped facts ─────────────────────────────────
+
+- name: "Benchmark | Accumulate batch results"
+  ansible.builtin.set_fact:
+    bench_all_results: >-
+      {{ bench_all_results
+         + (_bench_node1.results | default([]))
+         + (_bench_node0.results | default([])) }}
+    all_eligible_models: >-
+      {{ all_eligible_models + _eligible_node1 + _eligible_node0 }}