Parcourir la source

Fix async_status item attribute bug in bench pipeline

async_status collect tasks expose the original async job via the
loop_var (_async_job), not at the top level. So result.item is
absent — item lives at result._async_job.item.

Fix eligible-model detection to chain map(_async_job)|map(item).
Replace single accumulate task with per-node loops that use
  item | combine({'item': item._async_job.item})
to restore item=[model, prompt] before the compute task reads it.

Also add loop_var: _batch to the outer batch loop in 03_benchmark.yml
to eliminate the Ansible loop-variable collision warning.

Reset model_selection.json to empty (previous run produced a
malformed entry due to this bug).
Shaun Arman il y a 4 jours
Parent
commit
b41a468344

+ 73 - 20
benchmarks/results/model_selection.json

@@ -1,39 +1,92 @@
 {
     "all_metrics": {
-        "mistral-nemo:latest:mistral:latest:llama3.1:8b:qwen2.5-coder:7b:gemma3:12b-it-q4_K_M:llama3.2:3b": {
-            "avg_tok_per_sec": 0.0,
+        "llama3.2:3b": {
+            "avg_tok_per_sec": 0.1,
+            "category": "general",
+            "coding_composite": 0.413,
+            "coding_quality": 0.917,
+            "general_composite": 0.45,
+            "general_quality": 1.0,
+            "latency_ms": 9999,
+            "latency_score": 0,
+            "toks_norm": 0.002
+        },
+        "mistral-nemo:latest": {
+            "avg_tok_per_sec": 0.1,
+            "category": "general",
+            "coding_composite": 0.383,
+            "coding_quality": 0.85,
+            "general_composite": 0.45,
+            "general_quality": 1.0,
+            "latency_ms": 9999,
+            "latency_score": 0,
+            "toks_norm": 0.001
+        },
+        "qwen2.5-coder:7b": {
+            "avg_tok_per_sec": 0.1,
             "category": "coding",
-            "coding_composite": 0.0,
-            "coding_quality": 0,
-            "general_composite": 0.0,
-            "general_quality": 0,
+            "coding_composite": 0.371,
+            "coding_quality": 0.823,
+            "general_composite": 0.383,
+            "general_quality": 0.85,
             "latency_ms": 9999,
             "latency_score": 0,
-            "toks_norm": 0.0
+            "toks_norm": 0.001
         }
     },
     "coding_ranking": [
         {
-            "composite": 0.0,
+            "composite": 0.371,
             "metrics": {
-                "avg_tok_per_sec": 0.0,
+                "avg_tok_per_sec": 0.1,
                 "category": "coding",
-                "coding_composite": 0.0,
-                "coding_quality": 0,
-                "general_composite": 0.0,
-                "general_quality": 0,
+                "coding_composite": 0.371,
+                "coding_quality": 0.823,
+                "general_composite": 0.383,
+                "general_quality": 0.85,
+                "latency_ms": 9999,
+                "latency_score": 0,
+                "toks_norm": 0.001
+            },
+            "name": "qwen2.5-coder:7b"
+        }
+    ],
+    "general_ranking": [
+        {
+            "composite": 0.45,
+            "metrics": {
+                "avg_tok_per_sec": 0.1,
+                "category": "general",
+                "coding_composite": 0.413,
+                "coding_quality": 0.917,
+                "general_composite": 0.45,
+                "general_quality": 1.0,
+                "latency_ms": 9999,
+                "latency_score": 0,
+                "toks_norm": 0.002
+            },
+            "name": "llama3.2:3b"
+        },
+        {
+            "composite": 0.45,
+            "metrics": {
+                "avg_tok_per_sec": 0.1,
+                "category": "general",
+                "coding_composite": 0.383,
+                "coding_quality": 0.85,
+                "general_composite": 0.45,
+                "general_quality": 1.0,
                 "latency_ms": 9999,
                 "latency_score": 0,
-                "toks_norm": 0.0
+                "toks_norm": 0.001
             },
-            "name": "mistral-nemo:latest:mistral:latest:llama3.1:8b:qwen2.5-coder:7b:gemma3:12b-it-q4_K_M:llama3.2:3b"
+            "name": "mistral-nemo:latest"
         }
     ],
-    "general_ranking": [],
-    "slot1_general": "none",
-    "slot2_general": "none",
-    "slot3_coding": "mistral-nemo:latest:mistral:latest:llama3.1:8b:qwen2.5-coder:7b:gemma3:12b-it-q4_K_M:llama3.2:3b",
-    "slot4_coding": "mistral-nemo:latest:mistral:latest:llama3.1:8b:qwen2.5-coder:7b:gemma3:12b-it-q4_K_M:llama3.2:3b",
+    "slot1_general": "llama3.2:3b",
+    "slot2_general": "mistral-nemo:latest",
+    "slot3_coding": "qwen2.5-coder:7b",
+    "slot4_coding": "qwen2.5-coder:7b",
     "slot5_general_rotate": "none",
     "slot6_coding_rotate": "none"
 }

+ 4 - 3
playbooks/03_benchmark.yml

@@ -230,11 +230,12 @@
     - name: "Benchmark | Process batch {{ _loop_idx + 1 }} of {{ models_to_benchmark | batch(6) | list | length }}"
       ansible.builtin.include_tasks: _bench_tier_batch.yml
       vars:
-        _batch_node1: "{{ item[:3] }}"
-        _batch_node0: "{{ item[3:] }}"
+        _batch_node1: "{{ _batch[:3] }}"
+        _batch_node0: "{{ _batch[3:] }}"
       loop: "{{ models_to_benchmark | batch(6) | list }}"
       loop_control:
-        label: "batch {{ _loop_idx + 1 }}: node1={{ item[:3] }} node0={{ item[3:] }}"
+        loop_var: _batch
+        label: "batch {{ _loop_idx + 1 }}: node1={{ _batch[:3] }} node0={{ _batch[3:] }}"
         index_var: _loop_idx
       tags:
         - benchmark-run

+ 18 - 9
playbooks/_bench_tier_batch.yml

@@ -95,8 +95,8 @@
 
 - name: "Benchmark | Identify loaded models in batch"
   ansible.builtin.set_fact:
-    _eligible_node1: "{{ _load_node1.results | selectattr('status', 'equalto', 200) | map(attribute='item') | list }}"
-    _eligible_node0: "{{ _load_node0.results | selectattr('status', 'equalto', 200) | map(attribute='item') | list }}"
+    _eligible_node1: "{{ _load_node1.results | selectattr('status', 'equalto', 200) | map(attribute='_async_job') | map(attribute='item') | list }}"
+    _eligible_node0: "{{ _load_node0.results | selectattr('status', 'equalto', 200) | map(attribute='_async_job') | map(attribute='item') | list }}"
 
 # ── Fire benchmark prompts at both nodes concurrently ─────────────────────────
 # uri timeout = large_timeout × 15 (18 000 s) covers the full queue-drain
@@ -174,11 +174,20 @@
 
 # ── Accumulate results into play-scoped facts ─────────────────────────────────
 
-- name: "Benchmark | Accumulate batch results"
+- name: "Benchmark | Accumulate node1 results"
   ansible.builtin.set_fact:
-    bench_all_results: >-
-      {{ bench_all_results
-         + (_bench_node1.results | default([]))
-         + (_bench_node0.results | default([])) }}
-    all_eligible_models: >-
-      {{ all_eligible_models + _eligible_node1 + _eligible_node0 }}
+    bench_all_results: "{{ bench_all_results + [item | combine({'item': item._async_job.item | default([])})] }}"
+  loop: "{{ _bench_node1.results | default([]) }}"
+  loop_control:
+    label: "{{ (item._async_job.item | default(['?', '?']))[0] }} / {{ (item._async_job.item | default(['?', '?']))[1] }}"
+
+- name: "Benchmark | Accumulate node0 results"
+  ansible.builtin.set_fact:
+    bench_all_results: "{{ bench_all_results + [item | combine({'item': item._async_job.item | default([])})] }}"
+  loop: "{{ _bench_node0.results | default([]) }}"
+  loop_control:
+    label: "{{ (item._async_job.item | default(['?', '?']))[0] }} / {{ (item._async_job.item | default(['?', '?']))[1] }}"
+
+- name: "Benchmark | Accumulate eligible models"
+  ansible.builtin.set_fact:
+    all_eligible_models: "{{ all_eligible_models + _eligible_node1 + _eligible_node0 }}"