4 өдөр өмнө · d9a991f1b8
--- a/playbooks/03_benchmark.yml
+++ b/playbooks/03_benchmark.yml
@@ -265,7 +265,7 @@
 
				           {%       set ns2.ttft_sum = ns2.ttft_sum + prompt_eval_duration %}
			
 
				           {%       set ns2.ttft_count = ns2.ttft_count + 1 %}
			
 
				           {%       if test_name == 'latency' %}
			
 
				-          {%         set ns2.latency_ns = eval_duration + prompt_eval_duration %}
			
 
				+          {%         set ns2.latency_ns = resp.total_duration | default(0) | int %}
			
 
				           {%       endif %}
			
 
				           {%       set resp_len = response_text | length %}
			
 
				           {%       if test_name in ['code_gen', 'debug', 'refactor'] %}
			
--- a/playbooks/_bench_tier_batch.yml
+++ b/playbooks/_bench_tier_batch.yml
@@ -10,14 +10,13 @@
 
				 #   bench_all_results    — list of uri result dicts
			
 
				 #   all_eligible_models  — list of model names that passed load
			
 
				 #
			
 
				-# Concurrency design (load and benchmark phases both use async):
			
 
				-#   Load:      node1 and node0 warm-up "Hi" prompts fire simultaneously.
			
 
				+# Concurrency design:
			
 
				+#   Load:      node1 and node0 warm-up "Hi" prompts fire simultaneously (async).
			
 
				 #              Within each node Ollama still loads one model at a time,
			
 
				 #              but both nodes drain their queues in parallel.
			
 
				-#   Benchmark: same pattern — both nodes' prompt queues drain together.
			
 
				-#   uri timeout covers the full queue-drain wait (see inline comments).
			
 
				-#   loop_var: _async_job on collect tasks preserves the uri module's
			
 
				-#   item=[model, ...] field so downstream tasks need no structural changes.
			
 
				+#   Benchmark: sequential (synchronous uri), one request at a time per node.
			
 
				+#              Node1 drains fully, then node0. No queue contamination; each
			
 
				+#              request gets a full idle inference slot and clean eval_duration.
			
 
				 
			
 
				 # ── Load models into RAM (both nodes concurrently) ────────────────────────────
			
 
				 # 3 models per node, sequential within each node → last model waits for 2
			
@@ -98,13 +97,12 @@
 
				     _eligible_node1: "{{ _load_node1.results | selectattr('status', 'equalto', 200) | map(attribute='_async_job') | map(attribute='item') | list }}"
			
 
				     _eligible_node0: "{{ _load_node0.results | selectattr('status', 'equalto', 200) | map(attribute='_async_job') | map(attribute='item') | list }}"
			
 
				 
			
 
				-# ── Fire benchmark prompts at both nodes concurrently ─────────────────────────
			
 
				-# uri timeout = large_timeout × 15 (18 000 s) covers the full queue-drain
			
 
				-# wait: worst case 21 queued requests, 2 parallel, 1 200 s each →
			
 
				-# max queue wait ≈ 12 000 s < 18 000 s. async = ×18, collect retries
			
 
				-# 730 × 30 s = 21 900 s of headroom.
			
 
				+# ── Fire benchmark prompts sequentially (one request at a time per node) ──────
			
 
				+# Sequential firing ensures each request hits an idle Ollama inference slot:
			
 
				+# no queue contamination, full CPU budget per request, clean eval_duration.
			
 
				+# Node1 then node0 run back-to-back; concurrent load phase above is unchanged.
			
 
				 
			
 
				-- name: "Benchmark | Fire test prompts at node1 (async)"
			
 
				+- name: "Benchmark | Fire test prompts at node1"
			
 
				   ansible.builtin.uri:
			
 
				     url: "http://localhost:11434/api/generate"
			
 
				     method: POST
			
@@ -115,17 +113,15 @@
 
				       stream: false
			
 
				     headers:
			
 
				       Authorization: "Bearer {{ ollama_api_key }}"
			
 
				-    timeout: "{{ (benchmark_large_timeout | int) * 15 }}"
			
 
				+    timeout: "{{ (benchmark_large_timeout | int) }}"
			
 
				     status_code: 200
			
 
				   loop: "{{ _eligible_node1 | product(test_prompts.keys() | list) | list }}"
			
 
				   loop_control:
			
 
				     label: "{{ item.0 }} / {{ item.1 }}"
			
 
				-  async: "{{ (benchmark_large_timeout | int) * 18 }}"
			
 
				-  poll: 0
			
 
				-  register: _bench_node1_jobs
			
 
				+  register: _bench_node1
			
 
				   failed_when: false
			
 
				 
			
 
				-- name: "Benchmark | Fire test prompts at node0 (async)"
			
 
				+- name: "Benchmark | Fire test prompts at node0"
			
 
				   ansible.builtin.uri:
			
 
				     url: "http://localhost:{{ ollama_node0_port }}/api/generate"
			
 
				     method: POST
			
@@ -136,57 +132,31 @@
 
				       stream: false
			
 
				     headers:
			
 
				       Authorization: "Bearer {{ ollama_api_key }}"
			
 
				-    timeout: "{{ (benchmark_large_timeout | int) * 15 }}"
			
 
				+    timeout: "{{ (benchmark_large_timeout | int) }}"
			
 
				     status_code: 200
			
 
				   loop: "{{ _eligible_node0 | product(test_prompts.keys() | list) | list }}"
			
 
				   loop_control:
			
 
				     label: "{{ item.0 }} / {{ item.1 }}"
			
 
				-  async: "{{ (benchmark_large_timeout | int) * 18 }}"
			
 
				-  poll: 0
			
 
				-  register: _bench_node0_jobs
			
 
				-  failed_when: false
			
 
				-
			
 
				-- name: "Benchmark | Collect node1 results"
			
 
				-  ansible.builtin.async_status:
			
 
				-    jid: "{{ _async_job.ansible_job_id }}"
			
 
				-  loop: "{{ _bench_node1_jobs.results | default([]) }}"
			
 
				-  loop_control:
			
 
				-    loop_var: _async_job
			
 
				-    label: "{{ _async_job.item[0] | default('?') }} / {{ _async_job.item[1] | default('?') }}"
			
 
				-  register: _bench_node1
			
 
				-  until: _bench_node1.finished
			
 
				-  retries: 730
			
 
				-  delay: 30
			
 
				-  failed_when: false
			
 
				-
			
 
				-- name: "Benchmark | Collect node0 results"
			
 
				-  ansible.builtin.async_status:
			
 
				-    jid: "{{ _async_job.ansible_job_id }}"
			
 
				-  loop: "{{ _bench_node0_jobs.results | default([]) }}"
			
 
				-  loop_control:
			
 
				-    loop_var: _async_job
			
 
				-    label: "{{ _async_job.item[0] | default('?') }} / {{ _async_job.item[1] | default('?') }}"
			
 
				   register: _bench_node0
			
 
				-  until: _bench_node0.finished
			
 
				-  retries: 730
			
 
				-  delay: 30
			
 
				   failed_when: false
			
 
				 
			
 
				 # ── Accumulate results into play-scoped facts ─────────────────────────────────
			
 
				+# Synchronous uri populates result.item = [model, prompt_key] at top level —
			
 
				+# no _async_job indirection needed; compute task in 03_benchmark.yml unchanged.
			
 
				 
			
 
				 - name: "Benchmark | Accumulate node1 results"
			
 
				   ansible.builtin.set_fact:
			
 
				-    bench_all_results: "{{ bench_all_results + [item | combine({'item': item._async_job.item | default([])})] }}"
			
 
				+    bench_all_results: "{{ bench_all_results + [item] }}"
			
 
				   loop: "{{ _bench_node1.results | default([]) }}"
			
 
				   loop_control:
			
 
				-    label: "{{ (item._async_job.item | default(['?', '?']))[0] }} / {{ (item._async_job.item | default(['?', '?']))[1] }}"
			
 
				+    label: "{{ (item.item | default(['?', '?']))[0] }} / {{ (item.item | default(['?', '?']))[1] }}"
			
 
				 
			
 
				 - name: "Benchmark | Accumulate node0 results"
			
 
				   ansible.builtin.set_fact:
			
 
				-    bench_all_results: "{{ bench_all_results + [item | combine({'item': item._async_job.item | default([])})] }}"
			
 
				+    bench_all_results: "{{ bench_all_results + [item] }}"
			
 
				   loop: "{{ _bench_node0.results | default([]) }}"
			
 
				   loop_control:
			
 
				-    label: "{{ (item._async_job.item | default(['?', '?']))[0] }} / {{ (item._async_job.item | default(['?', '?']))[1] }}"
			
 
				+    label: "{{ (item.item | default(['?', '?']))[0] }} / {{ (item.item | default(['?', '?']))[1] }}"
			
 
				 
			
 
				 - name: "Benchmark | Accumulate eligible models"
			
 
				   ansible.builtin.set_fact: