4 дней назад · e4e09ba434
--- a/playbooks/_bench_tier_batch.yml
+++ b/playbooks/_bench_tier_batch.yml
@@ -10,18 +10,20 @@
 
				 #   bench_all_results    — list of uri result dicts
			
 
				 #   all_eligible_models  — list of model names that passed load
			
 
				 #
			
 
				-# Concurrency design:
			
 
				-#   Loads are sequential (disk I/O: node1 then node0).
			
 
				-#   Benchmarks are concurrent: all node1 prompts are fired into Ollama's
			
 
				-#   queue (async, poll:0), then all node0 prompts are fired immediately
			
 
				-#   after. Both nodes drain their queues simultaneously.
			
 
				-#   OLLAMA_SCHED_SPREAD=false (default) ensures each node processes its
			
 
				-#   models one at a time in queue order, so scores are not cross-model
			
 
				-#   contaminated. uri timeout accounts for full queue-drain wait time.
			
 
				+# Concurrency design (load and benchmark phases both use async):
			
 
				+#   Load:      node1 and node0 warm-up "Hi" prompts fire simultaneously.
			
 
				+#              Within each node Ollama still loads one model at a time,
			
 
				+#              but both nodes drain their queues in parallel.
			
 
				+#   Benchmark: same pattern — both nodes' prompt queues drain together.
			
 
				+#   uri timeout covers the full queue-drain wait (see inline comments).
			
 
				+#   loop_var: _async_job on collect tasks preserves the uri module's
			
 
				+#   item=[model, ...] field so downstream tasks need no structural changes.
			
 
				 
			
 
				-# ── Load models into RAM ──────────────────────────────────────────────────────
			
 
				+# ── Load models into RAM (both nodes concurrently) ────────────────────────────
			
 
				+# 3 models per node, sequential within each node → last model waits for 2
			
 
				+# ahead: max load wait ≤ 2 × load_timeout. Use load_timeout × 4 for margin.
			
 
				 
			
 
				-- name: "Benchmark | Load node1 models into RAM"
			
 
				+- name: "Benchmark | Load node1 models into RAM (async)"
			
 
				   ansible.builtin.uri:
			
 
				     url: "http://localhost:11434/api/generate"
			
 
				     method: POST
			
@@ -32,16 +34,17 @@
 
				       stream: false
			
 
				     headers:
			
 
				       Authorization: "Bearer {{ ollama_api_key }}"
			
 
				-    timeout: "{{ benchmark_load_timeout }}"
			
 
				+    timeout: "{{ (benchmark_load_timeout | int) * 4 }}"
			
 
				     status_code: 200
			
 
				   loop: "{{ _batch_node1 }}"
			
 
				   loop_control:
			
 
				-    loop_var: item
			
 
				     label: "node1 load: {{ item }}"
			
 
				-  register: _load_node1
			
 
				+  async: "{{ (benchmark_load_timeout | int) * 5 }}"
			
 
				+  poll: 0
			
 
				+  register: _load_node1_jobs
			
 
				   failed_when: false
			
 
				 
			
 
				-- name: "Benchmark | Load node0 models into RAM"
			
 
				+- name: "Benchmark | Load node0 models into RAM (async)"
			
 
				   ansible.builtin.uri:
			
 
				     url: "http://localhost:{{ ollama_node0_port }}/api/generate"
			
 
				     method: POST
			
@@ -52,13 +55,40 @@
 
				       stream: false
			
 
				     headers:
			
 
				       Authorization: "Bearer {{ ollama_api_key }}"
			
 
				-    timeout: "{{ benchmark_load_timeout }}"
			
 
				+    timeout: "{{ (benchmark_load_timeout | int) * 4 }}"
			
 
				     status_code: 200
			
 
				   loop: "{{ _batch_node0 }}"
			
 
				   loop_control:
			
 
				-    loop_var: item
			
 
				     label: "node0 load: {{ item }}"
			
 
				+  async: "{{ (benchmark_load_timeout | int) * 5 }}"
			
 
				+  poll: 0
			
 
				+  register: _load_node0_jobs
			
 
				+  failed_when: false
			
 
				+
			
 
				+- name: "Benchmark | Collect node1 load results"
			
 
				+  ansible.builtin.async_status:
			
 
				+    jid: "{{ _async_job.ansible_job_id }}"
			
 
				+  loop: "{{ _load_node1_jobs.results | default([]) }}"
			
 
				+  loop_control:
			
 
				+    loop_var: _async_job
			
 
				+    label: "node1 load: {{ _async_job.item | default('?') }}"
			
 
				+  register: _load_node1
			
 
				+  until: _load_node1.finished
			
 
				+  retries: "{{ ((benchmark_load_timeout | int) * 5 / 15) | int + 5 }}"
			
 
				+  delay: 15
			
 
				+  failed_when: false
			
 
				+
			
 
				+- name: "Benchmark | Collect node0 load results"
			
 
				+  ansible.builtin.async_status:
			
 
				+    jid: "{{ _async_job.ansible_job_id }}"
			
 
				+  loop: "{{ _load_node0_jobs.results | default([]) }}"
			
 
				+  loop_control:
			
 
				+    loop_var: _async_job
			
 
				+    label: "node0 load: {{ _async_job.item | default('?') }}"
			
 
				   register: _load_node0
			
 
				+  until: _load_node0.finished
			
 
				+  retries: "{{ ((benchmark_load_timeout | int) * 5 / 15) | int + 5 }}"
			
 
				+  delay: 15
			
 
				   failed_when: false
			
 
				 
			
 
				 # ── Identify successfully loaded models ───────────────────────────────────────
			
@@ -69,11 +99,10 @@
 
				     _eligible_node0: "{{ _load_node0.results | selectattr('status', 'equalto', 200) | map(attribute='item') | list }}"
			
 
				 
			
 
				 # ── Fire benchmark prompts at both nodes concurrently ─────────────────────────
			
 
				-# uri timeout = large_timeout × 15 covers the full queue-drain wait:
			
 
				-#   worst case 3 models × 7 prompts = 21 requests, 2 parallel →
			
 
				-#   last request waits ≤ 10 × large_timeout = 12 000 s < 18 000 s (15×).
			
 
				-# async timeout must exceed uri timeout; use 18× for headroom.
			
 
				-# Collect retries × delay must exceed async timeout: 730 × 30 = 21 900 s.
			
 
				+# uri timeout = large_timeout × 15 (18 000 s) covers the full queue-drain
			
 
				+# wait: worst case 21 queued requests, 2 parallel, 1 200 s each →
			
 
				+# max queue wait ≈ 12 000 s < 18 000 s. async = ×18, collect retries
			
 
				+# 730 × 30 s = 21 900 s of headroom.
			
 
				 
			
 
				 - name: "Benchmark | Fire test prompts at node1 (async)"
			
 
				   ansible.builtin.uri:
			
@@ -117,12 +146,6 @@
 
				   register: _bench_node0_jobs
			
 
				   failed_when: false
			
 
				 
			
 
				-# ── Collect results (both queues are draining in parallel during these tasks) ──
			
 
				-# loop_var: _async_job prevents Ansible from overwriting the `item` field in
			
 
				-# each collected result. The uri module stores item=[model, prompt] in the
			
 
				-# async job file; async_status surfaces it directly so the compute task can
			
 
				-# use result.item[0] / result.item[1] without any structural changes.
			
 
				-
			
 
				 - name: "Benchmark | Collect node1 results"
			
 
				   ansible.builtin.async_status:
			
 
				     jid: "{{ _async_job.ansible_job_id }}"