Просмотр исходного кода

Parallelize load phase: fire both node warm-ups concurrently

Apply the same async/collect pattern to the load tasks so node1 and
node0 warm-up requests are in flight simultaneously. Saves roughly
half the load phase wall time (typically 10–15 min per run).

uri timeout = load_timeout × 4 covers up to 3 sequential model loads
per node (last model waits ≤ 2 × load_timeout in queue). async = ×5,
collect retries = (load_timeout × 5 / 15) + 5 with 15 s poll interval.
Shaun Arman 4 дней назад
Родитель
Сommit
e4e09ba434
1 измененных файлов с 50 добавлено и 27 удалено
  1. 50 27
      playbooks/_bench_tier_batch.yml

+ 50 - 27
playbooks/_bench_tier_batch.yml

@@ -10,18 +10,20 @@
 #   bench_all_results    — list of uri result dicts
 #   all_eligible_models  — list of model names that passed load
 #
-# Concurrency design:
-#   Loads are sequential (disk I/O: node1 then node0).
-#   Benchmarks are concurrent: all node1 prompts are fired into Ollama's
-#   queue (async, poll:0), then all node0 prompts are fired immediately
-#   after. Both nodes drain their queues simultaneously.
-#   OLLAMA_SCHED_SPREAD=false (default) ensures each node processes its
-#   models one at a time in queue order, so scores are not cross-model
-#   contaminated. uri timeout accounts for full queue-drain wait time.
+# Concurrency design (load and benchmark phases both use async):
+#   Load:      node1 and node0 warm-up "Hi" prompts fire simultaneously.
+#              Within each node Ollama still loads one model at a time,
+#              but both nodes drain their queues in parallel.
+#   Benchmark: same pattern — both nodes' prompt queues drain together.
+#   uri timeout covers the full queue-drain wait (see inline comments).
+#   loop_var: _async_job on collect tasks preserves the uri module's
+#   item=[model, ...] field so downstream tasks need no structural changes.
 
-# ── Load models into RAM ──────────────────────────────────────────────────────
+# ── Load models into RAM (both nodes concurrently) ────────────────────────────
+# 3 models per node, sequential within each node → last model waits for 2
+# ahead: max load wait ≤ 2 × load_timeout. Use load_timeout × 4 for margin.
 
-- name: "Benchmark | Load node1 models into RAM"
+- name: "Benchmark | Load node1 models into RAM (async)"
   ansible.builtin.uri:
     url: "http://localhost:11434/api/generate"
     method: POST
@@ -32,16 +34,17 @@
       stream: false
     headers:
       Authorization: "Bearer {{ ollama_api_key }}"
-    timeout: "{{ benchmark_load_timeout }}"
+    timeout: "{{ (benchmark_load_timeout | int) * 4 }}"
     status_code: 200
   loop: "{{ _batch_node1 }}"
   loop_control:
-    loop_var: item
     label: "node1 load: {{ item }}"
-  register: _load_node1
+  async: "{{ (benchmark_load_timeout | int) * 5 }}"
+  poll: 0
+  register: _load_node1_jobs
   failed_when: false
 
-- name: "Benchmark | Load node0 models into RAM"
+- name: "Benchmark | Load node0 models into RAM (async)"
   ansible.builtin.uri:
     url: "http://localhost:{{ ollama_node0_port }}/api/generate"
     method: POST
@@ -52,13 +55,40 @@
       stream: false
     headers:
       Authorization: "Bearer {{ ollama_api_key }}"
-    timeout: "{{ benchmark_load_timeout }}"
+    timeout: "{{ (benchmark_load_timeout | int) * 4 }}"
     status_code: 200
   loop: "{{ _batch_node0 }}"
   loop_control:
-    loop_var: item
     label: "node0 load: {{ item }}"
+  async: "{{ (benchmark_load_timeout | int) * 5 }}"
+  poll: 0
+  register: _load_node0_jobs
+  failed_when: false
+
+- name: "Benchmark | Collect node1 load results"
+  ansible.builtin.async_status:
+    jid: "{{ _async_job.ansible_job_id }}"
+  loop: "{{ _load_node1_jobs.results | default([]) }}"
+  loop_control:
+    loop_var: _async_job
+    label: "node1 load: {{ _async_job.item | default('?') }}"
+  register: _load_node1
+  until: _load_node1.finished
+  retries: "{{ ((benchmark_load_timeout | int) * 5 / 15) | int + 5 }}"
+  delay: 15
+  failed_when: false
+
+- name: "Benchmark | Collect node0 load results"
+  ansible.builtin.async_status:
+    jid: "{{ _async_job.ansible_job_id }}"
+  loop: "{{ _load_node0_jobs.results | default([]) }}"
+  loop_control:
+    loop_var: _async_job
+    label: "node0 load: {{ _async_job.item | default('?') }}"
   register: _load_node0
+  until: _load_node0.finished
+  retries: "{{ ((benchmark_load_timeout | int) * 5 / 15) | int + 5 }}"
+  delay: 15
   failed_when: false
 
 # ── Identify successfully loaded models ───────────────────────────────────────
@@ -69,11 +99,10 @@
     _eligible_node0: "{{ _load_node0.results | selectattr('status', 'equalto', 200) | map(attribute='item') | list }}"
 
 # ── Fire benchmark prompts at both nodes concurrently ─────────────────────────
-# uri timeout = large_timeout × 15 covers the full queue-drain wait:
-#   worst case 3 models × 7 prompts = 21 requests, 2 parallel →
-#   last request waits ≤ 10 × large_timeout = 12 000 s < 18 000 s (15×).
-# async timeout must exceed uri timeout; use 18× for headroom.
-# Collect retries × delay must exceed async timeout: 730 × 30 = 21 900 s.
+# uri timeout = large_timeout × 15 (18 000 s) covers the full queue-drain
+# wait: worst case 21 queued requests, 2 parallel, 1 200 s each →
+# max queue wait ≈ 12 000 s < 18 000 s. async = ×18, collect retries
+# 730 × 30 s = 21 900 s of headroom.
 
 - name: "Benchmark | Fire test prompts at node1 (async)"
   ansible.builtin.uri:
@@ -117,12 +146,6 @@
   register: _bench_node0_jobs
   failed_when: false
 
-# ── Collect results (both queues are draining in parallel during these tasks) ──
-# loop_var: _async_job prevents Ansible from overwriting the `item` field in
-# each collected result. The uri module stores item=[model, prompt] in the
-# async job file; async_status surfaces it directly so the compute task can
-# use result.item[0] / result.item[1] without any structural changes.
-
 - name: "Benchmark | Collect node1 results"
   ansible.builtin.async_status:
     jid: "{{ _async_job.ansible_job_id }}"