| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171 |
- ---
- # playbooks/_bench_tier_batch.yml
- # Included by 03_benchmark.yml once per batch of up to 6 models.
- #
- # Expected vars (passed via include_tasks vars:):
- # _batch_node1 — list of 0–3 model names for port 11434
- # _batch_node0 — list of 0–3 model names for port 11435
- #
- # Mutates host facts (accumulated across batches):
- # bench_all_results — list of uri result dicts
- # all_eligible_models — list of model names that passed load
- #
- # Concurrency design:
- # Load: node1 and node0 warm-up "Hi" prompts fire simultaneously (async).
- # Within each node Ollama still loads one model at a time,
- # but both nodes drain their queues in parallel.
- # Benchmark: sequential (synchronous uri), one request at a time per node.
- # Node1 drains fully, then node0. No queue contamination; each
- # request gets a full idle inference slot and clean eval_duration.
- # ── Load models into RAM (both nodes concurrently) ────────────────────────────
- # 3 models per node, sequential within each node → last model waits for 2
- # ahead: max load wait ≤ 2 × load_timeout. Use load_timeout × 4 for margin.
- - name: "Benchmark | Load node1 models into RAM (async)"
- ansible.builtin.uri:
- url: "http://localhost:11434/api/generate"
- method: POST
- body_format: json
- body:
- model: "{{ item }}"
- prompt: "Hi"
- stream: false
- headers:
- Authorization: "Bearer {{ ollama_api_key }}"
- timeout: "{{ (benchmark_load_timeout | int) * 4 }}"
- status_code: 200
- loop: "{{ _batch_node1 }}"
- loop_control:
- label: "node1 load: {{ item }}"
- async: "{{ (benchmark_load_timeout | int) * 5 }}"
- poll: 0
- register: _load_node1_jobs
- failed_when: false
- - name: "Benchmark | Load node0 models into RAM (async)"
- ansible.builtin.uri:
- url: "http://localhost:{{ ollama_node0_port }}/api/generate"
- method: POST
- body_format: json
- body:
- model: "{{ item }}"
- prompt: "Hi"
- stream: false
- headers:
- Authorization: "Bearer {{ ollama_api_key }}"
- timeout: "{{ (benchmark_load_timeout | int) * 4 }}"
- status_code: 200
- loop: "{{ _batch_node0 }}"
- loop_control:
- label: "node0 load: {{ item }}"
- async: "{{ (benchmark_load_timeout | int) * 5 }}"
- poll: 0
- register: _load_node0_jobs
- failed_when: false
- - name: "Benchmark | Collect node1 load results"
- ansible.builtin.async_status:
- jid: "{{ _async_job.ansible_job_id }}"
- loop: "{{ _load_node1_jobs.results | default([]) }}"
- loop_control:
- loop_var: _async_job
- label: "node1 load: {{ _async_job.item | default('?') }}"
- register: _load_node1
- until: _load_node1.finished
- retries: "{{ ((benchmark_load_timeout | int) * 5 / 15) | int + 5 }}"
- delay: 15
- failed_when: false
- - name: "Benchmark | Collect node0 load results"
- ansible.builtin.async_status:
- jid: "{{ _async_job.ansible_job_id }}"
- loop: "{{ _load_node0_jobs.results | default([]) }}"
- loop_control:
- loop_var: _async_job
- label: "node0 load: {{ _async_job.item | default('?') }}"
- register: _load_node0
- until: _load_node0.finished
- retries: "{{ ((benchmark_load_timeout | int) * 5 / 15) | int + 5 }}"
- delay: 15
- failed_when: false
- # ── Identify successfully loaded models ───────────────────────────────────────
- - name: "Benchmark | Identify loaded models in batch"
- ansible.builtin.set_fact:
- _eligible_node1: "{{ _load_node1.results | selectattr('status', 'equalto', 200) | map(attribute='_async_job') | map(attribute='item') | list }}"
- _eligible_node0: "{{ _load_node0.results | selectattr('status', 'equalto', 200) | map(attribute='_async_job') | map(attribute='item') | list }}"
- # ── Fire benchmark prompts sequentially (one request at a time per node) ──────
- # Sequential firing ensures each request hits an idle Ollama inference slot:
- # no queue contamination, full CPU budget per request, clean eval_duration.
- # Node1 then node0 run back-to-back; concurrent load phase above is unchanged.
- - name: "Benchmark | Fire test prompts at node1"
- ansible.builtin.uri:
- url: "http://localhost:11434/api/generate"
- method: POST
- body_format: json
- body:
- model: "{{ item.0 }}"
- prompt: "{{ test_prompts[item.1].prompt }}"
- stream: false
- options:
- num_predict: "{{ benchmark_num_predict | int }}"
- temperature: 0
- seed: 42
- headers:
- Authorization: "Bearer {{ ollama_api_key }}"
- timeout: "{{ (benchmark_large_timeout | int) }}"
- status_code: 200
- loop: "{{ _eligible_node1 | product(test_prompts.keys() | list) | list }}"
- loop_control:
- label: "{{ item.0 }} / {{ item.1 }}"
- register: _bench_node1
- failed_when: false
- - name: "Benchmark | Fire test prompts at node0"
- ansible.builtin.uri:
- url: "http://localhost:{{ ollama_node0_port }}/api/generate"
- method: POST
- body_format: json
- body:
- model: "{{ item.0 }}"
- prompt: "{{ test_prompts[item.1].prompt }}"
- stream: false
- options:
- num_predict: "{{ benchmark_num_predict | int }}"
- temperature: 0
- seed: 42
- headers:
- Authorization: "Bearer {{ ollama_api_key }}"
- timeout: "{{ (benchmark_large_timeout | int) }}"
- status_code: 200
- loop: "{{ _eligible_node0 | product(test_prompts.keys() | list) | list }}"
- loop_control:
- label: "{{ item.0 }} / {{ item.1 }}"
- register: _bench_node0
- failed_when: false
- # ── Accumulate results into play-scoped facts ─────────────────────────────────
- # Synchronous uri populates result.item = [model, prompt_key] at top level —
- # no _async_job indirection needed; compute task in 03_benchmark.yml unchanged.
- - name: "Benchmark | Accumulate node1 results"
- ansible.builtin.set_fact:
- bench_all_results: "{{ bench_all_results + [item] }}"
- loop: "{{ _bench_node1.results | default([]) }}"
- loop_control:
- label: "{{ (item.item | default(['?', '?']))[0] }} / {{ (item.item | default(['?', '?']))[1] }}"
- - name: "Benchmark | Accumulate node0 results"
- ansible.builtin.set_fact:
- bench_all_results: "{{ bench_all_results + [item] }}"
- loop: "{{ _bench_node0.results | default([]) }}"
- loop_control:
- label: "{{ (item.item | default(['?', '?']))[0] }} / {{ (item.item | default(['?', '?']))[1] }}"
- - name: "Benchmark | Accumulate eligible models"
- ansible.builtin.set_fact:
- all_eligible_models: "{{ all_eligible_models + _eligible_node1 + _eligible_node0 }}"
|