sarman
/
tftsr_ai


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
							---
# playbooks/_bench_tier_batch.yml
# Included by 03_benchmark.yml once per batch of up to 6 models.
#
# Expected vars (passed via include_tasks vars:):
#   _batch_node1  — list of 0–3 model names for port 11434
#   _batch_node0  — list of 0–3 model names for port 11435
#
# Mutates host facts (accumulated across batches):
#   bench_all_results    — list of uri result dicts
#   all_eligible_models  — list of model names that passed load
#
# Concurrency design:
#   Load:      node1 and node0 warm-up "Hi" prompts fire simultaneously (async).
#              Within each node Ollama still loads one model at a time,
#              but both nodes drain their queues in parallel.
#   Benchmark: sequential (synchronous uri), one request at a time per node.
#              Node1 drains fully, then node0. No queue contamination; each
#              request gets a full idle inference slot and clean eval_duration.

# ── Load models into RAM (both nodes concurrently) ────────────────────────────
# 3 models per node, sequential within each node → last model waits for 2
# ahead: max load wait ≤ 2 × load_timeout. Use load_timeout × 4 for margin.

- name: "Benchmark | Load node1 models into RAM (async)"
  ansible.builtin.uri:
    url: "http://localhost:11434/api/generate"
    method: POST
    body_format: json
    body:
      model: "{{ item }}"
      prompt: "Hi"
      stream: false
    headers:
      Authorization: "Bearer {{ ollama_api_key }}"
    timeout: "{{ (benchmark_load_timeout | int) * 4 }}"
    status_code: 200
  loop: "{{ _batch_node1 }}"
  loop_control:
    label: "node1 load: {{ item }}"
  async: "{{ (benchmark_load_timeout | int) * 5 }}"
  poll: 0
  register: _load_node1_jobs
  failed_when: false

- name: "Benchmark | Load node0 models into RAM (async)"
  ansible.builtin.uri:
    url: "http://localhost:{{ ollama_node0_port }}/api/generate"
    method: POST
    body_format: json
    body:
      model: "{{ item }}"
      prompt: "Hi"
      stream: false
    headers:
      Authorization: "Bearer {{ ollama_api_key }}"
    timeout: "{{ (benchmark_load_timeout | int) * 4 }}"
    status_code: 200
  loop: "{{ _batch_node0 }}"
  loop_control:
    label: "node0 load: {{ item }}"
  async: "{{ (benchmark_load_timeout | int) * 5 }}"
  poll: 0
  register: _load_node0_jobs
  failed_when: false

- name: "Benchmark | Collect node1 load results"
  ansible.builtin.async_status:
    jid: "{{ _async_job.ansible_job_id }}"
  loop: "{{ _load_node1_jobs.results | default([]) }}"
  loop_control:
    loop_var: _async_job
    label: "node1 load: {{ _async_job.item | default('?') }}"
  register: _load_node1
  until: _load_node1.finished
  retries: "{{ ((benchmark_load_timeout | int) * 5 / 15) | int + 5 }}"
  delay: 15
  failed_when: false

- name: "Benchmark | Collect node0 load results"
  ansible.builtin.async_status:
    jid: "{{ _async_job.ansible_job_id }}"
  loop: "{{ _load_node0_jobs.results | default([]) }}"
  loop_control:
    loop_var: _async_job
    label: "node0 load: {{ _async_job.item | default('?') }}"
  register: _load_node0
  until: _load_node0.finished
  retries: "{{ ((benchmark_load_timeout | int) * 5 / 15) | int + 5 }}"
  delay: 15
  failed_when: false

# ── Identify successfully loaded models ───────────────────────────────────────

- name: "Benchmark | Identify loaded models in batch"
  ansible.builtin.set_fact:
    _eligible_node1: "{{ _load_node1.results | selectattr('status', 'equalto', 200) | map(attribute='_async_job') | map(attribute='item') | list }}"
    _eligible_node0: "{{ _load_node0.results | selectattr('status', 'equalto', 200) | map(attribute='_async_job') | map(attribute='item') | list }}"

# ── Fire benchmark prompts sequentially (one request at a time per node) ──────
# Sequential firing ensures each request hits an idle Ollama inference slot:
# no queue contamination, full CPU budget per request, clean eval_duration.
# Node1 then node0 run back-to-back; concurrent load phase above is unchanged.

- name: "Benchmark | Fire test prompts at node1"
  ansible.builtin.uri:
    url: "http://localhost:11434/api/generate"
    method: POST
    body_format: json
    body:
      model: "{{ item.0 }}"
      prompt: "{{ test_prompts[item.1].prompt }}"
      stream: false
      options:
        num_predict: "{{ benchmark_num_predict | int }}"
        temperature: 0
        seed: 42
    headers:
      Authorization: "Bearer {{ ollama_api_key }}"
    timeout: "{{ (benchmark_large_timeout | int) }}"
    status_code: 200
  loop: "{{ _eligible_node1 | product(test_prompts.keys() | list) | list }}"
  loop_control:
    label: "{{ item.0 }} / {{ item.1 }}"
  register: _bench_node1
  failed_when: false

- name: "Benchmark | Fire test prompts at node0"
  ansible.builtin.uri:
    url: "http://localhost:{{ ollama_node0_port }}/api/generate"
    method: POST
    body_format: json
    body:
      model: "{{ item.0 }}"
      prompt: "{{ test_prompts[item.1].prompt }}"
      stream: false
      options:
        num_predict: "{{ benchmark_num_predict | int }}"
        temperature: 0
        seed: 42
    headers:
      Authorization: "Bearer {{ ollama_api_key }}"
    timeout: "{{ (benchmark_large_timeout | int) }}"
    status_code: 200
  loop: "{{ _eligible_node0 | product(test_prompts.keys() | list) | list }}"
  loop_control:
    label: "{{ item.0 }} / {{ item.1 }}"
  register: _bench_node0
  failed_when: false

# ── Accumulate results into play-scoped facts ─────────────────────────────────
# Synchronous uri populates result.item = [model, prompt_key] at top level —
# no _async_job indirection needed; compute task in 03_benchmark.yml unchanged.

- name: "Benchmark | Accumulate node1 results"
  ansible.builtin.set_fact:
    bench_all_results: "{{ bench_all_results + [item] }}"
  loop: "{{ _bench_node1.results | default([]) }}"
  loop_control:
    label: "{{ (item.item | default(['?', '?']))[0] }} / {{ (item.item | default(['?', '?']))[1] }}"

- name: "Benchmark | Accumulate node0 results"
  ansible.builtin.set_fact:
    bench_all_results: "{{ bench_all_results + [item] }}"
  loop: "{{ _bench_node0.results | default([]) }}"
  loop_control:
    label: "{{ (item.item | default(['?', '?']))[0] }} / {{ (item.item | default(['?', '?']))[1] }}"

- name: "Benchmark | Accumulate eligible models"
  ansible.builtin.set_fact:
    all_eligible_models: "{{ all_eligible_models + _eligible_node1 + _eligible_node0 }}"