_bench_tier_batch.yml 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. ---
  2. # playbooks/_bench_tier_batch.yml
  3. # Included by 03_benchmark.yml once per batch of up to 6 models.
  4. #
  5. # Expected vars (passed via include_tasks vars:):
  6. # _batch_node1 — list of 0–3 model names for port 11434
  7. # _batch_node0 — list of 0–3 model names for port 11435
  8. #
  9. # Mutates host facts (accumulated across batches):
  10. # bench_all_results — list of uri result dicts
  11. # all_eligible_models — list of model names that passed load
  12. #
  13. # Concurrency design:
  14. # Load: node1 and node0 warm-up "Hi" prompts fire simultaneously (async).
  15. # Within each node Ollama still loads one model at a time,
  16. # but both nodes drain their queues in parallel.
  17. # Benchmark: sequential (synchronous uri), one request at a time per node.
  18. # Node1 drains fully, then node0. No queue contamination; each
  19. # request gets a full idle inference slot and clean eval_duration.
  20. # ── Load models into RAM (both nodes concurrently) ────────────────────────────
  21. # 3 models per node, sequential within each node → last model waits for 2
  22. # ahead: max load wait ≤ 2 × load_timeout. Use load_timeout × 4 for margin.
  23. - name: "Benchmark | Load node1 models into RAM (async)"
  24. ansible.builtin.uri:
  25. url: "http://localhost:11434/api/generate"
  26. method: POST
  27. body_format: json
  28. body:
  29. model: "{{ item }}"
  30. prompt: "Hi"
  31. stream: false
  32. headers:
  33. Authorization: "Bearer {{ ollama_api_key }}"
  34. timeout: "{{ (benchmark_load_timeout | int) * 4 }}"
  35. status_code: 200
  36. loop: "{{ _batch_node1 }}"
  37. loop_control:
  38. label: "node1 load: {{ item }}"
  39. async: "{{ (benchmark_load_timeout | int) * 5 }}"
  40. poll: 0
  41. register: _load_node1_jobs
  42. failed_when: false
  43. - name: "Benchmark | Load node0 models into RAM (async)"
  44. ansible.builtin.uri:
  45. url: "http://localhost:{{ ollama_node0_port }}/api/generate"
  46. method: POST
  47. body_format: json
  48. body:
  49. model: "{{ item }}"
  50. prompt: "Hi"
  51. stream: false
  52. headers:
  53. Authorization: "Bearer {{ ollama_api_key }}"
  54. timeout: "{{ (benchmark_load_timeout | int) * 4 }}"
  55. status_code: 200
  56. loop: "{{ _batch_node0 }}"
  57. loop_control:
  58. label: "node0 load: {{ item }}"
  59. async: "{{ (benchmark_load_timeout | int) * 5 }}"
  60. poll: 0
  61. register: _load_node0_jobs
  62. failed_when: false
  63. - name: "Benchmark | Collect node1 load results"
  64. ansible.builtin.async_status:
  65. jid: "{{ _async_job.ansible_job_id }}"
  66. loop: "{{ _load_node1_jobs.results | default([]) }}"
  67. loop_control:
  68. loop_var: _async_job
  69. label: "node1 load: {{ _async_job.item | default('?') }}"
  70. register: _load_node1
  71. until: _load_node1.finished
  72. retries: "{{ ((benchmark_load_timeout | int) * 5 / 15) | int + 5 }}"
  73. delay: 15
  74. failed_when: false
  75. - name: "Benchmark | Collect node0 load results"
  76. ansible.builtin.async_status:
  77. jid: "{{ _async_job.ansible_job_id }}"
  78. loop: "{{ _load_node0_jobs.results | default([]) }}"
  79. loop_control:
  80. loop_var: _async_job
  81. label: "node0 load: {{ _async_job.item | default('?') }}"
  82. register: _load_node0
  83. until: _load_node0.finished
  84. retries: "{{ ((benchmark_load_timeout | int) * 5 / 15) | int + 5 }}"
  85. delay: 15
  86. failed_when: false
  87. # ── Identify successfully loaded models ───────────────────────────────────────
  88. - name: "Benchmark | Identify loaded models in batch"
  89. ansible.builtin.set_fact:
  90. _eligible_node1: "{{ _load_node1.results | selectattr('status', 'equalto', 200) | map(attribute='_async_job') | map(attribute='item') | list }}"
  91. _eligible_node0: "{{ _load_node0.results | selectattr('status', 'equalto', 200) | map(attribute='_async_job') | map(attribute='item') | list }}"
  92. # ── Fire benchmark prompts sequentially (one request at a time per node) ──────
  93. # Sequential firing ensures each request hits an idle Ollama inference slot:
  94. # no queue contamination, full CPU budget per request, clean eval_duration.
  95. # Node1 then node0 run back-to-back; concurrent load phase above is unchanged.
  96. - name: "Benchmark | Fire test prompts at node1"
  97. ansible.builtin.uri:
  98. url: "http://localhost:11434/api/generate"
  99. method: POST
  100. body_format: json
  101. body:
  102. model: "{{ item.0 }}"
  103. prompt: "{{ test_prompts[item.1].prompt }}"
  104. stream: false
  105. options:
  106. num_predict: "{{ benchmark_num_predict | int }}"
  107. temperature: 0
  108. seed: 42
  109. headers:
  110. Authorization: "Bearer {{ ollama_api_key }}"
  111. timeout: "{{ (benchmark_large_timeout | int) }}"
  112. status_code: 200
  113. loop: "{{ _eligible_node1 | product(test_prompts.keys() | list) | list }}"
  114. loop_control:
  115. label: "{{ item.0 }} / {{ item.1 }}"
  116. register: _bench_node1
  117. failed_when: false
  118. - name: "Benchmark | Fire test prompts at node0"
  119. ansible.builtin.uri:
  120. url: "http://localhost:{{ ollama_node0_port }}/api/generate"
  121. method: POST
  122. body_format: json
  123. body:
  124. model: "{{ item.0 }}"
  125. prompt: "{{ test_prompts[item.1].prompt }}"
  126. stream: false
  127. options:
  128. num_predict: "{{ benchmark_num_predict | int }}"
  129. temperature: 0
  130. seed: 42
  131. headers:
  132. Authorization: "Bearer {{ ollama_api_key }}"
  133. timeout: "{{ (benchmark_large_timeout | int) }}"
  134. status_code: 200
  135. loop: "{{ _eligible_node0 | product(test_prompts.keys() | list) | list }}"
  136. loop_control:
  137. label: "{{ item.0 }} / {{ item.1 }}"
  138. register: _bench_node0
  139. failed_when: false
  140. # ── Accumulate results into play-scoped facts ─────────────────────────────────
  141. # Synchronous uri populates result.item = [model, prompt_key] at top level —
  142. # no _async_job indirection needed; compute task in 03_benchmark.yml unchanged.
  143. - name: "Benchmark | Accumulate node1 results"
  144. ansible.builtin.set_fact:
  145. bench_all_results: "{{ bench_all_results + [item] }}"
  146. loop: "{{ _bench_node1.results | default([]) }}"
  147. loop_control:
  148. label: "{{ (item.item | default(['?', '?']))[0] }} / {{ (item.item | default(['?', '?']))[1] }}"
  149. - name: "Benchmark | Accumulate node0 results"
  150. ansible.builtin.set_fact:
  151. bench_all_results: "{{ bench_all_results + [item] }}"
  152. loop: "{{ _bench_node0.results | default([]) }}"
  153. loop_control:
  154. label: "{{ (item.item | default(['?', '?']))[0] }} / {{ (item.item | default(['?', '?']))[1] }}"
  155. - name: "Benchmark | Accumulate eligible models"
  156. ansible.builtin.set_fact:
  157. all_eligible_models: "{{ all_eligible_models + _eligible_node1 + _eligible_node0 }}"