03_benchmark.yml 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. ---
  2. # playbooks/03_benchmark.yml
  3. # Benchmark installed Ollama models and select optimal models for each slot
  4. - name: "Benchmark | Evaluate Ollama models"
  5. hosts: ai_server
  6. become: false
  7. gather_facts: true
  8. tags:
  9. - benchmark
  10. vars:
  11. benchmark_models: ""
  12. pull_if_better: false
  13. min_composite_score: "{{ benchmark_thresholds.min_composite_score }}"
  14. ollama_api_url: "http://localhost:11434"
  15. ollama_api_key: "{{ lookup('community.hashi_vault.hashi_vault', vault_secret_prefix ~ '/ollama:api_key token=' ~ lookup('ansible.builtin.file', vault_token_file) ~ ' url=' ~ vault_url) }}"
  16. benchmark_results_dir: "{{ playbook_dir }}/../benchmarks/results"
  17. test_prompts:
  18. code_gen:
  19. prompt: "Write a Python merge sort with type hints, docstring, and 3 unit tests"
  20. category: coding
  21. weight: 1.0
  22. debug:
  23. prompt: >-
  24. Here is a Python function with 3 bugs. Find and fix all bugs:
  25. def calculate_average(numbers):
  26. total = 0
  27. for n in numbers:
  28. total =+ n
  29. average = total / len(numbers
  30. return averege
  31. category: coding
  32. weight: 1.0
  33. refactor:
  34. prompt: >-
  35. Refactor this for readability and performance:
  36. def f(l):
  37. r=[]
  38. for i in range(len(l)):
  39. if l[i]%2==0:
  40. r.append(l[i]*2)
  41. return r
  42. category: coding
  43. weight: 1.0
  44. explain:
  45. prompt: "Explain how Python's GIL works and when it matters"
  46. category: general
  47. weight: 1.0
  48. creative:
  49. prompt: "Suggest 5 fun family activities for a rainy weekend"
  50. category: general
  51. weight: 1.0
  52. reasoning:
  53. prompt: "I have 3 apples. I give away half. Then I get 4 more. How many do I have?"
  54. category: general
  55. weight: 1.0
  56. latency:
  57. prompt: "Hi"
  58. category: latency
  59. weight: 0.5
  60. tasks:
  61. - name: "Benchmark | Ensure results directory exists on control node"
  62. ansible.builtin.file:
  63. path: "{{ benchmark_results_dir }}"
  64. state: directory
  65. mode: "0755"
  66. delegate_to: localhost
  67. tags:
  68. - benchmark-setup
  69. - name: "Benchmark | Wait for Ollama API to be ready"
  70. ansible.builtin.uri:
  71. url: "http://localhost:11434/api/tags"
  72. method: GET
  73. status_code: 200
  74. timeout: 10
  75. register: ollama_ready
  76. retries: 24
  77. delay: 5
  78. until: ollama_ready.status == 200
  79. tags:
  80. - benchmark-discover
  81. - name: "Benchmark | Stop warmup services for clean benchmark run"
  82. ansible.builtin.systemd:
  83. name: "{{ item }}"
  84. state: stopped
  85. loop:
  86. - ollama-warmup.service
  87. - ollama-warmup-node0.service
  88. failed_when: false
  89. become: true
  90. tags:
  91. - benchmark-setup
  92. - name: "Benchmark | Wait for node0 Ollama API to be ready"
  93. ansible.builtin.uri:
  94. url: "http://localhost:{{ ollama_node0_port }}/api/tags"
  95. method: GET
  96. status_code: 200
  97. timeout: 10
  98. register: ollama_node0_ready
  99. retries: 24
  100. delay: 5
  101. until: ollama_node0_ready.status == 200
  102. tags:
  103. - benchmark-setup
  104. - name: "Benchmark | Discover installed models"
  105. ansible.builtin.command: ollama list
  106. changed_when: false
  107. register: ollama_list_output
  108. retries: 6
  109. delay: 10
  110. until: ollama_list_output.rc == 0
  111. tags:
  112. - benchmark-discover
  113. - name: "Benchmark | Parse model names from ollama list"
  114. ansible.builtin.set_fact:
  115. installed_models: "{{ ollama_list_output.stdout_lines[1:] | map('split') | map('first') | list }}"
  116. tags:
  117. - benchmark-discover
  118. - name: "Benchmark | Parse model sizes from ollama list"
  119. ansible.builtin.set_fact:
  120. _benchmark_sizes_json: |
  121. {% set ns = namespace(d={}) %}
  122. {% for line in ollama_list_output.stdout_lines[1:] %}
  123. {% set p = line.split() %}
  124. {% if p | length >= 4 %}
  125. {% set gb = (p[2] | float) if (p[3] | upper == 'GB') else ((p[2] | float) / 1024) %}
  126. {% set _ = ns.d.update({p[0]: gb}) %}
  127. {% endif %}
  128. {% endfor %}
  129. {{ ns.d | to_json }}
  130. tags:
  131. - benchmark-discover
  132. - name: "Benchmark | Partition models into small, medium, and large passes"
  133. ansible.builtin.set_fact:
  134. _small_models: "{{ _alias_filtered | select('in', _small_ok) | list }}"
  135. _medium_models: "{{ _alias_filtered | select('in', _medium_ok) | list }}"
  136. _large_models: "{{ _alias_filtered | reject('in', _small_ok) | reject('in', _medium_ok) | list }}"
  137. models_to_benchmark: "{{ _alias_filtered | list }}"
  138. vars:
  139. _sizes: "{{ _benchmark_sizes_json | from_json }}"
  140. _small_cut: "{{ (benchmark_small_max_gb | float) / (benchmark_size_overhead_factor | float) }}"
  141. _medium_cut: "{{ (benchmark_medium_max_gb | float) / (benchmark_size_overhead_factor | float) }}"
  142. _small_ok: "{{ _sizes | dict2items | selectattr('value', 'le', _small_cut | float) | map(attribute='key') | list }}"
  143. _medium_ok: "{{ _sizes | dict2items | selectattr('value', 'gt', _small_cut | float)
  144. | selectattr('value', 'le', _medium_cut | float)
  145. | map(attribute='key') | list }}"
  146. _alias_filtered: "{{ installed_models | reject('match', '^(' ~ benchmark_skip_aliases | join('|') ~ ')(:|$)') | list }}"
  147. when: benchmark_models | default('') | length == 0
  148. tags:
  149. - benchmark-discover
  150. - name: "Benchmark | Set models_to_benchmark to specified subset"
  151. ansible.builtin.set_fact:
  152. models_to_benchmark: "{{ _specified }}"
  153. _small_models: "{{ _specified | select('in', _small_ok) | list }}"
  154. _medium_models: "{{ _specified | select('in', _medium_ok) | list }}"
  155. _large_models: "{{ _specified | reject('in', _small_ok) | reject('in', _medium_ok) | list }}"
  156. vars:
  157. _specified: "{{ benchmark_models.split(',') | map('trim') | list }}"
  158. _sizes: "{{ _benchmark_sizes_json | from_json }}"
  159. _small_cut: "{{ (benchmark_small_max_gb | float) / (benchmark_size_overhead_factor | float) }}"
  160. _medium_cut: "{{ (benchmark_medium_max_gb | float) / (benchmark_size_overhead_factor | float) }}"
  161. _small_ok: "{{ _sizes | dict2items | selectattr('value', 'le', _small_cut | float) | map(attribute='key') | list }}"
  162. _medium_ok: "{{ _sizes | dict2items | selectattr('value', 'gt', _small_cut | float)
  163. | selectattr('value', 'le', _medium_cut | float)
  164. | map(attribute='key') | list }}"
  165. when: benchmark_models | default('') | length > 0
  166. tags:
  167. - benchmark-discover
  168. - name: "Benchmark | Initialize batch accumulator facts"
  169. ansible.builtin.set_fact:
  170. bench_all_results: []
  171. all_eligible_models: []
  172. tags:
  173. - benchmark-discover
  174. - name: "Benchmark | Build per-model benchmark timeout map"
  175. ansible.builtin.set_fact:
  176. _benchmark_timeout_map_json: |
  177. {% set ns = namespace(d={}) %}
  178. {% for m in models_to_benchmark %}
  179. {% if m in _small_models %}
  180. {% set _ = ns.d.update({m: benchmark_small_timeout | int}) %}
  181. {% elif m in _medium_models %}
  182. {% set _ = ns.d.update({m: benchmark_medium_timeout | int}) %}
  183. {% else %}
  184. {% set _ = ns.d.update({m: benchmark_large_timeout | int}) %}
  185. {% endif %}
  186. {% endfor %}
  187. {{ ns.d | to_json }}
  188. tags:
  189. - benchmark-discover
  190. - name: "Benchmark | Parse benchmark timeout map"
  191. ansible.builtin.set_fact:
  192. _benchmark_timeout_map: "{{ _benchmark_timeout_map_json | from_json }}"
  193. tags:
  194. - benchmark-discover
  195. - name: "Benchmark | Sort models largest-first so heaviest models land on node1 (120 GB)"
  196. ansible.builtin.set_fact:
  197. models_to_benchmark: >-
  198. {{ (_large_models + _medium_models + _small_models)
  199. | select('in', models_to_benchmark) | list }}
  200. tags:
  201. - benchmark-discover
  202. - name: "Benchmark | Display models to benchmark"
  203. ansible.builtin.debug:
  204. msg:
  205. - "Small pass (timeout {{ benchmark_small_timeout }}s, ≤{{ benchmark_small_max_gb }}GB): {{ _small_models }}"
  206. - "Medium pass (timeout {{ benchmark_medium_timeout }}s, {{ benchmark_small_max_gb }}–{{ benchmark_medium_max_gb }}GB): {{ _medium_models }}"
  207. - "Large pass (timeout {{ benchmark_large_timeout }}s, >{{ benchmark_medium_max_gb }}GB): {{ _large_models }}"
  208. - "Load timeout (warm-up 'Hi' prompt): {{ benchmark_load_timeout }}s"
  209. - "Total: {{ models_to_benchmark | length }} models, {{ (models_to_benchmark | batch(6) | list) | length }} batch(es) of ≤6"
  210. tags:
  211. - benchmark-discover
  212. - name: "Benchmark | Process batch {{ _loop_idx + 1 }} of {{ models_to_benchmark | batch(6) | list | length }}"
  213. ansible.builtin.include_tasks: _bench_tier_batch.yml
  214. vars:
  215. _batch_node1: "{{ _batch[:3] }}"
  216. _batch_node0: "{{ _batch[3:] }}"
  217. loop: "{{ models_to_benchmark | batch(6) | list }}"
  218. loop_control:
  219. loop_var: _batch
  220. label: "batch {{ _loop_idx + 1 }}: node1={{ _batch[:3] }} node0={{ _batch[3:] }}"
  221. index_var: _loop_idx
  222. tags:
  223. - benchmark-run
  224. - name: "Benchmark | Display models that failed to load"
  225. ansible.builtin.debug:
  226. msg: "Load failures (excluded from scoring): {{ models_to_benchmark | reject('in', all_eligible_models) | list }}"
  227. tags:
  228. - benchmark-run
  229. - name: "Benchmark | Compute per-model metrics"
  230. ansible.builtin.set_fact:
  231. model_metrics: |
  232. {% set ns = namespace(results={}) %}
  233. {% for model in all_eligible_models %}
  234. {% set ns2 = namespace(coding_quality=0, coding_count=0, general_quality=0, general_count=0, total_toks=0, total_eval_time=0, ttft_sum=0, ttft_count=0, latency_ns=0) %}
  235. {% for result in bench_all_results %}
  236. {% if result.item[0] == model and result.status == 200 %}
  237. {% set test_name = result.item[1] %}
  238. {% set resp = result.json | default({}) %}
  239. {% set eval_count = resp.eval_count | default(0) | int %}
  240. {% set eval_duration = resp.eval_duration | default(1) | int %}
  241. {% set prompt_eval_duration = resp.prompt_eval_duration | default(0) | int %}
  242. {% set response_text = resp.response | default('') %}
  243. {% set tok_per_sec = (eval_count / (eval_duration / 1000000000.0)) if eval_duration > 0 else 0 %}
  244. {% set ns2.total_toks = ns2.total_toks + tok_per_sec %}
  245. {% set ns2.ttft_sum = ns2.ttft_sum + prompt_eval_duration %}
  246. {% set ns2.ttft_count = ns2.ttft_count + 1 %}
  247. {% if test_name == 'latency' %}
  248. {% set ns2.latency_ns = ((resp.total_duration | default(0) | int) - (resp.load_duration | default(0) | int)) | abs %}
  249. {% endif %}
  250. {% set resp_len = response_text | length %}
  251. {% if test_name in ['code_gen', 'debug', 'refactor'] %}
  252. {% set has_def = 1 if 'def ' in response_text else 0 %}
  253. {% set has_return = 1 if 'return' in response_text else 0 %}
  254. {% set has_assert = 1 if 'assert ' in response_text else 0 %}
  255. {% set has_test_def = 1 if 'def test_' in response_text else 0 %}
  256. {% set has_docstring = 1 if '"""' in response_text else 0 %}
  257. {% set has_type_hint = 1 if ' -> ' in response_text else 0 %}
  258. {% set has_code_block = 1 if '```' in response_text else 0 %}
  259. {% set has_import = 1 if ('import ' in response_text or 'from ' in response_text) else 0 %}
  260. {% if test_name == 'code_gen' %}
  261. {% set quality = (has_def * 0.20 + has_return * 0.20 + has_docstring * 0.15 + has_type_hint * 0.15 + has_code_block * 0.10 + has_assert * 0.08 + has_test_def * 0.07 + has_import * 0.05) %}
  262. {% elif test_name == 'debug' %}
  263. {% set quality = (has_def * 0.30 + has_return * 0.30 + has_code_block * 0.25 + has_assert * 0.15) %}
  264. {% else %}
  265. {% set quality = (has_def * 0.25 + has_return * 0.25 + has_code_block * 0.20 + has_type_hint * 0.15 + has_import * 0.15) %}
  266. {% endif %}
  267. {% set ns2.coding_quality = ns2.coding_quality + quality %}
  268. {% set ns2.coding_count = ns2.coding_count + 1 %}
  269. {% elif test_name in ['explain', 'creative', 'reasoning'] %}
  270. {% set length_score = [resp_len / 800.0, 1.0] | min %}
  271. {% set has_structure = 1 if ('\n' in response_text and resp_len > 100) else 0 %}
  272. {% set has_list = 1 if ('\n- ' in response_text or '\n* ' in response_text or '\n1.' in response_text) else 0 %}
  273. {% set has_detail = 1 if '\n\n' in response_text else 0 %}
  274. {% set quality = (length_score * 0.35 + has_structure * 0.40 + has_list * 0.15 + has_detail * 0.10) %}
  275. {% set ns2.general_quality = ns2.general_quality + quality %}
  276. {% set ns2.general_count = ns2.general_count + 1 %}
  277. {% endif %}
  278. {% endif %}
  279. {% endfor %}
  280. {% set coding_avg = (ns2.coding_quality / ns2.coding_count) if ns2.coding_count > 0 else 0 %}
  281. {% set general_avg = (ns2.general_quality / ns2.general_count) if ns2.general_count > 0 else 0 %}
  282. {% set test_count = (ns2.ttft_count) if ns2.ttft_count > 0 else 1 %}
  283. {% set avg_toks = ns2.total_toks / test_count %}
  284. {% set toks_norm = [avg_toks / benchmark_toks_norm_ceiling, 1.0] | min %}
  285. {% set latency_ms = ns2.latency_ns / 1000000.0 if ns2.latency_ns > 0 else 9999 %}
  286. {% set latency_score = [1.0 - (latency_ms / 5000.0), 0] | max %}
  287. {% set coding_composite = coding_avg * 0.45 + toks_norm * 0.30 + latency_score * 0.25 %}
  288. {% set general_composite = general_avg * 0.45 + toks_norm * 0.30 + latency_score * 0.25 %}
  289. {% set _override = (model_category_overrides | default({}))[model] | default('') %}
  290. {% if _override in ['coding', 'general'] %}
  291. {% set category = _override %}
  292. {% elif (coding_avg - general_avg) >= benchmark_coding_threshold %}
  293. {% set category = 'coding' %}
  294. {% elif 'coder' in model | lower or 'codestral' in model | lower or 'codellama' in model | lower or 'starcoder' in model | lower %}
  295. {% set category = 'coding' %}
  296. {% else %}
  297. {% set category = 'general' %}
  298. {% endif %}
  299. {% set _ = ns.results.update({model: {'coding_quality': coding_avg | round(3), 'general_quality': general_avg | round(3), 'avg_tok_per_sec': avg_toks | round(1), 'toks_norm': toks_norm | round(3), 'latency_ms': latency_ms | round(1), 'latency_score': latency_score | round(3), 'coding_composite': coding_composite | round(3), 'general_composite': general_composite | round(3), 'category': category}}) %}
  300. {% endfor %}
  301. {{ ns.results | to_json }}
  302. tags:
  303. - benchmark-compute
  304. - name: "Benchmark | Parse model metrics"
  305. ansible.builtin.set_fact:
  306. parsed_metrics: "{{ model_metrics | from_json }}"
  307. tags:
  308. - benchmark-compute
  309. - name: "Benchmark | Rank models and select slots"
  310. ansible.builtin.set_fact:
  311. model_selection: |
  312. {% set general_models = [] %}
  313. {% set coding_models = [] %}
  314. {% for model, metrics in parsed_metrics.items() %}
  315. {% if metrics.category == 'general' %}
  316. {% set _ = general_models.append({'name': model, 'composite': metrics.general_composite, 'metrics': metrics}) %}
  317. {% else %}
  318. {% set _ = coding_models.append({'name': model, 'composite': metrics.coding_composite, 'metrics': metrics}) %}
  319. {% endif %}
  320. {% endfor %}
  321. {% set general_sorted = general_models | sort(attribute='composite', reverse=true) %}
  322. {% set coding_sorted = coding_models | sort(attribute='composite', reverse=true) %}
  323. {% set slot1 = general_sorted[0].name if general_sorted | length > 0 else 'none' %}
  324. {% set slot2 = general_sorted[1].name if general_sorted | length > 1 else (general_sorted[0].name if general_sorted | length > 0 else 'none') %}
  325. {% set slot5 = general_sorted[2].name if general_sorted | length > 2 else 'none' %}
  326. {% set slot3 = coding_sorted[0].name if coding_sorted | length > 0 else (general_sorted[0].name if general_sorted | length > 0 else 'none') %}
  327. {% set slot4 = coding_sorted[1].name if coding_sorted | length > 1 else (coding_sorted[0].name if coding_sorted | length > 0 else 'none') %}
  328. {% set slot6 = coding_sorted[2].name if coding_sorted | length > 2 else 'none' %}
  329. {{ {'slot1_general': slot1, 'slot2_general': slot2, 'slot5_general_rotate': slot5,
  330. 'slot3_coding': slot3, 'slot4_coding': slot4, 'slot6_coding_rotate': slot6,
  331. 'all_metrics': parsed_metrics, 'general_ranking': general_sorted,
  332. 'coding_ranking': coding_sorted} | to_json }}
  333. tags:
  334. - benchmark-select
  335. - name: "Benchmark | Parse model selection"
  336. ansible.builtin.set_fact:
  337. selection: "{{ model_selection | from_json }}"
  338. tags:
  339. - benchmark-select
  340. - name: "Benchmark | Display model selection results"
  341. ansible.builtin.debug:
  342. msg:
  343. - "============================================="
  344. - " MODEL SELECTION RESULTS (6-slot / 2-socket)"
  345. - "============================================="
  346. - " Node 1 — General (port 11434)"
  347. - " Slot 1 (locked): {{ selection.slot1_general }}"
  348. - " Slot 2 (locked): {{ selection.slot2_general }}"
  349. - " Slot 5 (rotate): {{ selection.slot5_general_rotate }}"
  350. - " Node 0 — Coding (port 11435)"
  351. - " Slot 3 (locked): {{ selection.slot3_coding }}"
  352. - " Slot 4 (locked): {{ selection.slot4_coding }}"
  353. - " Slot 6 (rotate): {{ selection.slot6_coding_rotate }}"
  354. - "============================================="
  355. tags:
  356. - benchmark-select
  357. - name: "Benchmark | Generate timestamp"
  358. ansible.builtin.set_fact:
  359. benchmark_timestamp: "{{ ansible_date_time.iso8601_basic_short }}"
  360. tags:
  361. - benchmark-report
  362. - name: "Benchmark | Save benchmark results markdown"
  363. ansible.builtin.copy:
  364. content: |
  365. # Benchmark Results - {{ benchmark_timestamp }}
  366. ## Model Selection (6-slot / 2-socket)
  367. | Slot | Socket | Role | Model | Composite Score |
  368. |------|--------|------|-------|----------------|
  369. | 1 | Node 1 (port 11434) | General (locked) | {{ selection.slot1_general }} | {{ (parsed_metrics[selection.slot1_general].general_composite if selection.slot1_general in parsed_metrics else 'N/A') }} |
  370. | 2 | Node 1 (port 11434) | General (locked) | {{ selection.slot2_general }} | {{ (parsed_metrics[selection.slot2_general].general_composite if selection.slot2_general in parsed_metrics else 'N/A') }} |
  371. | 5 | Node 1 (port 11434) | General (rotate) | {{ selection.slot5_general_rotate }} | {{ (parsed_metrics[selection.slot5_general_rotate].general_composite if selection.slot5_general_rotate in parsed_metrics else 'N/A') }} |
  372. | 3 | Node 0 (port 11435) | Coding (locked) | {{ selection.slot3_coding }} | {{ (parsed_metrics[selection.slot3_coding].coding_composite if selection.slot3_coding in parsed_metrics else 'N/A') }} |
  373. | 4 | Node 0 (port 11435) | Coding (locked) | {{ selection.slot4_coding }} | {{ (parsed_metrics[selection.slot4_coding].coding_composite if selection.slot4_coding in parsed_metrics else 'N/A') }} |
  374. | 6 | Node 0 (port 11435) | Coding (rotate) | {{ selection.slot6_coding_rotate }} | {{ (parsed_metrics[selection.slot6_coding_rotate].coding_composite if selection.slot6_coding_rotate in parsed_metrics else 'N/A') }} |
  375. ## Detailed Metrics
  376. {% for model, metrics in parsed_metrics.items() %}
  377. ### {{ model }}
  378. - **Category**: {{ metrics.category }}
  379. - **Coding Quality**: {{ metrics.coding_quality }}
  380. - **General Quality**: {{ metrics.general_quality }}
  381. - **Avg Tokens/sec**: {{ metrics.avg_tok_per_sec }}
  382. - **Latency (ms)**: {{ metrics.latency_ms }}
  383. - **Coding Composite**: {{ metrics.coding_composite }}
  384. - **General Composite**: {{ metrics.general_composite }}
  385. {% endfor %}
  386. ## Scoring Formula
  387. - Composite = quality * 0.45 + token_speed_normalized * 0.30 + latency_score * 0.25
  388. - Speed normalized against {{ benchmark_toks_norm_ceiling }} tok/sec ceiling (hardware-observed max)
  389. - Coding quality (per-prompt):
  390. code_gen: has_def×0.20 + has_return×0.20 + has_docstring×0.15 + has_type_hint×0.15 + has_code_block×0.10 + has_assert×0.08 + has_test_def×0.07 + has_import×0.05
  391. debug: has_def×0.30 + has_return×0.30 + has_code_block×0.25 + has_assert×0.15
  392. refactor: has_def×0.25 + has_return×0.25 + has_code_block×0.20 + has_type_hint×0.15 + has_import×0.15
  393. - Category: override dict → quality delta (coding_avg - general_avg >= {{ benchmark_coding_threshold }}) → name pattern (coder/codestral/codellama/starcoder) → general
  394. dest: "{{ benchmark_results_dir }}/benchmark_{{ benchmark_timestamp }}.md"
  395. mode: "0644"
  396. delegate_to: localhost
  397. tags:
  398. - benchmark-report
  399. - name: "Benchmark | Save model_selection.json"
  400. ansible.builtin.copy:
  401. content: "{{ selection | to_nice_json }}"
  402. dest: "{{ benchmark_results_dir }}/model_selection.json"
  403. mode: "0644"
  404. delegate_to: localhost
  405. tags:
  406. - benchmark-report
  407. - name: "Benchmark | Check minimum composite scores"
  408. ansible.builtin.debug:
  409. msg: >-
  410. WARNING: Best composite score for {{ item.key }} models is below threshold
  411. ({{ min_composite_score }}). Consider pulling additional models.
  412. Recommended candidates: qwen2.5-coder:14b, deepseek-coder-v2:16b, codellama:34b
  413. when: >-
  414. (item.value.coding_composite < min_composite_score | float) and
  415. (item.value.general_composite < min_composite_score | float)
  416. loop: "{{ parsed_metrics | dict2items }}"
  417. loop_control:
  418. label: "{{ item.key }}"
  419. tags:
  420. - benchmark-report
  421. - name: "Benchmark | Pull recommended model if pull_if_better is true"
  422. ansible.builtin.command: "ollama pull qwen2.5-coder:14b"
  423. when:
  424. - pull_if_better | bool
  425. - parsed_metrics.values() | map(attribute='coding_composite') | max < min_composite_score | float
  426. changed_when: true
  427. tags:
  428. - benchmark-pull
  429. - name: "Benchmark | Restart warmup services after benchmark"
  430. ansible.builtin.systemd:
  431. name: "{{ item }}"
  432. state: restarted
  433. loop:
  434. - ollama-warmup.service
  435. - ollama-warmup-node0.service
  436. failed_when: false
  437. become: true
  438. tags:
  439. - benchmark-cleanup