03_benchmark.yml 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. ---
  2. # playbooks/03_benchmark.yml
  3. # Benchmark installed Ollama models and select optimal models for each slot
  4. - name: "Benchmark | Evaluate Ollama models"
  5. hosts: ai_server
  6. become: false
  7. gather_facts: true
  8. tags:
  9. - benchmark
  10. vars:
  11. benchmark_models: ""
  12. pull_if_better: false
  13. min_composite_score: 0.50
  14. ollama_api_url: "http://localhost:11434"
  15. benchmark_results_dir: "{{ playbook_dir }}/../benchmarks/results"
  16. test_prompts:
  17. code_gen:
  18. prompt: "Write a Python merge sort with type hints, docstring, and 3 unit tests"
  19. category: coding
  20. weight: 1.0
  21. debug:
  22. prompt: >-
  23. Here is a Python function with 3 bugs. Find and fix all bugs:
  24. def calculate_average(numbers):
  25. total = 0
  26. for n in numbers:
  27. total =+ n
  28. average = total / len(numbers
  29. return averege
  30. category: coding
  31. weight: 1.0
  32. refactor:
  33. prompt: >-
  34. Refactor this for readability and performance:
  35. def f(l):
  36. r=[]
  37. for i in range(len(l)):
  38. if l[i]%2==0:
  39. r.append(l[i]*2)
  40. return r
  41. category: coding
  42. weight: 1.0
  43. explain:
  44. prompt: "Explain how Python's GIL works and when it matters"
  45. category: general
  46. weight: 1.0
  47. creative:
  48. prompt: "Suggest 5 fun family activities for a rainy weekend"
  49. category: general
  50. weight: 1.0
  51. reasoning:
  52. prompt: "I have 3 apples. I give away half. Then I get 4 more. How many do I have?"
  53. category: general
  54. weight: 1.0
  55. latency:
  56. prompt: "Hi"
  57. category: latency
  58. weight: 0.5
  59. tasks:
  60. - name: "Benchmark | Ensure results directory exists on control node"
  61. ansible.builtin.file:
  62. path: "{{ benchmark_results_dir }}"
  63. state: directory
  64. mode: "0755"
  65. delegate_to: localhost
  66. tags:
  67. - benchmark-setup
  68. - name: "Benchmark | Wait for Ollama API to be ready"
  69. ansible.builtin.uri:
  70. url: "http://localhost:11434/api/tags"
  71. method: GET
  72. status_code: 200
  73. timeout: 10
  74. register: ollama_ready
  75. retries: 24
  76. delay: 5
  77. until: ollama_ready.status == 200
  78. tags:
  79. - benchmark-discover
  80. - name: "Benchmark | Discover installed models"
  81. ansible.builtin.command: ollama list
  82. changed_when: false
  83. register: ollama_list_output
  84. retries: 6
  85. delay: 10
  86. until: ollama_list_output.rc == 0
  87. tags:
  88. - benchmark-discover
  89. - name: "Benchmark | Parse model names from ollama list"
  90. ansible.builtin.set_fact:
  91. installed_models: "{{ ollama_list_output.stdout_lines[1:] | map('split') | map('first') | list }}"
  92. tags:
  93. - benchmark-discover
  94. - name: "Benchmark | Set models_to_benchmark to all installed models"
  95. ansible.builtin.set_fact:
  96. models_to_benchmark: "{{ installed_models }}"
  97. when: benchmark_models | default('') | length == 0
  98. tags:
  99. - benchmark-discover
  100. - name: "Benchmark | Set models_to_benchmark to specified subset"
  101. ansible.builtin.set_fact:
  102. models_to_benchmark: "{{ benchmark_models.split(',') | map('trim') | list }}"
  103. when: benchmark_models | default('') | length > 0
  104. tags:
  105. - benchmark-discover
  106. - name: "Benchmark | Display models to benchmark"
  107. ansible.builtin.debug:
  108. msg: "Will benchmark the following models: {{ models_to_benchmark }}"
  109. tags:
  110. - benchmark-discover
  111. - name: "Benchmark | Run test prompts against each model"
  112. ansible.builtin.uri:
  113. url: "{{ ollama_api_url }}/api/generate"
  114. method: POST
  115. body_format: json
  116. body:
  117. model: "{{ item.0 }}"
  118. prompt: "{{ test_prompts[item.1].prompt }}"
  119. stream: false
  120. timeout: 300
  121. status_code: 200
  122. loop: "{{ models_to_benchmark | product(test_prompts.keys() | list) | list }}"
  123. loop_control:
  124. label: "{{ item.0 }} / {{ item.1 }}"
  125. register: benchmark_raw_results
  126. failed_when: false
  127. tags:
  128. - benchmark-run
  129. - name: "Benchmark | Compute per-model metrics"
  130. ansible.builtin.set_fact:
  131. model_metrics: |
  132. {% set ns = namespace(results={}) %}
  133. {% for model in models_to_benchmark %}
  134. {% set ns2 = namespace(coding_quality=0, coding_count=0, general_quality=0, general_count=0, total_toks=0, total_eval_time=0, ttft_sum=0, ttft_count=0, latency_ns=0) %}
  135. {% for result in benchmark_raw_results.results %}
  136. {% if result.item[0] == model and result.status == 200 %}
  137. {% set test_name = result.item[1] %}
  138. {% set resp = result.json | default({}) %}
  139. {% set eval_count = resp.eval_count | default(0) | int %}
  140. {% set eval_duration = resp.eval_duration | default(1) | int %}
  141. {% set prompt_eval_duration = resp.prompt_eval_duration | default(0) | int %}
  142. {% set response_text = resp.response | default('') %}
  143. {% set tok_per_sec = (eval_count / (eval_duration / 1000000000.0)) if eval_duration > 0 else 0 %}
  144. {% set ns2.total_toks = ns2.total_toks + tok_per_sec %}
  145. {% set ns2.ttft_sum = ns2.ttft_sum + prompt_eval_duration %}
  146. {% set ns2.ttft_count = ns2.ttft_count + 1 %}
  147. {% if test_name == 'latency' %}
  148. {% set ns2.latency_ns = eval_duration + prompt_eval_duration %}
  149. {% endif %}
  150. {% set resp_len = response_text | length %}
  151. {% if test_name in ['code_gen', 'debug', 'refactor'] %}
  152. {% set has_def = 1 if 'def ' in response_text else 0 %}
  153. {% set has_return = 1 if 'return' in response_text else 0 %}
  154. {% set length_score = [resp_len / 1500.0, 1.0] | min %}
  155. {% set quality = (has_def * 0.3 + has_return * 0.3 + length_score * 0.4) %}
  156. {% set ns2.coding_quality = ns2.coding_quality + quality %}
  157. {% set ns2.coding_count = ns2.coding_count + 1 %}
  158. {% elif test_name in ['explain', 'creative', 'reasoning'] %}
  159. {% set length_score = [resp_len / 800.0, 1.0] | min %}
  160. {% set has_structure = 1 if ('\n' in response_text and resp_len > 100) else 0 %}
  161. {% set quality = (length_score * 0.6 + has_structure * 0.4) %}
  162. {% set ns2.general_quality = ns2.general_quality + quality %}
  163. {% set ns2.general_count = ns2.general_count + 1 %}
  164. {% endif %}
  165. {% endif %}
  166. {% endfor %}
  167. {% set coding_avg = (ns2.coding_quality / ns2.coding_count) if ns2.coding_count > 0 else 0 %}
  168. {% set general_avg = (ns2.general_quality / ns2.general_count) if ns2.general_count > 0 else 0 %}
  169. {% set test_count = (ns2.ttft_count) if ns2.ttft_count > 0 else 1 %}
  170. {% set avg_toks = ns2.total_toks / test_count %}
  171. {% set toks_norm = [avg_toks / 100.0, 1.0] | min %}
  172. {% set latency_ms = ns2.latency_ns / 1000000.0 if ns2.latency_ns > 0 else 9999 %}
  173. {% set latency_score = [1.0 - (latency_ms / 5000.0), 0] | max %}
  174. {% set coding_composite = coding_avg * 0.45 + toks_norm * 0.30 + latency_score * 0.25 %}
  175. {% set general_composite = general_avg * 0.45 + toks_norm * 0.30 + latency_score * 0.25 %}
  176. {% set category = 'coding' if (coding_composite - general_composite) >= 0.15 else 'general' %}
  177. {% set _ = ns.results.update({model: {'coding_quality': coding_avg | round(3), 'general_quality': general_avg | round(3), 'avg_tok_per_sec': avg_toks | round(1), 'toks_norm': toks_norm | round(3), 'latency_ms': latency_ms | round(1), 'latency_score': latency_score | round(3), 'coding_composite': coding_composite | round(3), 'general_composite': general_composite | round(3), 'category': category}}) %}
  178. {% endfor %}
  179. {{ ns.results | to_json }}
  180. tags:
  181. - benchmark-compute
  182. - name: "Benchmark | Parse model metrics"
  183. ansible.builtin.set_fact:
  184. parsed_metrics: "{{ model_metrics | from_json }}"
  185. tags:
  186. - benchmark-compute
  187. - name: "Benchmark | Rank models and select slots"
  188. ansible.builtin.set_fact:
  189. model_selection: |
  190. {% set general_models = [] %}
  191. {% set coding_models = [] %}
  192. {% for model, metrics in parsed_metrics.items() %}
  193. {% if metrics.category == 'general' %}
  194. {% set _ = general_models.append({'name': model, 'composite': metrics.general_composite, 'metrics': metrics}) %}
  195. {% else %}
  196. {% set _ = coding_models.append({'name': model, 'composite': metrics.coding_composite, 'metrics': metrics}) %}
  197. {% endif %}
  198. {% endfor %}
  199. {% set general_sorted = general_models | sort(attribute='composite', reverse=true) %}
  200. {% set coding_sorted = coding_models | sort(attribute='composite', reverse=true) %}
  201. {% set slot1 = general_sorted[0].name if general_sorted | length > 0 else 'none' %}
  202. {% set slot2 = general_sorted[1].name if general_sorted | length > 1 else (general_sorted[0].name if general_sorted | length > 0 else 'none') %}
  203. {% set slot3 = coding_sorted[0].name if coding_sorted | length > 0 else (general_sorted[0].name if general_sorted | length > 0 else 'none') %}
  204. {% set slot4 = coding_sorted[1].name if coding_sorted | length > 1 else (coding_sorted[0].name if coding_sorted | length > 0 else 'none') %}
  205. {{ {'slot1_general': slot1, 'slot2_general': slot2, 'slot3_coding': slot3, 'slot4_coding': slot4, 'all_metrics': parsed_metrics, 'general_ranking': general_sorted, 'coding_ranking': coding_sorted} | to_json }}
  206. tags:
  207. - benchmark-select
  208. - name: "Benchmark | Parse model selection"
  209. ansible.builtin.set_fact:
  210. selection: "{{ model_selection | from_json }}"
  211. tags:
  212. - benchmark-select
  213. - name: "Benchmark | Display model selection results"
  214. ansible.builtin.debug:
  215. msg:
  216. - "============================================="
  217. - " MODEL SELECTION RESULTS"
  218. - "============================================="
  219. - " Slot 1 (General Primary): {{ selection.slot1_general }}"
  220. - " Slot 2 (General Secondary): {{ selection.slot2_general }}"
  221. - " Slot 3 (Coding Primary): {{ selection.slot3_coding }}"
  222. - " Slot 4 (Coding Secondary): {{ selection.slot4_coding }}"
  223. - "============================================="
  224. tags:
  225. - benchmark-select
  226. - name: "Benchmark | Generate timestamp"
  227. ansible.builtin.set_fact:
  228. benchmark_timestamp: "{{ ansible_date_time.iso8601_basic_short }}"
  229. tags:
  230. - benchmark-report
  231. - name: "Benchmark | Save benchmark results markdown"
  232. ansible.builtin.copy:
  233. content: |
  234. # Benchmark Results - {{ benchmark_timestamp }}
  235. ## Model Selection
  236. | Slot | Role | Model | Composite Score |
  237. |------|------|-------|----------------|
  238. | 1 | General (Primary) | {{ selection.slot1_general }} | {{ parsed_metrics[selection.slot1_general].general_composite | default('N/A') }} |
  239. | 2 | General (Secondary) | {{ selection.slot2_general }} | {{ parsed_metrics[selection.slot2_general].general_composite | default('N/A') }} |
  240. | 3 | Coding (Primary) | {{ selection.slot3_coding }} | {{ parsed_metrics[selection.slot3_coding].coding_composite | default('N/A') }} |
  241. | 4 | Coding (Secondary) | {{ selection.slot4_coding }} | {{ parsed_metrics[selection.slot4_coding].coding_composite | default('N/A') }} |
  242. ## Detailed Metrics
  243. {% for model, metrics in parsed_metrics.items() %}
  244. ### {{ model }}
  245. - **Category**: {{ metrics.category }}
  246. - **Coding Quality**: {{ metrics.coding_quality }}
  247. - **General Quality**: {{ metrics.general_quality }}
  248. - **Avg Tokens/sec**: {{ metrics.avg_tok_per_sec }}
  249. - **Latency (ms)**: {{ metrics.latency_ms }}
  250. - **Coding Composite**: {{ metrics.coding_composite }}
  251. - **General Composite**: {{ metrics.general_composite }}
  252. {% endfor %}
  253. ## Scoring Formula
  254. - Composite = quality * 0.45 + token_speed_normalized * 0.30 + latency_score * 0.25
  255. - Category: coding if (coding_composite - general_composite) >= 0.15, else general
  256. dest: "{{ benchmark_results_dir }}/benchmark_{{ benchmark_timestamp }}.md"
  257. mode: "0644"
  258. delegate_to: localhost
  259. tags:
  260. - benchmark-report
  261. - name: "Benchmark | Save model_selection.json"
  262. ansible.builtin.copy:
  263. content: "{{ selection | to_nice_json }}"
  264. dest: "{{ benchmark_results_dir }}/model_selection.json"
  265. mode: "0644"
  266. delegate_to: localhost
  267. tags:
  268. - benchmark-report
  269. - name: "Benchmark | Check minimum composite scores"
  270. ansible.builtin.debug:
  271. msg: >-
  272. WARNING: Best composite score for {{ item.key }} models is below threshold
  273. ({{ min_composite_score }}). Consider pulling additional models.
  274. Recommended candidates: qwen2.5-coder:14b, deepseek-coder-v2:16b, codellama:34b
  275. when: >-
  276. (item.value.coding_composite < min_composite_score) and
  277. (item.value.general_composite < min_composite_score)
  278. loop: "{{ parsed_metrics | dict2items }}"
  279. loop_control:
  280. label: "{{ item.key }}"
  281. tags:
  282. - benchmark-report
  283. - name: "Benchmark | Pull recommended model if pull_if_better is true"
  284. ansible.builtin.command: "ollama pull qwen2.5-coder:14b"
  285. when:
  286. - pull_if_better | bool
  287. - parsed_metrics.values() | map(attribute='coding_composite') | max < min_composite_score
  288. changed_when: true
  289. tags:
  290. - benchmark-pull