|
|
@@ -11,8 +11,9 @@
|
|
|
vars:
|
|
|
benchmark_models: ""
|
|
|
pull_if_better: false
|
|
|
- min_composite_score: 0.50
|
|
|
+ min_composite_score: "{{ benchmark_thresholds.min_composite_score }}"
|
|
|
ollama_api_url: "http://localhost:11434"
|
|
|
+ ollama_api_key: "{{ lookup('community.hashi_vault.hashi_vault', vault_secret_prefix ~ '/ollama:api_key token=' ~ lookup('ansible.builtin.file', vault_token_file) ~ ' url=' ~ vault_url) }}"
|
|
|
benchmark_results_dir: "{{ playbook_dir }}/../benchmarks/results"
|
|
|
test_prompts:
|
|
|
code_gen:
|
|
|
@@ -128,6 +129,8 @@
|
|
|
model: "{{ item.0 }}"
|
|
|
prompt: "{{ test_prompts[item.1].prompt }}"
|
|
|
stream: false
|
|
|
+ headers:
|
|
|
+ Authorization: "Bearer {{ ollama_api_key }}"
|
|
|
timeout: 300
|
|
|
status_code: 200
|
|
|
loop: "{{ models_to_benchmark | product(test_prompts.keys() | list) | list }}"
|
|
|
@@ -163,8 +166,11 @@
|
|
|
{% if test_name in ['code_gen', 'debug', 'refactor'] %}
|
|
|
{% set has_def = 1 if 'def ' in response_text else 0 %}
|
|
|
{% set has_return = 1 if 'return' in response_text else 0 %}
|
|
|
- {% set length_score = [resp_len / 1500.0, 1.0] | min %}
|
|
|
- {% set quality = (has_def * 0.3 + has_return * 0.3 + length_score * 0.4) %}
|
|
|
+ {% set has_assert = 1 if 'assert ' in response_text else 0 %}
|
|
|
+ {% set has_test_def = 1 if 'def test_' in response_text else 0 %}
|
|
|
+ {% set has_docstring = 1 if '"""' in response_text else 0 %}
|
|
|
+ {% set has_type_hint = 1 if ' -> ' in response_text else 0 %}
|
|
|
+ {% set quality = (has_def * 0.20 + has_return * 0.20 + has_assert * 0.15 + has_test_def * 0.15 + has_docstring * 0.15 + has_type_hint * 0.15) %}
|
|
|
{% set ns2.coding_quality = ns2.coding_quality + quality %}
|
|
|
{% set ns2.coding_count = ns2.coding_count + 1 %}
|
|
|
{% elif test_name in ['explain', 'creative', 'reasoning'] %}
|
|
|
@@ -180,12 +186,12 @@
|
|
|
{% set general_avg = (ns2.general_quality / ns2.general_count) if ns2.general_count > 0 else 0 %}
|
|
|
{% set test_count = (ns2.ttft_count) if ns2.ttft_count > 0 else 1 %}
|
|
|
{% set avg_toks = ns2.total_toks / test_count %}
|
|
|
- {% set toks_norm = [avg_toks / 100.0, 1.0] | min %}
|
|
|
+ {% set toks_norm = [avg_toks / benchmark_toks_norm_ceiling, 1.0] | min %}
|
|
|
{% set latency_ms = ns2.latency_ns / 1000000.0 if ns2.latency_ns > 0 else 9999 %}
|
|
|
{% set latency_score = [1.0 - (latency_ms / 5000.0), 0] | max %}
|
|
|
{% set coding_composite = coding_avg * 0.45 + toks_norm * 0.30 + latency_score * 0.25 %}
|
|
|
{% set general_composite = general_avg * 0.45 + toks_norm * 0.30 + latency_score * 0.25 %}
|
|
|
- {% set category = 'coding' if (coding_composite - general_composite) >= 0.15 else 'general' %}
|
|
|
+ {% set category = 'coding' if (coding_composite - general_composite) >= benchmark_coding_threshold else 'general' %}
|
|
|
{% set _ = ns.results.update({model: {'coding_quality': coding_avg | round(3), 'general_quality': general_avg | round(3), 'avg_tok_per_sec': avg_toks | round(1), 'toks_norm': toks_norm | round(3), 'latency_ms': latency_ms | round(1), 'latency_score': latency_score | round(3), 'coding_composite': coding_composite | round(3), 'general_composite': general_composite | round(3), 'category': category}}) %}
|
|
|
{% endfor %}
|
|
|
{{ ns.results | to_json }}
|
|
|
@@ -273,7 +279,9 @@
|
|
|
|
|
|
## Scoring Formula
|
|
|
- Composite = quality * 0.45 + token_speed_normalized * 0.30 + latency_score * 0.25
|
|
|
- - Category: coding if (coding_composite - general_composite) >= 0.15, else general
|
|
|
+ - Speed normalized against {{ benchmark_toks_norm_ceiling }} tok/sec ceiling
|
|
|
+ - Coding quality: has_def×0.20 + has_return×0.20 + has_assert×0.15 + has_test_def×0.15 + has_docstring×0.15 + has_type_hint×0.15
|
|
|
+ - Category: coding if (coding_composite - general_composite) >= {{ benchmark_coding_threshold }}, else general
|
|
|
dest: "{{ benchmark_results_dir }}/benchmark_{{ benchmark_timestamp }}.md"
|
|
|
mode: "0644"
|
|
|
delegate_to: localhost
|
|
|
@@ -296,8 +304,8 @@
|
|
|
({{ min_composite_score }}). Consider pulling additional models.
|
|
|
Recommended candidates: qwen2.5-coder:14b, deepseek-coder-v2:16b, codellama:34b
|
|
|
when: >-
|
|
|
- (item.value.coding_composite < min_composite_score) and
|
|
|
- (item.value.general_composite < min_composite_score)
|
|
|
+ (item.value.coding_composite < min_composite_score | float) and
|
|
|
+ (item.value.general_composite < min_composite_score | float)
|
|
|
loop: "{{ parsed_metrics | dict2items }}"
|
|
|
loop_control:
|
|
|
label: "{{ item.key }}"
|
|
|
@@ -308,7 +316,7 @@
|
|
|
ansible.builtin.command: "ollama pull qwen2.5-coder:14b"
|
|
|
when:
|
|
|
- pull_if_better | bool
|
|
|
- - parsed_metrics.values() | map(attribute='coding_composite') | max < min_composite_score
|
|
|
+ - parsed_metrics.values() | map(attribute='coding_composite') | max < min_composite_score | float
|
|
|
changed_when: true
|
|
|
tags:
|
|
|
- benchmark-pull
|