Explorar o código

Improve benchmark scoring and promote thresholds to group vars

- Move benchmark_toks_norm_ceiling and benchmark_coding_threshold into
  group_vars/all.yml so they can be tuned per environment without touching
  playbook code
- Fix min_composite_score to reference benchmark_thresholds instead of a
  hardcoded 0.50 default
- Add ollama_api_key Vault lookup and Authorization header to benchmark API
  calls (API key auth was silently bypassed before)
- Expand code quality scoring: add has_assert, has_test_def, has_docstring,
  has_type_hint signals alongside existing has_def/has_return
- Reference benchmark_coding_threshold variable in category classification
  and benchmark report output
- Fix min_composite_score | float cast to avoid Jinja2 type comparison errors
Shaun Arman hai 6 días
pai
achega
068427e60d
Modificáronse 2 ficheiros con 20 adicións e 9 borrados
  1. 3 0
      inventory/group_vars/all.yml
  2. 17 9
      playbooks/03_benchmark.yml

+ 3 - 0
inventory/group_vars/all.yml

@@ -81,6 +81,9 @@ benchmark_thresholds:
   min_quality_score: 0.6
   min_composite_score: 0.55
 
+benchmark_toks_norm_ceiling: 30     # Realistic max tok/sec for CPU-only inference
+benchmark_coding_threshold: 0.10    # Delta to classify a model as coding-specialized
+
 # Candidate models to recommend/pull if benchmark scores are below threshold
 candidate_models:
   - name: "qwen2.5-coder:32b-instruct-q4_K_M"

+ 17 - 9
playbooks/03_benchmark.yml

@@ -11,8 +11,9 @@
   vars:
     benchmark_models: ""
     pull_if_better: false
-    min_composite_score: 0.50
+    min_composite_score: "{{ benchmark_thresholds.min_composite_score }}"
     ollama_api_url: "http://localhost:11434"
+    ollama_api_key: "{{ lookup('community.hashi_vault.hashi_vault', vault_secret_prefix ~ '/ollama:api_key token=' ~ lookup('ansible.builtin.file', vault_token_file) ~ ' url=' ~ vault_url) }}"
     benchmark_results_dir: "{{ playbook_dir }}/../benchmarks/results"
     test_prompts:
       code_gen:
@@ -128,6 +129,8 @@
           model: "{{ item.0 }}"
           prompt: "{{ test_prompts[item.1].prompt }}"
           stream: false
+        headers:
+          Authorization: "Bearer {{ ollama_api_key }}"
         timeout: 300
         status_code: 200
       loop: "{{ models_to_benchmark | product(test_prompts.keys() | list) | list }}"
@@ -163,8 +166,11 @@
           {%       if test_name in ['code_gen', 'debug', 'refactor'] %}
           {%         set has_def = 1 if 'def ' in response_text else 0 %}
           {%         set has_return = 1 if 'return' in response_text else 0 %}
-          {%         set length_score = [resp_len / 1500.0, 1.0] | min %}
-          {%         set quality = (has_def * 0.3 + has_return * 0.3 + length_score * 0.4) %}
+          {%         set has_assert = 1 if 'assert ' in response_text else 0 %}
+          {%         set has_test_def = 1 if 'def test_' in response_text else 0 %}
+          {%         set has_docstring = 1 if '"""' in response_text else 0 %}
+          {%         set has_type_hint = 1 if ' -> ' in response_text else 0 %}
+          {%         set quality = (has_def * 0.20 + has_return * 0.20 + has_assert * 0.15 + has_test_def * 0.15 + has_docstring * 0.15 + has_type_hint * 0.15) %}
           {%         set ns2.coding_quality = ns2.coding_quality + quality %}
           {%         set ns2.coding_count = ns2.coding_count + 1 %}
           {%       elif test_name in ['explain', 'creative', 'reasoning'] %}
@@ -180,12 +186,12 @@
           {%   set general_avg = (ns2.general_quality / ns2.general_count) if ns2.general_count > 0 else 0 %}
           {%   set test_count = (ns2.ttft_count) if ns2.ttft_count > 0 else 1 %}
           {%   set avg_toks = ns2.total_toks / test_count %}
-          {%   set toks_norm = [avg_toks / 100.0, 1.0] | min %}
+          {%   set toks_norm = [avg_toks / benchmark_toks_norm_ceiling, 1.0] | min %}
           {%   set latency_ms = ns2.latency_ns / 1000000.0 if ns2.latency_ns > 0 else 9999 %}
           {%   set latency_score = [1.0 - (latency_ms / 5000.0), 0] | max %}
           {%   set coding_composite = coding_avg * 0.45 + toks_norm * 0.30 + latency_score * 0.25 %}
           {%   set general_composite = general_avg * 0.45 + toks_norm * 0.30 + latency_score * 0.25 %}
-          {%   set category = 'coding' if (coding_composite - general_composite) >= 0.15 else 'general' %}
+          {%   set category = 'coding' if (coding_composite - general_composite) >= benchmark_coding_threshold else 'general' %}
           {%   set _ = ns.results.update({model: {'coding_quality': coding_avg | round(3), 'general_quality': general_avg | round(3), 'avg_tok_per_sec': avg_toks | round(1), 'toks_norm': toks_norm | round(3), 'latency_ms': latency_ms | round(1), 'latency_score': latency_score | round(3), 'coding_composite': coding_composite | round(3), 'general_composite': general_composite | round(3), 'category': category}}) %}
           {% endfor %}
           {{ ns.results | to_json }}
@@ -273,7 +279,9 @@
 
           ## Scoring Formula
           - Composite = quality * 0.45 + token_speed_normalized * 0.30 + latency_score * 0.25
-          - Category: coding if (coding_composite - general_composite) >= 0.15, else general
+          - Speed normalized against {{ benchmark_toks_norm_ceiling }} tok/sec ceiling
+          - Coding quality: has_def×0.20 + has_return×0.20 + has_assert×0.15 + has_test_def×0.15 + has_docstring×0.15 + has_type_hint×0.15
+          - Category: coding if (coding_composite - general_composite) >= {{ benchmark_coding_threshold }}, else general
         dest: "{{ benchmark_results_dir }}/benchmark_{{ benchmark_timestamp }}.md"
         mode: "0644"
       delegate_to: localhost
@@ -296,8 +304,8 @@
           ({{ min_composite_score }}). Consider pulling additional models.
           Recommended candidates: qwen2.5-coder:14b, deepseek-coder-v2:16b, codellama:34b
       when: >-
-        (item.value.coding_composite < min_composite_score) and
-        (item.value.general_composite < min_composite_score)
+        (item.value.coding_composite < min_composite_score | float) and
+        (item.value.general_composite < min_composite_score | float)
       loop: "{{ parsed_metrics | dict2items }}"
       loop_control:
         label: "{{ item.key }}"
@@ -308,7 +316,7 @@
       ansible.builtin.command: "ollama pull qwen2.5-coder:14b"
       when:
         - pull_if_better | bool
-        - parsed_metrics.values() | map(attribute='coding_composite') | max < min_composite_score
+        - parsed_metrics.values() | map(attribute='coding_composite') | max < min_composite_score | float
       changed_when: true
       tags:
         - benchmark-pull