hai 6 días · 068427e60d
--- a/inventory/group_vars/all.yml
+++ b/inventory/group_vars/all.yml
@@ -81,6 +81,9 @@ benchmark_thresholds:
 
				   min_quality_score: 0.6
			
 
				   min_composite_score: 0.55
			
 
				 
			
 
				+benchmark_toks_norm_ceiling: 30     # Realistic max tok/sec for CPU-only inference
			
 
				+benchmark_coding_threshold: 0.10    # Delta to classify a model as coding-specialized
			
 
				+
			
 
				 # Candidate models to recommend/pull if benchmark scores are below threshold
			
 
				 candidate_models:
			
 
				   - name: "qwen2.5-coder:32b-instruct-q4_K_M"
			
--- a/playbooks/03_benchmark.yml
+++ b/playbooks/03_benchmark.yml
@@ -11,8 +11,9 @@
 
				   vars:
			
 
				     benchmark_models: ""
			
 
				     pull_if_better: false
			
 
				-    min_composite_score: 0.50
			
 
				+    min_composite_score: "{{ benchmark_thresholds.min_composite_score }}"
			
 
				     ollama_api_url: "http://localhost:11434"
			
 
				+    ollama_api_key: "{{ lookup('community.hashi_vault.hashi_vault', vault_secret_prefix ~ '/ollama:api_key token=' ~ lookup('ansible.builtin.file', vault_token_file) ~ ' url=' ~ vault_url) }}"
			
 
				     benchmark_results_dir: "{{ playbook_dir }}/../benchmarks/results"
			
 
				     test_prompts:
			
 
				       code_gen:
			
@@ -128,6 +129,8 @@
 
				           model: "{{ item.0 }}"
			
 
				           prompt: "{{ test_prompts[item.1].prompt }}"
			
 
				           stream: false
			
 
				+        headers:
			
 
				+          Authorization: "Bearer {{ ollama_api_key }}"
			
 
				         timeout: 300
			
 
				         status_code: 200
			
 
				       loop: "{{ models_to_benchmark | product(test_prompts.keys() | list) | list }}"
			
@@ -163,8 +166,11 @@
 
				           {%       if test_name in ['code_gen', 'debug', 'refactor'] %}
			
 
				           {%         set has_def = 1 if 'def ' in response_text else 0 %}
			
 
				           {%         set has_return = 1 if 'return' in response_text else 0 %}
			
 
				-          {%         set length_score = [resp_len / 1500.0, 1.0] | min %}
			
 
				-          {%         set quality = (has_def * 0.3 + has_return * 0.3 + length_score * 0.4) %}
			
 
				+          {%         set has_assert = 1 if 'assert ' in response_text else 0 %}
			
 
				+          {%         set has_test_def = 1 if 'def test_' in response_text else 0 %}
			
 
				+          {%         set has_docstring = 1 if '"""' in response_text else 0 %}
			
 
				+          {%         set has_type_hint = 1 if ' -> ' in response_text else 0 %}
			
 
				+          {%         set quality = (has_def * 0.20 + has_return * 0.20 + has_assert * 0.15 + has_test_def * 0.15 + has_docstring * 0.15 + has_type_hint * 0.15) %}
			
 
				           {%         set ns2.coding_quality = ns2.coding_quality + quality %}
			
 
				           {%         set ns2.coding_count = ns2.coding_count + 1 %}
			
 
				           {%       elif test_name in ['explain', 'creative', 'reasoning'] %}
			
@@ -180,12 +186,12 @@
 
				           {%   set general_avg = (ns2.general_quality / ns2.general_count) if ns2.general_count > 0 else 0 %}
			
 
				           {%   set test_count = (ns2.ttft_count) if ns2.ttft_count > 0 else 1 %}
			
 
				           {%   set avg_toks = ns2.total_toks / test_count %}
			
 
				-          {%   set toks_norm = [avg_toks / 100.0, 1.0] | min %}
			
 
				+          {%   set toks_norm = [avg_toks / benchmark_toks_norm_ceiling, 1.0] | min %}
			
 
				           {%   set latency_ms = ns2.latency_ns / 1000000.0 if ns2.latency_ns > 0 else 9999 %}
			
 
				           {%   set latency_score = [1.0 - (latency_ms / 5000.0), 0] | max %}
			
 
				           {%   set coding_composite = coding_avg * 0.45 + toks_norm * 0.30 + latency_score * 0.25 %}
			
 
				           {%   set general_composite = general_avg * 0.45 + toks_norm * 0.30 + latency_score * 0.25 %}
			
 
				-          {%   set category = 'coding' if (coding_composite - general_composite) >= 0.15 else 'general' %}
			
 
				+          {%   set category = 'coding' if (coding_composite - general_composite) >= benchmark_coding_threshold else 'general' %}
			
 
				           {%   set _ = ns.results.update({model: {'coding_quality': coding_avg | round(3), 'general_quality': general_avg | round(3), 'avg_tok_per_sec': avg_toks | round(1), 'toks_norm': toks_norm | round(3), 'latency_ms': latency_ms | round(1), 'latency_score': latency_score | round(3), 'coding_composite': coding_composite | round(3), 'general_composite': general_composite | round(3), 'category': category}}) %}
			
 
				           {% endfor %}
			
 
				           {{ ns.results | to_json }}
			
@@ -273,7 +279,9 @@
 
				 
			
 
				           ## Scoring Formula
			
 
				           - Composite = quality * 0.45 + token_speed_normalized * 0.30 + latency_score * 0.25
			
 
				-          - Category: coding if (coding_composite - general_composite) >= 0.15, else general
			
 
				+          - Speed normalized against {{ benchmark_toks_norm_ceiling }} tok/sec ceiling
			
 
				+          - Coding quality: has_def×0.20 + has_return×0.20 + has_assert×0.15 + has_test_def×0.15 + has_docstring×0.15 + has_type_hint×0.15
			
 
				+          - Category: coding if (coding_composite - general_composite) >= {{ benchmark_coding_threshold }}, else general
			
 
				         dest: "{{ benchmark_results_dir }}/benchmark_{{ benchmark_timestamp }}.md"
			
 
				         mode: "0644"
			
 
				       delegate_to: localhost
			
@@ -296,8 +304,8 @@
 
				           ({{ min_composite_score }}). Consider pulling additional models.
			
 
				           Recommended candidates: qwen2.5-coder:14b, deepseek-coder-v2:16b, codellama:34b
			
 
				       when: >-
			
 
				-        (item.value.coding_composite < min_composite_score) and
			
 
				-        (item.value.general_composite < min_composite_score)
			
 
				+        (item.value.coding_composite < min_composite_score | float) and
			
 
				+        (item.value.general_composite < min_composite_score | float)
			
 
				       loop: "{{ parsed_metrics | dict2items }}"
			
 
				       loop_control:
			
 
				         label: "{{ item.key }}"
			
@@ -308,7 +316,7 @@
 
				       ansible.builtin.command: "ollama pull qwen2.5-coder:14b"
			
 
				       when:
			
 
				         - pull_if_better | bool
			
 
				-        - parsed_metrics.values() | map(attribute='coding_composite') | max < min_composite_score
			
 
				+        - parsed_metrics.values() | map(attribute='coding_composite') | max < min_composite_score | float
			
 
				       changed_when: true
			
 
				       tags:
			
 
				         - benchmark-pull