3 zile în urmă · 02efb954d9
--- a/inventory/group_vars/all.yml
+++ b/inventory/group_vars/all.yml
@@ -103,10 +103,11 @@ benchmark_skip_aliases:
 
				 benchmark_small_max_gb: 10    # upper size boundary for small pass (< 10 GB), based on runtime RAM
			
 
				 benchmark_medium_max_gb: 15   # upper size boundary for medium pass (10–15 GB), based on runtime RAM
			
 
				 benchmark_size_overhead_factor: 1.2  # ollama list shows disk size; multiply by this to estimate runtime RAM
			
 
				-benchmark_load_timeout: 300      # seconds — warm-up "Hi" prompt per model before benchmarking
			
 
				-benchmark_small_timeout: 300     # seconds per request, small models
			
 
				-benchmark_medium_timeout: 900    # seconds per request, medium models (15 min)
			
 
				-benchmark_large_timeout: 1200    # seconds per request, large models (20 min)
			
 
				+benchmark_load_timeout: 180      # seconds — warm-up "Hi" prompt per model before benchmarking
			
 
				+benchmark_small_timeout: 90      # seconds per request, small models (<10 GB)
			
 
				+benchmark_medium_timeout: 240    # seconds per request, medium models (10–15 GB)
			
 
				+benchmark_large_timeout: 480     # seconds per request, large models (>15 GB)
			
 
				+benchmark_num_predict: 300       # cap output tokens per prompt; bounds worst-case at 0.1 tok/sec to ~3000s, at 10 tok/sec to ~30s
			
 
				 
			
 
				 # Explicit category overrides applied before heuristics. Keys are model names as
			
 
				 # returned by `ollama list`. Valid values: 'coding' or 'general'.
			
--- a/playbooks/03_benchmark.yml
+++ b/playbooks/03_benchmark.yml
@@ -265,7 +265,7 @@
 
				           {%       set ns2.ttft_sum = ns2.ttft_sum + prompt_eval_duration %}
			
 
				           {%       set ns2.ttft_count = ns2.ttft_count + 1 %}
			
 
				           {%       if test_name == 'latency' %}
			
 
				-          {%         set ns2.latency_ns = resp.total_duration | default(0) | int %}
			
 
				+          {%         set ns2.latency_ns = ((resp.total_duration | default(0) | int) - (resp.load_duration | default(0) | int)) | abs %}
			
 
				           {%       endif %}
			
 
				           {%       set resp_len = response_text | length %}
			
 
				           {%       if test_name in ['code_gen', 'debug', 'refactor'] %}
			
--- a/playbooks/_bench_tier_batch.yml
+++ b/playbooks/_bench_tier_batch.yml
@@ -111,6 +111,10 @@
 
				       model: "{{ item.0 }}"
			
 
				       prompt: "{{ test_prompts[item.1].prompt }}"
			
 
				       stream: false
			
 
				+      options:
			
 
				+        num_predict: "{{ benchmark_num_predict | int }}"
			
 
				+        temperature: 0
			
 
				+        seed: 42
			
 
				     headers:
			
 
				       Authorization: "Bearer {{ ollama_api_key }}"
			
 
				     timeout: "{{ (benchmark_large_timeout | int) }}"
			
@@ -130,6 +134,10 @@
 
				       model: "{{ item.0 }}"
			
 
				       prompt: "{{ test_prompts[item.1].prompt }}"
			
 
				       stream: false
			
 
				+      options:
			
 
				+        num_predict: "{{ benchmark_num_predict | int }}"
			
 
				+        temperature: 0
			
 
				+        seed: 42
			
 
				     headers:
			
 
				       Authorization: "Bearer {{ ollama_api_key }}"
			
 
				     timeout: "{{ (benchmark_large_timeout | int) }}"