|
|
@@ -103,10 +103,11 @@ benchmark_skip_aliases:
|
|
|
benchmark_small_max_gb: 10 # upper size boundary for small pass (< 10 GB), based on runtime RAM
|
|
|
benchmark_medium_max_gb: 15 # upper size boundary for medium pass (10–15 GB), based on runtime RAM
|
|
|
benchmark_size_overhead_factor: 1.2 # ollama list shows disk size; multiply by this to estimate runtime RAM
|
|
|
-benchmark_load_timeout: 300 # seconds — warm-up "Hi" prompt per model before benchmarking
|
|
|
-benchmark_small_timeout: 300 # seconds per request, small models
|
|
|
-benchmark_medium_timeout: 900 # seconds per request, medium models (15 min)
|
|
|
-benchmark_large_timeout: 1200 # seconds per request, large models (20 min)
|
|
|
+benchmark_load_timeout: 180 # seconds — warm-up "Hi" prompt per model before benchmarking
|
|
|
+benchmark_small_timeout: 90 # seconds per request, small models (<10 GB)
|
|
|
+benchmark_medium_timeout: 240 # seconds per request, medium models (10–15 GB)
|
|
|
+benchmark_large_timeout: 480 # seconds per request, large models (>15 GB)
|
|
|
+benchmark_num_predict: 300 # cap output tokens per prompt; bounds worst-case at 0.1 tok/sec to ~3000s, at 10 tok/sec to ~30s
|
|
|
|
|
|
# Explicit category overrides applied before heuristics. Keys are model names as
|
|
|
# returned by `ollama list`. Valid values: 'coding' or 'general'.
|