sarman
/
tftsr_ai


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
							---
# playbooks/03_benchmark.yml
# Benchmark installed Ollama models and select optimal models for each slot

- name: "Benchmark | Evaluate Ollama models"
  hosts: ai_server
  become: false
  gather_facts: true
  tags:
    - benchmark
  vars:
    benchmark_models: ""
    pull_if_better: false
    min_composite_score: "{{ benchmark_thresholds.min_composite_score }}"
    ollama_api_url: "http://localhost:11434"
    ollama_api_key: "{{ lookup('community.hashi_vault.hashi_vault', vault_secret_prefix ~ '/ollama:api_key token=' ~ lookup('ansible.builtin.file', vault_token_file) ~ ' url=' ~ vault_url) }}"
    benchmark_results_dir: "{{ playbook_dir }}/../benchmarks/results"
    test_prompts:
      code_gen:
        prompt: "Write a Python merge sort with type hints, docstring, and 3 unit tests"
        category: coding
        weight: 1.0
      debug:
        prompt: >-
          Here is a Python function with 3 bugs. Find and fix all bugs:

          def calculate_average(numbers):
              total = 0
              for n in numbers:
                  total =+ n
              average = total / len(numbers
              return averege
        category: coding
        weight: 1.0
      refactor:
        prompt: >-
          Refactor this for readability and performance:

          def f(l):
              r=[]
              for i in range(len(l)):
                  if l[i]%2==0:
                      r.append(l[i]*2)
              return r
        category: coding
        weight: 1.0
      explain:
        prompt: "Explain how Python's GIL works and when it matters"
        category: general
        weight: 1.0
      creative:
        prompt: "Suggest 5 fun family activities for a rainy weekend"
        category: general
        weight: 1.0
      reasoning:
        prompt: "I have 3 apples. I give away half. Then I get 4 more. How many do I have?"
        category: general
        weight: 1.0
      latency:
        prompt: "Hi"
        category: latency
        weight: 0.5

  tasks:
    - name: "Benchmark | Ensure results directory exists on control node"
      ansible.builtin.file:
        path: "{{ benchmark_results_dir }}"
        state: directory
        mode: "0755"
      delegate_to: localhost
      tags:
        - benchmark-setup

    - name: "Benchmark | Wait for Ollama API to be ready"
      ansible.builtin.uri:
        url: "http://localhost:11434/api/tags"
        method: GET
        status_code: 200
        timeout: 10
      register: ollama_ready
      retries: 24
      delay: 5
      until: ollama_ready.status == 200
      tags:
        - benchmark-discover

    - name: "Benchmark | Discover installed models"
      ansible.builtin.command: ollama list
      changed_when: false
      register: ollama_list_output
      retries: 6
      delay: 10
      until: ollama_list_output.rc == 0
      tags:
        - benchmark-discover

    - name: "Benchmark | Parse model names from ollama list"
      ansible.builtin.set_fact:
        installed_models: "{{ ollama_list_output.stdout_lines[1:] | map('split') | map('first') | list }}"
      tags:
        - benchmark-discover

    - name: "Benchmark | Parse model sizes from ollama list"
      ansible.builtin.set_fact:
        _benchmark_sizes_json: |
          {% set ns = namespace(d={}) %}
          {% for line in ollama_list_output.stdout_lines[1:] %}
          {%   set p = line.split() %}
          {%   if p | length >= 4 %}
          {%     set gb = (p[2] | float) if (p[3] | upper == 'GB') else ((p[2] | float) / 1024) %}
          {%     set _ = ns.d.update({p[0]: gb}) %}
          {%   endif %}
          {% endfor %}
          {{ ns.d | to_json }}
      tags:
        - benchmark-discover

    - name: "Benchmark | Partition models into small, medium, and large passes"
      ansible.builtin.set_fact:
        _small_models:  "{{ _alias_filtered | select('in', _small_ok)  | list }}"
        _medium_models: "{{ _alias_filtered | select('in', _medium_ok) | list }}"
        _large_models:  "{{ _alias_filtered | reject('in', _small_ok)  | reject('in', _medium_ok) | list }}"
        models_to_benchmark: "{{ _alias_filtered | list }}"
      vars:
        _sizes:     "{{ _benchmark_sizes_json | from_json }}"
        _small_cut:  "{{ (benchmark_small_max_gb  | float) / (benchmark_size_overhead_factor | float) }}"
        _medium_cut: "{{ (benchmark_medium_max_gb | float) / (benchmark_size_overhead_factor | float) }}"
        _small_ok:  "{{ _sizes | dict2items | selectattr('value', 'le', _small_cut  | float) | map(attribute='key') | list }}"
        _medium_ok: "{{ _sizes | dict2items | selectattr('value', 'gt', _small_cut  | float)
                                            | selectattr('value', 'le', _medium_cut | float)
                                            | map(attribute='key') | list }}"
        _alias_filtered: "{{ installed_models | reject('match', '^(' ~ benchmark_skip_aliases | join('|') ~ ')(:|$)') | list }}"
      when: benchmark_models | default('') | length == 0
      tags:
        - benchmark-discover

    - name: "Benchmark | Set models_to_benchmark to specified subset"
      ansible.builtin.set_fact:
        models_to_benchmark: "{{ _specified }}"
        _small_models:  "{{ _specified | select('in', _small_ok)  | list }}"
        _medium_models: "{{ _specified | select('in', _medium_ok) | list }}"
        _large_models:  "{{ _specified | reject('in', _small_ok)  | reject('in', _medium_ok) | list }}"
      vars:
        _specified: "{{ benchmark_models.split(',') | map('trim') | list }}"
        _sizes:     "{{ _benchmark_sizes_json | from_json }}"
        _small_cut:  "{{ (benchmark_small_max_gb  | float) / (benchmark_size_overhead_factor | float) }}"
        _medium_cut: "{{ (benchmark_medium_max_gb | float) / (benchmark_size_overhead_factor | float) }}"
        _small_ok:  "{{ _sizes | dict2items | selectattr('value', 'le', _small_cut  | float) | map(attribute='key') | list }}"
        _medium_ok: "{{ _sizes | dict2items | selectattr('value', 'gt', _small_cut  | float)
                                            | selectattr('value', 'le', _medium_cut | float)
                                            | map(attribute='key') | list }}"
      when: benchmark_models | default('') | length > 0
      tags:
        - benchmark-discover

    - name: "Benchmark | Display models to benchmark"
      ansible.builtin.debug:
        msg:
          - "Small  pass (timeout {{ benchmark_small_timeout }}s,  ≤{{ benchmark_small_max_gb }}GB):  {{ _small_models }}"
          - "Medium pass (timeout {{ benchmark_medium_timeout }}s, {{ benchmark_small_max_gb }}–{{ benchmark_medium_max_gb }}GB): {{ _medium_models }}"
          - "Large  pass (timeout {{ benchmark_large_timeout }}s, >{{ benchmark_medium_max_gb }}GB): {{ _large_models }}"
      tags:
        - benchmark-discover

    - name: "Benchmark | Run test prompts against small models"
      ansible.builtin.uri:
        url: "{{ ollama_api_url }}/api/generate"
        method: POST
        body_format: json
        body:
          model: "{{ item.0 }}"
          prompt: "{{ test_prompts[item.1].prompt }}"
          stream: false
        headers:
          Authorization: "Bearer {{ ollama_api_key }}"
        timeout: "{{ benchmark_small_timeout }}"
        status_code: 200
      loop: "{{ _small_models | product(test_prompts.keys() | list) | list }}"
      loop_control:
        label: "{{ item.0 }} / {{ item.1 }}"
      register: _bench_small
      failed_when: false
      tags:
        - benchmark-run

    - name: "Benchmark | Run test prompts against medium models"
      ansible.builtin.uri:
        url: "{{ ollama_api_url }}/api/generate"
        method: POST
        body_format: json
        body:
          model: "{{ item.0 }}"
          prompt: "{{ test_prompts[item.1].prompt }}"
          stream: false
        headers:
          Authorization: "Bearer {{ ollama_api_key }}"
        timeout: "{{ benchmark_medium_timeout }}"
        status_code: 200
      loop: "{{ _medium_models | product(test_prompts.keys() | list) | list }}"
      loop_control:
        label: "{{ item.0 }} / {{ item.1 }}"
      register: _bench_medium
      failed_when: false
      when: _medium_models | length > 0
      tags:
        - benchmark-run

    - name: "Benchmark | Run test prompts against large models"
      ansible.builtin.uri:
        url: "{{ ollama_api_url }}/api/generate"
        method: POST
        body_format: json
        body:
          model: "{{ item.0 }}"
          prompt: "{{ test_prompts[item.1].prompt }}"
          stream: false
        headers:
          Authorization: "Bearer {{ ollama_api_key }}"
        timeout: "{{ benchmark_large_timeout }}"
        status_code: 200
      loop: "{{ _large_models | product(test_prompts.keys() | list) | list }}"
      loop_control:
        label: "{{ item.0 }} / {{ item.1 }}"
      register: _bench_large
      failed_when: false
      when: _large_models | length > 0
      tags:
        - benchmark-run

    - name: "Benchmark | Merge small, medium, and large model results"
      ansible.builtin.set_fact:
        benchmark_raw_results:
          results: >-
            {{ (_bench_small.results  | default([]))
             + (_bench_medium.results | default([]))
             + (_bench_large.results  | default([])) }}
      tags:
        - benchmark-run

    - name: "Benchmark | Compute per-model metrics"
      ansible.builtin.set_fact:
        model_metrics: |
          {% set ns = namespace(results={}) %}
          {% for model in models_to_benchmark %}
          {%   set ns2 = namespace(coding_quality=0, coding_count=0, general_quality=0, general_count=0, total_toks=0, total_eval_time=0, ttft_sum=0, ttft_count=0, latency_ns=0) %}
          {%   for result in benchmark_raw_results.results %}
          {%     if result.item[0] == model and result.status == 200 %}
          {%       set test_name = result.item[1] %}
          {%       set resp = result.json | default({}) %}
          {%       set eval_count = resp.eval_count | default(0) | int %}
          {%       set eval_duration = resp.eval_duration | default(1) | int %}
          {%       set prompt_eval_duration = resp.prompt_eval_duration | default(0) | int %}
          {%       set response_text = resp.response | default('') %}
          {%       set tok_per_sec = (eval_count / (eval_duration / 1000000000.0)) if eval_duration > 0 else 0 %}
          {%       set ns2.total_toks = ns2.total_toks + tok_per_sec %}
          {%       set ns2.ttft_sum = ns2.ttft_sum + prompt_eval_duration %}
          {%       set ns2.ttft_count = ns2.ttft_count + 1 %}
          {%       if test_name == 'latency' %}
          {%         set ns2.latency_ns = eval_duration + prompt_eval_duration %}
          {%       endif %}
          {%       set resp_len = response_text | length %}
          {%       if test_name in ['code_gen', 'debug', 'refactor'] %}
          {%         set has_def = 1 if 'def ' in response_text else 0 %}
          {%         set has_return = 1 if 'return' in response_text else 0 %}
          {%         set has_assert = 1 if 'assert ' in response_text else 0 %}
          {%         set has_test_def = 1 if 'def test_' in response_text else 0 %}
          {%         set has_docstring = 1 if '"""' in response_text else 0 %}
          {%         set has_type_hint = 1 if ' -> ' in response_text else 0 %}
          {%         set has_code_block = 1 if '```' in response_text else 0 %}
          {%         set has_import = 1 if ('import ' in response_text or 'from ' in response_text) else 0 %}
          {%         if test_name == 'code_gen' %}
          {%           set quality = (has_def * 0.20 + has_return * 0.20 + has_docstring * 0.15 + has_type_hint * 0.15 + has_code_block * 0.10 + has_assert * 0.08 + has_test_def * 0.07 + has_import * 0.05) %}
          {%         elif test_name == 'debug' %}
          {%           set quality = (has_def * 0.30 + has_return * 0.30 + has_code_block * 0.25 + has_assert * 0.15) %}
          {%         else %}
          {%           set quality = (has_def * 0.25 + has_return * 0.25 + has_code_block * 0.20 + has_type_hint * 0.15 + has_import * 0.15) %}
          {%         endif %}
          {%         set ns2.coding_quality = ns2.coding_quality + quality %}
          {%         set ns2.coding_count = ns2.coding_count + 1 %}
          {%       elif test_name in ['explain', 'creative', 'reasoning'] %}
          {%         set length_score = [resp_len / 800.0, 1.0] | min %}
          {%         set has_structure = 1 if ('\n' in response_text and resp_len > 100) else 0 %}
          {%         set has_list = 1 if ('\n- ' in response_text or '\n* ' in response_text or '\n1.' in response_text) else 0 %}
          {%         set has_detail = 1 if '\n\n' in response_text else 0 %}
          {%         set quality = (length_score * 0.35 + has_structure * 0.40 + has_list * 0.15 + has_detail * 0.10) %}
          {%         set ns2.general_quality = ns2.general_quality + quality %}
          {%         set ns2.general_count = ns2.general_count + 1 %}
          {%       endif %}
          {%     endif %}
          {%   endfor %}
          {%   set coding_avg = (ns2.coding_quality / ns2.coding_count) if ns2.coding_count > 0 else 0 %}
          {%   set general_avg = (ns2.general_quality / ns2.general_count) if ns2.general_count > 0 else 0 %}
          {%   set test_count = (ns2.ttft_count) if ns2.ttft_count > 0 else 1 %}
          {%   set avg_toks = ns2.total_toks / test_count %}
          {%   set toks_norm = [avg_toks / benchmark_toks_norm_ceiling, 1.0] | min %}
          {%   set latency_ms = ns2.latency_ns / 1000000.0 if ns2.latency_ns > 0 else 9999 %}
          {%   set latency_score = [1.0 - (latency_ms / 5000.0), 0] | max %}
          {%   set coding_composite = coding_avg * 0.45 + toks_norm * 0.30 + latency_score * 0.25 %}
          {%   set general_composite = general_avg * 0.45 + toks_norm * 0.30 + latency_score * 0.25 %}
          {%   set _override = (model_category_overrides | default({}))[model] | default('') %}
          {%   if _override in ['coding', 'general'] %}
          {%     set category = _override %}
          {%   elif (coding_avg - general_avg) >= benchmark_coding_threshold %}
          {%     set category = 'coding' %}
          {%   elif 'coder' in model | lower or 'codestral' in model | lower or 'codellama' in model | lower or 'starcoder' in model | lower %}
          {%     set category = 'coding' %}
          {%   else %}
          {%     set category = 'general' %}
          {%   endif %}
          {%   set _ = ns.results.update({model: {'coding_quality': coding_avg | round(3), 'general_quality': general_avg | round(3), 'avg_tok_per_sec': avg_toks | round(1), 'toks_norm': toks_norm | round(3), 'latency_ms': latency_ms | round(1), 'latency_score': latency_score | round(3), 'coding_composite': coding_composite | round(3), 'general_composite': general_composite | round(3), 'category': category}}) %}
          {% endfor %}
          {{ ns.results | to_json }}
      tags:
        - benchmark-compute

    - name: "Benchmark | Parse model metrics"
      ansible.builtin.set_fact:
        parsed_metrics: "{{ model_metrics | from_json }}"
      tags:
        - benchmark-compute

    - name: "Benchmark | Rank models and select slots"
      ansible.builtin.set_fact:
        model_selection: |
          {% set general_models = [] %}
          {% set coding_models = [] %}
          {% for model, metrics in parsed_metrics.items() %}
          {%   if metrics.category == 'general' %}
          {%     set _ = general_models.append({'name': model, 'composite': metrics.general_composite, 'metrics': metrics}) %}
          {%   else %}
          {%     set _ = coding_models.append({'name': model, 'composite': metrics.coding_composite, 'metrics': metrics}) %}
          {%   endif %}
          {% endfor %}
          {% set general_sorted = general_models | sort(attribute='composite', reverse=true) %}
          {% set coding_sorted = coding_models | sort(attribute='composite', reverse=true) %}
          {% set slot1 = general_sorted[0].name if general_sorted | length > 0 else 'none' %}
          {% set slot2 = general_sorted[1].name if general_sorted | length > 1 else (general_sorted[0].name if general_sorted | length > 0 else 'none') %}
          {% set slot5 = general_sorted[2].name if general_sorted | length > 2 else 'none' %}
          {% set slot3 = coding_sorted[0].name if coding_sorted | length > 0 else (general_sorted[0].name if general_sorted | length > 0 else 'none') %}
          {% set slot4 = coding_sorted[1].name if coding_sorted | length > 1 else (coding_sorted[0].name if coding_sorted | length > 0 else 'none') %}
          {% set slot6 = coding_sorted[2].name if coding_sorted | length > 2 else 'none' %}
          {{ {'slot1_general': slot1, 'slot2_general': slot2, 'slot5_general_rotate': slot5,
              'slot3_coding': slot3, 'slot4_coding': slot4, 'slot6_coding_rotate': slot6,
              'all_metrics': parsed_metrics, 'general_ranking': general_sorted,
              'coding_ranking': coding_sorted} | to_json }}
      tags:
        - benchmark-select

    - name: "Benchmark | Parse model selection"
      ansible.builtin.set_fact:
        selection: "{{ model_selection | from_json }}"
      tags:
        - benchmark-select

    - name: "Benchmark | Display model selection results"
      ansible.builtin.debug:
        msg:
          - "============================================="
          - "  MODEL SELECTION RESULTS  (6-slot / 2-socket)"
          - "============================================="
          - "  Node 1 — General (port 11434)"
          - "  Slot 1 (locked):   {{ selection.slot1_general }}"
          - "  Slot 2 (locked):   {{ selection.slot2_general }}"
          - "  Slot 5 (rotate):   {{ selection.slot5_general_rotate }}"
          - "  Node 0 — Coding (port 11435)"
          - "  Slot 3 (locked):   {{ selection.slot3_coding }}"
          - "  Slot 4 (locked):   {{ selection.slot4_coding }}"
          - "  Slot 6 (rotate):   {{ selection.slot6_coding_rotate }}"
          - "============================================="
      tags:
        - benchmark-select

    - name: "Benchmark | Generate timestamp"
      ansible.builtin.set_fact:
        benchmark_timestamp: "{{ ansible_date_time.iso8601_basic_short }}"
      tags:
        - benchmark-report

    - name: "Benchmark | Save benchmark results markdown"
      ansible.builtin.copy:
        content: |
          # Benchmark Results - {{ benchmark_timestamp }}

          ## Model Selection (6-slot / 2-socket)
          | Slot | Socket | Role | Model | Composite Score |
          |------|--------|------|-------|----------------|
          | 1 | Node 1 (port 11434) | General (locked) | {{ selection.slot1_general }} | {{ parsed_metrics[selection.slot1_general].general_composite | default('N/A') }} |
          | 2 | Node 1 (port 11434) | General (locked) | {{ selection.slot2_general }} | {{ parsed_metrics[selection.slot2_general].general_composite | default('N/A') }} |
          | 5 | Node 1 (port 11434) | General (rotate) | {{ selection.slot5_general_rotate }} | {{ parsed_metrics[selection.slot5_general_rotate].general_composite | default('N/A') }} |
          | 3 | Node 0 (port 11435) | Coding (locked) | {{ selection.slot3_coding }} | {{ parsed_metrics[selection.slot3_coding].coding_composite | default('N/A') }} |
          | 4 | Node 0 (port 11435) | Coding (locked) | {{ selection.slot4_coding }} | {{ parsed_metrics[selection.slot4_coding].coding_composite | default('N/A') }} |
          | 6 | Node 0 (port 11435) | Coding (rotate) | {{ selection.slot6_coding_rotate }} | {{ parsed_metrics[selection.slot6_coding_rotate].coding_composite | default('N/A') }} |

          ## Detailed Metrics
          {% for model, metrics in parsed_metrics.items() %}
          ### {{ model }}
          - **Category**: {{ metrics.category }}
          - **Coding Quality**: {{ metrics.coding_quality }}
          - **General Quality**: {{ metrics.general_quality }}
          - **Avg Tokens/sec**: {{ metrics.avg_tok_per_sec }}
          - **Latency (ms)**: {{ metrics.latency_ms }}
          - **Coding Composite**: {{ metrics.coding_composite }}
          - **General Composite**: {{ metrics.general_composite }}
          {% endfor %}

          ## Scoring Formula
          - Composite = quality * 0.45 + token_speed_normalized * 0.30 + latency_score * 0.25
          - Speed normalized against {{ benchmark_toks_norm_ceiling }} tok/sec ceiling (hardware-observed max)
          - Coding quality (per-prompt):
            code_gen: has_def×0.20 + has_return×0.20 + has_docstring×0.15 + has_type_hint×0.15 + has_code_block×0.10 + has_assert×0.08 + has_test_def×0.07 + has_import×0.05
            debug:    has_def×0.30 + has_return×0.30 + has_code_block×0.25 + has_assert×0.15
            refactor: has_def×0.25 + has_return×0.25 + has_code_block×0.20 + has_type_hint×0.15 + has_import×0.15
          - Category: override dict → quality delta (coding_avg - general_avg >= {{ benchmark_coding_threshold }}) → name pattern (coder/codestral/codellama/starcoder) → general
        dest: "{{ benchmark_results_dir }}/benchmark_{{ benchmark_timestamp }}.md"
        mode: "0644"
      delegate_to: localhost
      tags:
        - benchmark-report

    - name: "Benchmark | Save model_selection.json"
      ansible.builtin.copy:
        content: "{{ selection | to_nice_json }}"
        dest: "{{ benchmark_results_dir }}/model_selection.json"
        mode: "0644"
      delegate_to: localhost
      tags:
        - benchmark-report

    - name: "Benchmark | Check minimum composite scores"
      ansible.builtin.debug:
        msg: >-
          WARNING: Best composite score for {{ item.key }} models is below threshold
          ({{ min_composite_score }}). Consider pulling additional models.
          Recommended candidates: qwen2.5-coder:14b, deepseek-coder-v2:16b, codellama:34b
      when: >-
        (item.value.coding_composite < min_composite_score | float) and
        (item.value.general_composite < min_composite_score | float)
      loop: "{{ parsed_metrics | dict2items }}"
      loop_control:
        label: "{{ item.key }}"
      tags:
        - benchmark-report

    - name: "Benchmark | Pull recommended model if pull_if_better is true"
      ansible.builtin.command: "ollama pull qwen2.5-coder:14b"
      when:
        - pull_if_better | bool
        - parsed_metrics.values() | map(attribute='coding_composite') | max < min_composite_score | float
      changed_when: true
      tags:
        - benchmark-pull