all.yml 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. ---
  2. # ============================================================
  3. # Global Variables — AI Platform Ansible Automation
  4. # ============================================================
  5. # Domain and networking
  6. domain: example.com
  7. ai_server_ip: 192.168.1.100
  8. nginx_proxy_ip: 192.168.1.30
  9. coredns_host_ip: 192.168.1.29
  10. # SSH user for all managed hosts (override per-host in host_vars if needed)
  11. ansible_user: admin
  12. # Platform identity — used for Keycloak realm, Vault paths, UI display names
  13. platform_name: "AI Platform"
  14. vault_project_slug: "ai-platform"
  15. # Service URLs
  16. vault_url: "https://vault.{{ domain }}"
  17. keycloak_url: "https://idm.{{ domain }}"
  18. openwebui_url: "https://ollama-ui.{{ domain }}"
  19. ollama_api_url: "https://ollama-api.{{ domain }}"
  20. # Storage paths on ai_server
  21. ai_data_root: /mnt/ai_data
  22. ollama_models_path: "{{ ai_data_root }}/ollama_models"
  23. keycloak_data_path: "{{ ai_data_root }}/keycloak"
  24. qdrant_data_path: "{{ ai_data_root }}/qdrant"
  25. openwebui_data_path: "{{ ai_data_root }}/open-webui"
  26. openclaw_data_path: "{{ ai_data_root }}/openclaw"
  27. benchmark_results_path: "{{ ai_data_root }}/benchmarks"
  28. # Storage paths on coredns_host
  29. vault_config_path: /docker_mounts/vault/config
  30. vault_data_path: /docker_mounts/vault/data
  31. vault_scripts_path: /docker_mounts/vault
  32. coredns_zone_file: "/docker_mounts/coredns/{{ domain }}.db"
  33. # Local control-node paths (gitignored)
  34. vault_token_file: "{{ playbook_dir }}/../vault/.vault-token"
  35. vault_init_file: "{{ playbook_dir }}/../vault/.vault-init.json"
  36. # Vault configuration
  37. vault_port: 8202
  38. vault_api_addr: "https://vault.{{ domain }}"
  39. vault_secret_prefix: "secret/data/{{ vault_project_slug }}"
  40. vault_secret_meta_prefix: "secret/metadata/{{ vault_project_slug }}"
  41. vault_approle_name: "ai-services"
  42. # Service ports
  43. keycloak_port: 8180
  44. ollama_port: 11434
  45. ollama_node0_port: 11435
  46. qdrant_http_port: 6333
  47. qdrant_grpc_port: 6334
  48. # Ollama configuration
  49. ollama_host: "0.0.0.0:11434"
  50. ollama_num_threads: 14
  51. ollama_num_parallel: 2
  52. ollama_max_loaded_models: 3 # 3 per socket (6 total across both NUMA instances)
  53. ollama_keep_alive: "-1"
  54. ollama_flash_attention: "1"
  55. # NUMA/CPU affinity - Dell M630, 2x E5-2690v4
  56. # CPUs are interleaved: odd = socket 1 (NUMA node 1), even = socket 0.
  57. # Physical cores on node 1: 1,3,...,27 (14 cores). HT siblings: 29,31,...,55.
  58. # Physical cores on node 0: 0,2,...,26 (14 cores). HT siblings: 28,30,...,54.
  59. # Pinning to physical cores only eliminates HT contention on the memory bus.
  60. # NUMA node 1 has ~120 GB free RAM vs node 0's ~75 GB.
  61. ollama_numa_node: "1"
  62. ollama_cpu_affinity: "1 3 5 7 9 11 13 15 17 19 21 23 25 27"
  63. ollama_node0_cpu_affinity: "0 2 4 6 8 10 12 14 16 18 20 22 24 26"
  64. ollama_binary_path: /usr/bin/ollama
  65. # Keycloak configuration
  66. keycloak_realm: "{{ vault_project_slug }}"
  67. keycloak_realm_display: "{{ platform_name }}"
  68. keycloak_client_id: open-webui
  69. keycloak_redirect_uri: "https://ollama-ui.{{ domain }}/*"
  70. keycloak_oidc_url: "https://idm.{{ domain }}/realms/{{ keycloak_realm }}"
  71. keycloak_realm_admin_user: "{{ vault_project_slug }}-admin"
  72. # Benchmark thresholds
  73. benchmark_thresholds:
  74. min_tokens_per_sec: 5.0
  75. min_quality_score: 0.6
  76. min_composite_score: 0.55
  77. benchmark_toks_norm_ceiling: 40 # Conservative dual-socket estimate (was 22 single-socket)
  78. benchmark_coding_threshold: 0.10 # Delta to classify a model as coding-specialized
  79. # Modelfile aliases created by 04_models.yml — excluded from benchmark to prevent
  80. # 32k-token KV cache allocations stalling the run with 285-second response times.
  81. benchmark_skip_aliases:
  82. - "coder-128k"
  83. - "coder-32k"
  84. - "coder-rotate"
  85. - "llama-family"
  86. - "gemma-family"
  87. benchmark_small_max_gb: 10 # upper size boundary for small pass (< 10 GB), based on runtime RAM
  88. benchmark_medium_max_gb: 15 # upper size boundary for medium pass (10–15 GB), based on runtime RAM
  89. benchmark_size_overhead_factor: 1.2 # ollama list shows disk size; multiply by this to estimate runtime RAM
  90. benchmark_load_timeout: 180 # seconds — warm-up "Hi" prompt per model before benchmarking
  91. benchmark_small_timeout: 90 # seconds per request, small models (<10 GB)
  92. benchmark_medium_timeout: 240 # seconds per request, medium models (10–15 GB)
  93. benchmark_large_timeout: 480 # seconds per request, large models (>15 GB)
  94. benchmark_num_predict: 500 # cap output tokens; allows full coding responses (def+return+docstring+assert); worst-case: 6.5 tok/s→77s, 22 tok/s→23s
  95. # Explicit category overrides applied before heuristics. Keys are model names as
  96. # returned by `ollama list`. Valid values: 'coding' or 'general'.
  97. # Example: { "deepseek-coder-v2": "coding", "qwen2.5-coder:7b": "coding" }
  98. model_category_overrides: {}
  99. # Baseline models — always pulled before benchmarking regardless of model_selection.json.
  100. # These are the minimum set needed to populate all 4 slots with meaningful candidates.
  101. baseline_models:
  102. - "llama3.2:3b"
  103. - "deepseek-coder-v2:16b"
  104. - "qwen2.5-coder:7b"
  105. - "llama3.1:8b"
  106. # Candidate models to recommend/pull if benchmark scores are below threshold
  107. candidate_models:
  108. - name: "qwen2.5-coder:32b-instruct-q4_K_M"
  109. size_gb: 20
  110. expected_tokens_sec: 4.5
  111. reason: "Larger qwen2.5-coder for higher quality"
  112. category: coding
  113. - name: "codegemma:7b-instruct-q5_K_M"
  114. size_gb: 5.5
  115. expected_tokens_sec: 12.0
  116. reason: "Fast Google coding model"
  117. category: coding
  118. - name: "starcoder2:15b-instruct-q4_K_M"
  119. size_gb: 9.5
  120. expected_tokens_sec: 7.0
  121. reason: "StarCoder2 coding specialist"
  122. category: coding
  123. # OpenClaw default model — overridden dynamically by 08_openclaw.yml from slot1_general
  124. openclaw_model: "deepseek-coder-v2:16b-lite-instruct-q4_K_M"
  125. # AWS Bedrock (OpenAI-compatible API via Open WebUI)
  126. # Pass bearer_token on first run: -e "bedrock_bearer_token=<value>"
  127. # To rotate: re-run with the new token value.
  128. bedrock_aws_region: "us-east-1"
  129. # NGINX SSL certificate paths (on nginx_proxy)
  130. nginx_ssl_cert: "/etc/nginx/ssl/{{ domain }}.crt"
  131. nginx_ssl_key: "/etc/nginx/ssl/{{ domain }}.key"