override.conf.j2 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. [Service]
  2. # ── Ollama API & model settings ────────────────────────────────────────────
  3. Environment="OLLAMA_API_KEY={{ ollama_api_key }}"
  4. Environment="OLLAMA_HOST=0.0.0.0:11434"
  5. Environment="OLLAMA_MODELS=/mnt/ai_data/ollama_models"
  6. Environment="OLLAMA_KEEP_ALIVE=-1"
  7. # ── Inference performance ──────────────────────────────────────────────────
  8. # Flash attention: fused softmax, ~20% less memory bandwidth, faster on AVX2
  9. Environment="OLLAMA_FLASH_ATTENTION=1"
  10. # Threads: 28 logical CPUs on NUMA node 1 (14 physical cores × 2 HT)
  11. # Covers all threads on the socket so no cross-socket migrations occur
  12. Environment="OLLAMA_NUM_THREADS={{ ollama_num_threads }}"
  13. # Parallel inference streams — 4 simultaneous requests, 7 threads each
  14. Environment="OLLAMA_NUM_PARALLEL={{ ollama_num_parallel }}"
  15. # Keep 4 models warm in RAM (KEEP_ALIVE=-1 means never unload)
  16. Environment="OLLAMA_MAX_LOADED_MODELS={{ ollama_max_loaded_models }}"
  17. # ── NUMA / CPU binding ────────────────────────────────────────────────────
  18. # Pin all Ollama threads to NUMA node 1 CPUs (odd: 1,3,5,...,55).
  19. # Node 1 has ~120 GB free RAM vs node 0's ~75 GB.
  20. # CPUAffinity prevents cross-NUMA thread migration; Linux will naturally
  21. # allocate memory from the local node when all threads are on that node.
  22. CPUAffinity={{ ollama_cpu_affinity }}
  23. # ── Memory hardening ───────────────────────────────────────────────────────
  24. # Prevent model weights from being paged out under memory pressure
  25. LimitMEMLOCK=infinity
  26. # Sufficient file descriptors for parallel connections and mmap'd model files
  27. LimitNOFILE=65535
  28. # Disable OOM kill — losing a loaded model mid-inference is worse than
  29. # the kernel reclaiming other memory first
  30. OOMScoreAdjust=-500