override.conf.j2 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. [Service]
  2. # ── Ollama API & model settings ────────────────────────────────────────────
  3. Environment="OLLAMA_API_KEY={{ ollama_api_key }}"
  4. Environment="OLLAMA_HOST=0.0.0.0:11434"
  5. Environment="OLLAMA_MODELS=/mnt/ai_data/ollama_models"
  6. Environment="OLLAMA_KEEP_ALIVE=-1"
  7. # ── Inference performance ──────────────────────────────────────────────────
  8. # Flash attention: fused softmax, ~20% less memory bandwidth, faster on AVX2
  9. Environment="OLLAMA_FLASH_ATTENTION=1"
  10. # Threads: 14 physical cores on NUMA node 1 only (no hyperthreads).
  11. # LLM inference is memory-bandwidth-bound; HT siblings share the same memory
  12. # pipeline and add scheduling overhead without adding bandwidth.
  13. Environment="OLLAMA_NUM_THREADS={{ ollama_num_threads }}"
  14. # Parallel inference streams — 2 simultaneous requests, 7 threads each.
  15. # Keeps per-request throughput high for interactive/single-user workloads.
  16. Environment="OLLAMA_NUM_PARALLEL={{ ollama_num_parallel }}"
  17. # Keep 3 models warm in RAM per instance (KEEP_ALIVE=-1 means never unload; 6 total across both sockets)
  18. Environment="OLLAMA_MAX_LOADED_MODELS={{ ollama_max_loaded_models }}"
  19. # ── NUMA / CPU binding ────────────────────────────────────────────────────
  20. # numactl --cpunodebind pins the scheduler to all logical CPUs on node 1
  21. # (14 physical + 14 HT siblings = 28 CPUs). This avoids two failure modes:
  22. #
  23. # 1. numactl --membind=1 (MPOL_BIND) suppresses khugepaged THP promotion
  24. # for the model's ~2.75 GB anonymous allocation, causing ~700k 4 KB TLB
  25. # entries and near-100% L2-STLB miss rate → 128x throughput loss.
  26. #
  27. # 2. CPUAffinity restricted to 14 physical cores only forces ~56 Go runtime
  28. # OS threads to compete with 14 GGML compute threads on 14 CPUs (5:1
  29. # oversubscription). GGML busy-wait barriers then block waiting threads
  30. # from checking in → cascading stall across ~400 ops/token → 128x loss.
  31. #
  32. # --cpunodebind (sched_setaffinity only, no set_mempolicy) gives 28 CPUs and
  33. # MPOL_DEFAULT, so allocations go to node 1 naturally and THP works freely.
  34. ExecStart=
  35. ExecStart=/usr/bin/numactl --cpunodebind={{ ollama_numa_node }} {{ ollama_binary_path }} serve
  36. # ── Memory hardening ───────────────────────────────────────────────────────
  37. # Prevent model weights from being paged out under memory pressure
  38. LimitMEMLOCK=infinity
  39. # Sufficient file descriptors for parallel connections and mmap'd model files
  40. LimitNOFILE=65535
  41. # Disable OOM kill — losing a loaded model mid-inference is worse than
  42. # the kernel reclaiming other memory first
  43. OOMScoreAdjust=-500