|
|
@@ -22,15 +22,22 @@ Environment="OLLAMA_NUM_PARALLEL={{ ollama_num_parallel }}"
|
|
|
Environment="OLLAMA_MAX_LOADED_MODELS={{ ollama_max_loaded_models }}"
|
|
|
|
|
|
# ── NUMA / CPU binding ────────────────────────────────────────────────────
|
|
|
-# ExecStart override: numactl --membind=1 guarantees model weights and KV
|
|
|
-# cache are allocated from NUMA node 1 RAM (120 GB free). CPUAffinity alone
|
|
|
-# does not set the memory policy; numactl makes it explicit.
|
|
|
+# numactl --cpunodebind pins the scheduler to all logical CPUs on node 1
|
|
|
+# (14 physical + 14 HT siblings = 28 CPUs). This avoids two failure modes:
|
|
|
+#
|
|
|
+# 1. numactl --membind=1 (MPOL_BIND) suppresses khugepaged THP promotion
|
|
|
+# for the model's ~2.75 GB anonymous allocation, causing ~700k 4 KB TLB
|
|
|
+# entries and near-100% L2-STLB miss rate → 128x throughput loss.
|
|
|
+#
|
|
|
+# 2. CPUAffinity restricted to 14 physical cores only forces ~56 Go runtime
|
|
|
+# OS threads to compete with 14 GGML compute threads on 14 CPUs (5:1
|
|
|
+# oversubscription). GGML busy-wait barriers then block waiting threads
|
|
|
+# from checking in → cascading stall across ~400 ops/token → 128x loss.
|
|
|
+#
|
|
|
+# --cpunodebind (sched_setaffinity only, no set_mempolicy) gives 28 CPUs and
|
|
|
+# MPOL_DEFAULT, so allocations go to node 1 naturally and THP works freely.
|
|
|
ExecStart=
|
|
|
-ExecStart=/usr/bin/numactl --membind=1 {{ ollama_binary_path }} serve
|
|
|
-
|
|
|
-# Restrict scheduler to physical cores on node 1 only (odd CPUs 1–27).
|
|
|
-# Omitting HT siblings (29–55) prevents cross-HT contention on the memory bus.
|
|
|
-CPUAffinity={{ ollama_cpu_affinity }}
|
|
|
+ExecStart=/usr/bin/numactl --cpunodebind={{ ollama_numa_node }} {{ ollama_binary_path }} serve
|
|
|
|
|
|
# ── Memory hardening ───────────────────────────────────────────────────────
|
|
|
# Prevent model weights from being paged out under memory pressure
|