hai 3 días · b6ff803219
--- a/playbooks/02_infrastructure.yml
+++ b/playbooks/02_infrastructure.yml
@@ -200,8 +200,8 @@
 
				         reload: true
			
 
				         state: present
			
 
				       loop:
			
 
				-        # Disable auto-NUMA migration — fights explicit numactl --membind=1 by
			
 
				-        # moving KV-cache pages mid-inference to a different NUMA node.
			
 
				+        # Disable auto-NUMA migration — CPUAffinity pins Ollama to node 1/0
			
 
				+        # physical cores; NUMA balancing could migrate pages mid-inference.
			
 
				         - { name: kernel.numa_balancing, value: "0" }
			
 
				         # Near-zero swappiness: prevents model weights being paged out under
			
 
				         # memory pressure (complements LimitMEMLOCK=infinity in the unit file).
			
--- a/templates/ollama/ollama-node0.service.j2
+++ b/templates/ollama/ollama-node0.service.j2
@@ -4,7 +4,7 @@ After=network-online.target ollama.service
 
				 Wants=network-online.target
			
 
				 
			
 
				 [Service]
			
 
				-ExecStart=/usr/bin/numactl --membind=0 {{ ollama_binary_path }} serve
			
 
				+ExecStart=/usr/bin/numactl --cpunodebind=0 {{ ollama_binary_path }} serve
			
 
				 Environment="OLLAMA_API_KEY={{ ollama_api_key }}"
			
 
				 Environment="OLLAMA_HOST=0.0.0.0:{{ ollama_node0_port }}"
			
 
				 Environment="OLLAMA_MODELS={{ ollama_models_path }}"
			
--- a/templates/ollama/override.conf.j2
+++ b/templates/ollama/override.conf.j2
@@ -22,15 +22,22 @@ Environment="OLLAMA_NUM_PARALLEL={{ ollama_num_parallel }}"
 
				 Environment="OLLAMA_MAX_LOADED_MODELS={{ ollama_max_loaded_models }}"
			
 
				 
			
 
				 # ── NUMA / CPU binding ────────────────────────────────────────────────────
			
 
				-# ExecStart override: numactl --membind=1 guarantees model weights and KV
			
 
				-# cache are allocated from NUMA node 1 RAM (120 GB free). CPUAffinity alone
			
 
				-# does not set the memory policy; numactl makes it explicit.
			
 
				+# numactl --cpunodebind pins the scheduler to all logical CPUs on node 1
			
 
				+# (14 physical + 14 HT siblings = 28 CPUs). This avoids two failure modes:
			
 
				+#
			
 
				+#  1. numactl --membind=1 (MPOL_BIND) suppresses khugepaged THP promotion
			
 
				+#     for the model's ~2.75 GB anonymous allocation, causing ~700k 4 KB TLB
			
 
				+#     entries and near-100% L2-STLB miss rate → 128x throughput loss.
			
 
				+#
			
 
				+#  2. CPUAffinity restricted to 14 physical cores only forces ~56 Go runtime
			
 
				+#     OS threads to compete with 14 GGML compute threads on 14 CPUs (5:1
			
 
				+#     oversubscription). GGML busy-wait barriers then block waiting threads
			
 
				+#     from checking in → cascading stall across ~400 ops/token → 128x loss.
			
 
				+#
			
 
				+# --cpunodebind (sched_setaffinity only, no set_mempolicy) gives 28 CPUs and
			
 
				+# MPOL_DEFAULT, so allocations go to node 1 naturally and THP works freely.
			
 
				 ExecStart=
			
 
				-ExecStart=/usr/bin/numactl --membind=1 {{ ollama_binary_path }} serve
			
 
				-
			
 
				-# Restrict scheduler to physical cores on node 1 only (odd CPUs 1–27).
			
 
				-# Omitting HT siblings (29–55) prevents cross-HT contention on the memory bus.
			
 
				-CPUAffinity={{ ollama_cpu_affinity }}
			
 
				+ExecStart=/usr/bin/numactl --cpunodebind={{ ollama_numa_node }} {{ ollama_binary_path }} serve
			
 
				 
			
 
				 # ── Memory hardening ───────────────────────────────────────────────────────
			
 
				 # Prevent model weights from being paged out under memory pressure