sarman
/
tftsr_ai


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
							[Service]
# ── Ollama API & model settings ────────────────────────────────────────────
Environment="OLLAMA_API_KEY={{ ollama_api_key }}"
Environment="OLLAMA_HOST=0.0.0.0:11434"
Environment="OLLAMA_MODELS=/mnt/ai_data/ollama_models"
Environment="OLLAMA_KEEP_ALIVE=-1"

# ── Inference performance ──────────────────────────────────────────────────
# Flash attention: fused softmax, ~20% less memory bandwidth, faster on AVX2
Environment="OLLAMA_FLASH_ATTENTION=1"

# Threads: 14 physical cores on NUMA node 1 only (no hyperthreads).
# LLM inference is memory-bandwidth-bound; HT siblings share the same memory
# pipeline and add scheduling overhead without adding bandwidth.
Environment="OLLAMA_NUM_THREADS={{ ollama_num_threads }}"

# Parallel inference streams — 2 simultaneous requests, 7 threads each.
# Keeps per-request throughput high for interactive/single-user workloads.
Environment="OLLAMA_NUM_PARALLEL={{ ollama_num_parallel }}"

# Keep 3 models warm in RAM per instance (KEEP_ALIVE=-1 means never unload; 6 total across both sockets)
Environment="OLLAMA_MAX_LOADED_MODELS={{ ollama_max_loaded_models }}"

# ── NUMA / CPU binding ────────────────────────────────────────────────────
# numactl --cpunodebind pins the scheduler to all logical CPUs on node 1
# (14 physical + 14 HT siblings = 28 CPUs). This avoids two failure modes:
#
#  1. numactl --membind=1 (MPOL_BIND) suppresses khugepaged THP promotion
#     for the model's ~2.75 GB anonymous allocation, causing ~700k 4 KB TLB
#     entries and near-100% L2-STLB miss rate → 128x throughput loss.
#
#  2. CPUAffinity restricted to 14 physical cores only forces ~56 Go runtime
#     OS threads to compete with 14 GGML compute threads on 14 CPUs (5:1
#     oversubscription). GGML busy-wait barriers then block waiting threads
#     from checking in → cascading stall across ~400 ops/token → 128x loss.
#
# --cpunodebind (sched_setaffinity only, no set_mempolicy) gives 28 CPUs and
# MPOL_DEFAULT, so allocations go to node 1 naturally and THP works freely.
ExecStart=
ExecStart=/usr/bin/numactl --cpunodebind={{ ollama_numa_node }} {{ ollama_binary_path }} serve

# ── Memory hardening ───────────────────────────────────────────────────────
# Prevent model weights from being paged out under memory pressure
LimitMEMLOCK=infinity

# Sufficient file descriptors for parallel connections and mmap'd model files
LimitNOFILE=65535

# Disable OOM kill — losing a loaded model mid-inference is worse than
# the kernel reclaiming other memory first
OOMScoreAdjust=-500