sarman
/
tftsr_ai


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344
							[Service]
# ── Ollama API & model settings ────────────────────────────────────────────
Environment="OLLAMA_API_KEY={{ ollama_api_key }}"
Environment="OLLAMA_HOST=0.0.0.0:11434"
Environment="OLLAMA_MODELS=/mnt/ai_data/ollama_models"
Environment="OLLAMA_KEEP_ALIVE=-1"

# ── Inference performance ──────────────────────────────────────────────────
# Flash attention: fused softmax, ~20% less memory bandwidth, faster on AVX2
Environment="OLLAMA_FLASH_ATTENTION=1"

# Threads: 14 physical cores on NUMA node 1 only (no hyperthreads).
# LLM inference is memory-bandwidth-bound; HT siblings share the same memory
# pipeline and add scheduling overhead without adding bandwidth.
Environment="OLLAMA_NUM_THREADS={{ ollama_num_threads }}"

# Parallel inference streams — 2 simultaneous requests, 7 threads each.
# Keeps per-request throughput high for interactive/single-user workloads.
Environment="OLLAMA_NUM_PARALLEL={{ ollama_num_parallel }}"

# Keep 3 models warm in RAM per instance (KEEP_ALIVE=-1 means never unload; 6 total across both sockets)
Environment="OLLAMA_MAX_LOADED_MODELS={{ ollama_max_loaded_models }}"

# ── NUMA / CPU binding ────────────────────────────────────────────────────
# ExecStart override: numactl --membind=1 guarantees model weights and KV
# cache are allocated from NUMA node 1 RAM (120 GB free). CPUAffinity alone
# does not set the memory policy; numactl makes it explicit.
ExecStart=
ExecStart=/usr/bin/numactl --membind=1 {{ ollama_binary_path }} serve

# Restrict scheduler to physical cores on node 1 only (odd CPUs 1–27).
# Omitting HT siblings (29–55) prevents cross-HT contention on the memory bus.
CPUAffinity={{ ollama_cpu_affinity }}

# ── Memory hardening ───────────────────────────────────────────────────────
# Prevent model weights from being paged out under memory pressure
LimitMEMLOCK=infinity

# Sufficient file descriptors for parallel connections and mmap'd model files
LimitNOFILE=65535

# Disable OOM kill — losing a loaded model mid-inference is worse than
# the kernel reclaiming other memory first
OOMScoreAdjust=-500