02_infrastructure.yml 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. ---
  2. # playbooks/02_infrastructure.yml
  3. # Install Docker and configure Ollama on ai_server
  4. - name: "Infrastructure | Docker and Ollama setup on ai_server"
  5. hosts: ai_server
  6. become: true
  7. gather_facts: true
  8. tags:
  9. - infrastructure
  10. vars:
  11. vault_token_file: "{{ playbook_dir }}/../vault/.vault-token"
  12. vault_url: "http://{{ ai_server_ip }}:{{ vault_port }}"
  13. pre_tasks:
  14. - name: "Infrastructure | Install Python Docker SDK prerequisites"
  15. ansible.builtin.dnf:
  16. name:
  17. - python3-pip
  18. - python3-requests
  19. - numactl
  20. state: present
  21. tags: always
  22. - name: "Infrastructure | Install Python docker SDK via pip"
  23. ansible.builtin.pip:
  24. name: docker
  25. state: present
  26. executable: pip3
  27. tags: always
  28. tasks:
  29. # ── Docker installation ──────────────────────────────────────────
  30. - name: "Docker | Check if Docker CE repo is already configured"
  31. ansible.builtin.stat:
  32. path: /etc/yum.repos.d/docker-ce.repo
  33. register: docker_repo_file
  34. tags:
  35. - docker
  36. - name: "Docker | Add Docker CE repository"
  37. ansible.builtin.command:
  38. cmd: dnf config-manager --add-repo https://download.docker.com/linux/fedora/docker-ce.repo
  39. when: not docker_repo_file.stat.exists
  40. changed_when: true
  41. tags:
  42. - docker
  43. - name: "Docker | Install Docker CE packages"
  44. ansible.builtin.dnf:
  45. name:
  46. - docker-ce
  47. - docker-ce-cli
  48. - containerd.io
  49. - docker-compose-plugin
  50. state: present
  51. tags:
  52. - docker
  53. - name: "Docker | Add {{ ansible_user }} to docker group"
  54. ansible.builtin.user:
  55. name: "{{ ansible_user }}"
  56. groups: docker
  57. append: true
  58. tags:
  59. - docker
  60. - name: "Docker | Add ollama user to docker group"
  61. ansible.builtin.user:
  62. name: ollama
  63. groups: docker
  64. append: true
  65. tags:
  66. - docker
  67. - name: "Docker | Start and enable docker.service"
  68. ansible.builtin.systemd:
  69. name: docker
  70. state: started
  71. enabled: true
  72. tags:
  73. - docker
  74. # ── Ollama installation and configuration ────────────────────────
  75. - name: "Ollama | Check if ollama binary exists"
  76. ansible.builtin.stat:
  77. path: "{{ item }}"
  78. loop:
  79. - /usr/local/bin/ollama
  80. - /usr/bin/ollama
  81. register: ollama_binary_check
  82. tags:
  83. - ollama
  84. - name: "Ollama | Set ollama installed fact"
  85. ansible.builtin.set_fact:
  86. ollama_installed: "{{ ollama_binary_check.results | selectattr('stat.exists', 'equalto', true) | list | length > 0 }}"
  87. tags:
  88. - ollama
  89. - name: "Ollama | Install Ollama"
  90. ansible.builtin.shell:
  91. cmd: curl -fsSL https://ollama.ai/install.sh | sh
  92. when: not ollama_installed
  93. changed_when: true
  94. tags:
  95. - ollama
  96. - name: "Ollama | Retrieve OLLAMA_API_KEY from Vault"
  97. ansible.builtin.set_fact:
  98. ollama_api_key: "{{ lookup('community.hashi_vault.hashi_vault', vault_secret_prefix ~ '/ollama:api_key token=' ~ lookup('ansible.builtin.file', vault_token_file) ~ ' url=' ~ vault_url) }}"
  99. tags:
  100. - ollama
  101. - name: "Ollama | Create systemd override directory"
  102. ansible.builtin.file:
  103. path: /etc/systemd/system/ollama.service.d
  104. state: directory
  105. mode: "0755"
  106. owner: root
  107. group: root
  108. tags:
  109. - ollama
  110. - name: "Ollama | Template systemd override configuration"
  111. ansible.builtin.template:
  112. src: "{{ playbook_dir }}/../templates/ollama/override.conf.j2"
  113. dest: /etc/systemd/system/ollama.service.d/override.conf
  114. mode: "0644"
  115. owner: root
  116. group: root
  117. notify:
  118. - Reload systemd and restart ollama
  119. tags:
  120. - ollama
  121. - name: "Ollama | Ensure Ollama is running"
  122. ansible.builtin.systemd:
  123. name: ollama
  124. state: started
  125. enabled: true
  126. tags:
  127. - ollama
  128. - name: "Ollama | Wait for Ollama API to be ready"
  129. ansible.builtin.uri:
  130. url: "http://localhost:11434/api/tags"
  131. method: GET
  132. status_code: 200
  133. timeout: 10
  134. register: ollama_ready
  135. retries: 24
  136. delay: 5
  137. until: ollama_ready.status == 200
  138. tags:
  139. - ollama
  140. - name: "Ollama | Deploy ollama-node0 systemd unit"
  141. ansible.builtin.template:
  142. src: "{{ playbook_dir }}/../templates/ollama/ollama-node0.service.j2"
  143. dest: /etc/systemd/system/ollama-node0.service
  144. mode: "0644"
  145. owner: root
  146. group: root
  147. notify:
  148. - Reload systemd and start ollama-node0
  149. tags:
  150. - ollama
  151. - name: "Ollama | Enable and start ollama-node0"
  152. ansible.builtin.systemd:
  153. name: ollama-node0
  154. enabled: true
  155. state: started
  156. daemon_reload: true
  157. tags:
  158. - ollama
  159. - name: "Ollama | Wait for ollama-node0 API to be ready"
  160. ansible.builtin.uri:
  161. url: "http://localhost:{{ ollama_node0_port }}/api/tags"
  162. method: GET
  163. headers:
  164. Authorization: "Bearer {{ ollama_api_key }}"
  165. status_code: 200
  166. timeout: 10
  167. register: ollama_node0_ready
  168. retries: 24
  169. delay: 5
  170. until: ollama_node0_ready.status == 200
  171. tags:
  172. - ollama
  173. # ── OS-level kernel tuning for dedicated inference server ────────────────
  174. - name: "OS Tune | Apply sysctl settings for inference workload"
  175. ansible.posix.sysctl:
  176. name: "{{ item.name }}"
  177. value: "{{ item.value }}"
  178. sysctl_file: /etc/sysctl.d/99-ollama-perf.conf
  179. reload: true
  180. state: present
  181. loop:
  182. # Disable auto-NUMA migration — fights explicit numactl --membind=1 by
  183. # moving KV-cache pages mid-inference to a different NUMA node.
  184. - { name: kernel.numa_balancing, value: "0" }
  185. # Near-zero swappiness: prevents model weights being paged out under
  186. # memory pressure (complements LimitMEMLOCK=infinity in the unit file).
  187. - { name: vm.swappiness, value: "1" }
  188. # Required for mlock to succeed without reservation failures.
  189. - { name: vm.overcommit_memory, value: "1" }
  190. tags:
  191. - os-tune
  192. - name: "OS Tune | Set Transparent Huge Pages to madvise (immediate)"
  193. ansible.builtin.shell:
  194. cmd: echo madvise > /sys/kernel/mm/transparent_hugepage/enabled
  195. changed_when: true
  196. tags:
  197. - os-tune
  198. - name: "OS Tune | Install THP madvise persistence service"
  199. ansible.builtin.copy:
  200. dest: /etc/systemd/system/thp-madvise.service
  201. mode: "0644"
  202. owner: root
  203. group: root
  204. content: |
  205. [Unit]
  206. Description=Set Transparent Huge Pages to madvise
  207. After=local-fs.target
  208. [Service]
  209. Type=oneshot
  210. ExecStart=/bin/sh -c 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled'
  211. RemainAfterExit=yes
  212. [Install]
  213. WantedBy=multi-user.target
  214. notify:
  215. - Reload systemd daemon
  216. tags:
  217. - os-tune
  218. - name: "OS Tune | Enable THP madvise persistence service"
  219. ansible.builtin.systemd:
  220. name: thp-madvise.service
  221. enabled: true
  222. daemon_reload: false
  223. tags:
  224. - os-tune
  225. - name: "OS Tune | Set CPU governor to performance (immediate)"
  226. ansible.builtin.shell:
  227. cmd: |
  228. for gov in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
  229. [ -f "$gov" ] && echo performance > "$gov"
  230. done
  231. changed_when: true
  232. tags:
  233. - os-tune
  234. - name: "OS Tune | Install CPU performance governor persistence service"
  235. ansible.builtin.copy:
  236. dest: /etc/systemd/system/cpu-performance.service
  237. mode: "0644"
  238. owner: root
  239. group: root
  240. content: |
  241. [Unit]
  242. Description=Set CPU scaling governor to performance
  243. After=local-fs.target
  244. [Service]
  245. Type=oneshot
  246. ExecStart=/bin/sh -c 'for gov in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do [ -f "$gov" ] && echo performance > "$gov"; done'
  247. RemainAfterExit=yes
  248. [Install]
  249. WantedBy=multi-user.target
  250. notify:
  251. - Reload systemd daemon
  252. tags:
  253. - os-tune
  254. - name: "OS Tune | Enable CPU performance governor persistence service"
  255. ansible.builtin.systemd:
  256. name: cpu-performance.service
  257. enabled: true
  258. daemon_reload: false
  259. tags:
  260. - os-tune
  261. handlers:
  262. - name: Reload systemd and restart ollama
  263. ansible.builtin.systemd:
  264. name: ollama
  265. state: restarted
  266. daemon_reload: true
  267. - name: Reload systemd and start ollama-node0
  268. ansible.builtin.systemd:
  269. name: ollama-node0
  270. state: started
  271. daemon_reload: true
  272. - name: Reload systemd daemon
  273. ansible.builtin.systemd:
  274. daemon_reload: true