00_preflight.yml 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. ---
  2. # playbooks/00_preflight.yml
  3. # Pre-flight checks: control-node dependencies, SSH/sudo connectivity, disk space, Ollama API health
  4. - name: "Pre-flight | Control node dependencies"
  5. hosts: localhost
  6. connection: local
  7. gather_facts: false
  8. tags:
  9. - preflight
  10. - local-deps
  11. tasks:
  12. - name: "Preflight | Ensure hvac Python library is installed (required for Vault lookups)"
  13. ansible.builtin.pip:
  14. name: hvac
  15. state: present
  16. become: false
  17. - name: "Pre-flight | Verify SSH and sudo access on all hosts"
  18. hosts: all
  19. gather_facts: true
  20. become: true
  21. tags:
  22. - preflight
  23. - connectivity
  24. tasks:
  25. - name: "Preflight | Confirm sudo privileges"
  26. ansible.builtin.command: whoami
  27. changed_when: false
  28. register: whoami_result
  29. - name: "Preflight | Assert running as root via sudo"
  30. ansible.builtin.assert:
  31. that:
  32. - whoami_result.stdout == "root"
  33. fail_msg: "sudo escalation failed on {{ inventory_hostname }}"
  34. success_msg: "sudo access confirmed on {{ inventory_hostname }}"
  35. - name: "Pre-flight | Disk space checks on ai_server"
  36. hosts: ai_server
  37. gather_facts: false
  38. become: true
  39. tags:
  40. - preflight
  41. - disk
  42. tasks:
  43. - name: "Preflight | Get /mnt/ai_data mount info"
  44. ansible.builtin.command: df --output=avail -BG /mnt/ai_data
  45. changed_when: false
  46. register: ai_data_disk
  47. - name: "Preflight | Parse available disk space on /mnt/ai_data"
  48. ansible.builtin.set_fact:
  49. ai_data_avail_gb: "{{ ai_data_disk.stdout_lines[-1] | regex_replace('[^0-9]', '') | int }}"
  50. - name: "Preflight | Assert /mnt/ai_data has >= 500 GB free"
  51. ansible.builtin.assert:
  52. that:
  53. - ai_data_avail_gb | int >= 500
  54. fail_msg: >-
  55. CRITICAL: /mnt/ai_data on ai_server has only {{ ai_data_avail_gb }} GB free.
  56. At least 500 GB is required for AI models and application data.
  57. success_msg: "/mnt/ai_data has {{ ai_data_avail_gb }} GB free (>= 500 GB required)"
  58. - name: "Pre-flight | Disk space checks on coredns_host"
  59. hosts: coredns_host
  60. gather_facts: false
  61. become: true
  62. tags:
  63. - preflight
  64. - disk
  65. tasks:
  66. - name: "Preflight | Get / mount info on coredns_host"
  67. ansible.builtin.command: df --output=avail -BG /
  68. changed_when: false
  69. register: root_disk
  70. - name: "Preflight | Parse available disk space on /"
  71. ansible.builtin.set_fact:
  72. root_avail_gb: "{{ root_disk.stdout_lines[-1] | regex_replace('[^0-9]', '') | int }}"
  73. - name: "Preflight | Assert / has >= 10 GB free on coredns_host"
  74. ansible.builtin.assert:
  75. that:
  76. - root_avail_gb | int >= 10
  77. fail_msg: >-
  78. CRITICAL: / on coredns_host has only {{ root_avail_gb }} GB free.
  79. At least 10 GB is required.
  80. success_msg: "/ has {{ root_avail_gb }} GB free (>= 10 GB required)"
  81. - name: "Pre-flight | Ollama API health check on ai_server"
  82. hosts: ai_server
  83. gather_facts: false
  84. become: false
  85. tags:
  86. - preflight
  87. - ollama
  88. tasks:
  89. - name: "Preflight | Check Ollama API is responding"
  90. ansible.builtin.uri:
  91. url: "http://localhost:11434/api/tags"
  92. method: GET
  93. return_content: true
  94. status_code: 200
  95. timeout: 10
  96. register: ollama_health
  97. failed_when: false
  98. retries: 12
  99. delay: 5
  100. until: ollama_health.status is defined and ollama_health.status == 200
  101. - name: "Preflight | Assert Ollama API is healthy"
  102. ansible.builtin.assert:
  103. that:
  104. - ollama_health.status == 200
  105. fail_msg: >-
  106. CRITICAL: Ollama API is not responding on ai_server at http://localhost:11434/api/tags.
  107. HTTP status: {{ ollama_health.status | default('unreachable') }}.
  108. Ensure Ollama is installed and running before proceeding.
  109. success_msg: "Ollama API is healthy and responding on ai_server"
  110. - name: "Preflight | Display available Ollama models"
  111. ansible.builtin.debug:
  112. msg: "Ollama models available: {{ (ollama_health.json.models | default([])) | map(attribute='name') | list }}"
  113. when: ollama_health.status == 200