headless_lms_server/domain/system_health/
health_check.rs1use super::kubernetes::{get_deployments, get_events, get_pod_disruption_budgets, get_pods};
4use super::{
5 DeploymentInfo, EventInfo, HealthStatus, PodDisruptionBudgetInfo, PodInfo, SystemHealthStatus,
6};
7use anyhow::Result;
8use chrono::{DateTime, Duration, Utc};
9use sqlx::{Executor, PgPool};
10use tracing::warn;
11
12pub fn is_critical_event(event: &EventInfo) -> bool {
13 let reason = event.reason.as_deref().unwrap_or("").to_lowercase();
14 let message = event.message.as_deref().unwrap_or("").to_lowercase();
15
16 let ignored_reasons = ["scheduled", "pulled", "created", "started", "killing"];
17
18 if ignored_reasons.iter().any(|r| reason.contains(r)) {
19 return false;
20 }
21
22 let critical_reasons = [
23 "failed",
24 "backoff",
25 "crashloop",
26 "imagepullbackoff",
27 "errimagepull",
28 "invalid",
29 ];
30
31 critical_reasons
32 .iter()
33 .any(|r| reason.contains(r) || message.contains(r))
34}
35
36pub fn is_recent_event(event: &EventInfo) -> bool {
37 let one_hour_ago = Utc::now() - Duration::hours(1);
38
39 let check_timestamp = |ts_str: &str| -> bool {
40 if let Ok(parsed) =
41 DateTime::parse_from_str(&ts_str.replace(" UTC", "Z"), "%Y-%m-%d %H:%M:%S%Z")
42 {
43 parsed.with_timezone(&Utc) > one_hour_ago
44 } else {
45 false
46 }
47 };
48
49 if let Some(ts) = &event.last_timestamp {
50 if check_timestamp(ts) {
51 return true;
52 }
53 }
54
55 if let Some(ts) = &event.first_timestamp {
56 if check_timestamp(ts) {
57 return true;
58 }
59 }
60
61 false
62}
63
64fn pod_matches_deployment(pod: &PodInfo, deployment: &DeploymentInfo) -> bool {
65 if deployment.selector_labels.is_empty() {
66 return false;
67 }
68 deployment
69 .selector_labels
70 .iter()
71 .all(|(k, v)| pod.labels.get(k) == Some(v))
72}
73
74fn count_deployment_pods_by_phase(
75 pods: &[PodInfo],
76 deployment: &DeploymentInfo,
77 phase: &str,
78) -> usize {
79 pods.iter()
80 .filter(|p| p.phase == phase && pod_matches_deployment(p, deployment))
81 .count()
82}
83
84fn is_deployment_covered_by_pdb<'a>(
85 deployment: &DeploymentInfo,
86 pdbs: &'a [PodDisruptionBudgetInfo],
87) -> Option<&'a PodDisruptionBudgetInfo> {
88 pdbs.iter().find(|pdb| {
89 if pdb.selector_labels.is_empty() {
90 return false;
91 }
92 pdb.selector_labels
93 .iter()
94 .all(|(k, v)| deployment.selector_labels.get(k) == Some(v))
95 })
96}
97
98pub async fn check_system_health(ns: &str, pool: Option<&PgPool>) -> Result<bool> {
99 let health = check_system_health_detailed(ns, pool).await?;
100 Ok(health.status == HealthStatus::Healthy)
101}
102
103pub async fn check_system_health_detailed(
104 ns: &str,
105 pool: Option<&PgPool>,
106) -> Result<SystemHealthStatus> {
107 let pods = match get_pods(ns).await {
108 Ok(p) => p,
109 Err(e) => {
110 return Ok(SystemHealthStatus {
111 status: HealthStatus::Error,
112 issues: vec![format!("Failed to fetch pods: {}", e)],
113 });
114 }
115 };
116 let deployments = match get_deployments(ns).await {
117 Ok(d) => d,
118 Err(e) => {
119 return Ok(SystemHealthStatus {
120 status: HealthStatus::Error,
121 issues: vec![format!("Failed to fetch deployments: {}", e)],
122 });
123 }
124 };
125 let events = match get_events(ns).await {
126 Ok(e) => e,
127 Err(e) => {
128 return Ok(SystemHealthStatus {
129 status: HealthStatus::Error,
130 issues: vec![format!("Failed to fetch events: {}", e)],
131 });
132 }
133 };
134 let (pdbs, mut pdb_issues) = match get_pod_disruption_budgets(ns).await {
135 Ok(pdbs) => (pdbs, Vec::new()),
136 Err(e) => {
137 warn!(
138 namespace = ns,
139 operation = "get_pod_disruption_budgets",
140 error = %e,
141 "Failed to fetch Pod Disruption Budgets"
142 );
143 (
144 Vec::new(),
145 vec![format!(
146 "Pod Disruption Budget check unavailable (namespace: {}, error: {})",
147 ns, e
148 )],
149 )
150 }
151 };
152
153 let active_pods: Vec<_> = pods.iter().filter(|p| p.phase != "Succeeded").collect();
154 let failed_pods: Vec<_> = pods.iter().filter(|p| p.phase == "Failed").collect();
155 let crashed_pods: Vec<_> = pods
156 .iter()
157 .filter(|p| p.phase == "Running" && p.ready == Some(false))
158 .collect();
159 let pending_pods: Vec<_> = pods.iter().filter(|p| p.phase == "Pending").collect();
160
161 let active_deployments: Vec<_> = deployments.iter().filter(|d| d.replicas > 0).collect();
162
163 let critical_deployments: Vec<_> = active_deployments
164 .iter()
165 .filter(|d| d.ready_replicas == 0 && d.replicas > 0)
166 .collect();
167
168 let degraded_deployments: Vec<_> = active_deployments
169 .iter()
170 .filter(|d| {
171 if d.ready_replicas >= d.replicas {
172 return false;
173 }
174 match is_deployment_covered_by_pdb(d, &pdbs) {
175 Some(pdb) => pdb.disruptions_allowed <= 0 && d.ready_replicas < d.replicas,
176 None => d.ready_replicas == 0,
177 }
178 })
179 .collect();
180
181 let recent_errors: Vec<_> = events
182 .iter()
183 .filter(|e| {
184 e.type_.as_deref() == Some("Error") && is_recent_event(e) && is_critical_event(e)
185 })
186 .collect();
187
188 let recent_warnings: Vec<_> = events
189 .iter()
190 .filter(|e| {
191 e.type_.as_deref() == Some("Warning") && is_recent_event(e) && is_critical_event(e)
192 })
193 .collect();
194
195 let mut status = HealthStatus::Healthy;
196 let mut issues = Vec::new();
197
198 if let Some(pool) = pool {
199 match pool.acquire().await {
200 Ok(mut conn) => {
201 if let Err(e) = conn.execute("SELECT 1").await {
202 status = HealthStatus::Error;
203 issues.push(format!("Database connectivity check failed: {}", e));
204 }
205 }
206 Err(e) => {
207 status = HealthStatus::Error;
208 issues.push(format!("Database connection pool check failed: {}", e));
209 }
210 }
211 }
212
213 if !pdb_issues.is_empty() && status == HealthStatus::Healthy {
214 status = HealthStatus::Warning;
215 }
216 issues.append(&mut pdb_issues);
217
218 if !failed_pods.is_empty() {
219 status = HealthStatus::Error;
220 issues.push(format!("{} failed pod(s)", failed_pods.len()));
221 }
222
223 if !crashed_pods.is_empty() {
224 status = HealthStatus::Error;
225 issues.push(format!("{} crashed pod(s)", crashed_pods.len()));
226 }
227
228 if !critical_deployments.is_empty() {
229 let has_unprotected_critical =
230 critical_deployments
231 .iter()
232 .any(|d| match is_deployment_covered_by_pdb(d, &pdbs) {
233 Some(pdb) if pdb.disruptions_allowed > 0 => false,
234 _ => {
235 let pending_count = count_deployment_pods_by_phase(&pods, d, "Pending");
236 let running_count = count_deployment_pods_by_phase(&pods, d, "Running");
237 !(pending_count > 0 && running_count > 0)
238 }
239 });
240
241 if has_unprotected_critical {
242 status = HealthStatus::Error;
243 issues.push(format!(
244 "{} deployment(s) completely down",
245 critical_deployments.len()
246 ));
247 }
248 }
249
250 if !recent_errors.is_empty() {
251 status = HealthStatus::Error;
252 issues.push(format!("{} recent error(s)", recent_errors.len()));
253 }
254
255 if !degraded_deployments.is_empty() && status == HealthStatus::Error {
256 issues.push(format!(
257 "{} deployment(s) degraded",
258 degraded_deployments.len()
259 ));
260 }
261
262 if status != HealthStatus::Error {
263 let has_actual_failures = !failed_pods.is_empty() || !crashed_pods.is_empty();
264 let has_only_pending_pods = !pending_pods.is_empty() && !has_actual_failures;
265
266 if !degraded_deployments.is_empty() {
267 status = HealthStatus::Warning;
268 issues.push(format!(
269 "{} deployment(s) degraded",
270 degraded_deployments.len()
271 ));
272 }
273
274 let unhealthy_deployments: Vec<_> = active_deployments
275 .iter()
276 .filter(|d| {
277 if d.ready_replicas >= d.replicas {
278 return false;
279 }
280 match is_deployment_covered_by_pdb(d, &pdbs) {
281 Some(pdb) => pdb.disruptions_allowed <= 0 && d.ready_replicas < d.replicas,
282 None => d.ready_replicas == 0,
283 }
284 })
285 .collect();
286 if !unhealthy_deployments.is_empty() && has_only_pending_pods {
287 status = HealthStatus::Warning;
288 issues.push(format!(
289 "{} unhealthy deployment(s)",
290 unhealthy_deployments.len()
291 ));
292 }
293
294 let pending_threshold = if active_pods.len() <= 3 {
295 1
296 } else {
297 (active_pods.len() as f64 * 0.1) as usize
298 };
299 if pending_pods.len() >= pending_threshold {
300 status = HealthStatus::Warning;
301 issues.push(format!("{} pending pod(s)", pending_pods.len()));
302 }
303
304 if !recent_warnings.is_empty() {
305 status = HealthStatus::Warning;
306 issues.push(format!("{} recent warning(s)", recent_warnings.len()));
307 }
308 }
309
310 Ok(SystemHealthStatus { status, issues })
311}