headless_lms_server/domain/system_health/
health_check.rs

1//! System health checking logic.
2
3use super::kubernetes::{get_deployments, get_events, get_pod_disruption_budgets, get_pods};
4use super::{
5    DeploymentInfo, EventInfo, HealthStatus, PodDisruptionBudgetInfo, PodInfo, SystemHealthStatus,
6};
7use anyhow::Result;
8use chrono::{DateTime, Duration, Utc};
9use sqlx::{Executor, PgPool};
10use tracing::warn;
11
12pub fn is_critical_event(event: &EventInfo) -> bool {
13    let reason = event.reason.as_deref().unwrap_or("").to_lowercase();
14    let message = event.message.as_deref().unwrap_or("").to_lowercase();
15
16    let ignored_reasons = ["scheduled", "pulled", "created", "started", "killing"];
17
18    if ignored_reasons.iter().any(|r| reason.contains(r)) {
19        return false;
20    }
21
22    let critical_reasons = [
23        "failed",
24        "backoff",
25        "crashloop",
26        "imagepullbackoff",
27        "errimagepull",
28        "invalid",
29    ];
30
31    critical_reasons
32        .iter()
33        .any(|r| reason.contains(r) || message.contains(r))
34}
35
36pub fn is_recent_event(event: &EventInfo) -> bool {
37    let one_hour_ago = Utc::now() - Duration::hours(1);
38
39    let check_timestamp = |ts_str: &str| -> bool {
40        if let Ok(parsed) =
41            DateTime::parse_from_str(&ts_str.replace(" UTC", "Z"), "%Y-%m-%d %H:%M:%S%Z")
42        {
43            parsed.with_timezone(&Utc) > one_hour_ago
44        } else {
45            false
46        }
47    };
48
49    if let Some(ts) = &event.last_timestamp {
50        if check_timestamp(ts) {
51            return true;
52        }
53    }
54
55    if let Some(ts) = &event.first_timestamp {
56        if check_timestamp(ts) {
57            return true;
58        }
59    }
60
61    false
62}
63
64fn pod_matches_deployment(pod: &PodInfo, deployment: &DeploymentInfo) -> bool {
65    if deployment.selector_labels.is_empty() {
66        return false;
67    }
68    deployment
69        .selector_labels
70        .iter()
71        .all(|(k, v)| pod.labels.get(k) == Some(v))
72}
73
74fn count_deployment_pods_by_phase(
75    pods: &[PodInfo],
76    deployment: &DeploymentInfo,
77    phase: &str,
78) -> usize {
79    pods.iter()
80        .filter(|p| p.phase == phase && pod_matches_deployment(p, deployment))
81        .count()
82}
83
84fn is_deployment_covered_by_pdb<'a>(
85    deployment: &DeploymentInfo,
86    pdbs: &'a [PodDisruptionBudgetInfo],
87) -> Option<&'a PodDisruptionBudgetInfo> {
88    pdbs.iter().find(|pdb| {
89        if pdb.selector_labels.is_empty() {
90            return false;
91        }
92        pdb.selector_labels
93            .iter()
94            .all(|(k, v)| deployment.selector_labels.get(k) == Some(v))
95    })
96}
97
98pub async fn check_system_health(ns: &str, pool: Option<&PgPool>) -> Result<bool> {
99    let health = check_system_health_detailed(ns, pool).await?;
100    Ok(health.status == HealthStatus::Healthy)
101}
102
103pub async fn check_system_health_detailed(
104    ns: &str,
105    pool: Option<&PgPool>,
106) -> Result<SystemHealthStatus> {
107    let pods = match get_pods(ns).await {
108        Ok(p) => p,
109        Err(e) => {
110            return Ok(SystemHealthStatus {
111                status: HealthStatus::Error,
112                issues: vec![format!("Failed to fetch pods: {}", e)],
113            });
114        }
115    };
116    let deployments = match get_deployments(ns).await {
117        Ok(d) => d,
118        Err(e) => {
119            return Ok(SystemHealthStatus {
120                status: HealthStatus::Error,
121                issues: vec![format!("Failed to fetch deployments: {}", e)],
122            });
123        }
124    };
125    let events = match get_events(ns).await {
126        Ok(e) => e,
127        Err(e) => {
128            return Ok(SystemHealthStatus {
129                status: HealthStatus::Error,
130                issues: vec![format!("Failed to fetch events: {}", e)],
131            });
132        }
133    };
134    let (pdbs, mut pdb_issues) = match get_pod_disruption_budgets(ns).await {
135        Ok(pdbs) => (pdbs, Vec::new()),
136        Err(e) => {
137            warn!(
138                namespace = ns,
139                operation = "get_pod_disruption_budgets",
140                error = %e,
141                "Failed to fetch Pod Disruption Budgets"
142            );
143            (
144                Vec::new(),
145                vec![format!(
146                    "Pod Disruption Budget check unavailable (namespace: {}, error: {})",
147                    ns, e
148                )],
149            )
150        }
151    };
152
153    let active_pods: Vec<_> = pods.iter().filter(|p| p.phase != "Succeeded").collect();
154    let failed_pods: Vec<_> = pods.iter().filter(|p| p.phase == "Failed").collect();
155    let crashed_pods: Vec<_> = pods
156        .iter()
157        .filter(|p| p.phase == "Running" && p.ready == Some(false))
158        .collect();
159    let pending_pods: Vec<_> = pods.iter().filter(|p| p.phase == "Pending").collect();
160
161    let active_deployments: Vec<_> = deployments.iter().filter(|d| d.replicas > 0).collect();
162
163    let critical_deployments: Vec<_> = active_deployments
164        .iter()
165        .filter(|d| d.ready_replicas == 0 && d.replicas > 0)
166        .collect();
167
168    let degraded_deployments: Vec<_> = active_deployments
169        .iter()
170        .filter(|d| {
171            if d.ready_replicas >= d.replicas {
172                return false;
173            }
174            match is_deployment_covered_by_pdb(d, &pdbs) {
175                Some(pdb) => pdb.disruptions_allowed <= 0 && d.ready_replicas < d.replicas,
176                None => d.ready_replicas == 0,
177            }
178        })
179        .collect();
180
181    let recent_errors: Vec<_> = events
182        .iter()
183        .filter(|e| {
184            e.type_.as_deref() == Some("Error") && is_recent_event(e) && is_critical_event(e)
185        })
186        .collect();
187
188    let recent_warnings: Vec<_> = events
189        .iter()
190        .filter(|e| {
191            e.type_.as_deref() == Some("Warning") && is_recent_event(e) && is_critical_event(e)
192        })
193        .collect();
194
195    let mut status = HealthStatus::Healthy;
196    let mut issues = Vec::new();
197
198    if let Some(pool) = pool {
199        match pool.acquire().await {
200            Ok(mut conn) => {
201                if let Err(e) = conn.execute("SELECT 1").await {
202                    status = HealthStatus::Error;
203                    issues.push(format!("Database connectivity check failed: {}", e));
204                }
205            }
206            Err(e) => {
207                status = HealthStatus::Error;
208                issues.push(format!("Database connection pool check failed: {}", e));
209            }
210        }
211    }
212
213    if !pdb_issues.is_empty() && status == HealthStatus::Healthy {
214        status = HealthStatus::Warning;
215    }
216    issues.append(&mut pdb_issues);
217
218    if !failed_pods.is_empty() {
219        status = HealthStatus::Error;
220        issues.push(format!("{} failed pod(s)", failed_pods.len()));
221    }
222
223    if !crashed_pods.is_empty() {
224        status = HealthStatus::Error;
225        issues.push(format!("{} crashed pod(s)", crashed_pods.len()));
226    }
227
228    if !critical_deployments.is_empty() {
229        let has_unprotected_critical =
230            critical_deployments
231                .iter()
232                .any(|d| match is_deployment_covered_by_pdb(d, &pdbs) {
233                    Some(pdb) if pdb.disruptions_allowed > 0 => false,
234                    _ => {
235                        let pending_count = count_deployment_pods_by_phase(&pods, d, "Pending");
236                        let running_count = count_deployment_pods_by_phase(&pods, d, "Running");
237                        !(pending_count > 0 && running_count > 0)
238                    }
239                });
240
241        if has_unprotected_critical {
242            status = HealthStatus::Error;
243            issues.push(format!(
244                "{} deployment(s) completely down",
245                critical_deployments.len()
246            ));
247        }
248    }
249
250    if !recent_errors.is_empty() {
251        status = HealthStatus::Error;
252        issues.push(format!("{} recent error(s)", recent_errors.len()));
253    }
254
255    if !degraded_deployments.is_empty() && status == HealthStatus::Error {
256        issues.push(format!(
257            "{} deployment(s) degraded",
258            degraded_deployments.len()
259        ));
260    }
261
262    if status != HealthStatus::Error {
263        let has_actual_failures = !failed_pods.is_empty() || !crashed_pods.is_empty();
264        let has_only_pending_pods = !pending_pods.is_empty() && !has_actual_failures;
265
266        if !degraded_deployments.is_empty() {
267            status = HealthStatus::Warning;
268            issues.push(format!(
269                "{} deployment(s) degraded",
270                degraded_deployments.len()
271            ));
272        }
273
274        let unhealthy_deployments: Vec<_> = active_deployments
275            .iter()
276            .filter(|d| {
277                if d.ready_replicas >= d.replicas {
278                    return false;
279                }
280                match is_deployment_covered_by_pdb(d, &pdbs) {
281                    Some(pdb) => pdb.disruptions_allowed <= 0 && d.ready_replicas < d.replicas,
282                    None => d.ready_replicas == 0,
283                }
284            })
285            .collect();
286        if !unhealthy_deployments.is_empty() && has_only_pending_pods {
287            status = HealthStatus::Warning;
288            issues.push(format!(
289                "{} unhealthy deployment(s)",
290                unhealthy_deployments.len()
291            ));
292        }
293
294        let pending_threshold = if active_pods.len() <= 3 {
295            1
296        } else {
297            (active_pods.len() as f64 * 0.1) as usize
298        };
299        if pending_pods.len() >= pending_threshold {
300            status = HealthStatus::Warning;
301            issues.push(format!("{} pending pod(s)", pending_pods.len()));
302        }
303
304        if !recent_warnings.is_empty() {
305            status = HealthStatus::Warning;
306            issues.push(format!("{} recent warning(s)", recent_warnings.len()));
307        }
308    }
309
310    Ok(SystemHealthStatus { status, issues })
311}