Skip to main content

rbnx/cmd/
teardown.rs

1// SPDX-License-Identifier: MulanPSL-2.0
2// Shared boot-state persistence + tear-down logic for `rbnx boot` and
3// `rbnx shutdown`. Boot writes `state.json`; shutdown reads it.
4//
5// Scope: host-side process groups only. Each child boot spawns is given
6// its own PGID via `process_group(0)`, so a single `kill -TERM -<PGID>`
7// takes down the wrapper plus everything it fork+exec'd. CLI does NOT
8// reach into container runtimes, network proxies, etc — that's the
9// individual package's job. The convention (already followed by the
10// webots nav2 start.sh) is for any package whose start body spawns
11// out-of-process children — `docker exec`, `kubectl exec`, ssh,
12// systemd-run, … — to install a `trap` that cleans them up on EXIT/
13// INT/TERM. When that trap fires, our SIGTERM-the-PGID arrives at the
14// wrapper script, the trap runs, the side-channel children die, and
15// then the wrapper exits.
16//
17// TODO: add a top-level `stop:` field to package_manifest.yaml — a
18// shell snippet rbnx runs *before* SIGTERMing the PGID. Right now we
19// rely entirely on each package's `trap` discipline; an explicit stop
20// hook makes the cleanup contract part of the manifest schema and is
21// easier to write correctly than nested traps. (Mirror what `build:`
22// and `start:` already are: just a string body executed in the package
23// root.)
24//
25// Boot also doesn't currently kill its children on the `?`-error path,
26// leaking atlas/pilot/executor as orphans. Persisted state + a separate
27// `rbnx shutdown` command lets the user (or boot's own error path)
28// always reach the same teardown helper.
29
30use anyhow::{Context, Result};
31use serde::{Deserialize, Serialize};
32use std::path::{Path, PathBuf};
33use std::time::Duration;
34
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct BootState {
37    pub manifest_path: String,
38    pub boot_pid: u32,
39    pub started_at_ms: u64,
40    pub atlas_endpoint: String,
41    pub components: Vec<ComponentRecord>,
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct ComponentRecord {
46    pub name: String,
47    /// "system_builtin" | "system_package" | "primitive" | "service"
48    pub kind: String,
49    pub pid: u32,
50    /// Process group id. Boot starts each child with `process_group(0)`
51    /// so PGID == PID at spawn time; tear-down sends SIGTERM/SIGKILL to
52    /// `-PGID` to take down the wrapper plus every direct fork+exec
53    /// descendant. Side-channel children (`docker exec`, ssh, …) must
54    /// be cleaned up by the package's own start-body trap — see this
55    /// module's header comment.
56    pub pgid: u32,
57}
58
59pub fn state_path(manifest_dir: &Path) -> PathBuf {
60    manifest_dir.join("rbnx-boot").join("state.json")
61}
62
63pub fn write_state(path: &Path, state: &BootState) -> Result<()> {
64    if let Some(parent) = path.parent() {
65        std::fs::create_dir_all(parent).with_context(|| format!("create {}", parent.display()))?;
66    }
67    let text = serde_json::to_string_pretty(state)?;
68    std::fs::write(path, text).with_context(|| format!("write {}", path.display()))?;
69    Ok(())
70}
71
72pub fn read_state(path: &Path) -> Result<BootState> {
73    let raw = std::fs::read_to_string(path).with_context(|| format!("read {}", path.display()))?;
74    let state: BootState =
75        serde_json::from_str(&raw).with_context(|| format!("parse {}", path.display()))?;
76    Ok(state)
77}
78
79/// SIGTERM each PGID, wait for graceful exit, SIGKILL stragglers.
80/// Idempotent: missing PGIDs are ok.
81pub async fn teardown(components: &[ComponentRecord]) {
82    use nix::sys::signal::{Signal, killpg};
83    use nix::unistd::Pid;
84
85    // Reverse order so primitives/services die before pilot/atlas — keeps
86    // log noise about "atlas unreachable" from drowning out the real
87    // shutdown trace.
88    let ordered: Vec<&ComponentRecord> = components.iter().rev().collect();
89
90    for c in &ordered {
91        let pgid = Pid::from_raw(c.pgid as i32);
92        match killpg(pgid, Signal::SIGTERM) {
93            Ok(()) => robonix_cli::output::sub_step(&format!(
94                "[shutdown] {} TERM (pgid={})",
95                c.name, c.pgid
96            )),
97            Err(nix::errno::Errno::ESRCH) => { /* already dead */ }
98            Err(e) => {
99                robonix_cli::output::sub_step(&format!("[shutdown] {} TERM failed: {e}", c.name))
100            }
101        }
102    }
103
104    tokio::time::sleep(Duration::from_secs(3)).await;
105
106    for c in &ordered {
107        let pgid = Pid::from_raw(c.pgid as i32);
108        // SIGKILL is best-effort; ESRCH means already gone.
109        let _ = killpg(pgid, Signal::SIGKILL);
110    }
111}