rbnx/cmd/teardown.rs
1// SPDX-License-Identifier: MulanPSL-2.0
2// Shared boot-state persistence + tear-down logic for `rbnx boot` and
3// `rbnx shutdown`. Boot writes `state.json`; shutdown reads it.
4//
5// Scope: host-side process groups only. Each child boot spawns is given
6// its own PGID via `process_group(0)`, so a single `kill -TERM -<PGID>`
7// takes down the wrapper plus everything it fork+exec'd. CLI does NOT
8// reach into container runtimes, network proxies, etc — that's the
9// individual package's job. The convention (already followed by the
10// webots nav2 start.sh) is for any package whose start body spawns
11// out-of-process children — `docker exec`, `kubectl exec`, ssh,
12// systemd-run, … — to install a `trap` that cleans them up on EXIT/
13// INT/TERM. When that trap fires, our SIGTERM-the-PGID arrives at the
14// wrapper script, the trap runs, the side-channel children die, and
15// then the wrapper exits.
16//
17// TODO: add a top-level `stop:` field to package_manifest.yaml — a
18// shell snippet rbnx runs *before* SIGTERMing the PGID. Right now we
19// rely entirely on each package's `trap` discipline; an explicit stop
20// hook makes the cleanup contract part of the manifest schema and is
21// easier to write correctly than nested traps. (Mirror what `build:`
22// and `start:` already are: just a string body executed in the package
23// root.)
24//
25// Boot also doesn't currently kill its children on the `?`-error path,
26// leaking atlas/pilot/executor as orphans. Persisted state + a separate
27// `rbnx shutdown` command lets the user (or boot's own error path)
28// always reach the same teardown helper.
29
30use anyhow::{Context, Result};
31use serde::{Deserialize, Serialize};
32use std::path::{Path, PathBuf};
33use std::time::Duration;
34
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct BootState {
37 pub manifest_path: String,
38 pub boot_pid: u32,
39 pub started_at_ms: u64,
40 pub atlas_endpoint: String,
41 pub components: Vec<ComponentRecord>,
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct ComponentRecord {
46 pub name: String,
47 /// "system_builtin" | "system_package" | "primitive" | "service"
48 pub kind: String,
49 pub pid: u32,
50 /// Process group id. Boot starts each child with `process_group(0)`
51 /// so PGID == PID at spawn time; tear-down sends SIGTERM/SIGKILL to
52 /// `-PGID` to take down the wrapper plus every direct fork+exec
53 /// descendant. Side-channel children (`docker exec`, ssh, …) must
54 /// be cleaned up by the package's own start-body trap — see this
55 /// module's header comment.
56 pub pgid: u32,
57}
58
59pub fn state_path(manifest_dir: &Path) -> PathBuf {
60 manifest_dir.join("rbnx-boot").join("state.json")
61}
62
63pub fn write_state(path: &Path, state: &BootState) -> Result<()> {
64 if let Some(parent) = path.parent() {
65 std::fs::create_dir_all(parent).with_context(|| format!("create {}", parent.display()))?;
66 }
67 let text = serde_json::to_string_pretty(state)?;
68 std::fs::write(path, text).with_context(|| format!("write {}", path.display()))?;
69 Ok(())
70}
71
72pub fn read_state(path: &Path) -> Result<BootState> {
73 let raw = std::fs::read_to_string(path).with_context(|| format!("read {}", path.display()))?;
74 let state: BootState =
75 serde_json::from_str(&raw).with_context(|| format!("parse {}", path.display()))?;
76 Ok(state)
77}
78
79/// SIGTERM each PGID, wait for graceful exit, SIGKILL stragglers.
80/// Idempotent: missing PGIDs are ok.
81pub async fn teardown(components: &[ComponentRecord]) {
82 use nix::sys::signal::{Signal, killpg};
83 use nix::unistd::Pid;
84
85 // Reverse order so primitives/services die before pilot/atlas — keeps
86 // log noise about "atlas unreachable" from drowning out the real
87 // shutdown trace.
88 let ordered: Vec<&ComponentRecord> = components.iter().rev().collect();
89
90 for c in &ordered {
91 let pgid = Pid::from_raw(c.pgid as i32);
92 match killpg(pgid, Signal::SIGTERM) {
93 Ok(()) => robonix_cli::output::sub_step(&format!(
94 "[shutdown] {} TERM (pgid={})",
95 c.name, c.pgid
96 )),
97 Err(nix::errno::Errno::ESRCH) => { /* already dead */ }
98 Err(e) => {
99 robonix_cli::output::sub_step(&format!("[shutdown] {} TERM failed: {e}", c.name))
100 }
101 }
102 }
103
104 tokio::time::sleep(Duration::from_secs(3)).await;
105
106 for c in &ordered {
107 let pgid = Pid::from_raw(c.pgid as i32);
108 // SIGKILL is best-effort; ESRCH means already gone.
109 let _ = killpg(pgid, Signal::SIGKILL);
110 }
111}