From fe30ecc81e4667ad37e4eb185abb528f5652fb69 Mon Sep 17 00:00:00 2001 From: rcourtman Date: Tue, 5 May 2026 09:13:03 +0100 Subject: [PATCH] Fix TrueNAS CORE agent supervisor restart Refs #1457 --- .../v6/internal/subsystems/agent-lifecycle.md | 5 + .../subsystems/deployment-installability.md | 5 + scripts/install.sh | 131 ++++++++++++++++-- scripts/installtests/install_sh_test.go | 28 ++++ 4 files changed, 159 insertions(+), 10 deletions(-) diff --git a/docs/release-control/v6/internal/subsystems/agent-lifecycle.md b/docs/release-control/v6/internal/subsystems/agent-lifecycle.md index 9a13626fd..6b09036ac 100644 --- a/docs/release-control/v6/internal/subsystems/agent-lifecycle.md +++ b/docs/release-control/v6/internal/subsystems/agent-lifecycle.md @@ -765,6 +765,11 @@ profile and assignment columns, but embedded table framing must route through ## Current State +Generated TrueNAS CORE rc.d service scripts must give `/usr/sbin/daemon -r` a +supervisor pidfile with `-P`, keep the child pid in a separate diagnostic +pidfile, and stop legacy child-pidfile installs by resolving the child back to +its daemon supervisor before replacing or restarting the agent binary. + Deploy selection and retry capacity feedback now follows the API/cloud-paid workspace-capacity terminology boundary. Lifecycle-owned deploy surfaces keep stable backend compatibility identifiers, but the user-facing confirmation, diff --git a/docs/release-control/v6/internal/subsystems/deployment-installability.md b/docs/release-control/v6/internal/subsystems/deployment-installability.md index 8f6518a67..fbd3b0a0c 100644 --- a/docs/release-control/v6/internal/subsystems/deployment-installability.md +++ b/docs/release-control/v6/internal/subsystems/deployment-installability.md @@ -334,6 +334,11 @@ server-side update execution surfaces. ## Current State +Generated TrueNAS CORE rc.d services must use `/usr/sbin/daemon -r` with a +supervisor pidfile (`-P`) separate from the child pidfile (`-p`) and must stop +older child-pidfile installs by killing the daemon supervisor before the child +so installer upgrades do not leave the old agent process running. + This subsystem now makes deployment planning, updater orchestration, and the non-shell installer/update scripts explicit inside the current self-hosted release-confidence lane instead of leaving them as implied behavior around the diff --git a/scripts/install.sh b/scripts/install.sh index d5a8c8cf7..e558936f1 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -692,6 +692,7 @@ render_freebsd_rc_agent_script() { name="pulse_agent" rcvar="pulse_agent_enable" pidfile="/var/run/\${name}.pid" +child_pidfile="/var/run/\${name}.child.pid" command="${exec_path}" command_args="${exec_args}" @@ -700,34 +701,144 @@ start_cmd="\${name}_start" stop_cmd="\${name}_stop" status_cmd="\${name}_status" +pulse_agent_pid_command() +{ + ps -o command= -p "\$1" 2>/dev/null | sed 's/^[[:space:]]*//' +} + +pulse_agent_supervisor_pid() +{ + agent_pid="\$1" + agent_command=\$(pulse_agent_pid_command "\${agent_pid}") + case "\${agent_command}" in + daemon:*) + echo "\${agent_pid}" + return 0 + ;; + esac + + parent_pid=\$(ps -o ppid= -p "\${agent_pid}" 2>/dev/null | tr -d '[:space:]') + if [ -z "\${parent_pid}" ] || [ "\${parent_pid}" = "1" ]; then + return 1 + fi + + parent_command=\$(pulse_agent_pid_command "\${parent_pid}") + case "\${parent_command}" in + daemon:*) + echo "\${parent_pid}" + return 0 + ;; + esac + + return 1 +} + pulse_agent_start() { if checkyesno \${rcvar}; then + if [ -f \${pidfile} ]; then + existing_pid=\$(cat \${pidfile} 2>/dev/null) + if [ -n "\${existing_pid}" ] && kill -0 "\${existing_pid}" 2>/dev/null; then + echo "\${name} is already running as pid \${existing_pid}." + return 0 + fi + fi + + rm -f \${pidfile} \${child_pidfile} echo "Starting \${name}." ${ssl_cert_line} - /usr/sbin/daemon -r -p \${pidfile} -f \${command} \${command_args} + /usr/sbin/daemon -r -P \${pidfile} -p \${child_pidfile} -f "\${command}" \${command_args} fi } pulse_agent_stop() { + supervisor_pid="" + child_pid="" + stopped=0 + + if [ -f \${child_pidfile} ]; then + child_pid=\$(cat \${child_pidfile} 2>/dev/null) + fi + if [ -f \${pidfile} ]; then - echo "Stopping \${name}." - kill \$(cat \${pidfile}) 2>/dev/null - rm -f \${pidfile} - else + primary_pid=\$(cat \${pidfile} 2>/dev/null) + if [ -n "\${primary_pid}" ] && kill -0 "\${primary_pid}" 2>/dev/null; then + detected_supervisor=\$(pulse_agent_supervisor_pid "\${primary_pid}" 2>/dev/null || true) + if [ -n "\${detected_supervisor}" ]; then + supervisor_pid="\${detected_supervisor}" + if [ "\${detected_supervisor}" != "\${primary_pid}" ] && [ -z "\${child_pid}" ]; then + child_pid="\${primary_pid}" + fi + else + supervisor_pid="\${primary_pid}" + fi + fi + fi + + if [ -n "\${supervisor_pid}" ] && kill -0 "\${supervisor_pid}" 2>/dev/null; then + echo "Stopping \${name} supervisor." + kill "\${supervisor_pid}" 2>/dev/null || true + sleep 1 + if kill -0 "\${supervisor_pid}" 2>/dev/null; then + kill -KILL "\${supervisor_pid}" 2>/dev/null || true + fi + stopped=1 + fi + + if [ -n "\${child_pid}" ] && kill -0 "\${child_pid}" 2>/dev/null; then + echo "Stopping \${name} child." + kill "\${child_pid}" 2>/dev/null || true + sleep 1 + if kill -0 "\${child_pid}" 2>/dev/null; then + kill -KILL "\${child_pid}" 2>/dev/null || true + fi + stopped=1 + fi + + rm -f \${pidfile} \${child_pidfile} + + if [ "\${stopped}" -eq 0 ]; then echo "\${name} is not running." fi } pulse_agent_status() { - if [ -f \${pidfile} ] && kill -0 \$(cat \${pidfile}) 2>/dev/null; then - echo "\${name} is running as pid \$(cat \${pidfile})." - else - echo "\${name} is not running." - return 1 + if [ -f \${pidfile} ]; then + primary_pid=\$(cat \${pidfile} 2>/dev/null) + if [ -n "\${primary_pid}" ] && kill -0 "\${primary_pid}" 2>/dev/null; then + child_status="" + if [ -f \${child_pidfile} ]; then + child_pid=\$(cat \${child_pidfile} 2>/dev/null) + if [ -n "\${child_pid}" ] && kill -0 "\${child_pid}" 2>/dev/null; then + child_status=" with child pid \${child_pid}" + fi + fi + echo "\${name} is running as supervisor pid \${primary_pid}\${child_status}." + return 0 + fi fi + + if [ -f \${child_pidfile} ]; then + child_pid=\$(cat \${child_pidfile} 2>/dev/null) + if [ -n "\${child_pid}" ] && kill -0 "\${child_pid}" 2>/dev/null; then + echo "\${name} is running as child pid \${child_pid}." + return 0 + fi + fi + + if [ -f \${pidfile} ]; then + legacy_pid=\$(cat \${pidfile} 2>/dev/null) + legacy_supervisor=\$(pulse_agent_supervisor_pid "\${legacy_pid}" 2>/dev/null || true) + if [ -n "\${legacy_supervisor}" ] && kill -0 "\${legacy_supervisor}" 2>/dev/null; then + echo "\${name} is running as legacy child pid \${legacy_pid} supervised by pid \${legacy_supervisor}." + return 0 + fi + fi + + echo "\${name} is not running." + return 1 } load_rc_config \$name diff --git a/scripts/installtests/install_sh_test.go b/scripts/installtests/install_sh_test.go index 7b6049517..2740ecd00 100644 --- a/scripts/installtests/install_sh_test.go +++ b/scripts/installtests/install_sh_test.go @@ -519,6 +519,34 @@ func TestInstallSHUsesSharedServiceRenderers(t *testing.T) { } } +func TestInstallSHFreeBSDRendererUsesDaemonSupervisorPidfile(t *testing.T) { + content, err := os.ReadFile(repoFile("scripts", "install.sh")) + if err != nil { + t.Fatalf("read install.sh: %v", err) + } + + script := string(content) + required := []string{ + `child_pidfile="/var/run/\${name}.child.pid"`, + `pulse_agent_supervisor_pid()`, + `parent_pid=\$(ps -o ppid= -p "\${agent_pid}" 2>/dev/null | tr -d '[:space:]')`, + `daemon:*)`, + `/usr/sbin/daemon -r -P \${pidfile} -p \${child_pidfile} -f "\${command}" \${command_args}`, + `kill -KILL "\${supervisor_pid}" 2>/dev/null || true`, + `rm -f \${pidfile} \${child_pidfile}`, + `legacy child pid \${legacy_pid} supervised by pid \${legacy_supervisor}`, + } + for _, needle := range required { + if !strings.Contains(script, needle) { + t.Fatalf("install.sh missing FreeBSD daemon supervisor contract: %s", needle) + } + } + + if strings.Contains(script, `/usr/sbin/daemon -r -p \${pidfile}`) { + t.Fatal("install.sh still writes the child pid to the service pidfile under daemon -r") + } +} + func TestInstallSHUsesCanonicalCompletionHelper(t *testing.T) { content, err := os.ReadFile(repoFile("scripts", "install.sh")) if err != nil {