Fix TrueNAS CORE agent supervisor restart

Refs #1457
This commit is contained in:
rcourtman 2026-05-05 09:13:03 +01:00
parent 04fb02defc
commit fe30ecc81e
4 changed files with 159 additions and 10 deletions

View file

@ -765,6 +765,11 @@ profile and assignment columns, but embedded table framing must route through
## Current State
Generated TrueNAS CORE rc.d service scripts must give `/usr/sbin/daemon -r` a
supervisor pidfile with `-P`, keep the child pid in a separate diagnostic
pidfile, and stop legacy child-pidfile installs by resolving the child back to
its daemon supervisor before replacing or restarting the agent binary.
Deploy selection and retry capacity feedback now follows the API/cloud-paid
workspace-capacity terminology boundary. Lifecycle-owned deploy surfaces keep
stable backend compatibility identifiers, but the user-facing confirmation,

View file

@ -334,6 +334,11 @@ server-side update execution surfaces.
## Current State
Generated TrueNAS CORE rc.d services must use `/usr/sbin/daemon -r` with a
supervisor pidfile (`-P`) separate from the child pidfile (`-p`) and must stop
older child-pidfile installs by killing the daemon supervisor before the child
so installer upgrades do not leave the old agent process running.
This subsystem now makes deployment planning, updater orchestration, and the
non-shell installer/update scripts explicit inside the current self-hosted
release-confidence lane instead of leaving them as implied behavior around the

View file

@ -692,6 +692,7 @@ render_freebsd_rc_agent_script() {
name="pulse_agent"
rcvar="pulse_agent_enable"
pidfile="/var/run/\${name}.pid"
child_pidfile="/var/run/\${name}.child.pid"
command="${exec_path}"
command_args="${exec_args}"
@ -700,34 +701,144 @@ start_cmd="\${name}_start"
stop_cmd="\${name}_stop"
status_cmd="\${name}_status"
pulse_agent_pid_command()
{
ps -o command= -p "\$1" 2>/dev/null | sed 's/^[[:space:]]*//'
}
pulse_agent_supervisor_pid()
{
agent_pid="\$1"
agent_command=\$(pulse_agent_pid_command "\${agent_pid}")
case "\${agent_command}" in
daemon:*)
echo "\${agent_pid}"
return 0
;;
esac
parent_pid=\$(ps -o ppid= -p "\${agent_pid}" 2>/dev/null | tr -d '[:space:]')
if [ -z "\${parent_pid}" ] || [ "\${parent_pid}" = "1" ]; then
return 1
fi
parent_command=\$(pulse_agent_pid_command "\${parent_pid}")
case "\${parent_command}" in
daemon:*)
echo "\${parent_pid}"
return 0
;;
esac
return 1
}
pulse_agent_start()
{
if checkyesno \${rcvar}; then
if [ -f \${pidfile} ]; then
existing_pid=\$(cat \${pidfile} 2>/dev/null)
if [ -n "\${existing_pid}" ] && kill -0 "\${existing_pid}" 2>/dev/null; then
echo "\${name} is already running as pid \${existing_pid}."
return 0
fi
fi
rm -f \${pidfile} \${child_pidfile}
echo "Starting \${name}."
${ssl_cert_line}
/usr/sbin/daemon -r -p \${pidfile} -f \${command} \${command_args}
/usr/sbin/daemon -r -P \${pidfile} -p \${child_pidfile} -f "\${command}" \${command_args}
fi
}
pulse_agent_stop()
{
supervisor_pid=""
child_pid=""
stopped=0
if [ -f \${child_pidfile} ]; then
child_pid=\$(cat \${child_pidfile} 2>/dev/null)
fi
if [ -f \${pidfile} ]; then
echo "Stopping \${name}."
kill \$(cat \${pidfile}) 2>/dev/null
rm -f \${pidfile}
else
primary_pid=\$(cat \${pidfile} 2>/dev/null)
if [ -n "\${primary_pid}" ] && kill -0 "\${primary_pid}" 2>/dev/null; then
detected_supervisor=\$(pulse_agent_supervisor_pid "\${primary_pid}" 2>/dev/null || true)
if [ -n "\${detected_supervisor}" ]; then
supervisor_pid="\${detected_supervisor}"
if [ "\${detected_supervisor}" != "\${primary_pid}" ] && [ -z "\${child_pid}" ]; then
child_pid="\${primary_pid}"
fi
else
supervisor_pid="\${primary_pid}"
fi
fi
fi
if [ -n "\${supervisor_pid}" ] && kill -0 "\${supervisor_pid}" 2>/dev/null; then
echo "Stopping \${name} supervisor."
kill "\${supervisor_pid}" 2>/dev/null || true
sleep 1
if kill -0 "\${supervisor_pid}" 2>/dev/null; then
kill -KILL "\${supervisor_pid}" 2>/dev/null || true
fi
stopped=1
fi
if [ -n "\${child_pid}" ] && kill -0 "\${child_pid}" 2>/dev/null; then
echo "Stopping \${name} child."
kill "\${child_pid}" 2>/dev/null || true
sleep 1
if kill -0 "\${child_pid}" 2>/dev/null; then
kill -KILL "\${child_pid}" 2>/dev/null || true
fi
stopped=1
fi
rm -f \${pidfile} \${child_pidfile}
if [ "\${stopped}" -eq 0 ]; then
echo "\${name} is not running."
fi
}
pulse_agent_status()
{
if [ -f \${pidfile} ] && kill -0 \$(cat \${pidfile}) 2>/dev/null; then
echo "\${name} is running as pid \$(cat \${pidfile})."
else
echo "\${name} is not running."
return 1
if [ -f \${pidfile} ]; then
primary_pid=\$(cat \${pidfile} 2>/dev/null)
if [ -n "\${primary_pid}" ] && kill -0 "\${primary_pid}" 2>/dev/null; then
child_status=""
if [ -f \${child_pidfile} ]; then
child_pid=\$(cat \${child_pidfile} 2>/dev/null)
if [ -n "\${child_pid}" ] && kill -0 "\${child_pid}" 2>/dev/null; then
child_status=" with child pid \${child_pid}"
fi
fi
echo "\${name} is running as supervisor pid \${primary_pid}\${child_status}."
return 0
fi
fi
if [ -f \${child_pidfile} ]; then
child_pid=\$(cat \${child_pidfile} 2>/dev/null)
if [ -n "\${child_pid}" ] && kill -0 "\${child_pid}" 2>/dev/null; then
echo "\${name} is running as child pid \${child_pid}."
return 0
fi
fi
if [ -f \${pidfile} ]; then
legacy_pid=\$(cat \${pidfile} 2>/dev/null)
legacy_supervisor=\$(pulse_agent_supervisor_pid "\${legacy_pid}" 2>/dev/null || true)
if [ -n "\${legacy_supervisor}" ] && kill -0 "\${legacy_supervisor}" 2>/dev/null; then
echo "\${name} is running as legacy child pid \${legacy_pid} supervised by pid \${legacy_supervisor}."
return 0
fi
fi
echo "\${name} is not running."
return 1
}
load_rc_config \$name

View file

@ -519,6 +519,34 @@ func TestInstallSHUsesSharedServiceRenderers(t *testing.T) {
}
}
func TestInstallSHFreeBSDRendererUsesDaemonSupervisorPidfile(t *testing.T) {
content, err := os.ReadFile(repoFile("scripts", "install.sh"))
if err != nil {
t.Fatalf("read install.sh: %v", err)
}
script := string(content)
required := []string{
`child_pidfile="/var/run/\${name}.child.pid"`,
`pulse_agent_supervisor_pid()`,
`parent_pid=\$(ps -o ppid= -p "\${agent_pid}" 2>/dev/null | tr -d '[:space:]')`,
`daemon:*)`,
`/usr/sbin/daemon -r -P \${pidfile} -p \${child_pidfile} -f "\${command}" \${command_args}`,
`kill -KILL "\${supervisor_pid}" 2>/dev/null || true`,
`rm -f \${pidfile} \${child_pidfile}`,
`legacy child pid \${legacy_pid} supervised by pid \${legacy_supervisor}`,
}
for _, needle := range required {
if !strings.Contains(script, needle) {
t.Fatalf("install.sh missing FreeBSD daemon supervisor contract: %s", needle)
}
}
if strings.Contains(script, `/usr/sbin/daemon -r -p \${pidfile}`) {
t.Fatal("install.sh still writes the child pid to the service pidfile under daemon -r")
}
}
func TestInstallSHUsesCanonicalCompletionHelper(t *testing.T) {
content, err := os.ReadFile(repoFile("scripts", "install.sh"))
if err != nil {