Recover missing hosted tenant runtimes

This commit is contained in:
rcourtman 2026-04-24 00:09:52 +01:00
parent 4dfe69fe6f
commit 9dbaaa7efe
5 changed files with 155 additions and 5 deletions

View file

@ -172,6 +172,9 @@ func newTenantRuntimeReconcileCmd() *cobra.Command {
result.ActiveImageID,
result.ReconciledOnly,
)
if result.RestoredMissing {
fmt.Printf("restored_missing=%t\n", result.RestoredMissing)
}
if result.PreviousContainerID != "" {
fmt.Printf("previous_container_id=%s\n", result.PreviousContainerID)
}

View file

@ -141,6 +141,12 @@ cloud-specific enforcement rules.
through the control-plane Docker manager so tenant runtime logging cannot
fill the live Pulse Cloud host independently of tenant data quotas.
10. `internal/cloudcp/tenant_runtime_rollout.go` shared with `deployment-installability`: hosted tenant runtime rollout is both a Pulse Cloud runtime contract boundary and a deployment-installability release-rollout boundary.
Hosted tenant runtime reconciliation must treat a registered tenant with
preserved tenant data but no live Docker runtime as a recoverable managed
state, not as a terminal skip. The control-plane-owned reconcile path must
recreate the canonical tenant container, health-check it, and persist the
new runtime identity before hosted billing/auth surfaces are considered
coherent.
The real `pulse-pro` license-server legacy checkout issuance, recurring
renewals, manual issue, and legacy exchange flows are part of that same

View file

@ -192,6 +192,11 @@ server-side update execution surfaces.
`cmd/pulse-control-plane/main.go`, `internal/cloudcp/docker/manager.go`,
`internal/cloudcp/docker/labels.go`, and
`internal/cloudcp/tenant_runtime_rollout.go`
The batch reconcile command must be restorative as well as corrective:
when a tenant registry row and tenant data remain but the canonical or
recorded Docker container is missing, dry-run must classify the tenant for
mutation and the live command must recreate the container, prove health, and
rewrite the registry runtime identity through the same control-plane path.
10. Add or change the canonical hosted staging smoke operator path through `scripts/run_hosted_staging_smoke.sh`, `tests/integration/scripts/bootstrap-hosted-mobile-onboarding.mjs`, `tests/integration/scripts/hosted-mobile-token-runtime.mjs`, `tests/integration/scripts/hosted-tenant-runtime.mjs`, and `tests/integration/scripts/relay-mobile-token-helper.go`
## Forbidden Paths

View file

@ -43,6 +43,7 @@ type TenantRuntimeRolloutResult struct {
ActiveImageID string
BackupContainerName string
ReconciledOnly bool
RestoredMissing bool
}
type TenantRuntimeContractReconcilePlanOptions struct {
@ -98,6 +99,7 @@ type tenantRuntimeRolloutService struct {
registry tenantRuntimeRolloutRegistry
docker tenantRuntimeRolloutDocker
tenantsDir string
defaultImage string
synchronizer tenantRuntimeRolloutSynchronizer
now func() time.Time
sleep func(time.Duration)
@ -105,6 +107,8 @@ type tenantRuntimeRolloutService struct {
healthPoll time.Duration
}
var errTenantRuntimeMissing = errors.New("tenant runtime container missing")
// RolloutTenantRuntime executes the canonical hosted tenant runtime rollout
// path using the control plane's registry and Docker manager.
func RolloutTenantRuntime(ctx context.Context, cfg *CPConfig, opts TenantRuntimeRolloutOptions) (*TenantRuntimeRolloutResult, error) {
@ -183,6 +187,7 @@ func newTenantRuntimeRolloutServiceFromConfig(
registry: reg,
docker: dockerMgr,
tenantsDir: cfg.TenantsDir(),
defaultImage: strings.TrimSpace(image),
synchronizer: filesystemTenantRuntimeSynchronizer{},
now: func() time.Time { return time.Now().UTC() },
sleep: time.Sleep,
@ -203,6 +208,9 @@ func (s *tenantRuntimeRolloutService) Rollout(ctx context.Context, opts TenantRu
return nil, fmt.Errorf("tenant id is required")
}
image := strings.TrimSpace(opts.Image)
if image == "" {
image = strings.TrimSpace(s.defaultImage)
}
if image == "" {
return nil, fmt.Errorf("image is required")
}
@ -227,6 +235,9 @@ func (s *tenantRuntimeRolloutService) Rollout(ctx context.Context, opts TenantRu
live, err := s.resolveLiveContainer(ctx, tenant)
if err != nil {
if errors.Is(err, errTenantRuntimeMissing) {
return s.recreateMissingRuntime(ctx, tenant, image, healthTimeout, healthPoll)
}
return nil, err
}
if strings.TrimSpace(tenant.ContainerID) != "" && tenant.ContainerID != live.ID {
@ -369,6 +380,59 @@ func (s *tenantRuntimeRolloutService) Rollout(ctx context.Context, opts TenantRu
}, nil
}
func (s *tenantRuntimeRolloutService) recreateMissingRuntime(
ctx context.Context,
tenant *registry.Tenant,
image string,
healthTimeout time.Duration,
healthPoll time.Duration,
) (*TenantRuntimeRolloutResult, error) {
if tenant == nil {
return nil, fmt.Errorf("tenant is nil")
}
tenantID := strings.TrimSpace(tenant.ID)
if tenantID == "" {
return nil, fmt.Errorf("tenant id is required")
}
tenantDataDir := filepath.Join(s.tenantsDir, tenantID)
newContainerID, err := s.docker.CreateAndStart(ctx, tenantID, tenantDataDir)
if err != nil {
return nil, fmt.Errorf("recreate missing tenant runtime for %s using image %s: %w", tenantID, image, err)
}
healthy, err := s.waitForHealth(ctx, newContainerID, healthTimeout, healthPoll)
if err != nil || !healthy {
if err == nil {
err = fmt.Errorf("tenant runtime %s failed health checks", newContainerID)
}
if removeErr := s.docker.Remove(ctx, newContainerID); removeErr != nil {
return nil, fmt.Errorf("%w; cleanup failed for recreated container %s: %v", err, newContainerID, removeErr)
}
return nil, err
}
newInfo, err := s.resolveLiveContainer(ctx, tenant)
if err != nil {
return nil, fmt.Errorf("inspect recreated tenant runtime %s: %w", tenantID, err)
}
if newInfo.Name != tenantRuntimeContainerName(tenantID) {
return nil, fmt.Errorf("recreated tenant runtime is not using canonical container name %s", tenantRuntimeContainerName(tenantID))
}
if err := s.persistTenantRuntimeState(tenant, newInfo, true); err != nil {
return nil, err
}
return &TenantRuntimeRolloutResult{
TenantID: tenantID,
PreviousContainerID: strings.TrimSpace(tenant.ContainerID),
ActiveContainerID: newInfo.ID,
ActiveImageRef: newInfo.ImageRef,
ActiveImageID: newInfo.ImageID,
RestoredMissing: true,
}, nil
}
func tenantRuntimeMatchesContract(
live *cpDocker.RuntimeContainerInfo,
canonicalName string,
@ -404,8 +468,17 @@ func (s *tenantRuntimeRolloutService) PlanContractReconcile(
}
live, err := s.resolveLiveContainer(ctx, tenant)
if err != nil {
item.Action = tenantRuntimeContractActionSkip
item.Reason = err.Error()
if errors.Is(err, errTenantRuntimeMissing) {
desiredRouting := s.docker.DesiredRuntimeRouting(tenant.ID)
item.ImageRef = strings.TrimSpace(s.defaultImage)
item.DesiredRouteHost = desiredRouting.Host
item.DesiredPublicURL = desiredRouting.PublicURL
item.Action = tenantRuntimeContractActionRollout
item.Reason = "tenant runtime container is missing; recreate from existing tenant data"
} else {
item.Action = tenantRuntimeContractActionSkip
item.Reason = err.Error()
}
plan.Tenants = append(plan.Tenants, item)
continue
}
@ -506,10 +579,13 @@ func (s *tenantRuntimeRolloutService) resolveLiveContainer(ctx context.Context,
containerID := strings.TrimSpace(tenant.ContainerID)
if containerID == "" {
return nil, fmt.Errorf("tenant %s has no canonical runtime container and no registry container_id", tenant.ID)
return nil, fmt.Errorf("%w: tenant %s has no canonical runtime container and no registry container_id", errTenantRuntimeMissing, tenant.ID)
}
info, err = s.docker.Inspect(ctx, containerID)
if err != nil {
if cpDocker.IsNotFound(err) {
return nil, fmt.Errorf("%w: inspect tenant container %s: %w", errTenantRuntimeMissing, containerID, err)
}
return nil, fmt.Errorf("inspect tenant container %s: %w", containerID, err)
}
return info, nil

View file

@ -314,6 +314,59 @@ func TestTenantRuntimeRollout_RecreatesSameImageWhenRoutingContractDrifts(t *tes
}
}
func TestTenantRuntimeRollout_RecreatesMissingRuntimeFromTenantData(t *testing.T) {
tenant := &registry.Tenant{ID: "t-MISSING01", ContainerID: "removed-container"}
reg := &fakeTenantRuntimeRolloutRegistry{tenant: tenant}
docker := newFakeTenantRuntimeRolloutDocker()
routing := docker.DesiredRuntimeRouting(tenant.ID)
docker.queueCreate(&cpDocker.RuntimeContainerInfo{
ID: "recreated-container",
Name: tenantRuntimeContainerName(tenant.ID),
ImageRef: "pulse-runtime:stable",
ImageID: "sha256:stable",
Running: true,
RouteHost: routing.Host,
PublicURL: routing.PublicURL,
}, nil)
docker.health["recreated-container"] = []bool{true}
sync := &fakeTenantRuntimeRolloutSynchronizer{}
clock := newFakeTenantRuntimeRolloutClock()
service := newTestTenantRuntimeRolloutService(reg, docker, sync, clock)
result, err := service.Rollout(context.Background(), TenantRuntimeRolloutOptions{
TenantID: tenant.ID,
Image: "pulse-runtime:stable",
})
if err != nil {
t.Fatalf("Rollout() error = %v", err)
}
if !result.RestoredMissing {
t.Fatalf("RestoredMissing = false, want true")
}
if result.ReconciledOnly {
t.Fatalf("ReconciledOnly = true, want false")
}
if len(sync.snapshots) != 0 {
t.Fatalf("snapshot count = %d, want 0", len(sync.snapshots))
}
if len(docker.createCalls) != 1 {
t.Fatalf("create call count = %d, want 1", len(docker.createCalls))
}
if docker.createCalls[0].tenantDataDir != filepath.Join(tTempDirForRolloutService(), tenant.ID) {
t.Fatalf("created tenant data dir = %q", docker.createCalls[0].tenantDataDir)
}
if got := reg.updatedTenant.ContainerID; got != "recreated-container" {
t.Fatalf("updated tenant container id = %q, want recreated-container", got)
}
if got := reg.updatedTenant.CurrentImageDigest; got != "sha256:stable" {
t.Fatalf("updated tenant image digest = %q, want sha256:stable", got)
}
if !reg.updatedTenant.HealthCheckOK {
t.Fatalf("updated tenant health_check_ok = false, want true")
}
}
func TestTenantRuntimeContractReconcilePlan_AllTenantsClassifiesNoopRolloutAndSkip(t *testing.T) {
tenantNoop := &registry.Tenant{ID: "t-NOOP12345", ContainerID: "noop-live"}
tenantDrift := &registry.Tenant{ID: "t-Drift1234", ContainerID: "drift-live"}
@ -366,8 +419,14 @@ func TestTenantRuntimeContractReconcilePlan_AllTenantsClassifiesNoopRolloutAndSk
if got[tenantDrift.ID].DesiredRouteHost != routingDrift.Host {
t.Fatalf("drift tenant desired route host = %q, want %q", got[tenantDrift.ID].DesiredRouteHost, routingDrift.Host)
}
if got[tenantMissing.ID].Action != tenantRuntimeContractActionSkip {
t.Fatalf("missing tenant action = %q, want %q", got[tenantMissing.ID].Action, tenantRuntimeContractActionSkip)
if got[tenantMissing.ID].Action != tenantRuntimeContractActionRollout {
t.Fatalf("missing tenant action = %q, want %q", got[tenantMissing.ID].Action, tenantRuntimeContractActionRollout)
}
if got[tenantMissing.ID].ImageRef != "pulse-runtime:stable" {
t.Fatalf("missing tenant image ref = %q, want pulse-runtime:stable", got[tenantMissing.ID].ImageRef)
}
if got[tenantMissing.ID].DesiredRouteHost == "" {
t.Fatalf("missing tenant desired route host is empty")
}
}
@ -416,6 +475,7 @@ func newTestTenantRuntimeRolloutService(
registry: reg,
docker: docker,
tenantsDir: tTempDirForRolloutService(),
defaultImage: "pulse-runtime:stable",
synchronizer: sync,
now: clock.Now,
sleep: clock.Sleep,