mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 03:20:11 +00:00
Recover missing hosted tenant runtimes
This commit is contained in:
parent
4dfe69fe6f
commit
9dbaaa7efe
5 changed files with 155 additions and 5 deletions
|
|
@ -172,6 +172,9 @@ func newTenantRuntimeReconcileCmd() *cobra.Command {
|
|||
result.ActiveImageID,
|
||||
result.ReconciledOnly,
|
||||
)
|
||||
if result.RestoredMissing {
|
||||
fmt.Printf("restored_missing=%t\n", result.RestoredMissing)
|
||||
}
|
||||
if result.PreviousContainerID != "" {
|
||||
fmt.Printf("previous_container_id=%s\n", result.PreviousContainerID)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -141,6 +141,12 @@ cloud-specific enforcement rules.
|
|||
through the control-plane Docker manager so tenant runtime logging cannot
|
||||
fill the live Pulse Cloud host independently of tenant data quotas.
|
||||
10. `internal/cloudcp/tenant_runtime_rollout.go` shared with `deployment-installability`: hosted tenant runtime rollout is both a Pulse Cloud runtime contract boundary and a deployment-installability release-rollout boundary.
|
||||
Hosted tenant runtime reconciliation must treat a registered tenant with
|
||||
preserved tenant data but no live Docker runtime as a recoverable managed
|
||||
state, not as a terminal skip. The control-plane-owned reconcile path must
|
||||
recreate the canonical tenant container, health-check it, and persist the
|
||||
new runtime identity before hosted billing/auth surfaces are considered
|
||||
coherent.
|
||||
|
||||
The real `pulse-pro` license-server legacy checkout issuance, recurring
|
||||
renewals, manual issue, and legacy exchange flows are part of that same
|
||||
|
|
|
|||
|
|
@ -192,6 +192,11 @@ server-side update execution surfaces.
|
|||
`cmd/pulse-control-plane/main.go`, `internal/cloudcp/docker/manager.go`,
|
||||
`internal/cloudcp/docker/labels.go`, and
|
||||
`internal/cloudcp/tenant_runtime_rollout.go`
|
||||
The batch reconcile command must be restorative as well as corrective:
|
||||
when a tenant registry row and tenant data remain but the canonical or
|
||||
recorded Docker container is missing, dry-run must classify the tenant for
|
||||
mutation and the live command must recreate the container, prove health, and
|
||||
rewrite the registry runtime identity through the same control-plane path.
|
||||
10. Add or change the canonical hosted staging smoke operator path through `scripts/run_hosted_staging_smoke.sh`, `tests/integration/scripts/bootstrap-hosted-mobile-onboarding.mjs`, `tests/integration/scripts/hosted-mobile-token-runtime.mjs`, `tests/integration/scripts/hosted-tenant-runtime.mjs`, and `tests/integration/scripts/relay-mobile-token-helper.go`
|
||||
|
||||
## Forbidden Paths
|
||||
|
|
|
|||
|
|
@ -43,6 +43,7 @@ type TenantRuntimeRolloutResult struct {
|
|||
ActiveImageID string
|
||||
BackupContainerName string
|
||||
ReconciledOnly bool
|
||||
RestoredMissing bool
|
||||
}
|
||||
|
||||
type TenantRuntimeContractReconcilePlanOptions struct {
|
||||
|
|
@ -98,6 +99,7 @@ type tenantRuntimeRolloutService struct {
|
|||
registry tenantRuntimeRolloutRegistry
|
||||
docker tenantRuntimeRolloutDocker
|
||||
tenantsDir string
|
||||
defaultImage string
|
||||
synchronizer tenantRuntimeRolloutSynchronizer
|
||||
now func() time.Time
|
||||
sleep func(time.Duration)
|
||||
|
|
@ -105,6 +107,8 @@ type tenantRuntimeRolloutService struct {
|
|||
healthPoll time.Duration
|
||||
}
|
||||
|
||||
var errTenantRuntimeMissing = errors.New("tenant runtime container missing")
|
||||
|
||||
// RolloutTenantRuntime executes the canonical hosted tenant runtime rollout
|
||||
// path using the control plane's registry and Docker manager.
|
||||
func RolloutTenantRuntime(ctx context.Context, cfg *CPConfig, opts TenantRuntimeRolloutOptions) (*TenantRuntimeRolloutResult, error) {
|
||||
|
|
@ -183,6 +187,7 @@ func newTenantRuntimeRolloutServiceFromConfig(
|
|||
registry: reg,
|
||||
docker: dockerMgr,
|
||||
tenantsDir: cfg.TenantsDir(),
|
||||
defaultImage: strings.TrimSpace(image),
|
||||
synchronizer: filesystemTenantRuntimeSynchronizer{},
|
||||
now: func() time.Time { return time.Now().UTC() },
|
||||
sleep: time.Sleep,
|
||||
|
|
@ -203,6 +208,9 @@ func (s *tenantRuntimeRolloutService) Rollout(ctx context.Context, opts TenantRu
|
|||
return nil, fmt.Errorf("tenant id is required")
|
||||
}
|
||||
image := strings.TrimSpace(opts.Image)
|
||||
if image == "" {
|
||||
image = strings.TrimSpace(s.defaultImage)
|
||||
}
|
||||
if image == "" {
|
||||
return nil, fmt.Errorf("image is required")
|
||||
}
|
||||
|
|
@ -227,6 +235,9 @@ func (s *tenantRuntimeRolloutService) Rollout(ctx context.Context, opts TenantRu
|
|||
|
||||
live, err := s.resolveLiveContainer(ctx, tenant)
|
||||
if err != nil {
|
||||
if errors.Is(err, errTenantRuntimeMissing) {
|
||||
return s.recreateMissingRuntime(ctx, tenant, image, healthTimeout, healthPoll)
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
if strings.TrimSpace(tenant.ContainerID) != "" && tenant.ContainerID != live.ID {
|
||||
|
|
@ -369,6 +380,59 @@ func (s *tenantRuntimeRolloutService) Rollout(ctx context.Context, opts TenantRu
|
|||
}, nil
|
||||
}
|
||||
|
||||
func (s *tenantRuntimeRolloutService) recreateMissingRuntime(
|
||||
ctx context.Context,
|
||||
tenant *registry.Tenant,
|
||||
image string,
|
||||
healthTimeout time.Duration,
|
||||
healthPoll time.Duration,
|
||||
) (*TenantRuntimeRolloutResult, error) {
|
||||
if tenant == nil {
|
||||
return nil, fmt.Errorf("tenant is nil")
|
||||
}
|
||||
tenantID := strings.TrimSpace(tenant.ID)
|
||||
if tenantID == "" {
|
||||
return nil, fmt.Errorf("tenant id is required")
|
||||
}
|
||||
|
||||
tenantDataDir := filepath.Join(s.tenantsDir, tenantID)
|
||||
newContainerID, err := s.docker.CreateAndStart(ctx, tenantID, tenantDataDir)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("recreate missing tenant runtime for %s using image %s: %w", tenantID, image, err)
|
||||
}
|
||||
|
||||
healthy, err := s.waitForHealth(ctx, newContainerID, healthTimeout, healthPoll)
|
||||
if err != nil || !healthy {
|
||||
if err == nil {
|
||||
err = fmt.Errorf("tenant runtime %s failed health checks", newContainerID)
|
||||
}
|
||||
if removeErr := s.docker.Remove(ctx, newContainerID); removeErr != nil {
|
||||
return nil, fmt.Errorf("%w; cleanup failed for recreated container %s: %v", err, newContainerID, removeErr)
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
newInfo, err := s.resolveLiveContainer(ctx, tenant)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("inspect recreated tenant runtime %s: %w", tenantID, err)
|
||||
}
|
||||
if newInfo.Name != tenantRuntimeContainerName(tenantID) {
|
||||
return nil, fmt.Errorf("recreated tenant runtime is not using canonical container name %s", tenantRuntimeContainerName(tenantID))
|
||||
}
|
||||
if err := s.persistTenantRuntimeState(tenant, newInfo, true); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &TenantRuntimeRolloutResult{
|
||||
TenantID: tenantID,
|
||||
PreviousContainerID: strings.TrimSpace(tenant.ContainerID),
|
||||
ActiveContainerID: newInfo.ID,
|
||||
ActiveImageRef: newInfo.ImageRef,
|
||||
ActiveImageID: newInfo.ImageID,
|
||||
RestoredMissing: true,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func tenantRuntimeMatchesContract(
|
||||
live *cpDocker.RuntimeContainerInfo,
|
||||
canonicalName string,
|
||||
|
|
@ -404,8 +468,17 @@ func (s *tenantRuntimeRolloutService) PlanContractReconcile(
|
|||
}
|
||||
live, err := s.resolveLiveContainer(ctx, tenant)
|
||||
if err != nil {
|
||||
item.Action = tenantRuntimeContractActionSkip
|
||||
item.Reason = err.Error()
|
||||
if errors.Is(err, errTenantRuntimeMissing) {
|
||||
desiredRouting := s.docker.DesiredRuntimeRouting(tenant.ID)
|
||||
item.ImageRef = strings.TrimSpace(s.defaultImage)
|
||||
item.DesiredRouteHost = desiredRouting.Host
|
||||
item.DesiredPublicURL = desiredRouting.PublicURL
|
||||
item.Action = tenantRuntimeContractActionRollout
|
||||
item.Reason = "tenant runtime container is missing; recreate from existing tenant data"
|
||||
} else {
|
||||
item.Action = tenantRuntimeContractActionSkip
|
||||
item.Reason = err.Error()
|
||||
}
|
||||
plan.Tenants = append(plan.Tenants, item)
|
||||
continue
|
||||
}
|
||||
|
|
@ -506,10 +579,13 @@ func (s *tenantRuntimeRolloutService) resolveLiveContainer(ctx context.Context,
|
|||
|
||||
containerID := strings.TrimSpace(tenant.ContainerID)
|
||||
if containerID == "" {
|
||||
return nil, fmt.Errorf("tenant %s has no canonical runtime container and no registry container_id", tenant.ID)
|
||||
return nil, fmt.Errorf("%w: tenant %s has no canonical runtime container and no registry container_id", errTenantRuntimeMissing, tenant.ID)
|
||||
}
|
||||
info, err = s.docker.Inspect(ctx, containerID)
|
||||
if err != nil {
|
||||
if cpDocker.IsNotFound(err) {
|
||||
return nil, fmt.Errorf("%w: inspect tenant container %s: %w", errTenantRuntimeMissing, containerID, err)
|
||||
}
|
||||
return nil, fmt.Errorf("inspect tenant container %s: %w", containerID, err)
|
||||
}
|
||||
return info, nil
|
||||
|
|
|
|||
|
|
@ -314,6 +314,59 @@ func TestTenantRuntimeRollout_RecreatesSameImageWhenRoutingContractDrifts(t *tes
|
|||
}
|
||||
}
|
||||
|
||||
func TestTenantRuntimeRollout_RecreatesMissingRuntimeFromTenantData(t *testing.T) {
|
||||
tenant := ®istry.Tenant{ID: "t-MISSING01", ContainerID: "removed-container"}
|
||||
reg := &fakeTenantRuntimeRolloutRegistry{tenant: tenant}
|
||||
docker := newFakeTenantRuntimeRolloutDocker()
|
||||
routing := docker.DesiredRuntimeRouting(tenant.ID)
|
||||
docker.queueCreate(&cpDocker.RuntimeContainerInfo{
|
||||
ID: "recreated-container",
|
||||
Name: tenantRuntimeContainerName(tenant.ID),
|
||||
ImageRef: "pulse-runtime:stable",
|
||||
ImageID: "sha256:stable",
|
||||
Running: true,
|
||||
RouteHost: routing.Host,
|
||||
PublicURL: routing.PublicURL,
|
||||
}, nil)
|
||||
docker.health["recreated-container"] = []bool{true}
|
||||
sync := &fakeTenantRuntimeRolloutSynchronizer{}
|
||||
clock := newFakeTenantRuntimeRolloutClock()
|
||||
|
||||
service := newTestTenantRuntimeRolloutService(reg, docker, sync, clock)
|
||||
result, err := service.Rollout(context.Background(), TenantRuntimeRolloutOptions{
|
||||
TenantID: tenant.ID,
|
||||
Image: "pulse-runtime:stable",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Rollout() error = %v", err)
|
||||
}
|
||||
|
||||
if !result.RestoredMissing {
|
||||
t.Fatalf("RestoredMissing = false, want true")
|
||||
}
|
||||
if result.ReconciledOnly {
|
||||
t.Fatalf("ReconciledOnly = true, want false")
|
||||
}
|
||||
if len(sync.snapshots) != 0 {
|
||||
t.Fatalf("snapshot count = %d, want 0", len(sync.snapshots))
|
||||
}
|
||||
if len(docker.createCalls) != 1 {
|
||||
t.Fatalf("create call count = %d, want 1", len(docker.createCalls))
|
||||
}
|
||||
if docker.createCalls[0].tenantDataDir != filepath.Join(tTempDirForRolloutService(), tenant.ID) {
|
||||
t.Fatalf("created tenant data dir = %q", docker.createCalls[0].tenantDataDir)
|
||||
}
|
||||
if got := reg.updatedTenant.ContainerID; got != "recreated-container" {
|
||||
t.Fatalf("updated tenant container id = %q, want recreated-container", got)
|
||||
}
|
||||
if got := reg.updatedTenant.CurrentImageDigest; got != "sha256:stable" {
|
||||
t.Fatalf("updated tenant image digest = %q, want sha256:stable", got)
|
||||
}
|
||||
if !reg.updatedTenant.HealthCheckOK {
|
||||
t.Fatalf("updated tenant health_check_ok = false, want true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTenantRuntimeContractReconcilePlan_AllTenantsClassifiesNoopRolloutAndSkip(t *testing.T) {
|
||||
tenantNoop := ®istry.Tenant{ID: "t-NOOP12345", ContainerID: "noop-live"}
|
||||
tenantDrift := ®istry.Tenant{ID: "t-Drift1234", ContainerID: "drift-live"}
|
||||
|
|
@ -366,8 +419,14 @@ func TestTenantRuntimeContractReconcilePlan_AllTenantsClassifiesNoopRolloutAndSk
|
|||
if got[tenantDrift.ID].DesiredRouteHost != routingDrift.Host {
|
||||
t.Fatalf("drift tenant desired route host = %q, want %q", got[tenantDrift.ID].DesiredRouteHost, routingDrift.Host)
|
||||
}
|
||||
if got[tenantMissing.ID].Action != tenantRuntimeContractActionSkip {
|
||||
t.Fatalf("missing tenant action = %q, want %q", got[tenantMissing.ID].Action, tenantRuntimeContractActionSkip)
|
||||
if got[tenantMissing.ID].Action != tenantRuntimeContractActionRollout {
|
||||
t.Fatalf("missing tenant action = %q, want %q", got[tenantMissing.ID].Action, tenantRuntimeContractActionRollout)
|
||||
}
|
||||
if got[tenantMissing.ID].ImageRef != "pulse-runtime:stable" {
|
||||
t.Fatalf("missing tenant image ref = %q, want pulse-runtime:stable", got[tenantMissing.ID].ImageRef)
|
||||
}
|
||||
if got[tenantMissing.ID].DesiredRouteHost == "" {
|
||||
t.Fatalf("missing tenant desired route host is empty")
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -416,6 +475,7 @@ func newTestTenantRuntimeRolloutService(
|
|||
registry: reg,
|
||||
docker: docker,
|
||||
tenantsDir: tTempDirForRolloutService(),
|
||||
defaultImage: "pulse-runtime:stable",
|
||||
synchronizer: sync,
|
||||
now: clock.Now,
|
||||
sleep: clock.Sleep,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue