Tighten Pulse Cloud residue audit

This commit is contained in:
rcourtman 2026-04-24 12:22:13 +01:00
parent ca9f2ceb85
commit 5fd6456302
10 changed files with 269 additions and 16 deletions

View file

@ -337,6 +337,22 @@ func printCloudAuditReport(report *cloudcp.CloudAuditReport) {
tenant.Age.Round(time.Second),
)
}
fmt.Printf("proof_account_stale_count=%d\n", len(report.StaleProofAccounts))
for _, account := range report.StaleProofAccounts {
fmt.Printf("proof_account_stale=%s kind=%s age=%s\n",
account.AccountID,
account.Kind,
account.Age.Round(time.Second),
)
}
fmt.Printf("hosted_paid_orphan_entitlement_count=%d\n", len(report.OrphanPaidHostedEntitlements))
for _, entitlement := range report.OrphanPaidHostedEntitlements {
fmt.Printf("hosted_paid_orphan_entitlement=%s tenant_id=%s kind=%s\n",
entitlement.EntitlementID,
entitlement.TenantID,
entitlement.Kind,
)
}
for _, container := range report.ManagedRuntimeContainers {
if container.State == "running" && (container.HealthStatus == "" || container.HealthStatus == "none" || container.HealthStatus == "healthy") {
continue

View file

@ -145,6 +145,9 @@ cloud-specific enforcement rules.
filesystem, tenant data, Docker runtime store, and Docker build-cache
thresholds are part of the Cloud paid readiness contract rather than an
operator-only cleanup script.
The same cloud audit contract must fail on stale proof/canary account rows
and paid hosted entitlements whose tenant rows are missing, because either
residue can recreate or mask hosted runtime state after a cleanup.
10. `internal/cloudcp/tenant_runtime_rollout.go` shared with `deployment-installability`: hosted tenant runtime rollout is both a Pulse Cloud runtime contract boundary and a deployment-installability release-rollout boundary.
Hosted tenant runtime reconciliation must treat a registered tenant with
preserved tenant data but no live Docker runtime as a recoverable managed

View file

@ -669,8 +669,9 @@ That deployment boundary also owns hosted storage admission: production
control-plane deployments must mount host root and Docker runtime storage
read-only for inspection, expose explicit root/data/Docker/build-cache
thresholds, and provide `pulse-control-plane cloud audit` as the operator proof
for tenant counts, unhealthy managed containers, disk pressure, and stale
proof tenants before GA or rollout evidence is accepted.
for tenant counts, unhealthy managed containers, disk pressure, stale proof
tenants/accounts, and orphan paid hosted entitlements before GA or rollout
evidence is accepted.
That same verification contract also applies before Playwright attaches: if a
managed hot-dev session is already running when the verify lock is active, the
integration launcher must restart that session instead of silently attaching to

View file

@ -20,19 +20,35 @@ type ProofTenantAuditItem struct {
Age time.Duration
}
type ProofAccountAuditItem struct {
AccountID string
Kind registry.AccountKind
CreatedAt time.Time
Age time.Duration
}
type HostedEntitlementAuditItem struct {
EntitlementID string
TenantID string
Kind registry.HostedEntitlementKind
IssuedAt time.Time
}
type CloudAuditReport struct {
OK bool
Failures []string
Storage *StorageGuardrailReport
TenantCounts map[registry.TenantState]int
TenantTotal int
RegistryUnhealthyActive int
DockerManagedTotal int
DockerManagedRunning int
DockerManagedUnhealthy int
DockerUnavailable string
StaleProofTenants []ProofTenantAuditItem
ManagedRuntimeContainers []cpDocker.RuntimeContainerSummary
OK bool
Failures []string
Storage *StorageGuardrailReport
TenantCounts map[registry.TenantState]int
TenantTotal int
RegistryUnhealthyActive int
DockerManagedTotal int
DockerManagedRunning int
DockerManagedUnhealthy int
DockerUnavailable string
StaleProofTenants []ProofTenantAuditItem
StaleProofAccounts []ProofAccountAuditItem
OrphanPaidHostedEntitlements []HostedEntitlementAuditItem
ManagedRuntimeContainers []cpDocker.RuntimeContainerSummary
}
func AuditCloud(ctx context.Context, cfg *CPConfig) (*CloudAuditReport, error) {
@ -54,11 +70,13 @@ func AuditCloud(ctx context.Context, cfg *CPConfig) (*CloudAuditReport, error) {
if err != nil {
return nil, fmt.Errorf("list tenants: %w", err)
}
tenantIDs := make(map[string]struct{}, len(tenants))
report.TenantTotal = len(tenants)
for _, tenant := range tenants {
if tenant == nil {
continue
}
tenantIDs[strings.TrimSpace(tenant.ID)] = struct{}{}
report.TenantCounts[tenant.State]++
if tenant.State == registry.TenantStateActive && !tenant.HealthCheckOK {
report.RegistryUnhealthyActive++
@ -72,6 +90,28 @@ func AuditCloud(ctx context.Context, cfg *CPConfig) (*CloudAuditReport, error) {
report.addFailure(fmt.Sprintf("%d proof/canary tenants are older than %s", len(report.StaleProofTenants), cfg.ProofTenantMaxAge))
}
entitlements, err := reg.ListHostedEntitlements()
if err != nil {
return nil, fmt.Errorf("list hosted entitlements: %w", err)
}
report.OrphanPaidHostedEntitlements = findOrphanPaidHostedEntitlements(entitlements, tenantIDs)
if len(report.OrphanPaidHostedEntitlements) > 0 {
report.addFailure(fmt.Sprintf("%d paid hosted entitlements reference missing tenants", len(report.OrphanPaidHostedEntitlements)))
}
accounts, err := reg.ListAccounts()
if err != nil {
return nil, fmt.Errorf("list accounts: %w", err)
}
stripeAccounts, err := reg.ListStripeAccounts()
if err != nil {
return nil, fmt.Errorf("list stripe accounts: %w", err)
}
report.StaleProofAccounts = findStaleProofAccounts(accounts, stripeAccounts, cfg.ProofTenantMatchers, cfg.ProofTenantMaxAge, time.Now().UTC())
if len(report.StaleProofAccounts) > 0 {
report.addFailure(fmt.Sprintf("%d proof/canary accounts are older than %s", len(report.StaleProofAccounts), cfg.ProofTenantMaxAge))
}
dockerMgr, err := cpDocker.NewManager(cpDocker.ManagerConfig{
Image: cfg.PulseImage,
Network: cfg.DockerNetwork,
@ -176,6 +216,77 @@ func findStaleProofTenants(tenants []*registry.Tenant, matchers []string, maxAge
return items
}
func findOrphanPaidHostedEntitlements(entitlements []*registry.HostedEntitlement, tenantIDs map[string]struct{}) []HostedEntitlementAuditItem {
items := make([]HostedEntitlementAuditItem, 0)
for _, entitlement := range entitlements {
if entitlement == nil || entitlement.Kind != registry.HostedEntitlementKindPaid {
continue
}
tenantID := strings.TrimSpace(entitlement.TenantID)
if tenantID == "" {
continue
}
if _, ok := tenantIDs[tenantID]; ok {
continue
}
items = append(items, HostedEntitlementAuditItem{
EntitlementID: strings.TrimSpace(entitlement.ID),
TenantID: tenantID,
Kind: entitlement.Kind,
IssuedAt: entitlement.IssuedAt.UTC(),
})
}
sort.Slice(items, func(i, j int) bool {
if items[i].IssuedAt.Equal(items[j].IssuedAt) {
return items[i].EntitlementID < items[j].EntitlementID
}
return items[i].IssuedAt.Before(items[j].IssuedAt)
})
return items
}
func findStaleProofAccounts(accounts []*registry.Account, stripeAccounts []*registry.StripeAccount, matchers []string, maxAge time.Duration, now time.Time) []ProofAccountAuditItem {
if maxAge <= 0 {
return nil
}
stripeByAccount := make(map[string][]*registry.StripeAccount, len(stripeAccounts))
for _, stripeAccount := range stripeAccounts {
if stripeAccount == nil {
continue
}
accountID := strings.TrimSpace(stripeAccount.AccountID)
if accountID == "" {
continue
}
stripeByAccount[accountID] = append(stripeByAccount[accountID], stripeAccount)
}
cutoff := now.Add(-maxAge)
items := make([]ProofAccountAuditItem, 0)
for _, account := range accounts {
if account == nil || account.CreatedAt.IsZero() || account.CreatedAt.After(cutoff) {
continue
}
if !matchesProofAccount(account, stripeByAccount[account.ID], matchers) {
continue
}
createdAt := account.CreatedAt.UTC()
items = append(items, ProofAccountAuditItem{
AccountID: strings.TrimSpace(account.ID),
Kind: account.Kind,
CreatedAt: createdAt,
Age: now.Sub(createdAt),
})
}
sort.Slice(items, func(i, j int) bool {
if items[i].CreatedAt.Equal(items[j].CreatedAt) {
return items[i].AccountID < items[j].AccountID
}
return items[i].CreatedAt.Before(items[j].CreatedAt)
})
return items
}
func matchesProofTenant(tenant *registry.Tenant, matchers []string) bool {
if tenant == nil {
return false
@ -202,3 +313,37 @@ func matchesProofTenant(tenant *registry.Tenant, matchers []string) bool {
}
return false
}
func matchesProofAccount(account *registry.Account, stripeAccounts []*registry.StripeAccount, matchers []string) bool {
if account == nil {
return false
}
parts := []string{
account.ID,
string(account.Kind),
account.DisplayName,
}
for _, stripeAccount := range stripeAccounts {
if stripeAccount == nil {
continue
}
parts = append(parts,
stripeAccount.StripeCustomerID,
stripeAccount.StripeSubscriptionID,
stripeAccount.StripeSubItemWorkspacesID,
stripeAccount.PlanVersion,
stripeAccount.SubscriptionState,
)
}
haystack := strings.ToLower(strings.Join(parts, " "))
for _, matcher := range matchers {
matcher = strings.ToLower(strings.TrimSpace(matcher))
if matcher == "" {
continue
}
if strings.Contains(haystack, matcher) {
return true
}
}
return false
}

View file

@ -175,7 +175,7 @@ func LoadConfig() (*CPConfig, error) {
StorageMinDockerAvailableBytes: storageMinDockerAvailable,
StorageMaxDockerBuildCacheBytes: storageMaxDockerBuildCache,
ProofTenantMaxAge: proofTenantMaxAge,
ProofTenantMatchers: parseCSVEnv("CP_PROOF_TENANT_MATCHERS", "proof,canary,rehearsal"),
ProofTenantMatchers: parseCSVEnv("CP_PROOF_TENANT_MATCHERS", "proof,canary,rehearsal,msp_prod,ownerseed,owner_seed"),
StripeWebhookSecret: strings.TrimSpace(os.Getenv("STRIPE_WEBHOOK_SECRET")),
StripeAPIKey: strings.TrimSpace(os.Getenv("STRIPE_API_KEY")),
PublicCloudSignupEnabled: envOrDefaultBool("CP_PUBLIC_CLOUD_SIGNUP_ENABLED", false),

View file

@ -165,6 +165,9 @@ func TestLoadConfig_EnablesStorageGuardrailsByDefaultInProduction(t *testing.T)
if cfg.StorageDockerPath != "/var/lib/docker" {
t.Fatalf("StorageDockerPath = %q, want /var/lib/docker", cfg.StorageDockerPath)
}
if got := strings.Join(cfg.ProofTenantMatchers, ","); got != "proof,canary,rehearsal,msp_prod,ownerseed,owner_seed" {
t.Fatalf("ProofTenantMatchers = %q", got)
}
}
func TestLoadConfig_InvalidStorageByteSize(t *testing.T) {

View file

@ -1251,6 +1251,34 @@ func (r *TenantRegistry) GetHostedEntitlementByTrialRequestID(requestID string)
return loadHostedEntitlement(row)
}
// ListHostedEntitlements returns all hosted entitlement authority rows.
func (r *TenantRegistry) ListHostedEntitlements() ([]*HostedEntitlement, error) {
rows, err := r.db.Query(`
SELECT id, kind, tenant_id, trial_request_id, org_id, email, return_url, instance_token, instance_host,
trial_started_at, refresh_token, activation_token, issued_at, activation_issued_at, last_refreshed_at, redeemed_at, revoked_at
FROM hosted_entitlements
ORDER BY issued_at DESC`)
if err != nil {
return nil, fmt.Errorf("list hosted entitlements: %w", err)
}
defer rows.Close()
var out []*HostedEntitlement
for rows.Next() {
rec, scanErr := loadHostedEntitlement(rows)
if scanErr != nil {
return nil, scanErr
}
if rec != nil {
out = append(out, rec)
}
}
if err := rows.Err(); err != nil {
return nil, fmt.Errorf("iterate hosted entitlements: %w", err)
}
return out, nil
}
// MarkHostedEntitlementRefreshed records the last successful hosted entitlement refresh time.
func (r *TenantRegistry) MarkHostedEntitlementRefreshed(id string, refreshedAt time.Time) error {
id = strings.TrimSpace(id)

View file

@ -1102,6 +1102,17 @@ func TestHostedEntitlementLookupAndIssue(t *testing.T) {
if stored != "etr_paid_three" {
t.Fatalf("stored token after revoke = %q, want %q", stored, "etr_paid_three")
}
listed, err := reg.ListHostedEntitlements()
if err != nil {
t.Fatalf("ListHostedEntitlements: %v", err)
}
if len(listed) != 1 {
t.Fatalf("len(ListHostedEntitlements) = %d, want 1", len(listed))
}
if listed[0].ID != paidHostedEntitlementID(tenant.ID) || listed[0].RefreshToken != "etr_paid_three" {
t.Fatalf("listed entitlement = %#v, want current paid entitlement", listed[0])
}
}
func TestTenantRegistryCanonicalizesTenantPlanVersion(t *testing.T) {

View file

@ -119,3 +119,45 @@ func TestFindStaleProofTenantsUsesConfiguredMatchersAndAge(t *testing.T) {
t.Fatalf("stale[0].TenantID = %q, want t-OLDPROOF", stale[0].TenantID)
}
}
func TestFindOrphanPaidHostedEntitlementsFlagsMissingTenants(t *testing.T) {
issuedAt := time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC)
entitlements := []*registry.HostedEntitlement{
{ID: "paid:t-ACTIVE", Kind: registry.HostedEntitlementKindPaid, TenantID: "t-ACTIVE", IssuedAt: issuedAt},
{ID: "paid:t-MISSING", Kind: registry.HostedEntitlementKindPaid, TenantID: "t-MISSING", IssuedAt: issuedAt.Add(-time.Hour)},
{ID: "trial:req", Kind: registry.HostedEntitlementKindTrial, TenantID: "", IssuedAt: issuedAt},
}
orphaned := findOrphanPaidHostedEntitlements(entitlements, map[string]struct{}{
"t-ACTIVE": {},
})
if len(orphaned) != 1 {
t.Fatalf("len(orphaned) = %d, want 1 (%v)", len(orphaned), orphaned)
}
if orphaned[0].EntitlementID != "paid:t-MISSING" {
t.Fatalf("orphaned[0].EntitlementID = %q, want paid:t-MISSING", orphaned[0].EntitlementID)
}
}
func TestFindStaleProofAccountsUsesAccountAndStripeMatchers(t *testing.T) {
now := time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC)
old := now.Add(-48 * time.Hour)
fresh := now.Add(-1 * time.Hour)
accounts := []*registry.Account{
{ID: "a_rehearsal_old", Kind: registry.AccountKindMSP, DisplayName: "Production Rehearsal", CreatedAt: old},
{ID: "a_customer", Kind: registry.AccountKindIndividual, DisplayName: "Customer", CreatedAt: old},
{ID: "a_stripe_old", Kind: registry.AccountKindMSP, DisplayName: "Pulse", CreatedAt: old},
{ID: "a_rehearsal_fresh", Kind: registry.AccountKindMSP, DisplayName: "Canary", CreatedAt: fresh},
}
stripeAccounts := []*registry.StripeAccount{
{AccountID: "a_stripe_old", StripeCustomerID: "cus_msp_rehearsal_123", PlanVersion: "msp_starter"},
}
stale := findStaleProofAccounts(accounts, stripeAccounts, []string{"canary", "rehearsal"}, 24*time.Hour, now)
if len(stale) != 2 {
t.Fatalf("len(stale) = %d, want 2 (%v)", len(stale), stale)
}
if stale[0].AccountID != "a_rehearsal_old" || stale[1].AccountID != "a_stripe_old" {
t.Fatalf("stale account order = %#v, want rehearsal then stripe", stale)
}
}

View file

@ -408,7 +408,8 @@ func TestTenantRuntimeRollout_AdmissionFailureStopsMissingRuntimeRestore(t *test
tenant := &registry.Tenant{ID: "t-ADMIT02", ContainerID: "removed-container"}
reg := &fakeTenantRuntimeRolloutRegistry{tenant: tenant}
docker := newFakeTenantRuntimeRolloutDocker()
service := newTestTenantRuntimeRolloutService(reg, docker, &fakeTenantRuntimeRolloutSynchronizer{}, newFakeTenantRuntimeRolloutClock())
sync := &fakeTenantRuntimeRolloutSynchronizer{}
service := newTestTenantRuntimeRolloutService(reg, docker, sync, newFakeTenantRuntimeRolloutClock())
service.admissionCheck = func(context.Context) error {
return errors.New("storage pressure")
}
@ -423,6 +424,9 @@ func TestTenantRuntimeRollout_AdmissionFailureStopsMissingRuntimeRestore(t *test
if len(docker.createCalls) != 0 {
t.Fatalf("create call count = %d, want 0", len(docker.createCalls))
}
if len(sync.restores) != 0 {
t.Fatalf("restore count = %d, want 0", len(sync.restores))
}
}
func TestTenantRuntimeContractReconcilePlan_AllTenantsClassifiesNoopRolloutAndSkip(t *testing.T) {