From 429af793baeb4f5dcb4de1ff47d111590ef8c41d Mon Sep 17 00:00:00 2001 From: Rafael Benevides Date: Wed, 29 Apr 2026 10:58:23 -0300 Subject: [PATCH] HYPERFLEET-856 - feat: add deletion observability metrics and alerts Add Prometheus metrics to track the lifecycle of resource deletion: - pending_deletion_total counter: tracks resources entering Pending Deletion - pending_deletion_duration_seconds histogram: measures soft-delete to hard-delete - pending_deletion_stuck gauge (collector): reports resources stuck beyond threshold Includes PrometheusRule alerts (warning at 1h, critical at 2.5h total), partial indexes on deleted_time for efficient collector queries, and integration tests for the full metrics pipeline. --- charts/templates/prometheusrule.yaml | 40 +++ charts/values.yaml | 14 + cmd/hyperfleet-api/servecmd/cmd.go | 10 + .../server/metrics_middleware.go | 2 + docs/metrics.md | 83 ++++++ pkg/config/flags.go | 2 + pkg/config/loader.go | 6 + pkg/config/metrics.go | 11 + .../202604290001_add_deleted_time_indexes.go | 25 ++ pkg/db/migrations/migration_structs.go | 1 + pkg/metrics/deletion.go | 172 +++++++++++ pkg/metrics/deletion_test.go | 267 ++++++++++++++++++ pkg/services/cluster.go | 17 +- pkg/services/node_pool.go | 3 + test/integration/deletion_metrics_test.go | 93 ++++++ 15 files changed, 741 insertions(+), 5 deletions(-) create mode 100644 charts/templates/prometheusrule.yaml create mode 100644 pkg/db/migrations/202604290001_add_deleted_time_indexes.go create mode 100644 pkg/metrics/deletion.go create mode 100644 pkg/metrics/deletion_test.go create mode 100644 test/integration/deletion_metrics_test.go diff --git a/charts/templates/prometheusrule.yaml b/charts/templates/prometheusrule.yaml new file mode 100644 index 00000000..000c44d6 --- /dev/null +++ b/charts/templates/prometheusrule.yaml @@ -0,0 +1,40 @@ +{{- if .Values.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "hyperfleet-api.fullname" . }} + namespace: {{ .Values.prometheusRule.namespace | default .Release.Namespace }} + labels: + {{- include "hyperfleet-api.labels" . | nindent 4 }} + {{- with .Values.prometheusRule.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + - name: hyperfleet-api-deletion + rules: + - alert: HyperFleetResourceDeletionStuckWarning + expr: max by (namespace, resource_type)(hyperfleet_api_resource_pending_deletion_stuck) > 0 + for: {{ .Values.prometheusRule.rules.deletionStuck.for | default "30m" }} + labels: + severity: warning + annotations: + summary: "HyperFleet resources stuck in Pending Deletion state" + description: >- + {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in + Pending Deletion state for more than {{ .Values.config.metrics.deletion_stuck_threshold | default "30m" }} + (stuck threshold) + {{ .Values.prometheusRule.rules.deletionStuck.for | default "30m" }} (alert delay). + runbook_url: {{ .Values.prometheusRule.rules.deletionStuck.runbookUrl | default "" | quote }} + - alert: HyperFleetResourceDeletionStuckCritical + expr: max by (namespace, resource_type)(hyperfleet_api_resource_pending_deletion_stuck) > 0 + for: {{ .Values.prometheusRule.rules.deletionTimeout.for | default "2h" }} + labels: + severity: critical + annotations: + summary: "HyperFleet resources timed out in Pending Deletion state" + description: >- + {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in + Pending Deletion state for more than {{ .Values.config.metrics.deletion_stuck_threshold | default "30m" }} + (stuck threshold) + {{ .Values.prometheusRule.rules.deletionTimeout.for | default "2h" }} (alert delay). Immediate investigation required. + runbook_url: {{ .Values.prometheusRule.rules.deletionTimeout.runbookUrl | default "" | quote }} +{{- end }} diff --git a/charts/values.yaml b/charts/values.yaml index de8f9139..b5a95a04 100644 --- a/charts/values.yaml +++ b/charts/values.yaml @@ -126,6 +126,7 @@ config: enabled: false label_metrics_inclusion_duration: 168h + deletion_stuck_threshold: 30m # Health check configuration health: @@ -243,6 +244,19 @@ database: size: 1Gi storageClass: "" +# PrometheusRule for alerting +prometheusRule: + enabled: false + labels: {} + namespace: "" + rules: + deletionStuck: + for: "30m" + runbookUrl: "" + deletionTimeout: + for: "2h" + runbookUrl: "" + # ServiceMonitor for Prometheus Operator serviceMonitor: enabled: false diff --git a/cmd/hyperfleet-api/servecmd/cmd.go b/cmd/hyperfleet-api/servecmd/cmd.go index 61beb0ef..040bcc54 100755 --- a/cmd/hyperfleet-api/servecmd/cmd.go +++ b/cmd/hyperfleet-api/servecmd/cmd.go @@ -18,6 +18,7 @@ import ( "github.com/openshift-hyperfleet/hyperfleet-api/pkg/db/db_session" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/health" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/telemetry" ) @@ -129,6 +130,15 @@ func runServe(cmd *cobra.Command, args []string) { "masking_enabled", environments.Environment().Config.Logging.Masking.Enabled, ).Info("Logger initialized") + if sf := environments.Environment().Database.SessionFactory; sf != nil { + if err := metrics.RegisterCollector( + sf.DirectDB(), + environments.Environment().Config.Metrics.DeletionStuckThreshold, + ); err != nil { + logger.WithError(ctx, err).Error("Failed to register pending deletion collector") + } + } + apiServer := server.NewAPIServer(tracingEnabled) go apiServer.Start() diff --git a/cmd/hyperfleet-api/server/metrics_middleware.go b/cmd/hyperfleet-api/server/metrics_middleware.go index dbfe2697..54ac053a 100755 --- a/cmd/hyperfleet-api/server/metrics_middleware.go +++ b/cmd/hyperfleet-api/server/metrics_middleware.go @@ -62,6 +62,7 @@ import ( "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/db/db_metrics" + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics" ) // MetricsMiddleware creates a new handler that collects metrics for the requests processed by the @@ -112,6 +113,7 @@ func ResetMetricCollectors() { requestCountMetric.Reset() requestDurationMetric.Reset() db_metrics.ResetMetrics() + metrics.ResetMetrics() buildInfoMetric.Reset() buildInfoMetric.With(prometheus.Labels{ metricsComponentLabel: metricsComponentValue, diff --git a/docs/metrics.md b/docs/metrics.md index e6607e0b..82d00736 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -99,6 +99,69 @@ hyperfleet_api_request_duration_seconds_sum{component="api",version="abc123",cod hyperfleet_api_request_duration_seconds_count{component="api",version="abc123",code="200",method="GET",path="/api/hyperfleet/v1/clusters"} 1523 ``` +### Deletion Observability Metrics + +These metrics track resources in the Pending Deletion state (`deleted_time` set, pending hard-delete by adapters). + +#### `hyperfleet_api_resource_pending_deletion_total` + +**Type:** Counter + +**Description:** Total number of resources that entered the Pending Deletion state (`deleted_time` set). + +**Labels:** + +| Label | Description | Example Values | +|-------|-------------|----------------| +| `resource_type` | Type of resource | `cluster`, `nodepool` | +| `component` | Component name | `api` | +| `version` | Application version | `abc123` | + +**Example output:** + +```text +hyperfleet_api_resource_pending_deletion_total{component="api",resource_type="cluster",version="abc123"} 42 +hyperfleet_api_resource_pending_deletion_total{component="api",resource_type="nodepool",version="abc123"} 156 +``` + +#### `hyperfleet_api_resource_pending_deletion_duration_seconds` + +**Type:** Histogram + +**Description:** Duration from pending deletion (`deleted_time` set) to hard-delete completion in seconds. Observed when a resource is hard-deleted after all adapters report `Finalized=True`. + +**Labels:** Same as `hyperfleet_api_resource_pending_deletion_total` + +**Buckets:** `1s`, `5s`, `10s`, `30s`, `60s`, `120s`, `300s`, `600s`, `1800s`, `3600s` + +**Note:** This metric is populated when the hard-delete flow is active. See the [hard-delete design](https://github.com/openshift-hyperfleet/architecture/blob/main/hyperfleet/components/api-service/hard-delete-design.md) for details. + +#### `hyperfleet_api_resource_pending_deletion_stuck` + +**Type:** Gauge (Collector) + +**Description:** Number of resources in Pending Deletion state beyond the stuck threshold (default 30 minutes). This gauge is computed on each Prometheus scrape by querying the database for resources with `deleted_time` set before the threshold. + +**Labels:** Same as `hyperfleet_api_resource_pending_deletion_total` + +**Configuration:** The stuck threshold is configurable via `--metrics-deletion-stuck-threshold` (default `30m`). + +**Example output:** + +```text +hyperfleet_api_resource_pending_deletion_stuck{component="api",resource_type="cluster",version="abc123"} 2 +hyperfleet_api_resource_pending_deletion_stuck{component="api",resource_type="nodepool",version="abc123"} 0 +``` + +### Deletion Alerts + +Two alerts are available via the PrometheusRule (requires `prometheusRule.enabled=true` in Helm values): + +| Alert | Severity | Condition | Description | +|-------|----------|-----------|-------------| +| `HyperFleetResourceDeletionStuckWarning` | Warning | `resource_pending_deletion_stuck > 0` for 30m | Resources stuck in Pending Deletion beyond 1 hour | +| `HyperFleetResourceDeletionStuckCritical` | Critical | `resource_pending_deletion_stuck > 0` for 2h | Resources stuck in Pending Deletion beyond 2.5 hours | + ## Go Runtime Metrics The following metrics are automatically exposed by the Prometheus Go client library. @@ -255,6 +318,26 @@ rate(process_cpu_seconds_total[5m]) process_open_fds / process_max_fds * 100 ``` +### Deletion Observability + +```promql +# Resources entering Pending Deletion state per minute +sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_total[5m])) * 60 + +# Resources currently stuck in Pending Deletion state +hyperfleet_api_resource_pending_deletion_stuck + +# Stuck resources by type +sum by (resource_type) (hyperfleet_api_resource_pending_deletion_stuck) + +# Average pending deletion duration (once hard-delete is active) +sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_sum[5m])) / +sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_count[5m])) + +# P99 pending deletion duration +histogram_quantile(0.99, sum by (le, resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_bucket[5m]))) +``` + ### Common Investigation Queries ```promql diff --git a/pkg/config/flags.go b/pkg/config/flags.go index d7a76004..794ce807 100644 --- a/pkg/config/flags.go +++ b/pkg/config/flags.go @@ -79,6 +79,8 @@ func AddMetricsFlags(cmd *cobra.Command) { cmd.Flags().String("metrics-tls-key-file", defaults.TLS.KeyFile, "Path to TLS key file for metrics") cmd.Flags().Duration("metrics-label-metrics-inclusion-duration", defaults.LabelMetricsInclusionDuration, "Duration for cluster telemetry label inclusion") + cmd.Flags().Duration("metrics-deletion-stuck-threshold", defaults.DeletionStuckThreshold, + "Duration after which a pending deletion resource is considered stuck") } // AddHealthFlags adds health check configuration flags following standard naming diff --git a/pkg/config/loader.go b/pkg/config/loader.go index f2b4066a..d768d438 100644 --- a/pkg/config/loader.go +++ b/pkg/config/loader.go @@ -185,6 +185,9 @@ func (l *ConfigLoader) validateConfig(config *ApplicationConfig) error { if valErr := config.Metrics.TLS.Validate(); valErr != nil { return fmt.Errorf("metrics TLS validation failed: %w", valErr) } + if valErr := config.Metrics.Validate(); valErr != nil { + return fmt.Errorf("metrics config validation failed: %w", valErr) + } return nil } @@ -345,6 +348,7 @@ func (l *ConfigLoader) bindAllEnvVars() { l.bindEnv("metrics.port") l.bindEnv("metrics.tls.enabled") l.bindEnv("metrics.label_metrics_inclusion_duration") + l.bindEnv("metrics.deletion_stuck_threshold") // Health config l.bindEnv("health.host") @@ -411,6 +415,8 @@ func (l *ConfigLoader) bindFlags(cmd *cobra.Command) { l.bindPFlag("metrics.tls.key_file", cmd.Flags().Lookup("metrics-tls-key-file")) l.bindPFlag("metrics.label_metrics_inclusion_duration", cmd.Flags().Lookup("metrics-label-metrics-inclusion-duration")) + l.bindPFlag("metrics.deletion_stuck_threshold", + cmd.Flags().Lookup("metrics-deletion-stuck-threshold")) // Health flags: --health-* -> health.* l.bindPFlag("health.host", cmd.Flags().Lookup("health-host")) diff --git a/pkg/config/metrics.go b/pkg/config/metrics.go index 79114fb2..1ee67ea4 100755 --- a/pkg/config/metrics.go +++ b/pkg/config/metrics.go @@ -1,6 +1,7 @@ package config import ( + "fmt" "net" "strconv" "time" @@ -13,6 +14,7 @@ type MetricsConfig struct { TLS TLSConfig `mapstructure:"tls" json:"tls" validate:"required"` Port int `mapstructure:"port" json:"port" validate:"required,min=1,max=65535"` LabelMetricsInclusionDuration time.Duration `mapstructure:"label_metrics_inclusion_duration" json:"label_metrics_inclusion_duration" validate:"required"` //nolint:lll + DeletionStuckThreshold time.Duration `mapstructure:"deletion_stuck_threshold" json:"deletion_stuck_threshold" validate:"required"` //nolint:lll } // NewMetricsConfig returns default MetricsConfig values @@ -25,9 +27,18 @@ func NewMetricsConfig() *MetricsConfig { Enabled: false, }, LabelMetricsInclusionDuration: 168 * time.Hour, // 7 days + DeletionStuckThreshold: 30 * time.Minute, } } +// Validate validates MetricsConfig fields that struct tags cannot enforce +func (m *MetricsConfig) Validate() error { + if m.DeletionStuckThreshold <= 0 { + return fmt.Errorf("DeletionStuckThreshold must be positive, got %v", m.DeletionStuckThreshold) + } + return nil +} + // ============================================================ // Convenience Accessor Methods // ============================================================ diff --git a/pkg/db/migrations/202604290001_add_deleted_time_indexes.go b/pkg/db/migrations/202604290001_add_deleted_time_indexes.go new file mode 100644 index 00000000..140ccf6a --- /dev/null +++ b/pkg/db/migrations/202604290001_add_deleted_time_indexes.go @@ -0,0 +1,25 @@ +package migrations + +import ( + "gorm.io/gorm" + + "github.com/go-gormigrate/gormigrate/v2" +) + +func addDeletedTimeIndexes() *gormigrate.Migration { + return &gormigrate.Migration{ + ID: "202604290001", + Migrate: func(tx *gorm.DB) error { + // Partial indexes for metrics collector queries: + // SELECT COUNT(*) FROM clusters WHERE deleted_time IS NOT NULL AND deleted_time < $1 + // SELECT COUNT(*) FROM node_pools WHERE deleted_time IS NOT NULL AND deleted_time < $1 + if err := tx.Exec("CREATE INDEX IF NOT EXISTS idx_clusters_deleted_time ON clusters(deleted_time) WHERE deleted_time IS NOT NULL;").Error; err != nil { //nolint:lll + return err + } + if err := tx.Exec("CREATE INDEX IF NOT EXISTS idx_node_pools_deleted_time ON node_pools(deleted_time) WHERE deleted_time IS NOT NULL;").Error; err != nil { //nolint:lll + return err + } + return nil + }, + } +} diff --git a/pkg/db/migrations/migration_structs.go b/pkg/db/migrations/migration_structs.go index d6bff2e1..f402c8c0 100755 --- a/pkg/db/migrations/migration_structs.go +++ b/pkg/db/migrations/migration_structs.go @@ -35,6 +35,7 @@ var MigrationList = []*gormigrate.Migration{ addSoftDeleteSchema(), addNodePoolOwnerDeletedIndex(), addReconciledIndex(), + addDeletedTimeIndexes(), } // Model represents the base model struct. All entities will have this struct embedded. diff --git a/pkg/metrics/deletion.go b/pkg/metrics/deletion.go new file mode 100644 index 00000000..40f0d790 --- /dev/null +++ b/pkg/metrics/deletion.go @@ -0,0 +1,172 @@ +/* +Copyright (c) 2026 Red Hat, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "context" + "database/sql" + "log/slog" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api" +) + +const metricsSubsystem = "hyperfleet_api" + +const ( + labelResourceType = "resource_type" + labelComponent = "component" + labelVersion = "version" +) + +const componentValue = "api" + +var deletionLabels = []string{labelResourceType, labelComponent, labelVersion} + +var pendingDeletionDurationBuckets = []float64{1, 5, 10, 30, 60, 120, 300, 600, 1800, 3600} + +var pendingDeletionTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: metricsSubsystem, + Name: "resource_pending_deletion_total", + Help: "Total number of resources that entered the Pending Deletion state (deleted_time set).", + }, + deletionLabels, +) + +var pendingDeletionDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: metricsSubsystem, + Name: "resource_pending_deletion_duration_seconds", + Help: "Duration from pending deletion (deleted_time set) to hard-delete completion in seconds.", + Buckets: pendingDeletionDurationBuckets, + }, + deletionLabels, +) + +var registerOnce sync.Once + +func RegisterMetrics() { + registerOnce.Do(func() { + prometheus.MustRegister(pendingDeletionTotal) + prometheus.MustRegister(pendingDeletionDuration) + }) +} + +func init() { + RegisterMetrics() +} + +func RecordPendingDeletion(resourceType string) { + pendingDeletionTotal.With(prometheus.Labels{ + labelResourceType: resourceType, + labelComponent: componentValue, + labelVersion: api.Version, + }).Inc() +} + +func ObservePendingDeletionDuration(resourceType string, deletedAt time.Time) { + duration := time.Since(deletedAt).Seconds() + pendingDeletionDuration.With(prometheus.Labels{ + labelResourceType: resourceType, + labelComponent: componentValue, + labelVersion: api.Version, + }).Observe(duration) +} + +func ResetMetrics() { + pendingDeletionTotal.Reset() + pendingDeletionDuration.Reset() +} + +// PendingDeletionCollector implements prometheus.Collector to report the number of +// resources stuck in Pending Deletion state beyond a configurable threshold. +// It queries the database on each Prometheus scrape. +const defaultQueryTimeout = 5 * time.Second + +type PendingDeletionCollector struct { + stuckDesc *prometheus.Desc + db *sql.DB + stuckThreshold time.Duration + queryTimeout time.Duration +} + +func NewPendingDeletionCollector(db *sql.DB, stuckThreshold time.Duration) *PendingDeletionCollector { + return &PendingDeletionCollector{ + db: db, + stuckThreshold: stuckThreshold, + queryTimeout: defaultQueryTimeout, + stuckDesc: prometheus.NewDesc( + metricsSubsystem+"_resource_pending_deletion_stuck", + "Number of resources in Pending Deletion state beyond the stuck threshold.", + []string{labelResourceType}, + prometheus.Labels{labelComponent: componentValue, labelVersion: api.Version}, + ), + } +} + +func (c *PendingDeletionCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.stuckDesc +} + +// stuckQueries maps resource types to their pre-built SQL queries. +// Table names are compile-time constants — no user input in SQL strings. +var stuckQueries = []struct { + query string + resourceType string +}{ + {"SELECT COUNT(*) FROM clusters WHERE deleted_time IS NOT NULL AND deleted_time < $1", "cluster"}, + {"SELECT COUNT(*) FROM node_pools WHERE deleted_time IS NOT NULL AND deleted_time < $1", "nodepool"}, +} + +func (c *PendingDeletionCollector) Collect(ch chan<- prometheus.Metric) { + if c == nil || c.db == nil { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), c.queryTimeout) + defer cancel() + + threshold := time.Now().UTC().Add(-c.stuckThreshold) + + for _, q := range stuckQueries { + var count int64 + row := c.db.QueryRowContext(ctx, q.query, threshold) //nolint:gosec // table names are compile-time constants + if err := row.Scan(&count); err != nil { + slog.Error("Failed to query pending deletion resources", + "resource_type", q.resourceType, + "error", err, + ) + continue + } + + ch <- prometheus.MustNewConstMetric( + c.stuckDesc, + prometheus.GaugeValue, + float64(count), + q.resourceType, + ) + } +} + +func RegisterCollector(db *sql.DB, stuckThreshold time.Duration) error { + collector := NewPendingDeletionCollector(db, stuckThreshold) + return prometheus.Register(collector) +} diff --git a/pkg/metrics/deletion_test.go b/pkg/metrics/deletion_test.go new file mode 100644 index 00000000..2fac3ca3 --- /dev/null +++ b/pkg/metrics/deletion_test.go @@ -0,0 +1,267 @@ +/* +Copyright (c) 2026 Red Hat, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "testing" + "time" + + . "github.com/onsi/gomega" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api" +) + +const ( + testPendingDeletionTotalMetric = "hyperfleet_api_resource_pending_deletion_total" + testPendingDeletionDurationMetric = "hyperfleet_api_resource_pending_deletion_duration_seconds" + testPendingDeletionStuckMetric = "hyperfleet_api_resource_pending_deletion_stuck" + testResourceCluster = "cluster" + testResourceNodepool = "nodepool" +) + +func TestMetricsSubsystem(t *testing.T) { + RegisterTestingT(t) + Expect(metricsSubsystem).To(Equal("hyperfleet_api")) +} + +func TestPendingDeletionTotalIsRegistered(t *testing.T) { + RegisterTestingT(t) + ResetMetrics() + + RecordPendingDeletion(testResourceCluster) + + metricFamilies, err := prometheus.DefaultGatherer.Gather() + Expect(err).To(BeNil()) + + var found bool + for _, mf := range metricFamilies { + if mf.GetName() == testPendingDeletionTotalMetric { + found = true + Expect(mf.GetType()).To(Equal(dto.MetricType_COUNTER)) + break + } + } + Expect(found).To(BeTrue(), testPendingDeletionTotalMetric+" metric should be registered") +} + +func TestRecordPendingDeletion_IncrementsCounter(t *testing.T) { + RegisterTestingT(t) + ResetMetrics() + + RecordPendingDeletion(testResourceCluster) + RecordPendingDeletion(testResourceCluster) + RecordPendingDeletion(testResourceNodepool) + + metricFamilies, err := prometheus.DefaultGatherer.Gather() + Expect(err).To(BeNil()) + + var clusterCount, nodepoolCount float64 + for _, mf := range metricFamilies { + if mf.GetName() == testPendingDeletionTotalMetric { + for _, metric := range mf.GetMetric() { + labels := labelsToMap(metric) + if labels["resource_type"] == testResourceCluster { + clusterCount = metric.GetCounter().GetValue() + } + if labels["resource_type"] == testResourceNodepool { + nodepoolCount = metric.GetCounter().GetValue() + } + } + break + } + } + Expect(clusterCount).To(Equal(2.0)) + Expect(nodepoolCount).To(Equal(1.0)) +} + +func TestRecordPendingDeletion_Labels(t *testing.T) { + RegisterTestingT(t) + ResetMetrics() + + RecordPendingDeletion(testResourceCluster) + + metricFamilies, err := prometheus.DefaultGatherer.Gather() + Expect(err).To(BeNil()) + + var found bool + for _, mf := range metricFamilies { + if mf.GetName() == testPendingDeletionTotalMetric { + for _, metric := range mf.GetMetric() { + labels := labelsToMap(metric) + if labels["resource_type"] == testResourceCluster { + found = true + Expect(labels["component"]).To(Equal("api")) + Expect(labels["version"]).To(Equal(api.Version)) + } + } + break + } + } + Expect(found).To(BeTrue(), "pending deletion total metric with expected labels should exist") +} + +func TestPendingDeletionDurationIsRegistered(t *testing.T) { + RegisterTestingT(t) + ResetMetrics() + + ObservePendingDeletionDuration(testResourceCluster, time.Now().Add(-5*time.Second)) + + metricFamilies, err := prometheus.DefaultGatherer.Gather() + Expect(err).To(BeNil()) + + var found bool + for _, mf := range metricFamilies { + if mf.GetName() == testPendingDeletionDurationMetric { + found = true + Expect(mf.GetType()).To(Equal(dto.MetricType_HISTOGRAM)) + break + } + } + Expect(found).To(BeTrue(), testPendingDeletionDurationMetric+" metric should be registered") +} + +func TestObservePendingDeletionDuration_RecordsValue(t *testing.T) { + RegisterTestingT(t) + ResetMetrics() + + deletedAt := time.Now().Add(-10 * time.Second) + ObservePendingDeletionDuration(testResourceCluster, deletedAt) + + metricFamilies, err := prometheus.DefaultGatherer.Gather() + Expect(err).To(BeNil()) + + var found bool + for _, mf := range metricFamilies { + if mf.GetName() == testPendingDeletionDurationMetric { + for _, metric := range mf.GetMetric() { + labels := labelsToMap(metric) + if labels["resource_type"] == testResourceCluster { + found = true + Expect(metric.GetHistogram().GetSampleCount()).To(BeEquivalentTo(1)) + Expect(metric.GetHistogram().GetSampleSum()).To(BeNumerically(">=", 10.0)) + } + } + break + } + } + Expect(found).To(BeTrue(), "pending deletion duration metric with cluster label should exist") +} + +func TestPendingDeletionDurationBuckets(t *testing.T) { + RegisterTestingT(t) + ResetMetrics() + + expectedBuckets := []float64{1, 5, 10, 30, 60, 120, 300, 600, 1800, 3600} + + ObservePendingDeletionDuration(testResourceCluster, time.Now().Add(-1*time.Second)) + + metricFamilies, err := prometheus.DefaultGatherer.Gather() + Expect(err).To(BeNil()) + + var found bool + for _, mf := range metricFamilies { + if mf.GetName() == testPendingDeletionDurationMetric { + found = true + for _, metric := range mf.GetMetric() { + histogram := metric.GetHistogram() + buckets := histogram.GetBucket() + Expect(buckets).To(HaveLen(len(expectedBuckets))) + for i, b := range buckets { + Expect(b.GetUpperBound()).To(Equal(expectedBuckets[i])) + } + } + break + } + } + Expect(found).To(BeTrue(), testPendingDeletionDurationMetric+" metric should be registered") +} + +func TestResetMetrics_ClearsAllDeletionMetrics(t *testing.T) { + RegisterTestingT(t) + + RecordPendingDeletion(testResourceCluster) + ObservePendingDeletionDuration(testResourceCluster, time.Now().Add(-5*time.Second)) + + ResetMetrics() + + metricFamilies, err := prometheus.DefaultGatherer.Gather() + Expect(err).To(BeNil()) + + for _, mf := range metricFamilies { + if mf.GetName() == testPendingDeletionTotalMetric { + Expect(mf.GetMetric()).To(BeEmpty(), "pending deletion total should be empty after reset") + } + if mf.GetName() == testPendingDeletionDurationMetric { + Expect(mf.GetMetric()).To(BeEmpty(), "pending deletion duration should be empty after reset") + } + } +} + +func TestPendingDeletionCollector_Describe(t *testing.T) { + RegisterTestingT(t) + + collector := NewPendingDeletionCollector(nil, 30*time.Minute) + ch := make(chan *prometheus.Desc, 10) + collector.Describe(ch) + close(ch) + + var descs []*prometheus.Desc + for desc := range ch { + descs = append(descs, desc) + } + + Expect(descs).To(HaveLen(1)) + Expect(descs[0].String()).To(ContainSubstring("resource_pending_deletion_stuck")) +} + +func TestPendingDeletionCollector_CollectWithNilDB(t *testing.T) { + RegisterTestingT(t) + + collector := NewPendingDeletionCollector(nil, 30*time.Minute) + ch := make(chan prometheus.Metric, 10) + collector.Collect(ch) + close(ch) + + var collectedMetrics []prometheus.Metric + for m := range ch { + collectedMetrics = append(collectedMetrics, m) + } + + Expect(collectedMetrics).To(BeEmpty()) +} + +func TestStuckDescriptor(t *testing.T) { + RegisterTestingT(t) + + collector := NewPendingDeletionCollector(nil, 30*time.Minute) + descStr := collector.stuckDesc.String() + + Expect(descStr).To(ContainSubstring("hyperfleet_api_resource_pending_deletion_stuck")) + Expect(descStr).To(ContainSubstring("resource_type")) + Expect(descStr).To(ContainSubstring("component")) + Expect(descStr).To(ContainSubstring("version")) +} + +func labelsToMap(metric *dto.Metric) map[string]string { + labels := make(map[string]string) + for _, lp := range metric.GetLabel() { + labels[lp.GetName()] = lp.GetValue() + } + return labels +} diff --git a/pkg/services/cluster.go b/pkg/services/cluster.go index 856c87aa..cac1a3b9 100644 --- a/pkg/services/cluster.go +++ b/pkg/services/cluster.go @@ -13,6 +13,7 @@ import ( "github.com/openshift-hyperfleet/hyperfleet-api/pkg/dao" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/errors" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics" "gorm.io/gorm" ) @@ -128,20 +129,26 @@ func (s *sqlClusterService) SoftDelete(ctx context.Context, id string) (*api.Clu return nil, handleSoftDeleteError("Cluster", saveErr) } + metrics.RecordPendingDeletion("cluster") + if cascadeErr := s.nodePoolDao.SoftDeleteByOwner(ctx, id, t, deletedBy); cascadeErr != nil { return nil, handleSoftDeleteError("NodePool", cascadeErr) } + nodePools, err := s.nodePoolDao.FindSoftDeletedByOwner(ctx, id) + if err != nil { + return nil, errors.GeneralError("Failed to fetch cascade-deleted nodepools: %s", err) + } + + for range nodePools { + metrics.RecordPendingDeletion("nodepool") + } + cluster, svcErr := s.UpdateClusterStatusFromAdapters(ctx, cluster.ID) if svcErr != nil { return nil, svcErr } - // Update status for all cascade-deleted nodepools so their Ready condition reflects the generation bump. - nodePools, err := s.nodePoolDao.FindSoftDeletedByOwner(ctx, id) - if err != nil { - return nil, errors.GeneralError("Failed to fetch cascade-deleted nodepools: %s", err) - } if svcErr := batchUpdateNodePoolStatusesFromAdapters( ctx, nodePools, diff --git a/pkg/services/node_pool.go b/pkg/services/node_pool.go index 14438591..218f53fa 100644 --- a/pkg/services/node_pool.go +++ b/pkg/services/node_pool.go @@ -12,6 +12,7 @@ import ( "github.com/openshift-hyperfleet/hyperfleet-api/pkg/dao" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/errors" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics" "gorm.io/gorm" ) @@ -119,6 +120,8 @@ func (s *sqlNodePoolService) SoftDelete(ctx context.Context, id string) (*api.No return nil, handleSoftDeleteError("NodePool", err) } + metrics.RecordPendingDeletion("nodepool") + nodePool, svcErr := s.UpdateNodePoolStatusFromAdapters(ctx, nodePool.ID) if svcErr != nil { return nil, svcErr diff --git a/test/integration/deletion_metrics_test.go b/test/integration/deletion_metrics_test.go new file mode 100644 index 00000000..49a8db7c --- /dev/null +++ b/test/integration/deletion_metrics_test.go @@ -0,0 +1,93 @@ +package integration + +import ( + "net/http" + "testing" + "time" + + . "github.com/onsi/gomega" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api/openapi" + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics" + "github.com/openshift-hyperfleet/hyperfleet-api/test" +) + +func TestPendingDeletionCollector_Integration(t *testing.T) { + t.Run("given soft-deleted resources older than threshold, collector reports them as stuck", func(t *testing.T) { + RegisterTestingT(t) + h, client := test.RegisterIntegration(t) + account := h.NewRandAccount() + ctx := h.NewAuthenticatedContext(account) + + cluster, err := h.Factories.NewClusters(h.NewID()) + Expect(err).NotTo(HaveOccurred()) + npResp, err := client.CreateNodePoolWithResponse( + ctx, cluster.ID, openapi.CreateNodePoolJSONRequestBody(newNodePoolInput("stuck-np")), + test.WithAuthToken(ctx), + ) + Expect(err).NotTo(HaveOccurred()) + Expect(npResp.StatusCode()).To(Equal(http.StatusCreated)) + + _, err = client.DeleteClusterByIdWithResponse(ctx, cluster.ID, test.WithAuthToken(ctx)) + Expect(err).NotTo(HaveOccurred()) + + // Backdate deleted_time to 1 hour ago so resources exceed the 30m threshold + db := h.DBFactory.New(ctx) + pastTime := time.Now().UTC().Add(-1 * time.Hour) + Expect(db.Exec("UPDATE clusters SET deleted_time = ? WHERE id = ?", pastTime, cluster.ID).Error).NotTo(HaveOccurred()) + err = db.Exec("UPDATE node_pools SET deleted_time = ? WHERE owner_id = ?", pastTime, cluster.ID).Error + Expect(err).NotTo(HaveOccurred()) + + rawDB := h.DBFactory.DirectDB() + collector := metrics.NewPendingDeletionCollector(rawDB, 30*time.Minute) + + collected := collectStuckMetrics(t, collector) + + Expect(collected["cluster"]).To(BeNumerically(">=", 1), "should report at least 1 stuck cluster") + Expect(collected["nodepool"]).To(BeNumerically(">=", 1), "should report at least 1 stuck nodepool") + }) + + t.Run("given soft-deleted resources within threshold, collector reports zero stuck", func(t *testing.T) { + RegisterTestingT(t) + h, client := test.RegisterIntegration(t) + account := h.NewRandAccount() + ctx := h.NewAuthenticatedContext(account) + + cluster, err := h.Factories.NewClusters(h.NewID()) + Expect(err).NotTo(HaveOccurred()) + + _, err = client.DeleteClusterByIdWithResponse(ctx, cluster.ID, test.WithAuthToken(ctx)) + Expect(err).NotTo(HaveOccurred()) + + rawDB := h.DBFactory.DirectDB() + // Use a very long threshold so the just-deleted cluster is NOT stuck + collector := metrics.NewPendingDeletionCollector(rawDB, 24*time.Hour) + + collected := collectStuckMetrics(t, collector) + + Expect(collected["cluster"]).To(Equal(0.0), "recently deleted cluster should not be stuck") + }) +} + +func collectStuckMetrics(t *testing.T, collector *metrics.PendingDeletionCollector) map[string]float64 { + t.Helper() + RegisterTestingT(t) + + ch := make(chan prometheus.Metric, 10) + collector.Collect(ch) + close(ch) + + result := make(map[string]float64) + for m := range ch { + pb := &dto.Metric{} + Expect(m.Write(pb)).To(Succeed()) + for _, lp := range pb.GetLabel() { + if lp.GetName() == "resource_type" { + result[lp.GetValue()] = pb.GetGauge().GetValue() + } + } + } + return result +}