From 429af793baeb4f5dcb4de1ff47d111590ef8c41d Mon Sep 17 00:00:00 2001
From: Rafael Benevides <rafabene@gmail.com>
Date: Wed, 29 Apr 2026 10:58:23 -0300
Subject: [PATCH] HYPERFLEET-856 - feat: add deletion observability metrics and
 alerts

Add Prometheus metrics to track the lifecycle of resource deletion:
- pending_deletion_total counter: tracks resources entering Pending Deletion
- pending_deletion_duration_seconds histogram: measures soft-delete to hard-delete
- pending_deletion_stuck gauge (collector): reports resources stuck beyond threshold

Includes PrometheusRule alerts (warning at 1h, critical at 2.5h total),
partial indexes on deleted_time for efficient collector queries,
and integration tests for the full metrics pipeline.
---
 charts/templates/prometheusrule.yaml          |  40 +++
 charts/values.yaml                            |  14 +
 cmd/hyperfleet-api/servecmd/cmd.go            |  10 +
 .../server/metrics_middleware.go              |   2 +
 docs/metrics.md                               |  83 ++++++
 pkg/config/flags.go                           |   2 +
 pkg/config/loader.go                          |   6 +
 pkg/config/metrics.go                         |  11 +
 .../202604290001_add_deleted_time_indexes.go  |  25 ++
 pkg/db/migrations/migration_structs.go        |   1 +
 pkg/metrics/deletion.go                       | 172 +++++++++++
 pkg/metrics/deletion_test.go                  | 267 ++++++++++++++++++
 pkg/services/cluster.go                       |  17 +-
 pkg/services/node_pool.go                     |   3 +
 test/integration/deletion_metrics_test.go     |  93 ++++++
 15 files changed, 741 insertions(+), 5 deletions(-)
 create mode 100644 charts/templates/prometheusrule.yaml
 create mode 100644 pkg/db/migrations/202604290001_add_deleted_time_indexes.go
 create mode 100644 pkg/metrics/deletion.go
 create mode 100644 pkg/metrics/deletion_test.go
 create mode 100644 test/integration/deletion_metrics_test.go

diff --git a/charts/templates/prometheusrule.yaml b/charts/templates/prometheusrule.yaml
new file mode 100644
index 00000000..000c44d6
--- /dev/null
+++ b/charts/templates/prometheusrule.yaml
@@ -0,0 +1,40 @@
+{{- if .Values.prometheusRule.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: {{ include "hyperfleet-api.fullname" . }}
+  namespace: {{ .Values.prometheusRule.namespace | default .Release.Namespace }}
+  labels:
+    {{- include "hyperfleet-api.labels" . | nindent 4 }}
+    {{- with .Values.prometheusRule.labels }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+spec:
+  groups:
+    - name: hyperfleet-api-deletion
+      rules:
+        - alert: HyperFleetResourceDeletionStuckWarning
+          expr: max by (namespace, resource_type)(hyperfleet_api_resource_pending_deletion_stuck) > 0
+          for: {{ .Values.prometheusRule.rules.deletionStuck.for | default "30m" }}
+          labels:
+            severity: warning
+          annotations:
+            summary: "HyperFleet resources stuck in Pending Deletion state"
+            description: >-
+              {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in
+              Pending Deletion state for more than {{ .Values.config.metrics.deletion_stuck_threshold | default "30m" }}
+              (stuck threshold) + {{ .Values.prometheusRule.rules.deletionStuck.for | default "30m" }} (alert delay).
+            runbook_url: {{ .Values.prometheusRule.rules.deletionStuck.runbookUrl | default "" | quote }}
+        - alert: HyperFleetResourceDeletionStuckCritical
+          expr: max by (namespace, resource_type)(hyperfleet_api_resource_pending_deletion_stuck) > 0
+          for: {{ .Values.prometheusRule.rules.deletionTimeout.for | default "2h" }}
+          labels:
+            severity: critical
+          annotations:
+            summary: "HyperFleet resources timed out in Pending Deletion state"
+            description: >-
+              {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in
+              Pending Deletion state for more than {{ .Values.config.metrics.deletion_stuck_threshold | default "30m" }}
+              (stuck threshold) + {{ .Values.prometheusRule.rules.deletionTimeout.for | default "2h" }} (alert delay). Immediate investigation required.
+            runbook_url: {{ .Values.prometheusRule.rules.deletionTimeout.runbookUrl | default "" | quote }}
+{{- end }}
diff --git a/charts/values.yaml b/charts/values.yaml
index de8f9139..b5a95a04 100644
--- a/charts/values.yaml
+++ b/charts/values.yaml
@@ -126,6 +126,7 @@ config:
       enabled: false
 
     label_metrics_inclusion_duration: 168h
+    deletion_stuck_threshold: 30m
 
   # Health check configuration
   health:
@@ -243,6 +244,19 @@ database:
       size: 1Gi
       storageClass: ""
 
+# PrometheusRule for alerting
+prometheusRule:
+  enabled: false
+  labels: {}
+  namespace: ""
+  rules:
+    deletionStuck:
+      for: "30m"
+      runbookUrl: ""
+    deletionTimeout:
+      for: "2h"
+      runbookUrl: ""
+
 # ServiceMonitor for Prometheus Operator
 serviceMonitor:
   enabled: false
diff --git a/cmd/hyperfleet-api/servecmd/cmd.go b/cmd/hyperfleet-api/servecmd/cmd.go
index 61beb0ef..040bcc54 100755
--- a/cmd/hyperfleet-api/servecmd/cmd.go
+++ b/cmd/hyperfleet-api/servecmd/cmd.go
@@ -18,6 +18,7 @@ import (
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/db/db_session"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/health"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger"
+	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/telemetry"
 )
 
@@ -129,6 +130,15 @@ func runServe(cmd *cobra.Command, args []string) {
 		"masking_enabled", environments.Environment().Config.Logging.Masking.Enabled,
 	).Info("Logger initialized")
 
+	if sf := environments.Environment().Database.SessionFactory; sf != nil {
+		if err := metrics.RegisterCollector(
+			sf.DirectDB(),
+			environments.Environment().Config.Metrics.DeletionStuckThreshold,
+		); err != nil {
+			logger.WithError(ctx, err).Error("Failed to register pending deletion collector")
+		}
+	}
+
 	apiServer := server.NewAPIServer(tracingEnabled)
 	go apiServer.Start()
 
diff --git a/cmd/hyperfleet-api/server/metrics_middleware.go b/cmd/hyperfleet-api/server/metrics_middleware.go
index dbfe2697..54ac053a 100755
--- a/cmd/hyperfleet-api/server/metrics_middleware.go
+++ b/cmd/hyperfleet-api/server/metrics_middleware.go
@@ -62,6 +62,7 @@ import (
 
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/api"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/db/db_metrics"
+	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics"
 )
 
 // MetricsMiddleware creates a new handler that collects metrics for the requests processed by the
@@ -112,6 +113,7 @@ func ResetMetricCollectors() {
 	requestCountMetric.Reset()
 	requestDurationMetric.Reset()
 	db_metrics.ResetMetrics()
+	metrics.ResetMetrics()
 	buildInfoMetric.Reset()
 	buildInfoMetric.With(prometheus.Labels{
 		metricsComponentLabel: metricsComponentValue,
diff --git a/docs/metrics.md b/docs/metrics.md
index e6607e0b..82d00736 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -99,6 +99,69 @@ hyperfleet_api_request_duration_seconds_sum{component="api",version="abc123",cod
 hyperfleet_api_request_duration_seconds_count{component="api",version="abc123",code="200",method="GET",path="/api/hyperfleet/v1/clusters"} 1523
 ```
 
+### Deletion Observability Metrics
+
+These metrics track resources in the Pending Deletion state (`deleted_time` set, pending hard-delete by adapters).
+
+#### `hyperfleet_api_resource_pending_deletion_total`
+
+**Type:** Counter
+
+**Description:** Total number of resources that entered the Pending Deletion state (`deleted_time` set).
+
+**Labels:**
+
+| Label | Description | Example Values |
+|-------|-------------|----------------|
+| `resource_type` | Type of resource | `cluster`, `nodepool` |
+| `component` | Component name | `api` |
+| `version` | Application version | `abc123` |
+
+**Example output:**
+
+```text
+hyperfleet_api_resource_pending_deletion_total{component="api",resource_type="cluster",version="abc123"} 42
+hyperfleet_api_resource_pending_deletion_total{component="api",resource_type="nodepool",version="abc123"} 156
+```
+
+#### `hyperfleet_api_resource_pending_deletion_duration_seconds`
+
+**Type:** Histogram
+
+**Description:** Duration from pending deletion (`deleted_time` set) to hard-delete completion in seconds. Observed when a resource is hard-deleted after all adapters report `Finalized=True`.
+
+**Labels:** Same as `hyperfleet_api_resource_pending_deletion_total`
+
+**Buckets:** `1s`, `5s`, `10s`, `30s`, `60s`, `120s`, `300s`, `600s`, `1800s`, `3600s`
+
+**Note:** This metric is populated when the hard-delete flow is active. See the [hard-delete design](https://github.com/openshift-hyperfleet/architecture/blob/main/hyperfleet/components/api-service/hard-delete-design.md) for details.
+
+#### `hyperfleet_api_resource_pending_deletion_stuck`
+
+**Type:** Gauge (Collector)
+
+**Description:** Number of resources in Pending Deletion state beyond the stuck threshold (default 30 minutes). This gauge is computed on each Prometheus scrape by querying the database for resources with `deleted_time` set before the threshold.
+
+**Labels:** Same as `hyperfleet_api_resource_pending_deletion_total`
+
+**Configuration:** The stuck threshold is configurable via `--metrics-deletion-stuck-threshold` (default `30m`).
+
+**Example output:**
+
+```text
+hyperfleet_api_resource_pending_deletion_stuck{component="api",resource_type="cluster",version="abc123"} 2
+hyperfleet_api_resource_pending_deletion_stuck{component="api",resource_type="nodepool",version="abc123"} 0
+```
+
+### Deletion Alerts
+
+Two alerts are available via the PrometheusRule (requires `prometheusRule.enabled=true` in Helm values):
+
+| Alert | Severity | Condition | Description |
+|-------|----------|-----------|-------------|
+| `HyperFleetResourceDeletionStuckWarning` | Warning | `resource_pending_deletion_stuck > 0` for 30m | Resources stuck in Pending Deletion beyond 1 hour |
+| `HyperFleetResourceDeletionStuckCritical` | Critical | `resource_pending_deletion_stuck > 0` for 2h | Resources stuck in Pending Deletion beyond 2.5 hours |
+
 ## Go Runtime Metrics
 
 The following metrics are automatically exposed by the Prometheus Go client library.
@@ -255,6 +318,26 @@ rate(process_cpu_seconds_total[5m])
 process_open_fds / process_max_fds * 100
 ```
 
+### Deletion Observability
+
+```promql
+# Resources entering Pending Deletion state per minute
+sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_total[5m])) * 60
+
+# Resources currently stuck in Pending Deletion state
+hyperfleet_api_resource_pending_deletion_stuck
+
+# Stuck resources by type
+sum by (resource_type) (hyperfleet_api_resource_pending_deletion_stuck)
+
+# Average pending deletion duration (once hard-delete is active)
+sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_sum[5m])) /
+sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_count[5m]))
+
+# P99 pending deletion duration
+histogram_quantile(0.99, sum by (le, resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_bucket[5m])))
+```
+
 ### Common Investigation Queries
 
 ```promql
diff --git a/pkg/config/flags.go b/pkg/config/flags.go
index d7a76004..794ce807 100644
--- a/pkg/config/flags.go
+++ b/pkg/config/flags.go
@@ -79,6 +79,8 @@ func AddMetricsFlags(cmd *cobra.Command) {
 	cmd.Flags().String("metrics-tls-key-file", defaults.TLS.KeyFile, "Path to TLS key file for metrics")
 	cmd.Flags().Duration("metrics-label-metrics-inclusion-duration", defaults.LabelMetricsInclusionDuration,
 		"Duration for cluster telemetry label inclusion")
+	cmd.Flags().Duration("metrics-deletion-stuck-threshold", defaults.DeletionStuckThreshold,
+		"Duration after which a pending deletion resource is considered stuck")
 }
 
 // AddHealthFlags adds health check configuration flags following standard naming
diff --git a/pkg/config/loader.go b/pkg/config/loader.go
index f2b4066a..d768d438 100644
--- a/pkg/config/loader.go
+++ b/pkg/config/loader.go
@@ -185,6 +185,9 @@ func (l *ConfigLoader) validateConfig(config *ApplicationConfig) error {
 		if valErr := config.Metrics.TLS.Validate(); valErr != nil {
 			return fmt.Errorf("metrics TLS validation failed: %w", valErr)
 		}
+		if valErr := config.Metrics.Validate(); valErr != nil {
+			return fmt.Errorf("metrics config validation failed: %w", valErr)
+		}
 		return nil
 	}
 
@@ -345,6 +348,7 @@ func (l *ConfigLoader) bindAllEnvVars() {
 	l.bindEnv("metrics.port")
 	l.bindEnv("metrics.tls.enabled")
 	l.bindEnv("metrics.label_metrics_inclusion_duration")
+	l.bindEnv("metrics.deletion_stuck_threshold")
 
 	// Health config
 	l.bindEnv("health.host")
@@ -411,6 +415,8 @@ func (l *ConfigLoader) bindFlags(cmd *cobra.Command) {
 	l.bindPFlag("metrics.tls.key_file", cmd.Flags().Lookup("metrics-tls-key-file"))
 	l.bindPFlag("metrics.label_metrics_inclusion_duration",
 		cmd.Flags().Lookup("metrics-label-metrics-inclusion-duration"))
+	l.bindPFlag("metrics.deletion_stuck_threshold",
+		cmd.Flags().Lookup("metrics-deletion-stuck-threshold"))
 
 	// Health flags: --health-* -> health.*
 	l.bindPFlag("health.host", cmd.Flags().Lookup("health-host"))
diff --git a/pkg/config/metrics.go b/pkg/config/metrics.go
index 79114fb2..1ee67ea4 100755
--- a/pkg/config/metrics.go
+++ b/pkg/config/metrics.go
@@ -1,6 +1,7 @@
 package config
 
 import (
+	"fmt"
 	"net"
 	"strconv"
 	"time"
@@ -13,6 +14,7 @@ type MetricsConfig struct {
 	TLS                           TLSConfig     `mapstructure:"tls" json:"tls" validate:"required"`
 	Port                          int           `mapstructure:"port" json:"port" validate:"required,min=1,max=65535"`
 	LabelMetricsInclusionDuration time.Duration `mapstructure:"label_metrics_inclusion_duration" json:"label_metrics_inclusion_duration" validate:"required"` //nolint:lll
+	DeletionStuckThreshold        time.Duration `mapstructure:"deletion_stuck_threshold" json:"deletion_stuck_threshold" validate:"required"`                 //nolint:lll
 }
 
 // NewMetricsConfig returns default MetricsConfig values
@@ -25,9 +27,18 @@ func NewMetricsConfig() *MetricsConfig {
 			Enabled: false,
 		},
 		LabelMetricsInclusionDuration: 168 * time.Hour, // 7 days
+		DeletionStuckThreshold:        30 * time.Minute,
 	}
 }
 
+// Validate validates MetricsConfig fields that struct tags cannot enforce
+func (m *MetricsConfig) Validate() error {
+	if m.DeletionStuckThreshold <= 0 {
+		return fmt.Errorf("DeletionStuckThreshold must be positive, got %v", m.DeletionStuckThreshold)
+	}
+	return nil
+}
+
 // ============================================================
 // Convenience Accessor Methods
 // ============================================================
diff --git a/pkg/db/migrations/202604290001_add_deleted_time_indexes.go b/pkg/db/migrations/202604290001_add_deleted_time_indexes.go
new file mode 100644
index 00000000..140ccf6a
--- /dev/null
+++ b/pkg/db/migrations/202604290001_add_deleted_time_indexes.go
@@ -0,0 +1,25 @@
+package migrations
+
+import (
+	"gorm.io/gorm"
+
+	"github.com/go-gormigrate/gormigrate/v2"
+)
+
+func addDeletedTimeIndexes() *gormigrate.Migration {
+	return &gormigrate.Migration{
+		ID: "202604290001",
+		Migrate: func(tx *gorm.DB) error {
+			// Partial indexes for metrics collector queries:
+			//   SELECT COUNT(*) FROM clusters WHERE deleted_time IS NOT NULL AND deleted_time < $1
+			//   SELECT COUNT(*) FROM node_pools WHERE deleted_time IS NOT NULL AND deleted_time < $1
+			if err := tx.Exec("CREATE INDEX IF NOT EXISTS idx_clusters_deleted_time ON clusters(deleted_time) WHERE deleted_time IS NOT NULL;").Error; err != nil { //nolint:lll
+				return err
+			}
+			if err := tx.Exec("CREATE INDEX IF NOT EXISTS idx_node_pools_deleted_time ON node_pools(deleted_time) WHERE deleted_time IS NOT NULL;").Error; err != nil { //nolint:lll
+				return err
+			}
+			return nil
+		},
+	}
+}
diff --git a/pkg/db/migrations/migration_structs.go b/pkg/db/migrations/migration_structs.go
index d6bff2e1..f402c8c0 100755
--- a/pkg/db/migrations/migration_structs.go
+++ b/pkg/db/migrations/migration_structs.go
@@ -35,6 +35,7 @@ var MigrationList = []*gormigrate.Migration{
 	addSoftDeleteSchema(),
 	addNodePoolOwnerDeletedIndex(),
 	addReconciledIndex(),
+	addDeletedTimeIndexes(),
 }
 
 // Model represents the base model struct. All entities will have this struct embedded.
diff --git a/pkg/metrics/deletion.go b/pkg/metrics/deletion.go
new file mode 100644
index 00000000..40f0d790
--- /dev/null
+++ b/pkg/metrics/deletion.go
@@ -0,0 +1,172 @@
+/*
+Copyright (c) 2026 Red Hat, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package metrics
+
+import (
+	"context"
+	"database/sql"
+	"log/slog"
+	"sync"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
+
+	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/api"
+)
+
+const metricsSubsystem = "hyperfleet_api"
+
+const (
+	labelResourceType = "resource_type"
+	labelComponent    = "component"
+	labelVersion      = "version"
+)
+
+const componentValue = "api"
+
+var deletionLabels = []string{labelResourceType, labelComponent, labelVersion}
+
+var pendingDeletionDurationBuckets = []float64{1, 5, 10, 30, 60, 120, 300, 600, 1800, 3600}
+
+var pendingDeletionTotal = prometheus.NewCounterVec(
+	prometheus.CounterOpts{
+		Subsystem: metricsSubsystem,
+		Name:      "resource_pending_deletion_total",
+		Help:      "Total number of resources that entered the Pending Deletion state (deleted_time set).",
+	},
+	deletionLabels,
+)
+
+var pendingDeletionDuration = prometheus.NewHistogramVec(
+	prometheus.HistogramOpts{
+		Subsystem: metricsSubsystem,
+		Name:      "resource_pending_deletion_duration_seconds",
+		Help:      "Duration from pending deletion (deleted_time set) to hard-delete completion in seconds.",
+		Buckets:   pendingDeletionDurationBuckets,
+	},
+	deletionLabels,
+)
+
+var registerOnce sync.Once
+
+func RegisterMetrics() {
+	registerOnce.Do(func() {
+		prometheus.MustRegister(pendingDeletionTotal)
+		prometheus.MustRegister(pendingDeletionDuration)
+	})
+}
+
+func init() {
+	RegisterMetrics()
+}
+
+func RecordPendingDeletion(resourceType string) {
+	pendingDeletionTotal.With(prometheus.Labels{
+		labelResourceType: resourceType,
+		labelComponent:    componentValue,
+		labelVersion:      api.Version,
+	}).Inc()
+}
+
+func ObservePendingDeletionDuration(resourceType string, deletedAt time.Time) {
+	duration := time.Since(deletedAt).Seconds()
+	pendingDeletionDuration.With(prometheus.Labels{
+		labelResourceType: resourceType,
+		labelComponent:    componentValue,
+		labelVersion:      api.Version,
+	}).Observe(duration)
+}
+
+func ResetMetrics() {
+	pendingDeletionTotal.Reset()
+	pendingDeletionDuration.Reset()
+}
+
+// PendingDeletionCollector implements prometheus.Collector to report the number of
+// resources stuck in Pending Deletion state beyond a configurable threshold.
+// It queries the database on each Prometheus scrape.
+const defaultQueryTimeout = 5 * time.Second
+
+type PendingDeletionCollector struct {
+	stuckDesc      *prometheus.Desc
+	db             *sql.DB
+	stuckThreshold time.Duration
+	queryTimeout   time.Duration
+}
+
+func NewPendingDeletionCollector(db *sql.DB, stuckThreshold time.Duration) *PendingDeletionCollector {
+	return &PendingDeletionCollector{
+		db:             db,
+		stuckThreshold: stuckThreshold,
+		queryTimeout:   defaultQueryTimeout,
+		stuckDesc: prometheus.NewDesc(
+			metricsSubsystem+"_resource_pending_deletion_stuck",
+			"Number of resources in Pending Deletion state beyond the stuck threshold.",
+			[]string{labelResourceType},
+			prometheus.Labels{labelComponent: componentValue, labelVersion: api.Version},
+		),
+	}
+}
+
+func (c *PendingDeletionCollector) Describe(ch chan<- *prometheus.Desc) {
+	ch <- c.stuckDesc
+}
+
+// stuckQueries maps resource types to their pre-built SQL queries.
+// Table names are compile-time constants — no user input in SQL strings.
+var stuckQueries = []struct {
+	query        string
+	resourceType string
+}{
+	{"SELECT COUNT(*) FROM clusters WHERE deleted_time IS NOT NULL AND deleted_time < $1", "cluster"},
+	{"SELECT COUNT(*) FROM node_pools WHERE deleted_time IS NOT NULL AND deleted_time < $1", "nodepool"},
+}
+
+func (c *PendingDeletionCollector) Collect(ch chan<- prometheus.Metric) {
+	if c == nil || c.db == nil {
+		return
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), c.queryTimeout)
+	defer cancel()
+
+	threshold := time.Now().UTC().Add(-c.stuckThreshold)
+
+	for _, q := range stuckQueries {
+		var count int64
+		row := c.db.QueryRowContext(ctx, q.query, threshold) //nolint:gosec // table names are compile-time constants
+		if err := row.Scan(&count); err != nil {
+			slog.Error("Failed to query pending deletion resources",
+				"resource_type", q.resourceType,
+				"error", err,
+			)
+			continue
+		}
+
+		ch <- prometheus.MustNewConstMetric(
+			c.stuckDesc,
+			prometheus.GaugeValue,
+			float64(count),
+			q.resourceType,
+		)
+	}
+}
+
+func RegisterCollector(db *sql.DB, stuckThreshold time.Duration) error {
+	collector := NewPendingDeletionCollector(db, stuckThreshold)
+	return prometheus.Register(collector)
+}
diff --git a/pkg/metrics/deletion_test.go b/pkg/metrics/deletion_test.go
new file mode 100644
index 00000000..2fac3ca3
--- /dev/null
+++ b/pkg/metrics/deletion_test.go
@@ -0,0 +1,267 @@
+/*
+Copyright (c) 2026 Red Hat, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package metrics
+
+import (
+	"testing"
+	"time"
+
+	. "github.com/onsi/gomega"
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+
+	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/api"
+)
+
+const (
+	testPendingDeletionTotalMetric    = "hyperfleet_api_resource_pending_deletion_total"
+	testPendingDeletionDurationMetric = "hyperfleet_api_resource_pending_deletion_duration_seconds"
+	testPendingDeletionStuckMetric    = "hyperfleet_api_resource_pending_deletion_stuck"
+	testResourceCluster               = "cluster"
+	testResourceNodepool              = "nodepool"
+)
+
+func TestMetricsSubsystem(t *testing.T) {
+	RegisterTestingT(t)
+	Expect(metricsSubsystem).To(Equal("hyperfleet_api"))
+}
+
+func TestPendingDeletionTotalIsRegistered(t *testing.T) {
+	RegisterTestingT(t)
+	ResetMetrics()
+
+	RecordPendingDeletion(testResourceCluster)
+
+	metricFamilies, err := prometheus.DefaultGatherer.Gather()
+	Expect(err).To(BeNil())
+
+	var found bool
+	for _, mf := range metricFamilies {
+		if mf.GetName() == testPendingDeletionTotalMetric {
+			found = true
+			Expect(mf.GetType()).To(Equal(dto.MetricType_COUNTER))
+			break
+		}
+	}
+	Expect(found).To(BeTrue(), testPendingDeletionTotalMetric+" metric should be registered")
+}
+
+func TestRecordPendingDeletion_IncrementsCounter(t *testing.T) {
+	RegisterTestingT(t)
+	ResetMetrics()
+
+	RecordPendingDeletion(testResourceCluster)
+	RecordPendingDeletion(testResourceCluster)
+	RecordPendingDeletion(testResourceNodepool)
+
+	metricFamilies, err := prometheus.DefaultGatherer.Gather()
+	Expect(err).To(BeNil())
+
+	var clusterCount, nodepoolCount float64
+	for _, mf := range metricFamilies {
+		if mf.GetName() == testPendingDeletionTotalMetric {
+			for _, metric := range mf.GetMetric() {
+				labels := labelsToMap(metric)
+				if labels["resource_type"] == testResourceCluster {
+					clusterCount = metric.GetCounter().GetValue()
+				}
+				if labels["resource_type"] == testResourceNodepool {
+					nodepoolCount = metric.GetCounter().GetValue()
+				}
+			}
+			break
+		}
+	}
+	Expect(clusterCount).To(Equal(2.0))
+	Expect(nodepoolCount).To(Equal(1.0))
+}
+
+func TestRecordPendingDeletion_Labels(t *testing.T) {
+	RegisterTestingT(t)
+	ResetMetrics()
+
+	RecordPendingDeletion(testResourceCluster)
+
+	metricFamilies, err := prometheus.DefaultGatherer.Gather()
+	Expect(err).To(BeNil())
+
+	var found bool
+	for _, mf := range metricFamilies {
+		if mf.GetName() == testPendingDeletionTotalMetric {
+			for _, metric := range mf.GetMetric() {
+				labels := labelsToMap(metric)
+				if labels["resource_type"] == testResourceCluster {
+					found = true
+					Expect(labels["component"]).To(Equal("api"))
+					Expect(labels["version"]).To(Equal(api.Version))
+				}
+			}
+			break
+		}
+	}
+	Expect(found).To(BeTrue(), "pending deletion total metric with expected labels should exist")
+}
+
+func TestPendingDeletionDurationIsRegistered(t *testing.T) {
+	RegisterTestingT(t)
+	ResetMetrics()
+
+	ObservePendingDeletionDuration(testResourceCluster, time.Now().Add(-5*time.Second))
+
+	metricFamilies, err := prometheus.DefaultGatherer.Gather()
+	Expect(err).To(BeNil())
+
+	var found bool
+	for _, mf := range metricFamilies {
+		if mf.GetName() == testPendingDeletionDurationMetric {
+			found = true
+			Expect(mf.GetType()).To(Equal(dto.MetricType_HISTOGRAM))
+			break
+		}
+	}
+	Expect(found).To(BeTrue(), testPendingDeletionDurationMetric+" metric should be registered")
+}
+
+func TestObservePendingDeletionDuration_RecordsValue(t *testing.T) {
+	RegisterTestingT(t)
+	ResetMetrics()
+
+	deletedAt := time.Now().Add(-10 * time.Second)
+	ObservePendingDeletionDuration(testResourceCluster, deletedAt)
+
+	metricFamilies, err := prometheus.DefaultGatherer.Gather()
+	Expect(err).To(BeNil())
+
+	var found bool
+	for _, mf := range metricFamilies {
+		if mf.GetName() == testPendingDeletionDurationMetric {
+			for _, metric := range mf.GetMetric() {
+				labels := labelsToMap(metric)
+				if labels["resource_type"] == testResourceCluster {
+					found = true
+					Expect(metric.GetHistogram().GetSampleCount()).To(BeEquivalentTo(1))
+					Expect(metric.GetHistogram().GetSampleSum()).To(BeNumerically(">=", 10.0))
+				}
+			}
+			break
+		}
+	}
+	Expect(found).To(BeTrue(), "pending deletion duration metric with cluster label should exist")
+}
+
+func TestPendingDeletionDurationBuckets(t *testing.T) {
+	RegisterTestingT(t)
+	ResetMetrics()
+
+	expectedBuckets := []float64{1, 5, 10, 30, 60, 120, 300, 600, 1800, 3600}
+
+	ObservePendingDeletionDuration(testResourceCluster, time.Now().Add(-1*time.Second))
+
+	metricFamilies, err := prometheus.DefaultGatherer.Gather()
+	Expect(err).To(BeNil())
+
+	var found bool
+	for _, mf := range metricFamilies {
+		if mf.GetName() == testPendingDeletionDurationMetric {
+			found = true
+			for _, metric := range mf.GetMetric() {
+				histogram := metric.GetHistogram()
+				buckets := histogram.GetBucket()
+				Expect(buckets).To(HaveLen(len(expectedBuckets)))
+				for i, b := range buckets {
+					Expect(b.GetUpperBound()).To(Equal(expectedBuckets[i]))
+				}
+			}
+			break
+		}
+	}
+	Expect(found).To(BeTrue(), testPendingDeletionDurationMetric+" metric should be registered")
+}
+
+func TestResetMetrics_ClearsAllDeletionMetrics(t *testing.T) {
+	RegisterTestingT(t)
+
+	RecordPendingDeletion(testResourceCluster)
+	ObservePendingDeletionDuration(testResourceCluster, time.Now().Add(-5*time.Second))
+
+	ResetMetrics()
+
+	metricFamilies, err := prometheus.DefaultGatherer.Gather()
+	Expect(err).To(BeNil())
+
+	for _, mf := range metricFamilies {
+		if mf.GetName() == testPendingDeletionTotalMetric {
+			Expect(mf.GetMetric()).To(BeEmpty(), "pending deletion total should be empty after reset")
+		}
+		if mf.GetName() == testPendingDeletionDurationMetric {
+			Expect(mf.GetMetric()).To(BeEmpty(), "pending deletion duration should be empty after reset")
+		}
+	}
+}
+
+func TestPendingDeletionCollector_Describe(t *testing.T) {
+	RegisterTestingT(t)
+
+	collector := NewPendingDeletionCollector(nil, 30*time.Minute)
+	ch := make(chan *prometheus.Desc, 10)
+	collector.Describe(ch)
+	close(ch)
+
+	var descs []*prometheus.Desc
+	for desc := range ch {
+		descs = append(descs, desc)
+	}
+
+	Expect(descs).To(HaveLen(1))
+	Expect(descs[0].String()).To(ContainSubstring("resource_pending_deletion_stuck"))
+}
+
+func TestPendingDeletionCollector_CollectWithNilDB(t *testing.T) {
+	RegisterTestingT(t)
+
+	collector := NewPendingDeletionCollector(nil, 30*time.Minute)
+	ch := make(chan prometheus.Metric, 10)
+	collector.Collect(ch)
+	close(ch)
+
+	var collectedMetrics []prometheus.Metric
+	for m := range ch {
+		collectedMetrics = append(collectedMetrics, m)
+	}
+
+	Expect(collectedMetrics).To(BeEmpty())
+}
+
+func TestStuckDescriptor(t *testing.T) {
+	RegisterTestingT(t)
+
+	collector := NewPendingDeletionCollector(nil, 30*time.Minute)
+	descStr := collector.stuckDesc.String()
+
+	Expect(descStr).To(ContainSubstring("hyperfleet_api_resource_pending_deletion_stuck"))
+	Expect(descStr).To(ContainSubstring("resource_type"))
+	Expect(descStr).To(ContainSubstring("component"))
+	Expect(descStr).To(ContainSubstring("version"))
+}
+
+func labelsToMap(metric *dto.Metric) map[string]string {
+	labels := make(map[string]string)
+	for _, lp := range metric.GetLabel() {
+		labels[lp.GetName()] = lp.GetValue()
+	}
+	return labels
+}
diff --git a/pkg/services/cluster.go b/pkg/services/cluster.go
index 856c87aa..cac1a3b9 100644
--- a/pkg/services/cluster.go
+++ b/pkg/services/cluster.go
@@ -13,6 +13,7 @@ import (
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/dao"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/errors"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger"
+	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics"
 	"gorm.io/gorm"
 )
 
@@ -128,20 +129,26 @@ func (s *sqlClusterService) SoftDelete(ctx context.Context, id string) (*api.Clu
 		return nil, handleSoftDeleteError("Cluster", saveErr)
 	}
 
+	metrics.RecordPendingDeletion("cluster")
+
 	if cascadeErr := s.nodePoolDao.SoftDeleteByOwner(ctx, id, t, deletedBy); cascadeErr != nil {
 		return nil, handleSoftDeleteError("NodePool", cascadeErr)
 	}
 
+	nodePools, err := s.nodePoolDao.FindSoftDeletedByOwner(ctx, id)
+	if err != nil {
+		return nil, errors.GeneralError("Failed to fetch cascade-deleted nodepools: %s", err)
+	}
+
+	for range nodePools {
+		metrics.RecordPendingDeletion("nodepool")
+	}
+
 	cluster, svcErr := s.UpdateClusterStatusFromAdapters(ctx, cluster.ID)
 	if svcErr != nil {
 		return nil, svcErr
 	}
 
-	// Update status for all cascade-deleted nodepools so their Ready condition reflects the generation bump.
-	nodePools, err := s.nodePoolDao.FindSoftDeletedByOwner(ctx, id)
-	if err != nil {
-		return nil, errors.GeneralError("Failed to fetch cascade-deleted nodepools: %s", err)
-	}
 	if svcErr := batchUpdateNodePoolStatusesFromAdapters(
 		ctx,
 		nodePools,
diff --git a/pkg/services/node_pool.go b/pkg/services/node_pool.go
index 14438591..218f53fa 100644
--- a/pkg/services/node_pool.go
+++ b/pkg/services/node_pool.go
@@ -12,6 +12,7 @@ import (
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/dao"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/errors"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger"
+	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics"
 	"gorm.io/gorm"
 )
 
@@ -119,6 +120,8 @@ func (s *sqlNodePoolService) SoftDelete(ctx context.Context, id string) (*api.No
 		return nil, handleSoftDeleteError("NodePool", err)
 	}
 
+	metrics.RecordPendingDeletion("nodepool")
+
 	nodePool, svcErr := s.UpdateNodePoolStatusFromAdapters(ctx, nodePool.ID)
 	if svcErr != nil {
 		return nil, svcErr
diff --git a/test/integration/deletion_metrics_test.go b/test/integration/deletion_metrics_test.go
new file mode 100644
index 00000000..49a8db7c
--- /dev/null
+++ b/test/integration/deletion_metrics_test.go
@@ -0,0 +1,93 @@
+package integration
+
+import (
+	"net/http"
+	"testing"
+	"time"
+
+	. "github.com/onsi/gomega"
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+
+	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/api/openapi"
+	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics"
+	"github.com/openshift-hyperfleet/hyperfleet-api/test"
+)
+
+func TestPendingDeletionCollector_Integration(t *testing.T) {
+	t.Run("given soft-deleted resources older than threshold, collector reports them as stuck", func(t *testing.T) {
+		RegisterTestingT(t)
+		h, client := test.RegisterIntegration(t)
+		account := h.NewRandAccount()
+		ctx := h.NewAuthenticatedContext(account)
+
+		cluster, err := h.Factories.NewClusters(h.NewID())
+		Expect(err).NotTo(HaveOccurred())
+		npResp, err := client.CreateNodePoolWithResponse(
+			ctx, cluster.ID, openapi.CreateNodePoolJSONRequestBody(newNodePoolInput("stuck-np")),
+			test.WithAuthToken(ctx),
+		)
+		Expect(err).NotTo(HaveOccurred())
+		Expect(npResp.StatusCode()).To(Equal(http.StatusCreated))
+
+		_, err = client.DeleteClusterByIdWithResponse(ctx, cluster.ID, test.WithAuthToken(ctx))
+		Expect(err).NotTo(HaveOccurred())
+
+		// Backdate deleted_time to 1 hour ago so resources exceed the 30m threshold
+		db := h.DBFactory.New(ctx)
+		pastTime := time.Now().UTC().Add(-1 * time.Hour)
+		Expect(db.Exec("UPDATE clusters SET deleted_time = ? WHERE id = ?", pastTime, cluster.ID).Error).NotTo(HaveOccurred())
+		err = db.Exec("UPDATE node_pools SET deleted_time = ? WHERE owner_id = ?", pastTime, cluster.ID).Error
+		Expect(err).NotTo(HaveOccurred())
+
+		rawDB := h.DBFactory.DirectDB()
+		collector := metrics.NewPendingDeletionCollector(rawDB, 30*time.Minute)
+
+		collected := collectStuckMetrics(t, collector)
+
+		Expect(collected["cluster"]).To(BeNumerically(">=", 1), "should report at least 1 stuck cluster")
+		Expect(collected["nodepool"]).To(BeNumerically(">=", 1), "should report at least 1 stuck nodepool")
+	})
+
+	t.Run("given soft-deleted resources within threshold, collector reports zero stuck", func(t *testing.T) {
+		RegisterTestingT(t)
+		h, client := test.RegisterIntegration(t)
+		account := h.NewRandAccount()
+		ctx := h.NewAuthenticatedContext(account)
+
+		cluster, err := h.Factories.NewClusters(h.NewID())
+		Expect(err).NotTo(HaveOccurred())
+
+		_, err = client.DeleteClusterByIdWithResponse(ctx, cluster.ID, test.WithAuthToken(ctx))
+		Expect(err).NotTo(HaveOccurred())
+
+		rawDB := h.DBFactory.DirectDB()
+		// Use a very long threshold so the just-deleted cluster is NOT stuck
+		collector := metrics.NewPendingDeletionCollector(rawDB, 24*time.Hour)
+
+		collected := collectStuckMetrics(t, collector)
+
+		Expect(collected["cluster"]).To(Equal(0.0), "recently deleted cluster should not be stuck")
+	})
+}
+
+func collectStuckMetrics(t *testing.T, collector *metrics.PendingDeletionCollector) map[string]float64 {
+	t.Helper()
+	RegisterTestingT(t)
+
+	ch := make(chan prometheus.Metric, 10)
+	collector.Collect(ch)
+	close(ch)
+
+	result := make(map[string]float64)
+	for m := range ch {
+		pb := &dto.Metric{}
+		Expect(m.Write(pb)).To(Succeed())
+		for _, lp := range pb.GetLabel() {
+			if lp.GetName() == "resource_type" {
+				result[lp.GetValue()] = pb.GetGauge().GetValue()
+			}
+		}
+	}
+	return result
+}