Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions charts/templates/prometheusrule.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{{- if .Values.prometheusRule.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ include "hyperfleet-api.fullname" . }}
namespace: {{ .Values.prometheusRule.namespace | default .Release.Namespace }}
labels:
{{- include "hyperfleet-api.labels" . | nindent 4 }}
{{- with .Values.prometheusRule.labels }}
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
groups:
- name: hyperfleet-api-deletion
rules:
- alert: HyperFleetResourceDeletionStuckWarning
expr: max by (namespace, resource_type)(hyperfleet_api_resource_pending_deletion_stuck) > 0
for: {{ .Values.prometheusRule.rules.deletionStuck.for | default "30m" }}
labels:
severity: warning
annotations:
summary: "HyperFleet resources stuck in Pending Deletion state"
description: >-
{{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in
Pending Deletion state for more than {{ .Values.config.metrics.deletion_stuck_threshold | default "30m" }}
(stuck threshold) + {{ .Values.prometheusRule.rules.deletionStuck.for | default "30m" }} (alert delay).
runbook_url: {{ .Values.prometheusRule.rules.deletionStuck.runbookUrl | default "" | quote }}
- alert: HyperFleetResourceDeletionStuckCritical
expr: max by (namespace, resource_type)(hyperfleet_api_resource_pending_deletion_stuck) > 0
for: {{ .Values.prometheusRule.rules.deletionTimeout.for | default "2h" }}
labels:
severity: critical
annotations:
summary: "HyperFleet resources timed out in Pending Deletion state"
description: >-
{{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in
Pending Deletion state for more than {{ .Values.config.metrics.deletion_stuck_threshold | default "30m" }}
(stuck threshold) + {{ .Values.prometheusRule.rules.deletionTimeout.for | default "2h" }} (alert delay). Immediate investigation required.
runbook_url: {{ .Values.prometheusRule.rules.deletionTimeout.runbookUrl | default "" | quote }}
{{- end }}
14 changes: 14 additions & 0 deletions charts/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ config:
enabled: false

label_metrics_inclusion_duration: 168h
deletion_stuck_threshold: 30m

# Health check configuration
health:
Expand Down Expand Up @@ -243,6 +244,19 @@ database:
size: 1Gi
storageClass: ""

# PrometheusRule for alerting
prometheusRule:
enabled: false
labels: {}
namespace: ""
rules:
deletionStuck:
for: "30m"
runbookUrl: ""
deletionTimeout:
for: "2h"
runbookUrl: ""

# ServiceMonitor for Prometheus Operator
serviceMonitor:
enabled: false
Expand Down
10 changes: 10 additions & 0 deletions cmd/hyperfleet-api/servecmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"github.com/openshift-hyperfleet/hyperfleet-api/pkg/db/db_session"
"github.com/openshift-hyperfleet/hyperfleet-api/pkg/health"
"github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger"
"github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics"
"github.com/openshift-hyperfleet/hyperfleet-api/pkg/telemetry"
)

Expand Down Expand Up @@ -129,6 +130,15 @@ func runServe(cmd *cobra.Command, args []string) {
"masking_enabled", environments.Environment().Config.Logging.Masking.Enabled,
).Info("Logger initialized")

if sf := environments.Environment().Database.SessionFactory; sf != nil {
if err := metrics.RegisterCollector(
sf.DirectDB(),
environments.Environment().Config.Metrics.DeletionStuckThreshold,
); err != nil {
logger.WithError(ctx, err).Error("Failed to register pending deletion collector")
}
}

apiServer := server.NewAPIServer(tracingEnabled)
go apiServer.Start()

Expand Down
2 changes: 2 additions & 0 deletions cmd/hyperfleet-api/server/metrics_middleware.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ import (

"github.com/openshift-hyperfleet/hyperfleet-api/pkg/api"
"github.com/openshift-hyperfleet/hyperfleet-api/pkg/db/db_metrics"
"github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics"
)

// MetricsMiddleware creates a new handler that collects metrics for the requests processed by the
Expand Down Expand Up @@ -112,6 +113,7 @@ func ResetMetricCollectors() {
requestCountMetric.Reset()
requestDurationMetric.Reset()
db_metrics.ResetMetrics()
metrics.ResetMetrics()
buildInfoMetric.Reset()
buildInfoMetric.With(prometheus.Labels{
metricsComponentLabel: metricsComponentValue,
Expand Down
83 changes: 83 additions & 0 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,69 @@ hyperfleet_api_request_duration_seconds_sum{component="api",version="abc123",cod
hyperfleet_api_request_duration_seconds_count{component="api",version="abc123",code="200",method="GET",path="/api/hyperfleet/v1/clusters"} 1523
```

### Deletion Observability Metrics

These metrics track resources in the Pending Deletion state (`deleted_time` set, pending hard-delete by adapters).

#### `hyperfleet_api_resource_pending_deletion_total`

**Type:** Counter

**Description:** Total number of resources that entered the Pending Deletion state (`deleted_time` set).

**Labels:**

| Label | Description | Example Values |
|-------|-------------|----------------|
| `resource_type` | Type of resource | `cluster`, `nodepool` |
| `component` | Component name | `api` |
| `version` | Application version | `abc123` |

**Example output:**

```text
hyperfleet_api_resource_pending_deletion_total{component="api",resource_type="cluster",version="abc123"} 42
hyperfleet_api_resource_pending_deletion_total{component="api",resource_type="nodepool",version="abc123"} 156
```

#### `hyperfleet_api_resource_pending_deletion_duration_seconds`

**Type:** Histogram

**Description:** Duration from pending deletion (`deleted_time` set) to hard-delete completion in seconds. Observed when a resource is hard-deleted after all adapters report `Finalized=True`.

**Labels:** Same as `hyperfleet_api_resource_pending_deletion_total`

**Buckets:** `1s`, `5s`, `10s`, `30s`, `60s`, `120s`, `300s`, `600s`, `1800s`, `3600s`

**Note:** This metric is populated when the hard-delete flow is active. See the [hard-delete design](https://github.com/openshift-hyperfleet/architecture/blob/main/hyperfleet/components/api-service/hard-delete-design.md) for details.

#### `hyperfleet_api_resource_pending_deletion_stuck`

**Type:** Gauge (Collector)

**Description:** Number of resources in Pending Deletion state beyond the stuck threshold (default 30 minutes). This gauge is computed on each Prometheus scrape by querying the database for resources with `deleted_time` set before the threshold.

**Labels:** Same as `hyperfleet_api_resource_pending_deletion_total`

**Configuration:** The stuck threshold is configurable via `--metrics-deletion-stuck-threshold` (default `30m`).

**Example output:**

```text
hyperfleet_api_resource_pending_deletion_stuck{component="api",resource_type="cluster",version="abc123"} 2
hyperfleet_api_resource_pending_deletion_stuck{component="api",resource_type="nodepool",version="abc123"} 0
```

### Deletion Alerts

Two alerts are available via the PrometheusRule (requires `prometheusRule.enabled=true` in Helm values):

| Alert | Severity | Condition | Description |
|-------|----------|-----------|-------------|
| `HyperFleetResourceDeletionStuckWarning` | Warning | `resource_pending_deletion_stuck > 0` for 30m | Resources stuck in Pending Deletion beyond 1 hour |
| `HyperFleetResourceDeletionStuckCritical` | Critical | `resource_pending_deletion_stuck > 0` for 2h | Resources stuck in Pending Deletion beyond 2.5 hours |

## Go Runtime Metrics

The following metrics are automatically exposed by the Prometheus Go client library.
Expand Down Expand Up @@ -255,6 +318,26 @@ rate(process_cpu_seconds_total[5m])
process_open_fds / process_max_fds * 100
```

### Deletion Observability

```promql
# Resources entering Pending Deletion state per minute
sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_total[5m])) * 60

# Resources currently stuck in Pending Deletion state
hyperfleet_api_resource_pending_deletion_stuck

# Stuck resources by type
sum by (resource_type) (hyperfleet_api_resource_pending_deletion_stuck)

# Average pending deletion duration (once hard-delete is active)
sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_sum[5m])) /
sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_count[5m]))

# P99 pending deletion duration
histogram_quantile(0.99, sum by (le, resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_bucket[5m])))
```
Comment thread
coderabbitai[bot] marked this conversation as resolved.

### Common Investigation Queries

```promql
Expand Down
2 changes: 2 additions & 0 deletions pkg/config/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ func AddMetricsFlags(cmd *cobra.Command) {
cmd.Flags().String("metrics-tls-key-file", defaults.TLS.KeyFile, "Path to TLS key file for metrics")
cmd.Flags().Duration("metrics-label-metrics-inclusion-duration", defaults.LabelMetricsInclusionDuration,
"Duration for cluster telemetry label inclusion")
cmd.Flags().Duration("metrics-deletion-stuck-threshold", defaults.DeletionStuckThreshold,
"Duration after which a pending deletion resource is considered stuck")
}

// AddHealthFlags adds health check configuration flags following standard naming
Expand Down
6 changes: 6 additions & 0 deletions pkg/config/loader.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,9 @@ func (l *ConfigLoader) validateConfig(config *ApplicationConfig) error {
if valErr := config.Metrics.TLS.Validate(); valErr != nil {
return fmt.Errorf("metrics TLS validation failed: %w", valErr)
}
if valErr := config.Metrics.Validate(); valErr != nil {
return fmt.Errorf("metrics config validation failed: %w", valErr)
}
return nil
}

Expand Down Expand Up @@ -345,6 +348,7 @@ func (l *ConfigLoader) bindAllEnvVars() {
l.bindEnv("metrics.port")
l.bindEnv("metrics.tls.enabled")
l.bindEnv("metrics.label_metrics_inclusion_duration")
l.bindEnv("metrics.deletion_stuck_threshold")

// Health config
l.bindEnv("health.host")
Expand Down Expand Up @@ -411,6 +415,8 @@ func (l *ConfigLoader) bindFlags(cmd *cobra.Command) {
l.bindPFlag("metrics.tls.key_file", cmd.Flags().Lookup("metrics-tls-key-file"))
l.bindPFlag("metrics.label_metrics_inclusion_duration",
cmd.Flags().Lookup("metrics-label-metrics-inclusion-duration"))
l.bindPFlag("metrics.deletion_stuck_threshold",
cmd.Flags().Lookup("metrics-deletion-stuck-threshold"))

// Health flags: --health-* -> health.*
l.bindPFlag("health.host", cmd.Flags().Lookup("health-host"))
Expand Down
11 changes: 11 additions & 0 deletions pkg/config/metrics.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package config

import (
"fmt"
"net"
"strconv"
"time"
Expand All @@ -13,6 +14,7 @@ type MetricsConfig struct {
TLS TLSConfig `mapstructure:"tls" json:"tls" validate:"required"`
Port int `mapstructure:"port" json:"port" validate:"required,min=1,max=65535"`
LabelMetricsInclusionDuration time.Duration `mapstructure:"label_metrics_inclusion_duration" json:"label_metrics_inclusion_duration" validate:"required"` //nolint:lll
DeletionStuckThreshold time.Duration `mapstructure:"deletion_stuck_threshold" json:"deletion_stuck_threshold" validate:"required"` //nolint:lll
}

// NewMetricsConfig returns default MetricsConfig values
Expand All @@ -25,9 +27,18 @@ func NewMetricsConfig() *MetricsConfig {
Enabled: false,
},
LabelMetricsInclusionDuration: 168 * time.Hour, // 7 days
DeletionStuckThreshold: 30 * time.Minute,
}
}

// Validate validates MetricsConfig fields that struct tags cannot enforce
func (m *MetricsConfig) Validate() error {
if m.DeletionStuckThreshold <= 0 {
return fmt.Errorf("DeletionStuckThreshold must be positive, got %v", m.DeletionStuckThreshold)
}
return nil
}

// ============================================================
// Convenience Accessor Methods
// ============================================================
Expand Down
25 changes: 25 additions & 0 deletions pkg/db/migrations/202604290001_add_deleted_time_indexes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package migrations

import (
"gorm.io/gorm"

"github.com/go-gormigrate/gormigrate/v2"
)

func addDeletedTimeIndexes() *gormigrate.Migration {
return &gormigrate.Migration{
ID: "202604290001",
Migrate: func(tx *gorm.DB) error {
// Partial indexes for metrics collector queries:
// SELECT COUNT(*) FROM clusters WHERE deleted_time IS NOT NULL AND deleted_time < $1
// SELECT COUNT(*) FROM node_pools WHERE deleted_time IS NOT NULL AND deleted_time < $1
if err := tx.Exec("CREATE INDEX IF NOT EXISTS idx_clusters_deleted_time ON clusters(deleted_time) WHERE deleted_time IS NOT NULL;").Error; err != nil { //nolint:lll
return err
}
if err := tx.Exec("CREATE INDEX IF NOT EXISTS idx_node_pools_deleted_time ON node_pools(deleted_time) WHERE deleted_time IS NOT NULL;").Error; err != nil { //nolint:lll
return err
}
return nil
},
}
}
1 change: 1 addition & 0 deletions pkg/db/migrations/migration_structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ var MigrationList = []*gormigrate.Migration{
addSoftDeleteSchema(),
addNodePoolOwnerDeletedIndex(),
addReconciledIndex(),
addDeletedTimeIndexes(),
}

// Model represents the base model struct. All entities will have this struct embedded.
Expand Down
Loading