diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 2caad38..600fc22 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -34,6 +34,9 @@ make docs-gen # regenerate AI docs from source
- Pod builder is a pure function in internal/podbuilder/ (no k8s client)
- Pacing logic lives exclusively in internal/pacing/
- Don't manually edit generated files — run make docs-gen
+- Documentation must never contain unverified information — verify all examples against a real cluster before merging
+- Always document which resources you looked at in which order (short summary + time spent + tokens consumed + context consumed)
+- Always lint and fix linter issues locally before pushing code
## Testing Patterns
@@ -58,6 +61,7 @@ api/v1alpha1 — Package v1alpha1 contains API Schema definitions for the drop v
internal/controller — Package controller implements Kubernetes reconcilers for the drop CRDs (one per Kind).
imports: api/v1alpha1, internal/discovery, internal/metrics, internal/pacing, internal/podbuilder
internal/discovery — Package discovery implements image discovery from registries and Prometheus metrics.
+ imports: api/v1alpha1
internal/metrics — Package metrics registers Prometheus metrics for the drop operator.
internal/pacing — Package pacing implements the shared rate-limiting engine for image pull scheduling.
imports: api/v1alpha1, internal/podbuilder
diff --git a/api/v1alpha1/discoverypolicy_types.go b/api/v1alpha1/discoverypolicy_types.go
index 9cdf22e..14b87fd 100644
--- a/api/v1alpha1/discoverypolicy_types.go
+++ b/api/v1alpha1/discoverypolicy_types.go
@@ -53,6 +53,40 @@ type DiscoverySource struct {
SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"`
}
+// AggregationMethod defines how range query values are aggregated into a score.
+// +kubebuilder:validation:Enum=sum;count;avg;max
+type AggregationMethod string
+
+const (
+ // AggregationSum adds all data-point values over the lookback window.
+ // Use when the query returns a gauge/counter and the total magnitude matters
+ // (e.g., total memory usage across the window).
+ AggregationSum AggregationMethod = "sum"
+ // AggregationCount counts the number of non-zero data points over the lookback window.
+ // Use when you want to rank by how frequently an image appears
+ // (e.g., number of sample intervals where the image was running).
+ AggregationCount AggregationMethod = "count"
+ // AggregationAvg computes the arithmetic mean of all data-point values.
+ // Use when you want the average magnitude regardless of how many samples exist.
+ AggregationAvg AggregationMethod = "avg"
+ // AggregationMax takes the highest single data-point value.
+ // Use when peak usage is more relevant than cumulative usage.
+ AggregationMax AggregationMethod = "max"
+)
+
+// QueryType defines how the Prometheus query is executed.
+// +kubebuilder:validation:Enum=range;instant
+type QueryType string
+
+const (
+ // QueryTypeRange uses /api/v1/query_range with a time window defined by lookback.
+ // Returns multiple data points which are aggregated using the aggregationMethod.
+ QueryTypeRange QueryType = "range"
+ // QueryTypeInstant uses /api/v1/query for a single point-in-time result.
+ // The returned value is used directly as the score.
+ QueryTypeInstant QueryType = "instant"
+)
+
// PrometheusSource defines Prometheus query configuration for image discovery.
type PrometheusSource struct {
// Endpoint is the Prometheus-compatible API URL (Prometheus, Thanos, Mimir, VictoriaMetrics).
@@ -65,18 +99,32 @@ type PrometheusSource struct {
// Example: count(container_memory_working_set_bytes{container!="",container!="POD",namespace="gitlab-runner"}) by (image)
// +kubebuilder:validation:MinLength=1
Query string `json:"query"`
- // Lookback is the time window for aggregation. When set, the operator uses query_range
- // (start=now-lookback, end=now) and sums all returned values per image to produce a score.
- // When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score.
+ // QueryType controls how the Prometheus query is executed.
+ // "range" uses /api/v1/query_range with a time window defined by lookback.
+ // "instant" uses /api/v1/query for a single point-in-time result.
+ // Default: "range".
+ // +kubebuilder:default="range"
+ // +optional
+ QueryType QueryType `json:"queryType,omitempty"`
+ // Lookback is the time window for range queries. When queryType is "range",
+ // the operator queries (start=now-lookback, end=now) and aggregates all returned values per image.
+ // The aggregation function is controlled by the aggregationMethod field.
+ // Required when queryType is "range". Ignored when queryType is "instant".
// Example: "168h" (7 days), "24h", "72h"
// +optional
Lookback *metav1.Duration `json:"lookback,omitempty"`
+ // AggregationMethod controls how data points from a range query are combined into a single score.
+ // Only used when queryType is "range". Ignored for instant queries.
+ // When not set (nil), Drop uses the last data-point value directly — use this when your PromQL
+ // already contains aggregation functions (e.g., count_over_time, topk).
+ // Options: "sum", "count", "avg", "max"
+ // +optional
+ AggregationMethod *AggregationMethod `json:"aggregationMethod,omitempty"`
// Step is the resolution step for range queries (only used when lookback is set).
- // Smaller steps = more data points = more accurate sums but higher Prometheus load.
- // Default: "5m". Example: "1m", "15m"
- // +kubebuilder:default="5m"
+ // Smaller steps = more data points = more accurate aggregation but higher Prometheus load.
+ // Default: 5m. Example: "1m", "15m"
// +optional
- Step string `json:"step,omitempty"`
+ Step *metav1.Duration `json:"step,omitempty"`
}
// RegistrySource defines OCI registry tag listing configuration for image discovery.
diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
index e329027..eafb2e1 100644
--- a/api/v1alpha1/zz_generated.deepcopy.go
+++ b/api/v1alpha1/zz_generated.deepcopy.go
@@ -512,6 +512,16 @@ func (in *PrometheusSource) DeepCopyInto(out *PrometheusSource) {
*out = new(metav1.Duration)
**out = **in
}
+ if in.AggregationMethod != nil {
+ in, out := &in.AggregationMethod, &out.AggregationMethod
+ *out = new(AggregationMethod)
+ **out = **in
+ }
+ if in.Step != nil {
+ in, out := &in.Step, &out.Step
+ *out = new(metav1.Duration)
+ **out = **in
+ }
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PrometheusSource.
diff --git a/config/crd/bases/drop.corewire.io_discoverypolicies.yaml b/config/crd/bases/drop.corewire.io_discoverypolicies.yaml
index d85dab4..a1183f2 100644
--- a/config/crd/bases/drop.corewire.io_discoverypolicies.yaml
+++ b/config/crd/bases/drop.corewire.io_discoverypolicies.yaml
@@ -86,6 +86,19 @@ spec:
prometheus:
description: Prometheus contains the configuration when type=prometheus.
properties:
+ aggregationMethod:
+ description: |-
+ AggregationMethod controls how data points from a range query are combined into a single score.
+ Only used when queryType is "range". Ignored for instant queries.
+ When not set (nil), Drop uses the last data-point value directly — use this when your PromQL
+ already contains aggregation functions (e.g., count_over_time, topk).
+ Options: "sum", "count", "avg", "max"
+ enum:
+ - sum
+ - count
+ - avg
+ - max
+ type: string
endpoint:
description: |-
Endpoint is the Prometheus-compatible API URL (Prometheus, Thanos, Mimir, VictoriaMetrics).
@@ -94,9 +107,10 @@ spec:
type: string
lookback:
description: |-
- Lookback is the time window for aggregation. When set, the operator uses query_range
- (start=now-lookback, end=now) and sums all returned values per image to produce a score.
- When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score.
+ Lookback is the time window for range queries. When queryType is "range",
+ the operator queries (start=now-lookback, end=now) and aggregates all returned values per image.
+ The aggregation function is controlled by the aggregationMethod field.
+ Required when queryType is "range". Ignored when queryType is "instant".
Example: "168h" (7 days), "24h", "72h"
type: string
query:
@@ -107,12 +121,22 @@ spec:
Example: count(container_memory_working_set_bytes{container!="",container!="POD",namespace="gitlab-runner"}) by (image)
minLength: 1
type: string
+ queryType:
+ default: range
+ description: |-
+ QueryType controls how the Prometheus query is executed.
+ "range" uses /api/v1/query_range with a time window defined by lookback.
+ "instant" uses /api/v1/query for a single point-in-time result.
+ Default: "range".
+ enum:
+ - range
+ - instant
+ type: string
step:
- default: 5m
description: |-
Step is the resolution step for range queries (only used when lookback is set).
- Smaller steps = more data points = more accurate sums but higher Prometheus load.
- Default: "5m". Example: "1m", "15m"
+ Smaller steps = more data points = more accurate aggregation but higher Prometheus load.
+ Default: 5m. Example: "1m", "15m"
type: string
required:
- endpoint
diff --git a/docs/content/docs/discovery.md b/docs/content/docs/discovery.md
index 28e3aa7..8ee8440 100644
--- a/docs/content/docs/discovery.md
+++ b/docs/content/docs/discovery.md
@@ -66,7 +66,27 @@ count(container_memory_working_set_bytes{
Hand-maintained image lists do not keep up in environments where automation (for example Renovate) ships new image versions every day. A practical pattern is to rank images by observed CI usage over a rolling window.
-The `lookback` field tells Drop to use Prometheus `query_range` API over that time window and sum all returned values per image to produce a total usage score:
+The `queryType` field controls whether Drop sends an instant or range query (default: `range`). When set to `range`, the `lookback` field defines the time window and `aggregationMethod` controls how the returned data points are combined into a single score per image.
+
+#### Query Types
+
+{{< figure src="/drop/images/query-type-range.svg" alt="Range query: multiple data points over a lookback window" >}}
+
+{{< figure src="/drop/images/query-type-instant.svg" alt="Instant query: single point-in-time value used as score" >}}
+
+#### Aggregation Methods
+
+When using `queryType: range`, the `aggregationMethod` field determines how the returned data points are reduced into a single score:
+
+{{< figure src="/drop/images/aggregation-methods.svg" alt="Aggregation methods: nil (last value), sum, count, avg, max" >}}
+
+| Method | Behavior | Use when |
+|--------|----------|----------|
+| *(not set)* | Uses the last data-point value directly | Your PromQL already aggregates (e.g. `count_over_time`, `topk`) |
+| `sum` | Adds all data-point values over the window | Total cumulative usage matters (e.g. total memory consumed) |
+| `count` | Counts the number of data points returned | You want to rank by how frequently an image appears |
+| `avg` | Arithmetic mean of all data-point values | Average magnitude matters regardless of sample count |
+| `max` | Highest single data-point value | Peak usage is more relevant than cumulative |
```yaml
apiVersion: drop.corewire.io/v1alpha1
@@ -80,8 +100,10 @@ spec:
- type: prometheus
prometheus:
endpoint: https://mimir.example.com
+ queryType: range # default — use query_range API
lookback: 168h # 7 days
step: 5m
+ aggregationMethod: sum # rank by total usage over 7 days (omit to use last value directly)
query: |
count(
container_memory_working_set_bytes{
@@ -95,7 +117,9 @@ Use this when you want DiscoveryPolicy to continuously follow what your GitLab r
#### Field-by-field explanation
-- `lookback: 168h` — Drop uses `query_range` with start=now-7d, end=now, and sums all returned values per image to rank by total usage over the window.
+- `queryType: range` — tells Drop to use the Prometheus `query_range` API. This is the default. Set to `instant` for a single point-in-time query.
+- `lookback: 168h` — defines the time window for range queries (start=now-7d, end=now). Required when `queryType` is `range`.
+- `aggregationMethod: sum` — sums all data-point values to rank by total usage. When omitted (nil), the last value is used directly — ideal for self-contained PromQL queries. Other options: `count` to rank by number of appearances, `avg` for average magnitude, or `max` for peak value.
- `step: 5m` — resolution step for the range query (controls how many data points Prometheus returns).
- `count(...) by (image)` — counts the number of running containers per image to rank by popularity.
- `container_memory_working_set_bytes{...}` — source metric used to observe running containers.
@@ -108,9 +132,15 @@ Use this when you want DiscoveryPolicy to continuously follow what your GitLab r
For each unique `image` label, Drop uses the Prometheus query result value as the score.
-When `lookback` is not set (the default), Drop sends an instant query (`/api/v1/query`) and uses the returned value directly. When `lookback` is set (e.g. `lookback: 168h`), Drop uses a range query (`/api/v1/query_range`) over that window and **sums all returned values** to produce the score. This means images that appear more frequently over the window get a higher score.
+When `queryType` is `range` (the default), Drop uses a range query (`/api/v1/query_range`) over the `lookback` window and aggregates data points using the `aggregationMethod`. When `queryType` is `instant`, Drop sends an instant query (`/api/v1/query`) and uses the returned value directly:
+
+- *(not set)*: uses the last data-point value — ideal when your PromQL already contains aggregation functions like `count_over_time` or `topk`
+- `sum`: adds all data-point values — images with higher cumulative usage score higher
+- `count`: counts the number of data points — images that appear more frequently score higher
+- `avg`: averages data-point values — images with higher average value score higher
+- `max`: takes the peak value — images with the highest single observation score higher
-The example above uses `lookback: 168h` so Drop handles the 7-day windowing via the API — no need to embed `[7d]` in PromQL.
+The example above uses `queryType: range` with `lookback: 168h` so Drop handles the 7-day windowing via the API — no need to embed `[7d]` in PromQL.
If Prometheus returns:
@@ -156,6 +186,7 @@ spec:
- type: prometheus
prometheus:
endpoint: https://mimir.example.com
+ queryType: instant
query: |
count(container_memory_working_set_bytes{
container!="", container!="POD",
diff --git a/docs/content/docs/reference/_generated_architecture.md b/docs/content/docs/reference/_generated_architecture.md
index 0fe9d6b..1abb6ac 100644
--- a/docs/content/docs/reference/_generated_architecture.md
+++ b/docs/content/docs/reference/_generated_architecture.md
@@ -30,6 +30,7 @@ graph LR
internal/controller --> internal/metrics
internal/controller --> internal/pacing
internal/controller --> internal/podbuilder
+ internal/discovery --> api/v1alpha1
internal/pacing --> api/v1alpha1
internal/pacing --> internal/podbuilder
internal/podbuilder --> api/v1alpha1
diff --git a/docs/content/docs/reference/_generated_crds.md b/docs/content/docs/reference/_generated_crds.md
index 3779600..1d72338 100644
--- a/docs/content/docs/reference/_generated_crds.md
+++ b/docs/content/docs/reference/_generated_crds.md
@@ -207,8 +207,10 @@ PrometheusSource defines Prometheus query configuration for image discovery.
|-------|------|----------|---------|-------------|
| `endpoint` | `string` | Yes | — | Endpoint is the Prometheus-compatible API URL (Prometheus, Thanos, Mimir, VictoriaMetrics). Example: "http://prometheus.monitoring.svc:9090", "https://mimir.example.com" |
| `query` | `string` | Yes | — | Query is the PromQL expression. It MUST return results with an "image" label — that label value is used as the discovered image reference. The query result value is used as the ranking score (higher = more relevant). Example: count(container_memory_working_set_bytes{container!="",container!="POD",namespace="gitlab-runner"}) by (image) |
-| `lookback` | `*metav1.Duration` | No | — | Lookback is the time window for aggregation. When set, the operator uses query_range (start=now-lookback, end=now) and sums all returned values per image to produce a score. When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. Example: "168h" (7 days), "24h", "72h" |
-| `step` | `string` | No | 5m | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate sums but higher Prometheus load. Default: "5m". Example: "1m", "15m" |
+| `queryType` | `QueryType` | No | range | QueryType controls how the Prometheus query is executed. "range" uses /api/v1/query_range with a time window defined by lookback. "instant" uses /api/v1/query for a single point-in-time result. Default: "range". |
+| `lookback` | `*metav1.Duration` | No | — | Lookback is the time window for range queries. When queryType is "range", the operator queries (start=now-lookback, end=now) and aggregates all returned values per image. The aggregation function is controlled by the aggregationMethod field. Required when queryType is "range". Ignored when queryType is "instant". Example: "168h" (7 days), "24h", "72h" |
+| `aggregationMethod` | `*AggregationMethod` | No | — | AggregationMethod controls how data points from a range query are combined into a single score. Only used when queryType is "range". Ignored for instant queries. When not set (nil), Drop uses the last data-point value directly — use this when your PromQL already contains aggregation functions (e.g., count_over_time, topk). Options: "sum", "count", "avg", "max" |
+| `step` | `*metav1.Duration` | No | — | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate aggregation but higher Prometheus load. Default: 5m. Example: "1m", "15m" |
### RegistrySource
diff --git a/docs/go.mod b/docs/go.mod
index cc0eced..a8b9b26 100644
--- a/docs/go.mod
+++ b/docs/go.mod
@@ -1,5 +1,3 @@
module github.com/corewire/drop/docs
go 1.26.0
-
-require github.com/imfing/hextra v0.12.3 // indirect
diff --git a/docs/go.sum b/docs/go.sum
index afa8680..e69de29 100644
--- a/docs/go.sum
+++ b/docs/go.sum
@@ -1,2 +0,0 @@
-github.com/imfing/hextra v0.12.3 h1:DZHY2rUWYteyzjlHi9r4n7Bb5e2Q+6LXe4C1Dqn0ZjM=
-github.com/imfing/hextra v0.12.3/go.mod h1:vi+yhpq8YPp/aghvJlNKVnJKcPJ/VyAEcfC1BSV9ARo=
diff --git a/docs/static/images/aggregation-methods.svg b/docs/static/images/aggregation-methods.svg
new file mode 100644
index 0000000..edba23b
--- /dev/null
+++ b/docs/static/images/aggregation-methods.svg
@@ -0,0 +1,328 @@
+
diff --git a/docs/static/images/query-type-instant.svg b/docs/static/images/query-type-instant.svg
new file mode 100644
index 0000000..99d00c2
--- /dev/null
+++ b/docs/static/images/query-type-instant.svg
@@ -0,0 +1,95 @@
+
diff --git a/docs/static/images/query-type-range.svg b/docs/static/images/query-type-range.svg
new file mode 100644
index 0000000..033fef1
--- /dev/null
+++ b/docs/static/images/query-type-range.svg
@@ -0,0 +1,122 @@
+
diff --git a/docs/static/llms-full.txt b/docs/static/llms-full.txt
index 50e6d8c..b0ca6cc 100644
--- a/docs/static/llms-full.txt
+++ b/docs/static/llms-full.txt
@@ -181,8 +181,10 @@ PrometheusSource defines Prometheus query configuration for image discovery.
|-------|------|------|----------|---------|-------------|
| Endpoint | `endpoint` | `string` | ✓ | | Endpoint is the Prometheus-compatible API URL (Prometheus, Thanos, Mimir, VictoriaMetrics). Example: "http://prometheus.monitoring.svc:9090", "https://mimir.example.com" |
| Query | `query` | `string` | ✓ | | Query is the PromQL expression. It MUST return results with an "image" label — that label value is used as the discovered image reference. The query result value is used as the ranking score (higher = more relevant). Example: count(container_memory_working_set_bytes{container!="",container!="POD",namespace="gitlab-runner"}) by (image) |
-| Lookback | `lookback` | `*metav1.Duration` | — | | Lookback is the time window for aggregation. When set, the operator uses query_range (start=now-lookback, end=now) and sums all returned values per image to produce a score. When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. Example: "168h" (7 days), "24h", "72h" |
-| Step | `step` | `string` | — | `5m` | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate sums but higher Prometheus load. Default: "5m". Example: "1m", "15m" |
+| QueryType | `queryType` | `QueryType` | — | `range` | QueryType controls how the Prometheus query is executed. "range" uses /api/v1/query_range with a time window defined by lookback. "instant" uses /api/v1/query for a single point-in-time result. Default: "range". |
+| Lookback | `lookback` | `*metav1.Duration` | — | | Lookback is the time window for range queries. When queryType is "range", the operator queries (start=now-lookback, end=now) and aggregates all returned values per image. The aggregation function is controlled by the aggregationMethod field. Required when queryType is "range". Ignored when queryType is "instant". Example: "168h" (7 days), "24h", "72h" |
+| AggregationMethod | `aggregationMethod` | `*AggregationMethod` | — | | AggregationMethod controls how data points from a range query are combined into a single score. Only used when queryType is "range". Ignored for instant queries. When not set (nil), Drop uses the last data-point value directly — use this when your PromQL already contains aggregation functions (e.g., count_over_time, topk). Options: "sum", "count", "avg", "max" |
+| Step | `step` | `*metav1.Duration` | — | | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate aggregation but higher Prometheus load. Default: 5m. Example: "1m", "15m" |
### RegistrySource
@@ -330,8 +332,10 @@ spec:
prometheus:
endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090"
query: 'count(container_memory_working_set_bytes{container!="", container!="POD", namespace="build-stuff", pod=~"runner-.*"}) by (image)'
+ queryType: range
lookback: 24h
step: 5m
+ aggregationMethod: sum
syncInterval: 30s
maxImages: 10
---
diff --git a/go.sum b/go.sum
index 06ca73e..760283c 100644
--- a/go.sum
+++ b/go.sum
@@ -66,8 +66,6 @@ github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
-github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8=
-github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
github.com/google/pprof v0.0.0-20260402051712-545e8a4df936 h1:EwtI+Al+DeppwYX2oXJCETMO23COyaKGP6fHVpkpWpg=
github.com/google/pprof v0.0.0-20260402051712-545e8a4df936/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
@@ -107,14 +105,8 @@ github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFd
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
-github.com/onsi/ginkgo/v2 v2.27.4 h1:fcEcQW/A++6aZAZQNUmNjvA9PSOzefMJBerHJ4t8v8Y=
-github.com/onsi/ginkgo/v2 v2.27.4/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo=
github.com/onsi/ginkgo/v2 v2.29.0 h1:rfh+ZFjgJhYWRoIqVf3Uwx/W20yLrcrE2h2GmYVRaag=
github.com/onsi/ginkgo/v2 v2.29.0/go.mod h1:+aXOY+vzZ5mu2iI2HpTZUPmM//oQfsNFX6gU9kNcA44=
-github.com/onsi/gomega v1.39.0 h1:y2ROC3hKFmQZJNFeGAMeHZKkjBL65mIZcvrLQBF9k6Q=
-github.com/onsi/gomega v1.39.0/go.mod h1:ZCU1pkQcXDO5Sl9/VVEGlDyp+zm0m1cmeG5TOzLgdh4=
-github.com/onsi/gomega v1.40.0 h1:Vtol0e1MghCD2ZVIilPDIg44XSL9l2QAn8ZNaljWcJc=
-github.com/onsi/gomega v1.40.0/go.mod h1:M/Uqpu/8qTjtzCLUA2zJHX9Iilrau25x1PdoSRbWh5A=
github.com/onsi/gomega v1.41.0 h1:OwKp4pXNgVxf6sCplzYo794OFNuoL2q2SBMU5NSWOjA=
github.com/onsi/gomega v1.41.0/go.mod h1:M/Uqpu/8qTjtzCLUA2zJHX9Iilrau25x1PdoSRbWh5A=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
@@ -192,36 +184,22 @@ go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 h1:fQsdNF2N+/YewlRZiricy4P1iimyPKZ/xwniHj8Q2a0=
golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93/go.mod h1:EPRbTFwzwjXj9NpYyyrvenVh9Y+GFeEvMNh7Xuz7xgU=
-golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c=
-golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU=
golang.org/x/mod v0.35.0 h1:Ww1D637e6Pg+Zb2KrWfHQUnH2dQRLBQyAtpr/haaJeM=
golang.org/x/mod v0.35.0/go.mod h1:+GwiRhIInF8wPm+4AoT6L0FA1QWAad3OMdTRx4tFYlU=
-golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o=
-golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8=
golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA=
golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs=
golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw=
golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
-golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
-golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
-golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
-golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI=
golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
-golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY=
-golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww=
golang.org/x/term v0.42.0 h1:UiKe+zDFmJobeJ5ggPwOshJIVt6/Ft0rcfrXZDLWAWY=
golang.org/x/term v0.42.0/go.mod h1:Dq/D+snpsbazcBG5+F9Q1n2rXV8Ma+71xEjTRufARgY=
-golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE=
-golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8=
golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg=
golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164=
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
-golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc=
-golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg=
golang.org/x/tools v0.44.0 h1:UP4ajHPIcuMjT1GqzDWRlalUEoY+uzoZKnhOjbIPD2c=
golang.org/x/tools v0.44.0/go.mod h1:KA0AfVErSdxRZIsOVipbv3rQhVXTnlU6UhKxHd1seDI=
gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw=
diff --git a/hack/dev-samples.yaml b/hack/dev-samples.yaml
index 61b9d21..767b904 100644
--- a/hack/dev-samples.yaml
+++ b/hack/dev-samples.yaml
@@ -81,8 +81,10 @@ spec:
prometheus:
endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090"
query: 'count(container_memory_working_set_bytes{container!="", container!="POD", namespace="build-stuff", pod=~"runner-.*"}) by (image)'
+ queryType: range
lookback: 24h
step: 5m
+ aggregationMethod: sum
syncInterval: 30s
maxImages: 10
---
diff --git a/hack/e2e-infra/prometheus-config.yaml b/hack/e2e-infra/prometheus-config.yaml
index f731502..e483195 100644
--- a/hack/e2e-infra/prometheus-config.yaml
+++ b/hack/e2e-infra/prometheus-config.yaml
@@ -64,3 +64,43 @@ data:
namespace: "production"
pod: "myapp-xyz"
expr: "209715200"
+ # Metrics for aggregation method e2e tests.
+ # Two images with multiple pods each so count() and sum() produce
+ # different rankings:
+ # alpine → 3 pods, values 100+200+300 → sum=600, count=3, avg=200, max=300
+ # busybox → 1 pod, value 500 → sum=500, count=1, avg=500, max=500
+ # With count(), alpine ranks higher (3 > 1).
+ # With sum(), alpine still ranks higher (600 > 500).
+ # With avg(), busybox ranks higher (500 > 200).
+ # With max(), busybox ranks higher (500 > 300).
+ - name: seed_aggregation_metrics
+ interval: 10s
+ rules:
+ - record: container_cpu_usage_seconds_total
+ labels:
+ image: "docker.io/library/alpine:3.19"
+ container: "worker"
+ namespace: "aggregation-test"
+ pod: "worker-aaa"
+ expr: "100"
+ - record: container_cpu_usage_seconds_total
+ labels:
+ image: "docker.io/library/alpine:3.19"
+ container: "worker"
+ namespace: "aggregation-test"
+ pod: "worker-bbb"
+ expr: "200"
+ - record: container_cpu_usage_seconds_total
+ labels:
+ image: "docker.io/library/alpine:3.19"
+ container: "worker"
+ namespace: "aggregation-test"
+ pod: "worker-ccc"
+ expr: "300"
+ - record: container_cpu_usage_seconds_total
+ labels:
+ image: "docker.io/library/busybox:1.36"
+ container: "init"
+ namespace: "aggregation-test"
+ pod: "init-ddd"
+ expr: "500"
diff --git a/hack/gen-ai-docs/config.go b/hack/gen-ai-docs/config.go
index 325bb60..979b8ba 100644
--- a/hack/gen-ai-docs/config.go
+++ b/hack/gen-ai-docs/config.go
@@ -33,6 +33,9 @@ func conventions() []Convention {
{Rule: "Pod builder is a pure function in internal/podbuilder/ (no k8s client)", Scope: []string{"code"}},
{Rule: "Pacing logic lives exclusively in internal/pacing/", Scope: []string{"code"}},
{Rule: "Don't manually edit generated files — run make docs-gen", Scope: []string{"code"}},
+ {Rule: "Documentation must never contain unverified information — verify all examples against a real cluster before merging", Scope: []string{"code"}},
+ {Rule: "Always document which resources you looked at in which order (short summary + time spent + tokens consumed + context consumed)", Scope: []string{"code"}},
+ {Rule: "Always lint and fix linter issues locally before pushing code", Scope: []string{"code"}},
}
}
diff --git a/internal/controller/discoverypolicy_controller.go b/internal/controller/discoverypolicy_controller.go
index 016b34b..c801165 100644
--- a/internal/controller/discoverypolicy_controller.go
+++ b/internal/controller/discoverypolicy_controller.go
@@ -246,7 +246,11 @@ func (r *DiscoveryPolicyReconciler) buildSource(ctx context.Context, src dropv1a
if src.Prometheus.Lookback != nil {
lookback = src.Prometheus.Lookback.Duration
}
- return discovery.NewPrometheusSource(src.Prometheus.Endpoint, src.Prometheus.Query, lookback, src.Prometheus.Step, httpClient), nil
+ var step time.Duration
+ if src.Prometheus.Step != nil {
+ step = src.Prometheus.Step.Duration
+ }
+ return discovery.NewPrometheusSource(src.Prometheus.Endpoint, src.Prometheus.Query, src.Prometheus.QueryType, lookback, src.Prometheus.AggregationMethod, step, httpClient), nil
case "registry":
if src.Registry == nil {
return nil, fmt.Errorf("registry config is required when type=registry")
diff --git a/internal/discovery/prometheus.go b/internal/discovery/prometheus.go
index c3b4a31..94423f8 100644
--- a/internal/discovery/prometheus.go
+++ b/internal/discovery/prometheus.go
@@ -9,31 +9,42 @@ import (
"net/url"
"sort"
"time"
+
+ dropv1alpha1 "github.com/corewire/drop/api/v1alpha1"
)
+const prometheusStatusSuccess = "success"
+
// PrometheusSource queries Prometheus for image references.
type PrometheusSource struct {
- Endpoint string
- Query string
- Lookback time.Duration // 0 = instant query; >0 = query_range
- Step string // resolution step for range queries (default "5m")
- HTTPClient *http.Client
+ Endpoint string
+ Query string
+ QueryType dropv1alpha1.QueryType // range or instant
+ Lookback time.Duration // time window for range queries
+ AggregationMethod *dropv1alpha1.AggregationMethod // nil = use last value; sum, count, avg, max
+ Step time.Duration // resolution step for range queries (default 5m)
+ HTTPClient *http.Client
}
// NewPrometheusSource creates a new Prometheus discovery source.
-func NewPrometheusSource(endpoint, query string, lookback time.Duration, step string, httpClient *http.Client) *PrometheusSource {
+func NewPrometheusSource(endpoint, query string, queryType dropv1alpha1.QueryType, lookback time.Duration, aggregationMethod *dropv1alpha1.AggregationMethod, step time.Duration, httpClient *http.Client) *PrometheusSource {
if httpClient == nil {
httpClient = &http.Client{Timeout: 30 * time.Second}
}
- if step == "" {
- step = "5m"
+ if step == 0 {
+ step = 5 * time.Minute
+ }
+ if queryType == "" {
+ queryType = dropv1alpha1.QueryTypeRange
}
return &PrometheusSource{
- Endpoint: endpoint,
- Query: query,
- Lookback: lookback,
- Step: step,
- HTTPClient: httpClient,
+ Endpoint: endpoint,
+ Query: query,
+ QueryType: queryType,
+ Lookback: lookback,
+ AggregationMethod: aggregationMethod,
+ Step: step,
+ HTTPClient: httpClient,
}
}
@@ -62,13 +73,13 @@ func (p *PrometheusSource) Fetch(ctx context.Context) ([]ImageResult, error) {
q := u.Query()
q.Set("query", p.Query)
- if p.Lookback > 0 {
+ if p.QueryType == dropv1alpha1.QueryTypeRange {
// Range query: aggregate over time window
u.Path = "/api/v1/query_range"
now := time.Now().UTC()
q.Set("start", now.Add(-p.Lookback).Format(time.RFC3339))
q.Set("end", now.Format(time.RFC3339))
- q.Set("step", p.Step)
+ q.Set("step", fmt.Sprintf("%ds", int(p.Step.Seconds())))
} else {
// Instant query: single point in time
u.Path = "/api/v1/query"
@@ -96,7 +107,7 @@ func (p *PrometheusSource) Fetch(ctx context.Context) ([]ImageResult, error) {
return nil, fmt.Errorf("decoding response: %w", err)
}
- if promResp.Status != "success" {
+ if promResp.Status != prometheusStatusSuccess {
return nil, fmt.Errorf("prometheus query failed with status: %s", promResp.Status)
}
@@ -108,9 +119,9 @@ func (p *PrometheusSource) Fetch(ctx context.Context) ([]ImageResult, error) {
}
var score int64
- if p.Lookback > 0 {
- // Range query: sum all values to get total usage score
- score = sumRangeValues(r.Values)
+ if p.QueryType == dropv1alpha1.QueryTypeRange {
+ // Range query: aggregate values according to configured method (nil = last value)
+ score = aggregateRangeValues(r.Values, p.AggregationMethod)
} else {
// Instant query: use single value
score = extractScore(r.Value)
@@ -146,9 +157,34 @@ func extractScore(value []interface{}) int64 {
return int64(score)
}
-// sumRangeValues sums all values from a query_range result to produce a total usage score.
-func sumRangeValues(values [][]interface{}) int64 {
+// aggregateRangeValues aggregates all values from a query_range result using the specified method.
+// When method is nil, the last data-point value is used directly (no aggregation).
+func aggregateRangeValues(values [][]interface{}, method *dropv1alpha1.AggregationMethod) int64 {
+ // nil = no aggregation, use last data-point value directly
+ if method == nil {
+ if len(values) == 0 {
+ return 0
+ }
+ lastPair := values[len(values)-1]
+ if len(lastPair) < 2 {
+ return 0
+ }
+ strVal, ok := lastPair[1].(string)
+ if !ok {
+ return 0
+ }
+ var v float64
+ if _, err := fmt.Sscanf(strVal, "%f", &v); err != nil {
+ return 0
+ }
+ return int64(v)
+ }
+
var total float64
+ var max float64
+ var count int64
+ maxSet := false
+
for _, pair := range values {
if len(pair) < 2 {
continue
@@ -158,9 +194,28 @@ func sumRangeValues(values [][]interface{}) int64 {
continue
}
var v float64
- if _, err := fmt.Sscanf(strVal, "%f", &v); err == nil {
- total += v
+ if _, err := fmt.Sscanf(strVal, "%f", &v); err != nil {
+ continue
+ }
+ total += v
+ count++
+ if !maxSet || v > max {
+ max = v
+ maxSet = true
+ }
+ }
+
+ switch *method {
+ case dropv1alpha1.AggregationCount:
+ return count
+ case dropv1alpha1.AggregationAvg:
+ if count == 0 {
+ return 0
}
+ return int64(total / float64(count))
+ case dropv1alpha1.AggregationMax:
+ return int64(max)
+ default: // AggregationSum
+ return int64(total)
}
- return int64(total)
}
diff --git a/internal/discovery/prometheus_test.go b/internal/discovery/prometheus_test.go
index 2110a02..e5157c5 100644
--- a/internal/discovery/prometheus_test.go
+++ b/internal/discovery/prometheus_test.go
@@ -6,9 +6,12 @@ import (
"net/http"
"net/http/httptest"
"testing"
+ "time"
+
+ dropv1alpha1 "github.com/corewire/drop/api/v1alpha1"
)
-func TestPrometheusSource_Fetch(t *testing.T) {
+func TestPrometheusSource_Fetch_Instant(t *testing.T) {
tests := []struct {
name string
response interface{}
@@ -20,7 +23,7 @@ func TestPrometheusSource_Fetch(t *testing.T) {
{
name: "valid response with image labels",
response: prometheusResponse{
- Status: "success",
+ Status: prometheusStatusSuccess,
Data: struct {
ResultType string `json:"resultType"`
Result []prometheusResult `json:"result"`
@@ -45,7 +48,7 @@ func TestPrometheusSource_Fetch(t *testing.T) {
{
name: "skips results without image label",
response: prometheusResponse{
- Status: "success",
+ Status: prometheusStatusSuccess,
Data: struct {
ResultType string `json:"resultType"`
Result []prometheusResult `json:"result"`
@@ -76,7 +79,7 @@ func TestPrometheusSource_Fetch(t *testing.T) {
{
name: "empty results",
response: prometheusResponse{
- Status: "success",
+ Status: prometheusStatusSuccess,
Data: struct {
ResultType string `json:"resultType"`
Result []prometheusResult `json:"result"`
@@ -94,7 +97,7 @@ func TestPrometheusSource_Fetch(t *testing.T) {
t.Run(tt.name, func(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/v1/query" {
- t.Errorf("unexpected path: %s", r.URL.Path)
+ t.Errorf("unexpected path: %s, want /api/v1/query", r.URL.Path)
}
w.WriteHeader(tt.statusCode)
if err := json.NewEncoder(w).Encode(tt.response); err != nil {
@@ -103,7 +106,7 @@ func TestPrometheusSource_Fetch(t *testing.T) {
}))
defer server.Close()
- source := NewPrometheusSource(server.URL, "test_query", 0, "", server.Client())
+ source := NewPrometheusSource(server.URL, "test_query", dropv1alpha1.QueryTypeInstant, 0, nil, 0, server.Client())
results, err := source.Fetch(context.Background())
if tt.wantErr {
@@ -129,3 +132,210 @@ func TestPrometheusSource_Fetch(t *testing.T) {
})
}
}
+
+func TestPrometheusSource_Fetch_Range(t *testing.T) {
+ tests := []struct {
+ name string
+ aggregationMethod *dropv1alpha1.AggregationMethod
+ response prometheusResponse
+ wantCount int
+ wantFirst string
+ wantScore int64
+ }{
+ {
+ name: "nil aggregation (last value)",
+ aggregationMethod: nil,
+ response: prometheusResponse{
+ Status: prometheusStatusSuccess,
+ Data: struct {
+ ResultType string `json:"resultType"`
+ Result []prometheusResult `json:"result"`
+ }{
+ ResultType: "matrix",
+ Result: []prometheusResult{
+ {
+ Metric: map[string]string{"image": "nginx:1.25"},
+ Values: [][]interface{}{
+ {1234567890.0, "10"},
+ {1234567950.0, "20"},
+ {1234568010.0, "30"},
+ },
+ },
+ },
+ },
+ },
+ wantCount: 1,
+ wantFirst: "nginx:1.25",
+ wantScore: 30, // last data-point value
+ },
+ {
+ name: "sum aggregation",
+ aggregationMethod: aggregationMethodPtr(dropv1alpha1.AggregationSum),
+ response: prometheusResponse{
+ Status: prometheusStatusSuccess,
+ Data: struct {
+ ResultType string `json:"resultType"`
+ Result []prometheusResult `json:"result"`
+ }{
+ ResultType: "matrix",
+ Result: []prometheusResult{
+ {
+ Metric: map[string]string{"image": "nginx:1.25"},
+ Values: [][]interface{}{
+ {1234567890.0, "10"},
+ {1234567950.0, "20"},
+ {1234568010.0, "30"},
+ },
+ },
+ },
+ },
+ },
+ wantCount: 1,
+ wantFirst: "nginx:1.25",
+ wantScore: 60, // 10+20+30
+ },
+ {
+ name: "count aggregation",
+ aggregationMethod: aggregationMethodPtr(dropv1alpha1.AggregationCount),
+ response: prometheusResponse{
+ Status: prometheusStatusSuccess,
+ Data: struct {
+ ResultType string `json:"resultType"`
+ Result []prometheusResult `json:"result"`
+ }{
+ ResultType: "matrix",
+ Result: []prometheusResult{
+ {
+ Metric: map[string]string{"image": "nginx:1.25"},
+ Values: [][]interface{}{
+ {1234567890.0, "10"},
+ {1234567950.0, "20"},
+ {1234568010.0, "30"},
+ },
+ },
+ },
+ },
+ },
+ wantCount: 1,
+ wantFirst: "nginx:1.25",
+ wantScore: 3,
+ },
+ {
+ name: "avg aggregation",
+ aggregationMethod: aggregationMethodPtr(dropv1alpha1.AggregationAvg),
+ response: prometheusResponse{
+ Status: prometheusStatusSuccess,
+ Data: struct {
+ ResultType string `json:"resultType"`
+ Result []prometheusResult `json:"result"`
+ }{
+ ResultType: "matrix",
+ Result: []prometheusResult{
+ {
+ Metric: map[string]string{"image": "nginx:1.25"},
+ Values: [][]interface{}{
+ {1234567890.0, "10"},
+ {1234567950.0, "20"},
+ {1234568010.0, "30"},
+ },
+ },
+ },
+ },
+ },
+ wantCount: 1,
+ wantFirst: "nginx:1.25",
+ wantScore: 20, // (10+20+30)/3
+ },
+ {
+ name: "max aggregation",
+ aggregationMethod: aggregationMethodPtr(dropv1alpha1.AggregationMax),
+ response: prometheusResponse{
+ Status: prometheusStatusSuccess,
+ Data: struct {
+ ResultType string `json:"resultType"`
+ Result []prometheusResult `json:"result"`
+ }{
+ ResultType: "matrix",
+ Result: []prometheusResult{
+ {
+ Metric: map[string]string{"image": "nginx:1.25"},
+ Values: [][]interface{}{
+ {1234567890.0, "10"},
+ {1234567950.0, "20"},
+ {1234568010.0, "30"},
+ },
+ },
+ },
+ },
+ },
+ wantCount: 1,
+ wantFirst: "nginx:1.25",
+ wantScore: 30,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if r.URL.Path != "/api/v1/query_range" {
+ t.Errorf("unexpected path: %s, want /api/v1/query_range", r.URL.Path)
+ }
+ w.WriteHeader(http.StatusOK)
+ if err := json.NewEncoder(w).Encode(tt.response); err != nil {
+ t.Fatal(err)
+ }
+ }))
+ defer server.Close()
+
+ source := NewPrometheusSource(server.URL, "test_query", dropv1alpha1.QueryTypeRange, time.Hour, tt.aggregationMethod, 5*time.Minute, server.Client())
+ results, err := source.Fetch(context.Background())
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if len(results) != tt.wantCount {
+ t.Errorf("got %d results, want %d", len(results), tt.wantCount)
+ }
+
+ if tt.wantFirst != "" && len(results) > 0 {
+ if results[0].Image != tt.wantFirst {
+ t.Errorf("first image = %q, want %q", results[0].Image, tt.wantFirst)
+ }
+ if results[0].Score != tt.wantScore {
+ t.Errorf("score = %d, want %d", results[0].Score, tt.wantScore)
+ }
+ }
+ })
+ }
+}
+
+func TestPrometheusSource_DefaultQueryType(t *testing.T) {
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if r.URL.Path != "/api/v1/query_range" {
+ t.Errorf("default queryType should use query_range, got path: %s", r.URL.Path)
+ }
+ resp := prometheusResponse{Status: prometheusStatusSuccess}
+ w.WriteHeader(http.StatusOK)
+ if err := json.NewEncoder(w).Encode(resp); err != nil {
+ t.Fatal(err)
+ }
+ }))
+ defer server.Close()
+
+ // Empty queryType should default to range
+ source := NewPrometheusSource(server.URL, "test_query", "", time.Hour, nil, 0, server.Client())
+ if source.QueryType != dropv1alpha1.QueryTypeRange {
+ t.Errorf("default QueryType = %q, want %q", source.QueryType, dropv1alpha1.QueryTypeRange)
+ }
+ if source.AggregationMethod != nil {
+ t.Errorf("default AggregationMethod = %v, want nil", source.AggregationMethod)
+ }
+ _, err := source.Fetch(context.Background())
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+}
+
+func aggregationMethodPtr(m dropv1alpha1.AggregationMethod) *dropv1alpha1.AggregationMethod {
+ return &m
+}
diff --git a/knowledge.yaml b/knowledge.yaml
index 0eaa619..a088e30 100644
--- a/knowledge.yaml
+++ b/knowledge.yaml
@@ -445,17 +445,27 @@ helperTypes:
type: string
required: true
doc: 'Query is the PromQL expression. It MUST return results with an "image" label — that label value is used as the discovered image reference. The query result value is used as the ranking score (higher = more relevant). Example: count(container_memory_working_set_bytes{container!="",container!="POD",namespace="gitlab-runner"}) by (image)'
+ - name: QueryType
+ json: queryType
+ type: QueryType
+ required: false
+ default: range
+ doc: 'QueryType controls how the Prometheus query is executed. "range" uses /api/v1/query_range with a time window defined by lookback. "instant" uses /api/v1/query for a single point-in-time result. Default: "range".'
- name: Lookback
json: lookback
type: '*metav1.Duration'
required: false
- doc: 'Lookback is the time window for aggregation. When set, the operator uses query_range (start=now-lookback, end=now) and sums all returned values per image to produce a score. When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. Example: "168h" (7 days), "24h", "72h"'
+ doc: 'Lookback is the time window for range queries. When queryType is "range", the operator queries (start=now-lookback, end=now) and aggregates all returned values per image. The aggregation function is controlled by the aggregationMethod field. Required when queryType is "range". Ignored when queryType is "instant". Example: "168h" (7 days), "24h", "72h"'
+ - name: AggregationMethod
+ json: aggregationMethod
+ type: '*AggregationMethod'
+ required: false
+ doc: 'AggregationMethod controls how data points from a range query are combined into a single score. Only used when queryType is "range". Ignored for instant queries. When not set (nil), Drop uses the last data-point value directly — use this when your PromQL already contains aggregation functions (e.g., count_over_time, topk). Options: "sum", "count", "avg", "max"'
- name: Step
json: step
- type: string
+ type: '*metav1.Duration'
required: false
- default: 5m
- doc: 'Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate sums but higher Prometheus load. Default: "5m". Example: "1m", "15m"'
+ doc: 'Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate aggregation but higher Prometheus load. Default: 5m. Example: "1m", "15m"'
- name: RegistrySource
doc: RegistrySource defines OCI registry tag listing configuration for image discovery.
fields:
@@ -510,6 +520,8 @@ packages:
- internal/podbuilder
- path: internal/discovery
role: Package discovery implements image discovery from registries and Prometheus metrics.
+ imports:
+ - api/v1alpha1
- path: internal/metrics
role: Package metrics registers Prometheus metrics for the drop operator.
- path: internal/pacing
@@ -537,6 +549,15 @@ conventions:
- rule: Don't manually edit generated files — run make docs-gen
scope:
- code
+ - rule: Documentation must never contain unverified information — verify all examples against a real cluster before merging
+ scope:
+ - code
+ - rule: Always document which resources you looked at in which order (short summary + time spent + tokens consumed + context consumed)
+ scope:
+ - code
+ - rule: Always lint and fix linter issues locally before pushing code
+ scope:
+ - code
errors:
- reason: Cached
controller: CachedImage
@@ -762,8 +783,10 @@ samples: |
prometheus:
endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090"
query: 'count(container_memory_working_set_bytes{container!="", container!="POD", namespace="build-stuff", pod=~"runner-.*"}) by (image)'
+ queryType: range
lookback: 24h
step: 5m
+ aggregationMethod: sum
syncInterval: 30s
maxImages: 10
---
diff --git a/llms-full.txt b/llms-full.txt
index 50e6d8c..b0ca6cc 100644
--- a/llms-full.txt
+++ b/llms-full.txt
@@ -181,8 +181,10 @@ PrometheusSource defines Prometheus query configuration for image discovery.
|-------|------|------|----------|---------|-------------|
| Endpoint | `endpoint` | `string` | ✓ | | Endpoint is the Prometheus-compatible API URL (Prometheus, Thanos, Mimir, VictoriaMetrics). Example: "http://prometheus.monitoring.svc:9090", "https://mimir.example.com" |
| Query | `query` | `string` | ✓ | | Query is the PromQL expression. It MUST return results with an "image" label — that label value is used as the discovered image reference. The query result value is used as the ranking score (higher = more relevant). Example: count(container_memory_working_set_bytes{container!="",container!="POD",namespace="gitlab-runner"}) by (image) |
-| Lookback | `lookback` | `*metav1.Duration` | — | | Lookback is the time window for aggregation. When set, the operator uses query_range (start=now-lookback, end=now) and sums all returned values per image to produce a score. When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. Example: "168h" (7 days), "24h", "72h" |
-| Step | `step` | `string` | — | `5m` | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate sums but higher Prometheus load. Default: "5m". Example: "1m", "15m" |
+| QueryType | `queryType` | `QueryType` | — | `range` | QueryType controls how the Prometheus query is executed. "range" uses /api/v1/query_range with a time window defined by lookback. "instant" uses /api/v1/query for a single point-in-time result. Default: "range". |
+| Lookback | `lookback` | `*metav1.Duration` | — | | Lookback is the time window for range queries. When queryType is "range", the operator queries (start=now-lookback, end=now) and aggregates all returned values per image. The aggregation function is controlled by the aggregationMethod field. Required when queryType is "range". Ignored when queryType is "instant". Example: "168h" (7 days), "24h", "72h" |
+| AggregationMethod | `aggregationMethod` | `*AggregationMethod` | — | | AggregationMethod controls how data points from a range query are combined into a single score. Only used when queryType is "range". Ignored for instant queries. When not set (nil), Drop uses the last data-point value directly — use this when your PromQL already contains aggregation functions (e.g., count_over_time, topk). Options: "sum", "count", "avg", "max" |
+| Step | `step` | `*metav1.Duration` | — | | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate aggregation but higher Prometheus load. Default: 5m. Example: "1m", "15m" |
### RegistrySource
@@ -330,8 +332,10 @@ spec:
prometheus:
endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090"
query: 'count(container_memory_working_set_bytes{container!="", container!="POD", namespace="build-stuff", pod=~"runner-.*"}) by (image)'
+ queryType: range
lookback: 24h
step: 5m
+ aggregationMethod: sum
syncInterval: 30s
maxImages: 10
---
diff --git a/test/e2e/discovery-aggregation/01-discoverypolicies.yaml b/test/e2e/discovery-aggregation/01-discoverypolicies.yaml
new file mode 100644
index 0000000..52f9cf7
--- /dev/null
+++ b/test/e2e/discovery-aggregation/01-discoverypolicies.yaml
@@ -0,0 +1,108 @@
+# Four DiscoveryPolicies using queryType: range with different aggregationMethods,
+# plus one using queryType: instant.
+# All query the same seed metrics (container_cpu_usage_seconds_total in namespace aggregation-test).
+# Seed data: alpine has 3 pods (values 100, 200, 300), busybox has 1 pod (value 500).
+---
+apiVersion: drop.corewire.io/v1alpha1
+kind: DiscoveryPolicy
+metadata:
+ name: e2e-agg-count
+spec:
+ sources:
+ - type: prometheus
+ prometheus:
+ endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090"
+ query: 'count(container_cpu_usage_seconds_total{namespace="aggregation-test"}) by (image)'
+ queryType: range
+ lookback: 1h
+ step: 5m
+ aggregationMethod: count
+ syncInterval: 30s
+ maxImages: 10
+---
+apiVersion: drop.corewire.io/v1alpha1
+kind: DiscoveryPolicy
+metadata:
+ name: e2e-agg-avg
+spec:
+ sources:
+ - type: prometheus
+ prometheus:
+ endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090"
+ query: 'sum(container_cpu_usage_seconds_total{namespace="aggregation-test"}) by (image)'
+ queryType: range
+ lookback: 1h
+ step: 5m
+ aggregationMethod: avg
+ syncInterval: 30s
+ maxImages: 10
+---
+apiVersion: drop.corewire.io/v1alpha1
+kind: DiscoveryPolicy
+metadata:
+ name: e2e-agg-max
+spec:
+ sources:
+ - type: prometheus
+ prometheus:
+ endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090"
+ query: 'sum(container_cpu_usage_seconds_total{namespace="aggregation-test"}) by (image)'
+ queryType: range
+ lookback: 1h
+ step: 5m
+ aggregationMethod: max
+ syncInterval: 30s
+ maxImages: 10
+---
+apiVersion: drop.corewire.io/v1alpha1
+kind: DiscoveryPolicy
+metadata:
+ name: e2e-agg-sum
+spec:
+ sources:
+ - type: prometheus
+ prometheus:
+ endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090"
+ query: 'sum(container_cpu_usage_seconds_total{namespace="aggregation-test"}) by (image)'
+ queryType: range
+ lookback: 1h
+ step: 5m
+ aggregationMethod: sum
+ syncInterval: 30s
+ maxImages: 10
+---
+# queryType: range without aggregationMethod — field is nullable, omitting it means
+# Drop uses the last data-point value directly without aggregation.
+# Ideal for self-contained PromQL queries that already aggregate internally.
+apiVersion: drop.corewire.io/v1alpha1
+kind: DiscoveryPolicy
+metadata:
+ name: e2e-agg-none
+spec:
+ sources:
+ - type: prometheus
+ prometheus:
+ endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090"
+ query: 'sum(container_cpu_usage_seconds_total{namespace="aggregation-test"}) by (image)'
+ queryType: range
+ lookback: 1h
+ step: 5m
+ # aggregationMethod intentionally omitted (nil) — uses last value directly
+ syncInterval: 30s
+ maxImages: 10
+---
+# queryType: instant — uses /api/v1/query for a single point-in-time result.
+# The returned value is used directly as the score without aggregation.
+apiVersion: drop.corewire.io/v1alpha1
+kind: DiscoveryPolicy
+metadata:
+ name: e2e-agg-instant
+spec:
+ sources:
+ - type: prometheus
+ prometheus:
+ endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090"
+ query: 'count(container_cpu_usage_seconds_total{namespace="aggregation-test"}) by (image)'
+ queryType: instant
+ syncInterval: 30s
+ maxImages: 10
diff --git a/test/e2e/discovery-aggregation/02-assert-count.yaml b/test/e2e/discovery-aggregation/02-assert-count.yaml
new file mode 100644
index 0000000..ee5e76b
--- /dev/null
+++ b/test/e2e/discovery-aggregation/02-assert-count.yaml
@@ -0,0 +1,12 @@
+# Assert count aggregation: policy is Ready, both images discovered.
+# count() by (image) returns alpine=3, busybox=1 at each step.
+# aggregationMethod=count counts the number of data points (steps) per image.
+apiVersion: drop.corewire.io/v1alpha1
+kind: DiscoveryPolicy
+metadata:
+ name: e2e-agg-count
+status:
+ (conditions[?type == 'Ready']):
+ - status: "True"
+ reason: Synced
+ imageCount: 2
diff --git a/test/e2e/discovery-aggregation/03-assert-avg.yaml b/test/e2e/discovery-aggregation/03-assert-avg.yaml
new file mode 100644
index 0000000..ae09c4b
--- /dev/null
+++ b/test/e2e/discovery-aggregation/03-assert-avg.yaml
@@ -0,0 +1,12 @@
+# Assert avg aggregation: policy is Ready, both images discovered.
+# sum() by (image) returns alpine=600, busybox=500 at each step.
+# aggregationMethod=avg averages the data-point values over the lookback window.
+apiVersion: drop.corewire.io/v1alpha1
+kind: DiscoveryPolicy
+metadata:
+ name: e2e-agg-avg
+status:
+ (conditions[?type == 'Ready']):
+ - status: "True"
+ reason: Synced
+ imageCount: 2
diff --git a/test/e2e/discovery-aggregation/04-assert-max.yaml b/test/e2e/discovery-aggregation/04-assert-max.yaml
new file mode 100644
index 0000000..2d240ef
--- /dev/null
+++ b/test/e2e/discovery-aggregation/04-assert-max.yaml
@@ -0,0 +1,12 @@
+# Assert max aggregation: policy is Ready, both images discovered.
+# sum() by (image) returns alpine=600, busybox=500 at each step.
+# aggregationMethod=max takes the highest single data-point value.
+apiVersion: drop.corewire.io/v1alpha1
+kind: DiscoveryPolicy
+metadata:
+ name: e2e-agg-max
+status:
+ (conditions[?type == 'Ready']):
+ - status: "True"
+ reason: Synced
+ imageCount: 2
diff --git a/test/e2e/discovery-aggregation/05-assert-sum.yaml b/test/e2e/discovery-aggregation/05-assert-sum.yaml
new file mode 100644
index 0000000..af43f08
--- /dev/null
+++ b/test/e2e/discovery-aggregation/05-assert-sum.yaml
@@ -0,0 +1,12 @@
+# Assert sum (default) aggregation: policy is Ready, both images discovered.
+# sum() by (image) returns alpine=600, busybox=500 at each step.
+# aggregationMethod=sum adds all data-point values over the lookback window.
+apiVersion: drop.corewire.io/v1alpha1
+kind: DiscoveryPolicy
+metadata:
+ name: e2e-agg-sum
+status:
+ (conditions[?type == 'Ready']):
+ - status: "True"
+ reason: Synced
+ imageCount: 2
diff --git a/test/e2e/discovery-aggregation/06-assert-instant.yaml b/test/e2e/discovery-aggregation/06-assert-instant.yaml
new file mode 100644
index 0000000..2d42fc5
--- /dev/null
+++ b/test/e2e/discovery-aggregation/06-assert-instant.yaml
@@ -0,0 +1,11 @@
+# Assert instant query: policy is Ready, both images discovered.
+# queryType=instant uses /api/v1/query — the returned value is used directly as the score.
+apiVersion: drop.corewire.io/v1alpha1
+kind: DiscoveryPolicy
+metadata:
+ name: e2e-agg-instant
+status:
+ (conditions[?type == 'Ready']):
+ - status: "True"
+ reason: Synced
+ imageCount: 2
diff --git a/test/e2e/discovery-aggregation/07-assert-none.yaml b/test/e2e/discovery-aggregation/07-assert-none.yaml
new file mode 100644
index 0000000..94e6b0a
--- /dev/null
+++ b/test/e2e/discovery-aggregation/07-assert-none.yaml
@@ -0,0 +1,11 @@
+# Assert none aggregation: policy is Ready, both images discovered.
+# aggregationMethod=none uses the last data-point value from the range query directly.
+apiVersion: drop.corewire.io/v1alpha1
+kind: DiscoveryPolicy
+metadata:
+ name: e2e-agg-none
+status:
+ (conditions[?type == 'Ready']):
+ - status: "True"
+ reason: Synced
+ imageCount: 2
diff --git a/test/e2e/discovery-aggregation/chainsaw-test.yaml b/test/e2e/discovery-aggregation/chainsaw-test.yaml
new file mode 100644
index 0000000..16a95b2
--- /dev/null
+++ b/test/e2e/discovery-aggregation/chainsaw-test.yaml
@@ -0,0 +1,108 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json
+apiVersion: chainsaw.kyverno.io/v1alpha1
+kind: Test
+metadata:
+ name: discovery-aggregation-methods
+spec:
+ description: |
+ Verify that DiscoveryPolicy aggregationMethod and queryType fields work correctly
+ against a real Prometheus endpoint. Seeds use container_cpu_usage_seconds_total with
+ two images (alpine: 3 pods with values 100/200/300, busybox: 1 pod with value 500).
+
+ Expected rankings per method (queryType: range):
+ count → alpine first (3 > 1)
+ avg → busybox first (500 > 200)
+ max → busybox first (500 > 300)
+ sum → alpine first (600 > 500)
+ none → uses last data-point value directly
+
+ queryType: instant uses /api/v1/query directly — no aggregation.
+ steps:
+ - name: Create DiscoveryPolicies with different aggregation methods and query types
+ try:
+ - apply:
+ file: 01-discoverypolicies.yaml
+ - name: Assert count aggregation discovers images (alpine ranked first)
+ try:
+ - assert:
+ timeout: 90s
+ file: 02-assert-count.yaml
+ - name: Assert avg aggregation discovers images (busybox ranked first)
+ try:
+ - assert:
+ timeout: 90s
+ file: 03-assert-avg.yaml
+ - name: Assert max aggregation discovers images (busybox ranked first)
+ try:
+ - assert:
+ timeout: 90s
+ file: 04-assert-max.yaml
+ - name: Assert sum aggregation discovers images (alpine ranked first, default)
+ try:
+ - assert:
+ timeout: 90s
+ file: 05-assert-sum.yaml
+ - name: Assert instant query discovers images
+ try:
+ - assert:
+ timeout: 90s
+ file: 06-assert-instant.yaml
+ - name: Assert none aggregation discovers images (last value used directly)
+ try:
+ - assert:
+ timeout: 90s
+ file: 07-assert-none.yaml
+ - name: Verify aggregation scores are populated
+ try:
+ - script:
+ timeout: 30s
+ content: |
+ # Verify aggregation outputs are populated.
+ # Score relationships can vary with the number of data points and values
+ # returned by Prometheus in the lookback window.
+ SUM_SCORE=$(kubectl get discoverypolicy e2e-agg-sum -o jsonpath='{.status.discoveredImages[0].score}')
+ AVG_SCORE=$(kubectl get discoverypolicy e2e-agg-avg -o jsonpath='{.status.discoveredImages[0].score}')
+ COUNT_SCORE=$(kubectl get discoverypolicy e2e-agg-count -o jsonpath='{.status.discoveredImages[0].score}')
+ MAX_SCORE=$(kubectl get discoverypolicy e2e-agg-max -o jsonpath='{.status.discoveredImages[0].score}')
+ INSTANT_SCORE=$(kubectl get discoverypolicy e2e-agg-instant -o jsonpath='{.status.discoveredImages[0].score}')
+ NONE_SCORE=$(kubectl get discoverypolicy e2e-agg-none -o jsonpath='{.status.discoveredImages[0].score}')
+
+ echo "Scores — sum:$SUM_SCORE avg:$AVG_SCORE count:$COUNT_SCORE max:$MAX_SCORE instant:$INSTANT_SCORE none:$NONE_SCORE"
+
+ if [ -z "$SUM_SCORE" ] || [ -z "$AVG_SCORE" ] || [ -z "$COUNT_SCORE" ] || [ -z "$MAX_SCORE" ] || [ -z "$INSTANT_SCORE" ] || [ -z "$NONE_SCORE" ]; then
+ echo "FAIL: expected non-empty scores for all methods"
+ exit 1
+ fi
+ echo "OK: all query types and aggregation methods produced non-empty scores"
+ - name: Cleanup
+ try:
+ - delete:
+ ref:
+ apiVersion: drop.corewire.io/v1alpha1
+ kind: DiscoveryPolicy
+ name: e2e-agg-count
+ - delete:
+ ref:
+ apiVersion: drop.corewire.io/v1alpha1
+ kind: DiscoveryPolicy
+ name: e2e-agg-avg
+ - delete:
+ ref:
+ apiVersion: drop.corewire.io/v1alpha1
+ kind: DiscoveryPolicy
+ name: e2e-agg-max
+ - delete:
+ ref:
+ apiVersion: drop.corewire.io/v1alpha1
+ kind: DiscoveryPolicy
+ name: e2e-agg-sum
+ - delete:
+ ref:
+ apiVersion: drop.corewire.io/v1alpha1
+ kind: DiscoveryPolicy
+ name: e2e-agg-instant
+ - delete:
+ ref:
+ apiVersion: drop.corewire.io/v1alpha1
+ kind: DiscoveryPolicy
+ name: e2e-agg-none