diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 2caad38..600fc22 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -34,6 +34,9 @@ make docs-gen # regenerate AI docs from source - Pod builder is a pure function in internal/podbuilder/ (no k8s client) - Pacing logic lives exclusively in internal/pacing/ - Don't manually edit generated files — run make docs-gen +- Documentation must never contain unverified information — verify all examples against a real cluster before merging +- Always document which resources you looked at in which order (short summary + time spent + tokens consumed + context consumed) +- Always lint and fix linter issues locally before pushing code ## Testing Patterns @@ -58,6 +61,7 @@ api/v1alpha1 — Package v1alpha1 contains API Schema definitions for the drop v internal/controller — Package controller implements Kubernetes reconcilers for the drop CRDs (one per Kind). imports: api/v1alpha1, internal/discovery, internal/metrics, internal/pacing, internal/podbuilder internal/discovery — Package discovery implements image discovery from registries and Prometheus metrics. + imports: api/v1alpha1 internal/metrics — Package metrics registers Prometheus metrics for the drop operator. internal/pacing — Package pacing implements the shared rate-limiting engine for image pull scheduling. imports: api/v1alpha1, internal/podbuilder diff --git a/api/v1alpha1/discoverypolicy_types.go b/api/v1alpha1/discoverypolicy_types.go index 9cdf22e..14b87fd 100644 --- a/api/v1alpha1/discoverypolicy_types.go +++ b/api/v1alpha1/discoverypolicy_types.go @@ -53,6 +53,40 @@ type DiscoverySource struct { SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"` } +// AggregationMethod defines how range query values are aggregated into a score. +// +kubebuilder:validation:Enum=sum;count;avg;max +type AggregationMethod string + +const ( + // AggregationSum adds all data-point values over the lookback window. + // Use when the query returns a gauge/counter and the total magnitude matters + // (e.g., total memory usage across the window). + AggregationSum AggregationMethod = "sum" + // AggregationCount counts the number of non-zero data points over the lookback window. + // Use when you want to rank by how frequently an image appears + // (e.g., number of sample intervals where the image was running). + AggregationCount AggregationMethod = "count" + // AggregationAvg computes the arithmetic mean of all data-point values. + // Use when you want the average magnitude regardless of how many samples exist. + AggregationAvg AggregationMethod = "avg" + // AggregationMax takes the highest single data-point value. + // Use when peak usage is more relevant than cumulative usage. + AggregationMax AggregationMethod = "max" +) + +// QueryType defines how the Prometheus query is executed. +// +kubebuilder:validation:Enum=range;instant +type QueryType string + +const ( + // QueryTypeRange uses /api/v1/query_range with a time window defined by lookback. + // Returns multiple data points which are aggregated using the aggregationMethod. + QueryTypeRange QueryType = "range" + // QueryTypeInstant uses /api/v1/query for a single point-in-time result. + // The returned value is used directly as the score. + QueryTypeInstant QueryType = "instant" +) + // PrometheusSource defines Prometheus query configuration for image discovery. type PrometheusSource struct { // Endpoint is the Prometheus-compatible API URL (Prometheus, Thanos, Mimir, VictoriaMetrics). @@ -65,18 +99,32 @@ type PrometheusSource struct { // Example: count(container_memory_working_set_bytes{container!="",container!="POD",namespace="gitlab-runner"}) by (image) // +kubebuilder:validation:MinLength=1 Query string `json:"query"` - // Lookback is the time window for aggregation. When set, the operator uses query_range - // (start=now-lookback, end=now) and sums all returned values per image to produce a score. - // When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. + // QueryType controls how the Prometheus query is executed. + // "range" uses /api/v1/query_range with a time window defined by lookback. + // "instant" uses /api/v1/query for a single point-in-time result. + // Default: "range". + // +kubebuilder:default="range" + // +optional + QueryType QueryType `json:"queryType,omitempty"` + // Lookback is the time window for range queries. When queryType is "range", + // the operator queries (start=now-lookback, end=now) and aggregates all returned values per image. + // The aggregation function is controlled by the aggregationMethod field. + // Required when queryType is "range". Ignored when queryType is "instant". // Example: "168h" (7 days), "24h", "72h" // +optional Lookback *metav1.Duration `json:"lookback,omitempty"` + // AggregationMethod controls how data points from a range query are combined into a single score. + // Only used when queryType is "range". Ignored for instant queries. + // When not set (nil), Drop uses the last data-point value directly — use this when your PromQL + // already contains aggregation functions (e.g., count_over_time, topk). + // Options: "sum", "count", "avg", "max" + // +optional + AggregationMethod *AggregationMethod `json:"aggregationMethod,omitempty"` // Step is the resolution step for range queries (only used when lookback is set). - // Smaller steps = more data points = more accurate sums but higher Prometheus load. - // Default: "5m". Example: "1m", "15m" - // +kubebuilder:default="5m" + // Smaller steps = more data points = more accurate aggregation but higher Prometheus load. + // Default: 5m. Example: "1m", "15m" // +optional - Step string `json:"step,omitempty"` + Step *metav1.Duration `json:"step,omitempty"` } // RegistrySource defines OCI registry tag listing configuration for image discovery. diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index e329027..eafb2e1 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -512,6 +512,16 @@ func (in *PrometheusSource) DeepCopyInto(out *PrometheusSource) { *out = new(metav1.Duration) **out = **in } + if in.AggregationMethod != nil { + in, out := &in.AggregationMethod, &out.AggregationMethod + *out = new(AggregationMethod) + **out = **in + } + if in.Step != nil { + in, out := &in.Step, &out.Step + *out = new(metav1.Duration) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PrometheusSource. diff --git a/config/crd/bases/drop.corewire.io_discoverypolicies.yaml b/config/crd/bases/drop.corewire.io_discoverypolicies.yaml index d85dab4..a1183f2 100644 --- a/config/crd/bases/drop.corewire.io_discoverypolicies.yaml +++ b/config/crd/bases/drop.corewire.io_discoverypolicies.yaml @@ -86,6 +86,19 @@ spec: prometheus: description: Prometheus contains the configuration when type=prometheus. properties: + aggregationMethod: + description: |- + AggregationMethod controls how data points from a range query are combined into a single score. + Only used when queryType is "range". Ignored for instant queries. + When not set (nil), Drop uses the last data-point value directly — use this when your PromQL + already contains aggregation functions (e.g., count_over_time, topk). + Options: "sum", "count", "avg", "max" + enum: + - sum + - count + - avg + - max + type: string endpoint: description: |- Endpoint is the Prometheus-compatible API URL (Prometheus, Thanos, Mimir, VictoriaMetrics). @@ -94,9 +107,10 @@ spec: type: string lookback: description: |- - Lookback is the time window for aggregation. When set, the operator uses query_range - (start=now-lookback, end=now) and sums all returned values per image to produce a score. - When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. + Lookback is the time window for range queries. When queryType is "range", + the operator queries (start=now-lookback, end=now) and aggregates all returned values per image. + The aggregation function is controlled by the aggregationMethod field. + Required when queryType is "range". Ignored when queryType is "instant". Example: "168h" (7 days), "24h", "72h" type: string query: @@ -107,12 +121,22 @@ spec: Example: count(container_memory_working_set_bytes{container!="",container!="POD",namespace="gitlab-runner"}) by (image) minLength: 1 type: string + queryType: + default: range + description: |- + QueryType controls how the Prometheus query is executed. + "range" uses /api/v1/query_range with a time window defined by lookback. + "instant" uses /api/v1/query for a single point-in-time result. + Default: "range". + enum: + - range + - instant + type: string step: - default: 5m description: |- Step is the resolution step for range queries (only used when lookback is set). - Smaller steps = more data points = more accurate sums but higher Prometheus load. - Default: "5m". Example: "1m", "15m" + Smaller steps = more data points = more accurate aggregation but higher Prometheus load. + Default: 5m. Example: "1m", "15m" type: string required: - endpoint diff --git a/docs/content/docs/discovery.md b/docs/content/docs/discovery.md index 28e3aa7..8ee8440 100644 --- a/docs/content/docs/discovery.md +++ b/docs/content/docs/discovery.md @@ -66,7 +66,27 @@ count(container_memory_working_set_bytes{ Hand-maintained image lists do not keep up in environments where automation (for example Renovate) ships new image versions every day. A practical pattern is to rank images by observed CI usage over a rolling window. -The `lookback` field tells Drop to use Prometheus `query_range` API over that time window and sum all returned values per image to produce a total usage score: +The `queryType` field controls whether Drop sends an instant or range query (default: `range`). When set to `range`, the `lookback` field defines the time window and `aggregationMethod` controls how the returned data points are combined into a single score per image. + +#### Query Types + +{{< figure src="/drop/images/query-type-range.svg" alt="Range query: multiple data points over a lookback window" >}} + +{{< figure src="/drop/images/query-type-instant.svg" alt="Instant query: single point-in-time value used as score" >}} + +#### Aggregation Methods + +When using `queryType: range`, the `aggregationMethod` field determines how the returned data points are reduced into a single score: + +{{< figure src="/drop/images/aggregation-methods.svg" alt="Aggregation methods: nil (last value), sum, count, avg, max" >}} + +| Method | Behavior | Use when | +|--------|----------|----------| +| *(not set)* | Uses the last data-point value directly | Your PromQL already aggregates (e.g. `count_over_time`, `topk`) | +| `sum` | Adds all data-point values over the window | Total cumulative usage matters (e.g. total memory consumed) | +| `count` | Counts the number of data points returned | You want to rank by how frequently an image appears | +| `avg` | Arithmetic mean of all data-point values | Average magnitude matters regardless of sample count | +| `max` | Highest single data-point value | Peak usage is more relevant than cumulative | ```yaml apiVersion: drop.corewire.io/v1alpha1 @@ -80,8 +100,10 @@ spec: - type: prometheus prometheus: endpoint: https://mimir.example.com + queryType: range # default — use query_range API lookback: 168h # 7 days step: 5m + aggregationMethod: sum # rank by total usage over 7 days (omit to use last value directly) query: | count( container_memory_working_set_bytes{ @@ -95,7 +117,9 @@ Use this when you want DiscoveryPolicy to continuously follow what your GitLab r #### Field-by-field explanation -- `lookback: 168h` — Drop uses `query_range` with start=now-7d, end=now, and sums all returned values per image to rank by total usage over the window. +- `queryType: range` — tells Drop to use the Prometheus `query_range` API. This is the default. Set to `instant` for a single point-in-time query. +- `lookback: 168h` — defines the time window for range queries (start=now-7d, end=now). Required when `queryType` is `range`. +- `aggregationMethod: sum` — sums all data-point values to rank by total usage. When omitted (nil), the last value is used directly — ideal for self-contained PromQL queries. Other options: `count` to rank by number of appearances, `avg` for average magnitude, or `max` for peak value. - `step: 5m` — resolution step for the range query (controls how many data points Prometheus returns). - `count(...) by (image)` — counts the number of running containers per image to rank by popularity. - `container_memory_working_set_bytes{...}` — source metric used to observe running containers. @@ -108,9 +132,15 @@ Use this when you want DiscoveryPolicy to continuously follow what your GitLab r For each unique `image` label, Drop uses the Prometheus query result value as the score. -When `lookback` is not set (the default), Drop sends an instant query (`/api/v1/query`) and uses the returned value directly. When `lookback` is set (e.g. `lookback: 168h`), Drop uses a range query (`/api/v1/query_range`) over that window and **sums all returned values** to produce the score. This means images that appear more frequently over the window get a higher score. +When `queryType` is `range` (the default), Drop uses a range query (`/api/v1/query_range`) over the `lookback` window and aggregates data points using the `aggregationMethod`. When `queryType` is `instant`, Drop sends an instant query (`/api/v1/query`) and uses the returned value directly: + +- *(not set)*: uses the last data-point value — ideal when your PromQL already contains aggregation functions like `count_over_time` or `topk` +- `sum`: adds all data-point values — images with higher cumulative usage score higher +- `count`: counts the number of data points — images that appear more frequently score higher +- `avg`: averages data-point values — images with higher average value score higher +- `max`: takes the peak value — images with the highest single observation score higher -The example above uses `lookback: 168h` so Drop handles the 7-day windowing via the API — no need to embed `[7d]` in PromQL. +The example above uses `queryType: range` with `lookback: 168h` so Drop handles the 7-day windowing via the API — no need to embed `[7d]` in PromQL. If Prometheus returns: @@ -156,6 +186,7 @@ spec: - type: prometheus prometheus: endpoint: https://mimir.example.com + queryType: instant query: | count(container_memory_working_set_bytes{ container!="", container!="POD", diff --git a/docs/content/docs/reference/_generated_architecture.md b/docs/content/docs/reference/_generated_architecture.md index 0fe9d6b..1abb6ac 100644 --- a/docs/content/docs/reference/_generated_architecture.md +++ b/docs/content/docs/reference/_generated_architecture.md @@ -30,6 +30,7 @@ graph LR internal/controller --> internal/metrics internal/controller --> internal/pacing internal/controller --> internal/podbuilder + internal/discovery --> api/v1alpha1 internal/pacing --> api/v1alpha1 internal/pacing --> internal/podbuilder internal/podbuilder --> api/v1alpha1 diff --git a/docs/content/docs/reference/_generated_crds.md b/docs/content/docs/reference/_generated_crds.md index 3779600..1d72338 100644 --- a/docs/content/docs/reference/_generated_crds.md +++ b/docs/content/docs/reference/_generated_crds.md @@ -207,8 +207,10 @@ PrometheusSource defines Prometheus query configuration for image discovery. |-------|------|----------|---------|-------------| | `endpoint` | `string` | Yes | — | Endpoint is the Prometheus-compatible API URL (Prometheus, Thanos, Mimir, VictoriaMetrics). Example: "http://prometheus.monitoring.svc:9090", "https://mimir.example.com" | | `query` | `string` | Yes | — | Query is the PromQL expression. It MUST return results with an "image" label — that label value is used as the discovered image reference. The query result value is used as the ranking score (higher = more relevant). Example: count(container_memory_working_set_bytes{container!="",container!="POD",namespace="gitlab-runner"}) by (image) | -| `lookback` | `*metav1.Duration` | No | — | Lookback is the time window for aggregation. When set, the operator uses query_range (start=now-lookback, end=now) and sums all returned values per image to produce a score. When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. Example: "168h" (7 days), "24h", "72h" | -| `step` | `string` | No | 5m | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate sums but higher Prometheus load. Default: "5m". Example: "1m", "15m" | +| `queryType` | `QueryType` | No | range | QueryType controls how the Prometheus query is executed. "range" uses /api/v1/query_range with a time window defined by lookback. "instant" uses /api/v1/query for a single point-in-time result. Default: "range". | +| `lookback` | `*metav1.Duration` | No | — | Lookback is the time window for range queries. When queryType is "range", the operator queries (start=now-lookback, end=now) and aggregates all returned values per image. The aggregation function is controlled by the aggregationMethod field. Required when queryType is "range". Ignored when queryType is "instant". Example: "168h" (7 days), "24h", "72h" | +| `aggregationMethod` | `*AggregationMethod` | No | — | AggregationMethod controls how data points from a range query are combined into a single score. Only used when queryType is "range". Ignored for instant queries. When not set (nil), Drop uses the last data-point value directly — use this when your PromQL already contains aggregation functions (e.g., count_over_time, topk). Options: "sum", "count", "avg", "max" | +| `step` | `*metav1.Duration` | No | — | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate aggregation but higher Prometheus load. Default: 5m. Example: "1m", "15m" | ### RegistrySource diff --git a/docs/go.mod b/docs/go.mod index cc0eced..a8b9b26 100644 --- a/docs/go.mod +++ b/docs/go.mod @@ -1,5 +1,3 @@ module github.com/corewire/drop/docs go 1.26.0 - -require github.com/imfing/hextra v0.12.3 // indirect diff --git a/docs/go.sum b/docs/go.sum index afa8680..e69de29 100644 --- a/docs/go.sum +++ b/docs/go.sum @@ -1,2 +0,0 @@ -github.com/imfing/hextra v0.12.3 h1:DZHY2rUWYteyzjlHi9r4n7Bb5e2Q+6LXe4C1Dqn0ZjM= -github.com/imfing/hextra v0.12.3/go.mod h1:vi+yhpq8YPp/aghvJlNKVnJKcPJ/VyAEcfC1BSV9ARo= diff --git a/docs/static/images/aggregation-methods.svg b/docs/static/images/aggregation-methods.svg new file mode 100644 index 0000000..edba23b --- /dev/null +++ b/docs/static/images/aggregation-methods.svg @@ -0,0 +1,328 @@ + + + + + + Aggregation Methods + How range query data points are combined into a single score (example values: 2, 5, 3, 7, 4, 6) + + + + not set (nil) — last value used directly + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2 + 5 + 3 + 7 + 4 + + + + + + + + + + + + 6 + + + + score = 6 + + + + + sum — total of all values + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2 + 5 + 3 + 7 + 4 + 6 + + + + + score = 27 (2+5+3+7+4+6) + + + + + count — number of data points + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + score = 6 (6 data points) + + + + + avg — arithmetic mean + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2 + 5 + 3 + 7 + 4 + 6 + + + + + 4.5 + + + + score = 4.5 (27 ÷ 6) + + + + + max — highest single value + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2 + 5 + 3 + 4 + 6 + + + + + + 7 + + + + + + + score = 7 (peak value) + + + + + + Summary — same data, different scores + Input: 6 data points with values [2, 5, 3, 7, 4, 6] from a range query over 30m (step=5m) + + + + not set (nil): score = 6 — uses last value directly (PromQL handles aggregation) + + + + sum: score = 27 — total accumulated value (2+5+3+7+4+6) + + + + count: score = 6 — number of data points returned by Prometheus + + + + avg: score = 4.5 — arithmetic mean (27 ÷ 6) + + + + max: score = 7 — highest single observation in the window + + + diff --git a/docs/static/images/query-type-instant.svg b/docs/static/images/query-type-instant.svg new file mode 100644 index 0000000..99d00c2 --- /dev/null +++ b/docs/static/images/query-type-instant.svg @@ -0,0 +1,95 @@ + + + + + + queryType: instant + Uses /api/v1/query — returns a single point-in-time value + + + + + + + metric value + + + 0 + 50 + 100 + 150 + 200 + + + + + + + + + + + + ignored by instant query + + + + + + + + + + + + + + + + + + 47 + 70 + 105 + 88 + 140 + 128 + 158 + 117 + 146 + + + + + now + + + + + + + + + + + + + 134 + + + + score = 134 + (single instant value) + + + + + + + time + query time + + + Example: count(container_memory_working_set_bytes{namespace="build"}) by (image) → 134 running containers + diff --git a/docs/static/images/query-type-range.svg b/docs/static/images/query-type-range.svg new file mode 100644 index 0000000..033fef1 --- /dev/null +++ b/docs/static/images/query-type-range.svg @@ -0,0 +1,122 @@ + + + + + + queryType: range + Uses /api/v1/query_range — returns multiple data points over a time window + + + + + + + metric value + + + 0 + 50 + 100 + 150 + 200 + + + + + + + + + + + + + + + + + + + + + + + + + + t₁ + t₂ + t₃ + t₄ + t₅ + t₆ + t₇ + t₈ + t₉ + t₁₀ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 47 + 70 + 105 + 88 + 140 + 128 + 158 + 117 + 146 + 134 + + + + + + + ← lookback window (e.g. 168h) → + each vertical line = one step interval (e.g. 5m) + + + + all 10 values → score + + + + Example: sum → 47+70+105+88+140+128+158+117+146+134 = 1133 + diff --git a/docs/static/llms-full.txt b/docs/static/llms-full.txt index 50e6d8c..b0ca6cc 100644 --- a/docs/static/llms-full.txt +++ b/docs/static/llms-full.txt @@ -181,8 +181,10 @@ PrometheusSource defines Prometheus query configuration for image discovery. |-------|------|------|----------|---------|-------------| | Endpoint | `endpoint` | `string` | ✓ | | Endpoint is the Prometheus-compatible API URL (Prometheus, Thanos, Mimir, VictoriaMetrics). Example: "http://prometheus.monitoring.svc:9090", "https://mimir.example.com" | | Query | `query` | `string` | ✓ | | Query is the PromQL expression. It MUST return results with an "image" label — that label value is used as the discovered image reference. The query result value is used as the ranking score (higher = more relevant). Example: count(container_memory_working_set_bytes{container!="",container!="POD",namespace="gitlab-runner"}) by (image) | -| Lookback | `lookback` | `*metav1.Duration` | — | | Lookback is the time window for aggregation. When set, the operator uses query_range (start=now-lookback, end=now) and sums all returned values per image to produce a score. When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. Example: "168h" (7 days), "24h", "72h" | -| Step | `step` | `string` | — | `5m` | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate sums but higher Prometheus load. Default: "5m". Example: "1m", "15m" | +| QueryType | `queryType` | `QueryType` | — | `range` | QueryType controls how the Prometheus query is executed. "range" uses /api/v1/query_range with a time window defined by lookback. "instant" uses /api/v1/query for a single point-in-time result. Default: "range". | +| Lookback | `lookback` | `*metav1.Duration` | — | | Lookback is the time window for range queries. When queryType is "range", the operator queries (start=now-lookback, end=now) and aggregates all returned values per image. The aggregation function is controlled by the aggregationMethod field. Required when queryType is "range". Ignored when queryType is "instant". Example: "168h" (7 days), "24h", "72h" | +| AggregationMethod | `aggregationMethod` | `*AggregationMethod` | — | | AggregationMethod controls how data points from a range query are combined into a single score. Only used when queryType is "range". Ignored for instant queries. When not set (nil), Drop uses the last data-point value directly — use this when your PromQL already contains aggregation functions (e.g., count_over_time, topk). Options: "sum", "count", "avg", "max" | +| Step | `step` | `*metav1.Duration` | — | | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate aggregation but higher Prometheus load. Default: 5m. Example: "1m", "15m" | ### RegistrySource @@ -330,8 +332,10 @@ spec: prometheus: endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" query: 'count(container_memory_working_set_bytes{container!="", container!="POD", namespace="build-stuff", pod=~"runner-.*"}) by (image)' + queryType: range lookback: 24h step: 5m + aggregationMethod: sum syncInterval: 30s maxImages: 10 --- diff --git a/go.sum b/go.sum index 06ca73e..760283c 100644 --- a/go.sum +++ b/go.sum @@ -66,8 +66,6 @@ github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= -github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/pprof v0.0.0-20260402051712-545e8a4df936 h1:EwtI+Al+DeppwYX2oXJCETMO23COyaKGP6fHVpkpWpg= github.com/google/pprof v0.0.0-20260402051712-545e8a4df936/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= @@ -107,14 +105,8 @@ github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFd github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.27.4 h1:fcEcQW/A++6aZAZQNUmNjvA9PSOzefMJBerHJ4t8v8Y= -github.com/onsi/ginkgo/v2 v2.27.4/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= github.com/onsi/ginkgo/v2 v2.29.0 h1:rfh+ZFjgJhYWRoIqVf3Uwx/W20yLrcrE2h2GmYVRaag= github.com/onsi/ginkgo/v2 v2.29.0/go.mod h1:+aXOY+vzZ5mu2iI2HpTZUPmM//oQfsNFX6gU9kNcA44= -github.com/onsi/gomega v1.39.0 h1:y2ROC3hKFmQZJNFeGAMeHZKkjBL65mIZcvrLQBF9k6Q= -github.com/onsi/gomega v1.39.0/go.mod h1:ZCU1pkQcXDO5Sl9/VVEGlDyp+zm0m1cmeG5TOzLgdh4= -github.com/onsi/gomega v1.40.0 h1:Vtol0e1MghCD2ZVIilPDIg44XSL9l2QAn8ZNaljWcJc= -github.com/onsi/gomega v1.40.0/go.mod h1:M/Uqpu/8qTjtzCLUA2zJHX9Iilrau25x1PdoSRbWh5A= github.com/onsi/gomega v1.41.0 h1:OwKp4pXNgVxf6sCplzYo794OFNuoL2q2SBMU5NSWOjA= github.com/onsi/gomega v1.41.0/go.mod h1:M/Uqpu/8qTjtzCLUA2zJHX9Iilrau25x1PdoSRbWh5A= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= @@ -192,36 +184,22 @@ go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 h1:fQsdNF2N+/YewlRZiricy4P1iimyPKZ/xwniHj8Q2a0= golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93/go.mod h1:EPRbTFwzwjXj9NpYyyrvenVh9Y+GFeEvMNh7Xuz7xgU= -golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c= -golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU= golang.org/x/mod v0.35.0 h1:Ww1D637e6Pg+Zb2KrWfHQUnH2dQRLBQyAtpr/haaJeM= golang.org/x/mod v0.35.0/go.mod h1:+GwiRhIInF8wPm+4AoT6L0FA1QWAad3OMdTRx4tFYlU= -golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= -golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA= golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw= golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= -golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= -golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= -golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= -golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI= golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= -golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY= -golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww= golang.org/x/term v0.42.0 h1:UiKe+zDFmJobeJ5ggPwOshJIVt6/Ft0rcfrXZDLWAWY= golang.org/x/term v0.42.0/go.mod h1:Dq/D+snpsbazcBG5+F9Q1n2rXV8Ma+71xEjTRufARgY= -golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= -golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= -golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc= -golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg= golang.org/x/tools v0.44.0 h1:UP4ajHPIcuMjT1GqzDWRlalUEoY+uzoZKnhOjbIPD2c= golang.org/x/tools v0.44.0/go.mod h1:KA0AfVErSdxRZIsOVipbv3rQhVXTnlU6UhKxHd1seDI= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= diff --git a/hack/dev-samples.yaml b/hack/dev-samples.yaml index 61b9d21..767b904 100644 --- a/hack/dev-samples.yaml +++ b/hack/dev-samples.yaml @@ -81,8 +81,10 @@ spec: prometheus: endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" query: 'count(container_memory_working_set_bytes{container!="", container!="POD", namespace="build-stuff", pod=~"runner-.*"}) by (image)' + queryType: range lookback: 24h step: 5m + aggregationMethod: sum syncInterval: 30s maxImages: 10 --- diff --git a/hack/e2e-infra/prometheus-config.yaml b/hack/e2e-infra/prometheus-config.yaml index f731502..e483195 100644 --- a/hack/e2e-infra/prometheus-config.yaml +++ b/hack/e2e-infra/prometheus-config.yaml @@ -64,3 +64,43 @@ data: namespace: "production" pod: "myapp-xyz" expr: "209715200" + # Metrics for aggregation method e2e tests. + # Two images with multiple pods each so count() and sum() produce + # different rankings: + # alpine → 3 pods, values 100+200+300 → sum=600, count=3, avg=200, max=300 + # busybox → 1 pod, value 500 → sum=500, count=1, avg=500, max=500 + # With count(), alpine ranks higher (3 > 1). + # With sum(), alpine still ranks higher (600 > 500). + # With avg(), busybox ranks higher (500 > 200). + # With max(), busybox ranks higher (500 > 300). + - name: seed_aggregation_metrics + interval: 10s + rules: + - record: container_cpu_usage_seconds_total + labels: + image: "docker.io/library/alpine:3.19" + container: "worker" + namespace: "aggregation-test" + pod: "worker-aaa" + expr: "100" + - record: container_cpu_usage_seconds_total + labels: + image: "docker.io/library/alpine:3.19" + container: "worker" + namespace: "aggregation-test" + pod: "worker-bbb" + expr: "200" + - record: container_cpu_usage_seconds_total + labels: + image: "docker.io/library/alpine:3.19" + container: "worker" + namespace: "aggregation-test" + pod: "worker-ccc" + expr: "300" + - record: container_cpu_usage_seconds_total + labels: + image: "docker.io/library/busybox:1.36" + container: "init" + namespace: "aggregation-test" + pod: "init-ddd" + expr: "500" diff --git a/hack/gen-ai-docs/config.go b/hack/gen-ai-docs/config.go index 325bb60..979b8ba 100644 --- a/hack/gen-ai-docs/config.go +++ b/hack/gen-ai-docs/config.go @@ -33,6 +33,9 @@ func conventions() []Convention { {Rule: "Pod builder is a pure function in internal/podbuilder/ (no k8s client)", Scope: []string{"code"}}, {Rule: "Pacing logic lives exclusively in internal/pacing/", Scope: []string{"code"}}, {Rule: "Don't manually edit generated files — run make docs-gen", Scope: []string{"code"}}, + {Rule: "Documentation must never contain unverified information — verify all examples against a real cluster before merging", Scope: []string{"code"}}, + {Rule: "Always document which resources you looked at in which order (short summary + time spent + tokens consumed + context consumed)", Scope: []string{"code"}}, + {Rule: "Always lint and fix linter issues locally before pushing code", Scope: []string{"code"}}, } } diff --git a/internal/controller/discoverypolicy_controller.go b/internal/controller/discoverypolicy_controller.go index 016b34b..c801165 100644 --- a/internal/controller/discoverypolicy_controller.go +++ b/internal/controller/discoverypolicy_controller.go @@ -246,7 +246,11 @@ func (r *DiscoveryPolicyReconciler) buildSource(ctx context.Context, src dropv1a if src.Prometheus.Lookback != nil { lookback = src.Prometheus.Lookback.Duration } - return discovery.NewPrometheusSource(src.Prometheus.Endpoint, src.Prometheus.Query, lookback, src.Prometheus.Step, httpClient), nil + var step time.Duration + if src.Prometheus.Step != nil { + step = src.Prometheus.Step.Duration + } + return discovery.NewPrometheusSource(src.Prometheus.Endpoint, src.Prometheus.Query, src.Prometheus.QueryType, lookback, src.Prometheus.AggregationMethod, step, httpClient), nil case "registry": if src.Registry == nil { return nil, fmt.Errorf("registry config is required when type=registry") diff --git a/internal/discovery/prometheus.go b/internal/discovery/prometheus.go index c3b4a31..94423f8 100644 --- a/internal/discovery/prometheus.go +++ b/internal/discovery/prometheus.go @@ -9,31 +9,42 @@ import ( "net/url" "sort" "time" + + dropv1alpha1 "github.com/corewire/drop/api/v1alpha1" ) +const prometheusStatusSuccess = "success" + // PrometheusSource queries Prometheus for image references. type PrometheusSource struct { - Endpoint string - Query string - Lookback time.Duration // 0 = instant query; >0 = query_range - Step string // resolution step for range queries (default "5m") - HTTPClient *http.Client + Endpoint string + Query string + QueryType dropv1alpha1.QueryType // range or instant + Lookback time.Duration // time window for range queries + AggregationMethod *dropv1alpha1.AggregationMethod // nil = use last value; sum, count, avg, max + Step time.Duration // resolution step for range queries (default 5m) + HTTPClient *http.Client } // NewPrometheusSource creates a new Prometheus discovery source. -func NewPrometheusSource(endpoint, query string, lookback time.Duration, step string, httpClient *http.Client) *PrometheusSource { +func NewPrometheusSource(endpoint, query string, queryType dropv1alpha1.QueryType, lookback time.Duration, aggregationMethod *dropv1alpha1.AggregationMethod, step time.Duration, httpClient *http.Client) *PrometheusSource { if httpClient == nil { httpClient = &http.Client{Timeout: 30 * time.Second} } - if step == "" { - step = "5m" + if step == 0 { + step = 5 * time.Minute + } + if queryType == "" { + queryType = dropv1alpha1.QueryTypeRange } return &PrometheusSource{ - Endpoint: endpoint, - Query: query, - Lookback: lookback, - Step: step, - HTTPClient: httpClient, + Endpoint: endpoint, + Query: query, + QueryType: queryType, + Lookback: lookback, + AggregationMethod: aggregationMethod, + Step: step, + HTTPClient: httpClient, } } @@ -62,13 +73,13 @@ func (p *PrometheusSource) Fetch(ctx context.Context) ([]ImageResult, error) { q := u.Query() q.Set("query", p.Query) - if p.Lookback > 0 { + if p.QueryType == dropv1alpha1.QueryTypeRange { // Range query: aggregate over time window u.Path = "/api/v1/query_range" now := time.Now().UTC() q.Set("start", now.Add(-p.Lookback).Format(time.RFC3339)) q.Set("end", now.Format(time.RFC3339)) - q.Set("step", p.Step) + q.Set("step", fmt.Sprintf("%ds", int(p.Step.Seconds()))) } else { // Instant query: single point in time u.Path = "/api/v1/query" @@ -96,7 +107,7 @@ func (p *PrometheusSource) Fetch(ctx context.Context) ([]ImageResult, error) { return nil, fmt.Errorf("decoding response: %w", err) } - if promResp.Status != "success" { + if promResp.Status != prometheusStatusSuccess { return nil, fmt.Errorf("prometheus query failed with status: %s", promResp.Status) } @@ -108,9 +119,9 @@ func (p *PrometheusSource) Fetch(ctx context.Context) ([]ImageResult, error) { } var score int64 - if p.Lookback > 0 { - // Range query: sum all values to get total usage score - score = sumRangeValues(r.Values) + if p.QueryType == dropv1alpha1.QueryTypeRange { + // Range query: aggregate values according to configured method (nil = last value) + score = aggregateRangeValues(r.Values, p.AggregationMethod) } else { // Instant query: use single value score = extractScore(r.Value) @@ -146,9 +157,34 @@ func extractScore(value []interface{}) int64 { return int64(score) } -// sumRangeValues sums all values from a query_range result to produce a total usage score. -func sumRangeValues(values [][]interface{}) int64 { +// aggregateRangeValues aggregates all values from a query_range result using the specified method. +// When method is nil, the last data-point value is used directly (no aggregation). +func aggregateRangeValues(values [][]interface{}, method *dropv1alpha1.AggregationMethod) int64 { + // nil = no aggregation, use last data-point value directly + if method == nil { + if len(values) == 0 { + return 0 + } + lastPair := values[len(values)-1] + if len(lastPair) < 2 { + return 0 + } + strVal, ok := lastPair[1].(string) + if !ok { + return 0 + } + var v float64 + if _, err := fmt.Sscanf(strVal, "%f", &v); err != nil { + return 0 + } + return int64(v) + } + var total float64 + var max float64 + var count int64 + maxSet := false + for _, pair := range values { if len(pair) < 2 { continue @@ -158,9 +194,28 @@ func sumRangeValues(values [][]interface{}) int64 { continue } var v float64 - if _, err := fmt.Sscanf(strVal, "%f", &v); err == nil { - total += v + if _, err := fmt.Sscanf(strVal, "%f", &v); err != nil { + continue + } + total += v + count++ + if !maxSet || v > max { + max = v + maxSet = true + } + } + + switch *method { + case dropv1alpha1.AggregationCount: + return count + case dropv1alpha1.AggregationAvg: + if count == 0 { + return 0 } + return int64(total / float64(count)) + case dropv1alpha1.AggregationMax: + return int64(max) + default: // AggregationSum + return int64(total) } - return int64(total) } diff --git a/internal/discovery/prometheus_test.go b/internal/discovery/prometheus_test.go index 2110a02..e5157c5 100644 --- a/internal/discovery/prometheus_test.go +++ b/internal/discovery/prometheus_test.go @@ -6,9 +6,12 @@ import ( "net/http" "net/http/httptest" "testing" + "time" + + dropv1alpha1 "github.com/corewire/drop/api/v1alpha1" ) -func TestPrometheusSource_Fetch(t *testing.T) { +func TestPrometheusSource_Fetch_Instant(t *testing.T) { tests := []struct { name string response interface{} @@ -20,7 +23,7 @@ func TestPrometheusSource_Fetch(t *testing.T) { { name: "valid response with image labels", response: prometheusResponse{ - Status: "success", + Status: prometheusStatusSuccess, Data: struct { ResultType string `json:"resultType"` Result []prometheusResult `json:"result"` @@ -45,7 +48,7 @@ func TestPrometheusSource_Fetch(t *testing.T) { { name: "skips results without image label", response: prometheusResponse{ - Status: "success", + Status: prometheusStatusSuccess, Data: struct { ResultType string `json:"resultType"` Result []prometheusResult `json:"result"` @@ -76,7 +79,7 @@ func TestPrometheusSource_Fetch(t *testing.T) { { name: "empty results", response: prometheusResponse{ - Status: "success", + Status: prometheusStatusSuccess, Data: struct { ResultType string `json:"resultType"` Result []prometheusResult `json:"result"` @@ -94,7 +97,7 @@ func TestPrometheusSource_Fetch(t *testing.T) { t.Run(tt.name, func(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path != "/api/v1/query" { - t.Errorf("unexpected path: %s", r.URL.Path) + t.Errorf("unexpected path: %s, want /api/v1/query", r.URL.Path) } w.WriteHeader(tt.statusCode) if err := json.NewEncoder(w).Encode(tt.response); err != nil { @@ -103,7 +106,7 @@ func TestPrometheusSource_Fetch(t *testing.T) { })) defer server.Close() - source := NewPrometheusSource(server.URL, "test_query", 0, "", server.Client()) + source := NewPrometheusSource(server.URL, "test_query", dropv1alpha1.QueryTypeInstant, 0, nil, 0, server.Client()) results, err := source.Fetch(context.Background()) if tt.wantErr { @@ -129,3 +132,210 @@ func TestPrometheusSource_Fetch(t *testing.T) { }) } } + +func TestPrometheusSource_Fetch_Range(t *testing.T) { + tests := []struct { + name string + aggregationMethod *dropv1alpha1.AggregationMethod + response prometheusResponse + wantCount int + wantFirst string + wantScore int64 + }{ + { + name: "nil aggregation (last value)", + aggregationMethod: nil, + response: prometheusResponse{ + Status: prometheusStatusSuccess, + Data: struct { + ResultType string `json:"resultType"` + Result []prometheusResult `json:"result"` + }{ + ResultType: "matrix", + Result: []prometheusResult{ + { + Metric: map[string]string{"image": "nginx:1.25"}, + Values: [][]interface{}{ + {1234567890.0, "10"}, + {1234567950.0, "20"}, + {1234568010.0, "30"}, + }, + }, + }, + }, + }, + wantCount: 1, + wantFirst: "nginx:1.25", + wantScore: 30, // last data-point value + }, + { + name: "sum aggregation", + aggregationMethod: aggregationMethodPtr(dropv1alpha1.AggregationSum), + response: prometheusResponse{ + Status: prometheusStatusSuccess, + Data: struct { + ResultType string `json:"resultType"` + Result []prometheusResult `json:"result"` + }{ + ResultType: "matrix", + Result: []prometheusResult{ + { + Metric: map[string]string{"image": "nginx:1.25"}, + Values: [][]interface{}{ + {1234567890.0, "10"}, + {1234567950.0, "20"}, + {1234568010.0, "30"}, + }, + }, + }, + }, + }, + wantCount: 1, + wantFirst: "nginx:1.25", + wantScore: 60, // 10+20+30 + }, + { + name: "count aggregation", + aggregationMethod: aggregationMethodPtr(dropv1alpha1.AggregationCount), + response: prometheusResponse{ + Status: prometheusStatusSuccess, + Data: struct { + ResultType string `json:"resultType"` + Result []prometheusResult `json:"result"` + }{ + ResultType: "matrix", + Result: []prometheusResult{ + { + Metric: map[string]string{"image": "nginx:1.25"}, + Values: [][]interface{}{ + {1234567890.0, "10"}, + {1234567950.0, "20"}, + {1234568010.0, "30"}, + }, + }, + }, + }, + }, + wantCount: 1, + wantFirst: "nginx:1.25", + wantScore: 3, + }, + { + name: "avg aggregation", + aggregationMethod: aggregationMethodPtr(dropv1alpha1.AggregationAvg), + response: prometheusResponse{ + Status: prometheusStatusSuccess, + Data: struct { + ResultType string `json:"resultType"` + Result []prometheusResult `json:"result"` + }{ + ResultType: "matrix", + Result: []prometheusResult{ + { + Metric: map[string]string{"image": "nginx:1.25"}, + Values: [][]interface{}{ + {1234567890.0, "10"}, + {1234567950.0, "20"}, + {1234568010.0, "30"}, + }, + }, + }, + }, + }, + wantCount: 1, + wantFirst: "nginx:1.25", + wantScore: 20, // (10+20+30)/3 + }, + { + name: "max aggregation", + aggregationMethod: aggregationMethodPtr(dropv1alpha1.AggregationMax), + response: prometheusResponse{ + Status: prometheusStatusSuccess, + Data: struct { + ResultType string `json:"resultType"` + Result []prometheusResult `json:"result"` + }{ + ResultType: "matrix", + Result: []prometheusResult{ + { + Metric: map[string]string{"image": "nginx:1.25"}, + Values: [][]interface{}{ + {1234567890.0, "10"}, + {1234567950.0, "20"}, + {1234568010.0, "30"}, + }, + }, + }, + }, + }, + wantCount: 1, + wantFirst: "nginx:1.25", + wantScore: 30, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/query_range" { + t.Errorf("unexpected path: %s, want /api/v1/query_range", r.URL.Path) + } + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(tt.response); err != nil { + t.Fatal(err) + } + })) + defer server.Close() + + source := NewPrometheusSource(server.URL, "test_query", dropv1alpha1.QueryTypeRange, time.Hour, tt.aggregationMethod, 5*time.Minute, server.Client()) + results, err := source.Fetch(context.Background()) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(results) != tt.wantCount { + t.Errorf("got %d results, want %d", len(results), tt.wantCount) + } + + if tt.wantFirst != "" && len(results) > 0 { + if results[0].Image != tt.wantFirst { + t.Errorf("first image = %q, want %q", results[0].Image, tt.wantFirst) + } + if results[0].Score != tt.wantScore { + t.Errorf("score = %d, want %d", results[0].Score, tt.wantScore) + } + } + }) + } +} + +func TestPrometheusSource_DefaultQueryType(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/query_range" { + t.Errorf("default queryType should use query_range, got path: %s", r.URL.Path) + } + resp := prometheusResponse{Status: prometheusStatusSuccess} + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(resp); err != nil { + t.Fatal(err) + } + })) + defer server.Close() + + // Empty queryType should default to range + source := NewPrometheusSource(server.URL, "test_query", "", time.Hour, nil, 0, server.Client()) + if source.QueryType != dropv1alpha1.QueryTypeRange { + t.Errorf("default QueryType = %q, want %q", source.QueryType, dropv1alpha1.QueryTypeRange) + } + if source.AggregationMethod != nil { + t.Errorf("default AggregationMethod = %v, want nil", source.AggregationMethod) + } + _, err := source.Fetch(context.Background()) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } +} + +func aggregationMethodPtr(m dropv1alpha1.AggregationMethod) *dropv1alpha1.AggregationMethod { + return &m +} diff --git a/knowledge.yaml b/knowledge.yaml index 0eaa619..a088e30 100644 --- a/knowledge.yaml +++ b/knowledge.yaml @@ -445,17 +445,27 @@ helperTypes: type: string required: true doc: 'Query is the PromQL expression. It MUST return results with an "image" label — that label value is used as the discovered image reference. The query result value is used as the ranking score (higher = more relevant). Example: count(container_memory_working_set_bytes{container!="",container!="POD",namespace="gitlab-runner"}) by (image)' + - name: QueryType + json: queryType + type: QueryType + required: false + default: range + doc: 'QueryType controls how the Prometheus query is executed. "range" uses /api/v1/query_range with a time window defined by lookback. "instant" uses /api/v1/query for a single point-in-time result. Default: "range".' - name: Lookback json: lookback type: '*metav1.Duration' required: false - doc: 'Lookback is the time window for aggregation. When set, the operator uses query_range (start=now-lookback, end=now) and sums all returned values per image to produce a score. When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. Example: "168h" (7 days), "24h", "72h"' + doc: 'Lookback is the time window for range queries. When queryType is "range", the operator queries (start=now-lookback, end=now) and aggregates all returned values per image. The aggregation function is controlled by the aggregationMethod field. Required when queryType is "range". Ignored when queryType is "instant". Example: "168h" (7 days), "24h", "72h"' + - name: AggregationMethod + json: aggregationMethod + type: '*AggregationMethod' + required: false + doc: 'AggregationMethod controls how data points from a range query are combined into a single score. Only used when queryType is "range". Ignored for instant queries. When not set (nil), Drop uses the last data-point value directly — use this when your PromQL already contains aggregation functions (e.g., count_over_time, topk). Options: "sum", "count", "avg", "max"' - name: Step json: step - type: string + type: '*metav1.Duration' required: false - default: 5m - doc: 'Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate sums but higher Prometheus load. Default: "5m". Example: "1m", "15m"' + doc: 'Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate aggregation but higher Prometheus load. Default: 5m. Example: "1m", "15m"' - name: RegistrySource doc: RegistrySource defines OCI registry tag listing configuration for image discovery. fields: @@ -510,6 +520,8 @@ packages: - internal/podbuilder - path: internal/discovery role: Package discovery implements image discovery from registries and Prometheus metrics. + imports: + - api/v1alpha1 - path: internal/metrics role: Package metrics registers Prometheus metrics for the drop operator. - path: internal/pacing @@ -537,6 +549,15 @@ conventions: - rule: Don't manually edit generated files — run make docs-gen scope: - code + - rule: Documentation must never contain unverified information — verify all examples against a real cluster before merging + scope: + - code + - rule: Always document which resources you looked at in which order (short summary + time spent + tokens consumed + context consumed) + scope: + - code + - rule: Always lint and fix linter issues locally before pushing code + scope: + - code errors: - reason: Cached controller: CachedImage @@ -762,8 +783,10 @@ samples: | prometheus: endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" query: 'count(container_memory_working_set_bytes{container!="", container!="POD", namespace="build-stuff", pod=~"runner-.*"}) by (image)' + queryType: range lookback: 24h step: 5m + aggregationMethod: sum syncInterval: 30s maxImages: 10 --- diff --git a/llms-full.txt b/llms-full.txt index 50e6d8c..b0ca6cc 100644 --- a/llms-full.txt +++ b/llms-full.txt @@ -181,8 +181,10 @@ PrometheusSource defines Prometheus query configuration for image discovery. |-------|------|------|----------|---------|-------------| | Endpoint | `endpoint` | `string` | ✓ | | Endpoint is the Prometheus-compatible API URL (Prometheus, Thanos, Mimir, VictoriaMetrics). Example: "http://prometheus.monitoring.svc:9090", "https://mimir.example.com" | | Query | `query` | `string` | ✓ | | Query is the PromQL expression. It MUST return results with an "image" label — that label value is used as the discovered image reference. The query result value is used as the ranking score (higher = more relevant). Example: count(container_memory_working_set_bytes{container!="",container!="POD",namespace="gitlab-runner"}) by (image) | -| Lookback | `lookback` | `*metav1.Duration` | — | | Lookback is the time window for aggregation. When set, the operator uses query_range (start=now-lookback, end=now) and sums all returned values per image to produce a score. When unset, uses an instant query (/api/v1/query) and the point-in-time value is the score. Example: "168h" (7 days), "24h", "72h" | -| Step | `step` | `string` | — | `5m` | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate sums but higher Prometheus load. Default: "5m". Example: "1m", "15m" | +| QueryType | `queryType` | `QueryType` | — | `range` | QueryType controls how the Prometheus query is executed. "range" uses /api/v1/query_range with a time window defined by lookback. "instant" uses /api/v1/query for a single point-in-time result. Default: "range". | +| Lookback | `lookback` | `*metav1.Duration` | — | | Lookback is the time window for range queries. When queryType is "range", the operator queries (start=now-lookback, end=now) and aggregates all returned values per image. The aggregation function is controlled by the aggregationMethod field. Required when queryType is "range". Ignored when queryType is "instant". Example: "168h" (7 days), "24h", "72h" | +| AggregationMethod | `aggregationMethod` | `*AggregationMethod` | — | | AggregationMethod controls how data points from a range query are combined into a single score. Only used when queryType is "range". Ignored for instant queries. When not set (nil), Drop uses the last data-point value directly — use this when your PromQL already contains aggregation functions (e.g., count_over_time, topk). Options: "sum", "count", "avg", "max" | +| Step | `step` | `*metav1.Duration` | — | | Step is the resolution step for range queries (only used when lookback is set). Smaller steps = more data points = more accurate aggregation but higher Prometheus load. Default: 5m. Example: "1m", "15m" | ### RegistrySource @@ -330,8 +332,10 @@ spec: prometheus: endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" query: 'count(container_memory_working_set_bytes{container!="", container!="POD", namespace="build-stuff", pod=~"runner-.*"}) by (image)' + queryType: range lookback: 24h step: 5m + aggregationMethod: sum syncInterval: 30s maxImages: 10 --- diff --git a/test/e2e/discovery-aggregation/01-discoverypolicies.yaml b/test/e2e/discovery-aggregation/01-discoverypolicies.yaml new file mode 100644 index 0000000..52f9cf7 --- /dev/null +++ b/test/e2e/discovery-aggregation/01-discoverypolicies.yaml @@ -0,0 +1,108 @@ +# Four DiscoveryPolicies using queryType: range with different aggregationMethods, +# plus one using queryType: instant. +# All query the same seed metrics (container_cpu_usage_seconds_total in namespace aggregation-test). +# Seed data: alpine has 3 pods (values 100, 200, 300), busybox has 1 pod (value 500). +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-count +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" + query: 'count(container_cpu_usage_seconds_total{namespace="aggregation-test"}) by (image)' + queryType: range + lookback: 1h + step: 5m + aggregationMethod: count + syncInterval: 30s + maxImages: 10 +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-avg +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" + query: 'sum(container_cpu_usage_seconds_total{namespace="aggregation-test"}) by (image)' + queryType: range + lookback: 1h + step: 5m + aggregationMethod: avg + syncInterval: 30s + maxImages: 10 +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-max +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" + query: 'sum(container_cpu_usage_seconds_total{namespace="aggregation-test"}) by (image)' + queryType: range + lookback: 1h + step: 5m + aggregationMethod: max + syncInterval: 30s + maxImages: 10 +--- +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-sum +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" + query: 'sum(container_cpu_usage_seconds_total{namespace="aggregation-test"}) by (image)' + queryType: range + lookback: 1h + step: 5m + aggregationMethod: sum + syncInterval: 30s + maxImages: 10 +--- +# queryType: range without aggregationMethod — field is nullable, omitting it means +# Drop uses the last data-point value directly without aggregation. +# Ideal for self-contained PromQL queries that already aggregate internally. +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-none +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" + query: 'sum(container_cpu_usage_seconds_total{namespace="aggregation-test"}) by (image)' + queryType: range + lookback: 1h + step: 5m + # aggregationMethod intentionally omitted (nil) — uses last value directly + syncInterval: 30s + maxImages: 10 +--- +# queryType: instant — uses /api/v1/query for a single point-in-time result. +# The returned value is used directly as the score without aggregation. +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-instant +spec: + sources: + - type: prometheus + prometheus: + endpoint: "http://prometheus.e2e-infra.svc.cluster.local:9090" + query: 'count(container_cpu_usage_seconds_total{namespace="aggregation-test"}) by (image)' + queryType: instant + syncInterval: 30s + maxImages: 10 diff --git a/test/e2e/discovery-aggregation/02-assert-count.yaml b/test/e2e/discovery-aggregation/02-assert-count.yaml new file mode 100644 index 0000000..ee5e76b --- /dev/null +++ b/test/e2e/discovery-aggregation/02-assert-count.yaml @@ -0,0 +1,12 @@ +# Assert count aggregation: policy is Ready, both images discovered. +# count() by (image) returns alpine=3, busybox=1 at each step. +# aggregationMethod=count counts the number of data points (steps) per image. +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-count +status: + (conditions[?type == 'Ready']): + - status: "True" + reason: Synced + imageCount: 2 diff --git a/test/e2e/discovery-aggregation/03-assert-avg.yaml b/test/e2e/discovery-aggregation/03-assert-avg.yaml new file mode 100644 index 0000000..ae09c4b --- /dev/null +++ b/test/e2e/discovery-aggregation/03-assert-avg.yaml @@ -0,0 +1,12 @@ +# Assert avg aggregation: policy is Ready, both images discovered. +# sum() by (image) returns alpine=600, busybox=500 at each step. +# aggregationMethod=avg averages the data-point values over the lookback window. +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-avg +status: + (conditions[?type == 'Ready']): + - status: "True" + reason: Synced + imageCount: 2 diff --git a/test/e2e/discovery-aggregation/04-assert-max.yaml b/test/e2e/discovery-aggregation/04-assert-max.yaml new file mode 100644 index 0000000..2d240ef --- /dev/null +++ b/test/e2e/discovery-aggregation/04-assert-max.yaml @@ -0,0 +1,12 @@ +# Assert max aggregation: policy is Ready, both images discovered. +# sum() by (image) returns alpine=600, busybox=500 at each step. +# aggregationMethod=max takes the highest single data-point value. +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-max +status: + (conditions[?type == 'Ready']): + - status: "True" + reason: Synced + imageCount: 2 diff --git a/test/e2e/discovery-aggregation/05-assert-sum.yaml b/test/e2e/discovery-aggregation/05-assert-sum.yaml new file mode 100644 index 0000000..af43f08 --- /dev/null +++ b/test/e2e/discovery-aggregation/05-assert-sum.yaml @@ -0,0 +1,12 @@ +# Assert sum (default) aggregation: policy is Ready, both images discovered. +# sum() by (image) returns alpine=600, busybox=500 at each step. +# aggregationMethod=sum adds all data-point values over the lookback window. +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-sum +status: + (conditions[?type == 'Ready']): + - status: "True" + reason: Synced + imageCount: 2 diff --git a/test/e2e/discovery-aggregation/06-assert-instant.yaml b/test/e2e/discovery-aggregation/06-assert-instant.yaml new file mode 100644 index 0000000..2d42fc5 --- /dev/null +++ b/test/e2e/discovery-aggregation/06-assert-instant.yaml @@ -0,0 +1,11 @@ +# Assert instant query: policy is Ready, both images discovered. +# queryType=instant uses /api/v1/query — the returned value is used directly as the score. +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-instant +status: + (conditions[?type == 'Ready']): + - status: "True" + reason: Synced + imageCount: 2 diff --git a/test/e2e/discovery-aggregation/07-assert-none.yaml b/test/e2e/discovery-aggregation/07-assert-none.yaml new file mode 100644 index 0000000..94e6b0a --- /dev/null +++ b/test/e2e/discovery-aggregation/07-assert-none.yaml @@ -0,0 +1,11 @@ +# Assert none aggregation: policy is Ready, both images discovered. +# aggregationMethod=none uses the last data-point value from the range query directly. +apiVersion: drop.corewire.io/v1alpha1 +kind: DiscoveryPolicy +metadata: + name: e2e-agg-none +status: + (conditions[?type == 'Ready']): + - status: "True" + reason: Synced + imageCount: 2 diff --git a/test/e2e/discovery-aggregation/chainsaw-test.yaml b/test/e2e/discovery-aggregation/chainsaw-test.yaml new file mode 100644 index 0000000..16a95b2 --- /dev/null +++ b/test/e2e/discovery-aggregation/chainsaw-test.yaml @@ -0,0 +1,108 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: discovery-aggregation-methods +spec: + description: | + Verify that DiscoveryPolicy aggregationMethod and queryType fields work correctly + against a real Prometheus endpoint. Seeds use container_cpu_usage_seconds_total with + two images (alpine: 3 pods with values 100/200/300, busybox: 1 pod with value 500). + + Expected rankings per method (queryType: range): + count → alpine first (3 > 1) + avg → busybox first (500 > 200) + max → busybox first (500 > 300) + sum → alpine first (600 > 500) + none → uses last data-point value directly + + queryType: instant uses /api/v1/query directly — no aggregation. + steps: + - name: Create DiscoveryPolicies with different aggregation methods and query types + try: + - apply: + file: 01-discoverypolicies.yaml + - name: Assert count aggregation discovers images (alpine ranked first) + try: + - assert: + timeout: 90s + file: 02-assert-count.yaml + - name: Assert avg aggregation discovers images (busybox ranked first) + try: + - assert: + timeout: 90s + file: 03-assert-avg.yaml + - name: Assert max aggregation discovers images (busybox ranked first) + try: + - assert: + timeout: 90s + file: 04-assert-max.yaml + - name: Assert sum aggregation discovers images (alpine ranked first, default) + try: + - assert: + timeout: 90s + file: 05-assert-sum.yaml + - name: Assert instant query discovers images + try: + - assert: + timeout: 90s + file: 06-assert-instant.yaml + - name: Assert none aggregation discovers images (last value used directly) + try: + - assert: + timeout: 90s + file: 07-assert-none.yaml + - name: Verify aggregation scores are populated + try: + - script: + timeout: 30s + content: | + # Verify aggregation outputs are populated. + # Score relationships can vary with the number of data points and values + # returned by Prometheus in the lookback window. + SUM_SCORE=$(kubectl get discoverypolicy e2e-agg-sum -o jsonpath='{.status.discoveredImages[0].score}') + AVG_SCORE=$(kubectl get discoverypolicy e2e-agg-avg -o jsonpath='{.status.discoveredImages[0].score}') + COUNT_SCORE=$(kubectl get discoverypolicy e2e-agg-count -o jsonpath='{.status.discoveredImages[0].score}') + MAX_SCORE=$(kubectl get discoverypolicy e2e-agg-max -o jsonpath='{.status.discoveredImages[0].score}') + INSTANT_SCORE=$(kubectl get discoverypolicy e2e-agg-instant -o jsonpath='{.status.discoveredImages[0].score}') + NONE_SCORE=$(kubectl get discoverypolicy e2e-agg-none -o jsonpath='{.status.discoveredImages[0].score}') + + echo "Scores — sum:$SUM_SCORE avg:$AVG_SCORE count:$COUNT_SCORE max:$MAX_SCORE instant:$INSTANT_SCORE none:$NONE_SCORE" + + if [ -z "$SUM_SCORE" ] || [ -z "$AVG_SCORE" ] || [ -z "$COUNT_SCORE" ] || [ -z "$MAX_SCORE" ] || [ -z "$INSTANT_SCORE" ] || [ -z "$NONE_SCORE" ]; then + echo "FAIL: expected non-empty scores for all methods" + exit 1 + fi + echo "OK: all query types and aggregation methods produced non-empty scores" + - name: Cleanup + try: + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + name: e2e-agg-count + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + name: e2e-agg-avg + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + name: e2e-agg-max + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + name: e2e-agg-sum + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + name: e2e-agg-instant + - delete: + ref: + apiVersion: drop.corewire.io/v1alpha1 + kind: DiscoveryPolicy + name: e2e-agg-none