Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions diffly/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from ._compat import typer
from ._utils import ABS_TOL_DEFAULT, ABS_TOL_TEMPORAL_DEFAULT, REL_TOL_DEFAULT
from .metrics import DEFAULT_METRICS

app = typer.Typer()

Expand Down Expand Up @@ -129,8 +130,24 @@ def main(
)
),
] = [],
metric: Annotated[
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
metric: Annotated[
metrics: Annotated[

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is intended. It is a repeatable argument that is used as follows:

image

list[str],
typer.Option(
help=(
"Metric presets to display per numerical column. Repeatable. "
f"Available: {', '.join(DEFAULT_METRICS)}."
)
),
] = [],
) -> None:
"""Compare two `parquet` files and print the comparison result."""
for name in metric:
if name not in DEFAULT_METRICS:
raise typer.BadParameter(
f"Unknown metric: {name!r}. Available: {', '.join(DEFAULT_METRICS)}."
)
metrics = {name: DEFAULT_METRICS[name] for name in metric}

comparison = compare_frames(
pl.scan_parquet(left),
pl.scan_parquet(right),
Expand All @@ -148,6 +165,7 @@ def main(
right_name=right_name,
slim=slim,
hidden_columns=hidden_columns,
metrics=metrics,
)
if output_json:
typer.echo(summary.to_json())
Expand Down
13 changes: 13 additions & 0 deletions diffly/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
lazy_len,
make_and_validate_mapping,
)
from .metrics import Metric

if TYPE_CHECKING: # pragma: no cover
# NOTE: We cannot import at runtime as we're otherwise running into circular
Expand Down Expand Up @@ -919,6 +920,7 @@ def summary(
right_name: str = Side.RIGHT,
slim: bool = False,
hidden_columns: list[str] | None = None,
metrics: Mapping[str, Metric] | None = None,
) -> Summary:
"""Generate a summary of all aspects of the comparison.

Expand Down Expand Up @@ -948,6 +950,16 @@ def summary(
advanced users who are familiar with the summary format.
hidden_columns: Columns for which no values are printed, e.g. because they
contain sensitive information.
metrics: Optional mapping from display label to a metric callable
Comment thread
borchero marked this conversation as resolved.
``(left_expr, right_expr) -> pl.Expr``. Each callable receives two
:class:`polars.Expr` referring to the left and right values of a single
numerical column across all joined rows, and must return a scalar
aggregation expression. See :mod:`diffly.metrics` for presets
(``mean``, ``median``, ``mean_absolute_deviation`` etc.). When ``None``
(default), no metrics are computed; presets are not applied
automatically. Metrics are only computed for numerical columns. Prefer
short labels — the summary has a fixed width and many or long labels
degrade rendering.

Returns:
A summary which can be printed or written to a file.
Expand All @@ -973,6 +985,7 @@ def summary(
right_name=right_name,
slim=slim,
hidden_columns=hidden_columns,
metrics=metrics,
)

# ----------------------------------- UTILITIES ----------------------------------- #
Expand Down
93 changes: 93 additions & 0 deletions diffly/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# Copyright (c) QuantCo 2025-2026
# SPDX-License-Identifier: BSD-3-Clause

from __future__ import annotations

from collections.abc import Callable
from dataclasses import dataclass

import polars as pl
import polars.selectors as cs


@dataclass(frozen=True)
class _Metric:
"""A metric paired with a column-applicability selector.

Internal only.
"""

fn: Metric
selector: cs.Selector


Metric = Callable[[pl.Expr, pl.Expr], pl.Expr]
"""A metric is a callable mapping ``(left_expr, right_expr)`` to a scalar aggregation
expression.

The expressions refer to the left-side and right-side values of a single column across
all joined rows.
"""


def _make_numeric_metric(metric: Metric) -> _Metric:
return _Metric(fn=metric, selector=cs.numeric())


def mean(left: pl.Expr, right: pl.Expr) -> pl.Expr:
"""Mean of ``right - left``."""
return (right - left).mean()


def median(left: pl.Expr, right: pl.Expr) -> pl.Expr:
"""Median of ``right - left``."""
return (right - left).median()


def min(left: pl.Expr, right: pl.Expr) -> pl.Expr:
"""Minimum of ``right - left``."""
return (right - left).min()


def max(left: pl.Expr, right: pl.Expr) -> pl.Expr:
"""Maximum of ``right - left``."""
return (right - left).max()


def std(left: pl.Expr, right: pl.Expr) -> pl.Expr:
"""Standard deviation of ``right - left``."""
return (right - left).std()


def mean_absolute_deviation(left: pl.Expr, right: pl.Expr) -> pl.Expr:
"""Mean of ``|right - left|``."""
return (right - left).abs().mean()


def mean_relative_deviation(left: pl.Expr, right: pl.Expr) -> pl.Expr:
"""Mean of ``|(right - left) / left|``. Yields ``inf`` or ``null`` where
``left`` is zero."""
return ((right - left) / left).abs().mean()


def quantile(q: float) -> Metric:
"""Factory returning a metric that computes the ``q``-quantile of
``right - left``."""
if not 0 <= q <= 1:
raise ValueError(f"q must be in [0, 1], got {q}")

def _quantile(left: pl.Expr, right: pl.Expr) -> pl.Expr:
return (right - left).quantile(q)

return _quantile
Comment thread
EgeKaraismailogluQC marked this conversation as resolved.


DEFAULT_METRICS: dict[str, Metric] = {
"Mean": mean,
"Median": median,
"Min": min,
"Max": max,
"Std": std,
"Mean absolute deviation": mean_absolute_deviation,
"Mean relative deviation": mean_relative_deviation,
}
Loading
Loading