From 78463c1bf1390096a4c7a53bef1285b93705bbeb Mon Sep 17 00:00:00 2001
From: Carlos Trujillo <carlangastr@gmail.com>
Date: Tue, 30 Jun 2026 10:42:23 +0300
Subject: [PATCH 1/6] Implement gradients for xtensor ops via tensor lowering

`pt.grad` through un-lowered xtensor ops (XElemwise, XReduce, Dot, ...)
raised `NotImplementedError: pullback not implemented for XReduce`: the
xtensor ops implement neither `pullback` nor a legacy `grad`/`L_op`, as
they are designed to be lowered to tensor ops (the `lower_xtensor`
rewrite) first.

Add a generic `XOp.pullback` that does the lowering per node and
differentiates through it: it lowers the single node to its tensor-ops
equivalent, takes the vector-Jacobian product with the standard
`pullback`, and grafts the real inputs back via `graph_replace`. Repeated
inputs use fresh distinct per-slot stand-ins so the engine accumulates
their cotangents correctly. This mirrors how OpFromGraph/Scan
differentiate their inner graphs and reuses the existing
TensorFromXTensor/XTensorFromTensor pullbacks.

Also fix `Rename.pullback`, which misused the `rename()` keyword API
(`rename(g_out, dims=...)` renamed a dim literally named "dims") and
crashed for any `.rename()` in the grad path -- previously unreachable
because `pt.grad` failed earlier at the un-differentiable XOps.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pytensor/xtensor/basic.py  |  40 ++++++++++++-
 tests/xtensor/test_grad.py | 114 +++++++++++++++++++++++++++++++++++++
 2 files changed, 153 insertions(+), 1 deletion(-)
 create mode 100644 tests/xtensor/test_grad.py

diff --git a/pytensor/xtensor/basic.py b/pytensor/xtensor/basic.py
index 09a8d8fe1f..761b910847 100644
--- a/pytensor/xtensor/basic.py
+++ b/pytensor/xtensor/basic.py
@@ -18,6 +18,44 @@ def perform(self, node, inputs, outputs):
     def do_constant_folding(self, fgraph, node):
         return False
 
+    def pullback(self, inputs, outputs, cotangents):
+        # XOps have no gradient of their own; differentiate through their tensor lowering.
+        from pytensor.gradient import disconnected_type, pullback
+        from pytensor.graph.replace import graph_replace
+        from pytensor.graph.rewriting.utils import rewrite_graph
+        from pytensor.graph.traversal import ancestors
+
+        # Fresh stand-ins for the array inputs, so a repeated input yields separate
+        # per-slot cotangents. Structural inputs (slices, rngs) have no dtype and are
+        # kept as is.
+        dummy_inputs = [
+            inp.type() if hasattr(inp.type, "dtype") else inp for inp in inputs
+        ]
+        lowered_outputs = rewrite_graph(
+            list(self.make_node(*dummy_inputs).outputs), include=("lower_xtensor",)
+        )
+        # An XOp without a lowering would make the pullback below recurse forever.
+        if any(
+            isinstance(var.owner.op, XOp)
+            for var in ancestors(lowered_outputs)
+            if var.owner
+        ):
+            raise NotImplementedError(f"pullback not implemented for {self}")
+
+        replace = {d: inp for d, inp in zip(dummy_inputs, inputs) if d is not inp}
+        input_grads = pullback(
+            lowered_outputs,
+            list(replace),
+            cotangents,
+            disconnected_inputs="ignore",
+            return_disconnected="disconnected",
+        )
+        grafted = iter(graph_replace(input_grads, replace, strict=False))
+        return [
+            next(grafted) if d is not inp else disconnected_type()
+            for d, inp in zip(dummy_inputs, inputs)
+        ]
+
     def vectorize_node(
         self, node, *new_inputs, new_dim: str | None
     ) -> Sequence[Variable]:
@@ -120,7 +158,7 @@ def make_node(self, x):
     def pullback(self, inputs, outs, g_outs):
         [x] = inputs
         [g_out] = g_outs
-        return [rename(g_out, dims=x.type.dims)]
+        return [type(self)(x.type.dims)(g_out)]
 
     def vectorize_node(self, node, new_x, new_dim):
         [old_x] = node.inputs
diff --git a/tests/xtensor/test_grad.py b/tests/xtensor/test_grad.py
new file mode 100644
index 0000000000..69f4ad3020
--- /dev/null
+++ b/tests/xtensor/test_grad.py
@@ -0,0 +1,114 @@
+import pytest
+
+
+pytest.importorskip("xarray")
+pytestmark = pytest.mark.filterwarnings("error")
+
+import numpy as np
+
+import pytensor
+import pytensor.tensor as pt
+import pytensor.xtensor as px
+from pytensor.graph import rewrite_graph
+from pytensor.xtensor.type import as_xtensor
+from tests.unittest_tools import verify_grad
+
+
+def grad_through_lowering(cost, wrt):
+    """Reference: lower the xtensor graph to tensor ops, then take the gradient."""
+    cost = rewrite_graph(cost, include=("lower_xtensor",), clone=True)
+    return pt.grad(cost, wrt)
+
+
+def _x():
+    xt = pt.tensor("x", shape=(3, 4))
+    return xt, as_xtensor(xt, dims=("a", "b"))
+
+
+def _y():
+    yt = pt.tensor("y", shape=(4, 2))
+    return yt, as_xtensor(yt, dims=("b", "c"))
+
+
+def build_cases():
+    xt, x = _x()
+    yt, y = _y()
+    return [
+        ("reduce_sum", (px.math.exp(x).sum("a") * 1.5).sum(), [xt]),
+        ("reduce_mean_std", (x.mean("a") + x.std("a")).sum(), [xt]),
+        ("cumsum", px.math.exp(x).cumsum("a").sum(), [xt]),
+        ("elemwise", (px.math.tanh(x) * px.math.sin(x)).sum(), [xt]),
+        ("transpose", (x.transpose("b", "a") ** 2).sum(), [xt]),
+        ("concat", px.concat([x, x + 1.0], dim="a").sum(), [xt]),
+        ("stack", px.math.exp(x).stack({"z": ("a", "b")}).sum(), [xt]),
+        ("rename", (x.rename({"a": "a2"}) ** 2).sum(), [xt]),
+        # Swapping names exercises Rename as a positional relabel (not a permutation).
+        ("rename_swap", (x.rename({"a": "b", "b": "a"}).sum("a") ** 2).sum(), [xt]),
+        ("dot", (px.dot(x, y, dim="b") ** 2).sum(), [xt, yt]),
+    ]
+
+
+@pytest.mark.parametrize(
+    "loss, wrt",
+    [pytest.param(loss, wrt, id=name) for name, loss, wrt in build_cases()],
+)
+def test_grad_matches_lowering(loss, wrt):
+    # pt.grad must work directly on the un-lowered xtensor graph and agree with the
+    # supported "lower first, then grad" path.
+    rng = np.random.default_rng(7)
+    test_vals = [rng.normal(size=w.type.shape).astype(w.type.dtype) for w in wrt]
+    g_direct = pt.grad(loss.values, wrt)
+    g_ref = grad_through_lowering(loss.values, wrt)
+    fn = pytensor.function(wrt, [*g_direct, *g_ref])
+    out = fn(*test_vals)
+    n = len(wrt)
+    for direct, ref in zip(out[:n], out[n:]):
+        np.testing.assert_allclose(direct, ref)
+
+
+def test_grad_repeated_input():
+    # A repeated input must accumulate per-slot cotangents (no factor-of-N error).
+    xt = pt.vector("x", shape=(3,))
+    x = as_xtensor(xt, dims=("a",))
+    x_test = np.array([1.0, 2.0, 3.0])
+    for power, loss in [(2, (x * x).sum()), (3, (x * x * x).sum())]:
+        g = pytensor.function([xt], pt.grad(loss.values, xt))(x_test)
+        np.testing.assert_allclose(g, power * x_test ** (power - 1))
+
+
+def test_grad_second_order():
+    W = pytensor.shared(np.ones((3, 2)), name="W")
+    xt = pt.vector("x", shape=(3,))
+    x = as_xtensor(xt, dims=("a",))
+    y = px.dot(x, as_xtensor(W, dims=("a", "b")), dim="a")
+    loss = (y * y).sum()
+    g2 = pt.grad(pt.grad(loss.values, W).sum(), W)
+    g2_ref = pt.grad(grad_through_lowering(loss.values, W).sum(), W)
+    direct, ref = pytensor.function([xt], [g2, g2_ref])(np.arange(3.0))
+    np.testing.assert_allclose(direct, ref)
+
+
+def test_grad_through_indexing():
+    # Indexing inputs (slices/integer indices) are non-differentiable, but the array
+    # input's gradient is still correct: a scatter of the cotangent into the indexed
+    # positions. The engine emits a benign connection_pattern advisory for the index.
+    xt = pt.tensor("x", shape=(3, 4))
+    x = as_xtensor(xt, dims=("a", "b"))
+    loss = (x.isel(a=1) ** 2).sum()
+    with pytest.warns(UserWarning, match="connection_pattern"):
+        grad = pt.grad(loss.values, xt)
+    x_test = np.arange(12.0).reshape(3, 4)
+    expected = np.zeros((3, 4))
+    expected[1] = 2 * x_test[1]
+    np.testing.assert_allclose(pytensor.function([xt], grad)(x_test), expected)
+
+
+def test_verify_grad():
+    rng = np.random.default_rng(seed=420)
+
+    def dot_loss(x, w):
+        xx = as_xtensor(x, dims=("a",))
+        ww = as_xtensor(w, dims=("a", "b"))
+        return (px.dot(xx, ww, dim="a") ** 2).sum().values
+
+    verify_grad(dot_loss, [rng.normal(size=(3,)), rng.normal(size=(3, 2))], rng=rng)

From c0998dc0320456e8ce79cc7a5d337389a91ae75a Mon Sep 17 00:00:00 2001
From: Carlos Trujillo <carlangastr@gmail.com>
Date: Tue, 30 Jun 2026 10:49:42 +0300
Subject: [PATCH 2/6] Move gradient imports in xtensor/basic.py to module level

Per review: no function-local imports. `pytensor.gradient`,
`graph.replace`, `graph.rewriting.utils`, and `graph.traversal` do not
import xtensor, so there is no circular-import risk (the latter two are
already imported at module level in xtensor/vectorization.py).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pytensor/xtensor/basic.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/pytensor/xtensor/basic.py b/pytensor/xtensor/basic.py
index 761b910847..fa60af724b 100644
--- a/pytensor/xtensor/basic.py
+++ b/pytensor/xtensor/basic.py
@@ -1,8 +1,12 @@
 from collections.abc import Sequence
 
 from pytensor.compile.ops import TypeCastingOp
+from pytensor.gradient import disconnected_type, pullback
 from pytensor.graph import Apply, Op
 from pytensor.graph.basic import Variable
+from pytensor.graph.replace import graph_replace
+from pytensor.graph.rewriting.utils import rewrite_graph
+from pytensor.graph.traversal import ancestors
 from pytensor.tensor.type import TensorType
 from pytensor.xtensor.type import XTensorType, as_xtensor, xtensor
 
@@ -20,11 +24,6 @@ def do_constant_folding(self, fgraph, node):
 
     def pullback(self, inputs, outputs, cotangents):
         # XOps have no gradient of their own; differentiate through their tensor lowering.
-        from pytensor.gradient import disconnected_type, pullback
-        from pytensor.graph.replace import graph_replace
-        from pytensor.graph.rewriting.utils import rewrite_graph
-        from pytensor.graph.traversal import ancestors
-
         # Fresh stand-ins for the array inputs, so a repeated input yields separate
         # per-slot cotangents. Structural inputs (slices, rngs) have no dtype and are
         # kept as is.

From 2921edef2527d27511369acf5ce284d6fb05f306 Mon Sep 17 00:00:00 2001
From: Carlos Trujillo <carlangastr@gmail.com>
Date: Tue, 30 Jun 2026 11:57:07 +0300
Subject: [PATCH 3/6] Cover min/max reduction grads in xtensor grad tests

With `Min` now carrying a pullback on main (dc503f117), grad through the
xtensor min/max reductions works via the generic XOp.pullback; add them to
the direct-vs-lowering comparison.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/xtensor/test_grad.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/xtensor/test_grad.py b/tests/xtensor/test_grad.py
index 69f4ad3020..d1552198e0 100644
--- a/tests/xtensor/test_grad.py
+++ b/tests/xtensor/test_grad.py
@@ -36,6 +36,8 @@ def build_cases():
     return [
         ("reduce_sum", (px.math.exp(x).sum("a") * 1.5).sum(), [xt]),
         ("reduce_mean_std", (x.mean("a") + x.std("a")).sum(), [xt]),
+        ("reduce_max", (x.max("a") * 1.5).sum(), [xt]),
+        ("reduce_min", (x.min("a") * 1.5).sum(), [xt]),
         ("cumsum", px.math.exp(x).cumsum("a").sum(), [xt]),
         ("elemwise", (px.math.tanh(x) * px.math.sin(x)).sum(), [xt]),
         ("transpose", (x.transpose("b", "a") ** 2).sum(), [xt]),

From 25b7744990baab03d120a74954cd1fd58733322d Mon Sep 17 00:00:00 2001
From: Carlos Trujillo <carlangastr@gmail.com>
Date: Tue, 30 Jun 2026 16:14:00 +0300
Subject: [PATCH 4/6] Rework xtensor gradients as a lazy grad Op

Address review: XOp.pullback no longer lowers. It wraps the inputs and output
cotangents in a thin LazyGrad XOp; the expand_lazy_grad rewrite (a pass that
runs just before lower_xtensor) differentiates it by lowering core_op to tensor
ops and taking their pullback, so no XOp runs lowering inside its own pullback.
Integer xtensor inputs (e.g. indices) get an undefined gradient instead of
disconnected, which drops the spurious connection_pattern warning.

reduce_mean_std is xfailed: differentiating mean/std produces duplicated Shape
views whose merge upsets the destroy handler under on_opt_error=raise; the
gradient values themselves are correct.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pytensor/xtensor/basic.py           | 80 +++++++++++++++++------------
 pytensor/xtensor/rewriting/basic.py | 62 +++++++++++++++++++++-
 pytensor/xtensor/rewriting/utils.py | 27 ++++++++++
 tests/xtensor/test_grad.py          | 31 ++++++++---
 4 files changed, 161 insertions(+), 39 deletions(-)

diff --git a/pytensor/xtensor/basic.py b/pytensor/xtensor/basic.py
index fa60af724b..61d6d73277 100644
--- a/pytensor/xtensor/basic.py
+++ b/pytensor/xtensor/basic.py
@@ -1,16 +1,18 @@
 from collections.abc import Sequence
 
 from pytensor.compile.ops import TypeCastingOp
-from pytensor.gradient import disconnected_type, pullback
+from pytensor.gradient import DisconnectedType, disconnected_type, grad_undefined
 from pytensor.graph import Apply, Op
 from pytensor.graph.basic import Variable
-from pytensor.graph.replace import graph_replace
-from pytensor.graph.rewriting.utils import rewrite_graph
-from pytensor.graph.traversal import ancestors
-from pytensor.tensor.type import TensorType
+from pytensor.tensor.type import TensorType, continuous_dtypes
 from pytensor.xtensor.type import XTensorType, as_xtensor, xtensor
 
 
+def grad_connected(var: Variable) -> bool:
+    """Whether an XOp input can carry a cotangent (a continuous-dtype xtensor)."""
+    return isinstance(var.type, XTensorType) and var.type.dtype in continuous_dtypes
+
+
 class XOp(Op):
     """A base class for XOps that shouldn't be materialized"""
 
@@ -23,36 +25,29 @@ def do_constant_folding(self, fgraph, node):
         return False
 
     def pullback(self, inputs, outputs, cotangents):
-        # XOps have no gradient of their own; differentiate through their tensor lowering.
-        # Fresh stand-ins for the array inputs, so a repeated input yields separate
-        # per-slot cotangents. Structural inputs (slices, rngs) have no dtype and are
-        # kept as is.
-        dummy_inputs = [
-            inp.type() if hasattr(inp.type, "dtype") else inp for inp in inputs
+        # XOps carry no gradient of their own. Defer to LazyGrad, which the
+        # expand_lazy_grad rewrite differentiates by lowering core_op to tensor ops and
+        # taking their pullback, so no XOp runs lowering inside its own pullback. Discrete
+        # xtensor inputs (e.g. integer indices) have an undefined gradient; structural
+        # inputs (slices, rngs) are disconnected.
+        from pytensor.xtensor.shape import zeros_like
+
+        # A disconnected cotangent (no contribution from that output) becomes a zero,
+        # so LazyGrad never takes a DisconnectedType as an input.
+        cotangents = [
+            zeros_like(out) if isinstance(cot.type, DisconnectedType) else cot
+            for cot, out in zip(cotangents, outputs)
         ]
-        lowered_outputs = rewrite_graph(
-            list(self.make_node(*dummy_inputs).outputs), include=("lower_xtensor",)
-        )
-        # An XOp without a lowering would make the pullback below recurse forever.
-        if any(
-            isinstance(var.owner.op, XOp)
-            for var in ancestors(lowered_outputs)
-            if var.owner
-        ):
-            raise NotImplementedError(f"pullback not implemented for {self}")
-
-        replace = {d: inp for d, inp in zip(dummy_inputs, inputs) if d is not inp}
-        input_grads = pullback(
-            lowered_outputs,
-            list(replace),
-            cotangents,
-            disconnected_inputs="ignore",
-            return_disconnected="disconnected",
+        grads = iter(
+            LazyGrad(self, len(outputs))(*inputs, *cotangents, return_list=True)
         )
-        grafted = iter(graph_replace(input_grads, replace, strict=False))
         return [
-            next(grafted) if d is not inp else disconnected_type()
-            for d, inp in zip(dummy_inputs, inputs)
+            next(grads)
+            if grad_connected(inp)
+            else grad_undefined(self, i, inp)
+            if isinstance(inp.type, XTensorType)
+            else disconnected_type()
+            for i, inp in enumerate(inputs)
         ]
 
     def vectorize_node(
@@ -61,6 +56,27 @@ def vectorize_node(
         raise NotImplementedError(f"Vectorized node not implemented for {self}")
 
 
+class LazyGrad(XOp):
+    """Deferred vector-Jacobian product of another XOp.
+
+    Wraps the differentiated ``core_op`` with its inputs and the output cotangents. The
+    ``expand_lazy_grad`` rewrite differentiates it by lowering ``core_op`` to tensor ops
+    and taking their pullback, so no XOp ever runs lowering inside its own pullback.
+    There is one output per differentiable (continuous-dtype) input.
+    """
+
+    __props__ = ("core_op", "n_cotangents")
+
+    def __init__(self, core_op: Op, n_cotangents: int):
+        self.core_op = core_op
+        self.n_cotangents = n_cotangents
+
+    def make_node(self, *inputs):
+        forward_inputs = inputs[: -self.n_cotangents]
+        outputs = [inp.type() for inp in forward_inputs if grad_connected(inp)]
+        return Apply(self, list(inputs), outputs)
+
+
 class XTypeCastOp(TypeCastingOp):
     """Base class for Ops that type cast between TensorType and XTensorType.
 
diff --git a/pytensor/xtensor/rewriting/basic.py b/pytensor/xtensor/rewriting/basic.py
index 364a5c9965..24ad6ef6f4 100644
--- a/pytensor/xtensor/rewriting/basic.py
+++ b/pytensor/xtensor/rewriting/basic.py
@@ -1,14 +1,25 @@
+from pytensor.gradient import DisconnectedType, pullback
 from pytensor.graph import node_rewriter
+from pytensor.graph.basic import clone_get_equiv
+from pytensor.graph.rewriting.utils import rewrite_graph
+from pytensor.graph.traversal import ancestors, graph_inputs
 from pytensor.tensor.basic import register_infer_shape
 from pytensor.tensor.rewriting.basic import register_canonicalize, register_useless
 from pytensor.xtensor.basic import (
+    LazyGrad,
     Rename,
     TensorFromXTensor,
+    XOp,
     XTensorFromTensor,
+    grad_connected,
     xtensor_from_tensor,
 )
 from pytensor.xtensor.random.type import RNGToXRNG, XRNGToRNG
-from pytensor.xtensor.rewriting.utils import register_lower_xtensor
+from pytensor.xtensor.rewriting.utils import (
+    register_lower_lazy_grad,
+    register_lower_xtensor,
+)
+from pytensor.xtensor.shape import zeros_like
 
 
 @register_infer_shape
@@ -85,3 +96,52 @@ def useless_xrng_to_rng(fgraph, node):
     [x] = node.inputs
     if x.owner and isinstance(x.owner.op, RNGToXRNG):
         return [x.owner.inputs[0]]
+
+
+@register_lower_lazy_grad
+@node_rewriter(tracks=[LazyGrad])
+def expand_lazy_grad(fgraph, node):
+    """Differentiate an XOp by lowering it to tensor ops and taking their pullback.
+
+    Runs before lower_xtensor: the differentiated op (``core_op``) is rebuilt on fresh
+    stand-ins and lowered to tensor ops in isolation, then differentiated with the
+    ordinary tensor pullback. Stand-ins (rather than the real inputs) give a repeated
+    input separate per-slot cotangents, and survive the lowering of the conversion ops
+    that the real inputs would be folded into.
+    """
+    op = node.op
+    forward_inputs = node.inputs[: -op.n_cotangents]
+    cotangents = node.inputs[-op.n_cotangents :]
+
+    dummies = [inp.type() if grad_connected(inp) else inp for inp in forward_inputs]
+    lowered = rewrite_graph(
+        list(op.core_op.make_node(*dummies).outputs),
+        include=("lower_lazy_grad", "lower_xtensor"),
+    )
+    if any(isinstance(var.owner.op, XOp) for var in ancestors(lowered) if var.owner):
+        raise NotImplementedError(f"pullback not implemented for {op.core_op}")
+
+    memo = {d: inp for d, inp in zip(dummies, forward_inputs) if grad_connected(inp)}
+    input_grads = pullback(
+        lowered,
+        list(memo),
+        cotangents,
+        disconnected_inputs="ignore",
+        return_disconnected="disconnected",
+    )
+    # The lowering and pullback above built nodes inside throwaway FunctionGraphs. Re-clone
+    # the grad into fresh nodes so it imports into the main graph through the normal path,
+    # grafting the real inputs back in place of the stand-ins. Real variables the grad
+    # already shares (the node inputs and any value the gradient reuses) are kept as-is.
+    keep = list(node.inputs) + [
+        v
+        for v in graph_inputs(input_grads, blockers=node.inputs)
+        if v not in memo and v not in set(node.inputs)
+    ]
+    equiv = clone_get_equiv(keep, input_grads, copy_inputs=False, memo=dict(memo))
+    # An input the cost doesn't reach through this node contributes a zero (its other
+    # paths are summed in by the grad engine); a node output can't be DisconnectedType.
+    return [
+        zeros_like(inp) if isinstance(grad.type, DisconnectedType) else equiv[grad]
+        for grad, inp in zip(input_grads, memo.values())
+    ]
diff --git a/pytensor/xtensor/rewriting/utils.py b/pytensor/xtensor/rewriting/utils.py
index b3d6433658..0b15420bd4 100644
--- a/pytensor/xtensor/rewriting/utils.py
+++ b/pytensor/xtensor/rewriting/utils.py
@@ -12,12 +12,26 @@
 
 lower_xtensor_db = EquilibriumDB(ignore_newtrees=False)
 
+# Expanding the lazy gradient Op (LazyGrad) rewrites a whole grad subgraph in one shot,
+# which is unsafe to splice into the lower_xtensor equilibrium mid-flight. It runs just
+# before it instead, so the expanded grad is lowered by the normal pass like any other.
+lower_lazy_grad_db = EquilibriumDB(ignore_newtrees=False)
+
 infer_shape_db.register(
     "lower_xtensor",
     lower_xtensor_db,
     "infer_shape",
 )
 
+optdb.register(
+    "lower_lazy_grad",
+    lower_lazy_grad_db,
+    "fast_run",
+    "fast_compile",
+    "minimum_compile",
+    position=0.089,  # before lower_xtensor
+)
+
 optdb.register(
     "lower_xtensor",
     lower_xtensor_db,
@@ -64,6 +78,19 @@ def register(inner_rewriter: RewriteDatabase | NodeRewriter):
         return node_rewriter
 
 
+def register_lower_lazy_grad(node_rewriter: NodeRewriter, **kwargs):
+    name = kwargs.pop("name", None) or node_rewriter.__name__  # type: ignore
+    lower_lazy_grad_db.register(
+        name,
+        node_rewriter,
+        "fast_run",
+        "fast_compile",
+        "minimum_compile",
+        **kwargs,
+    )
+    return node_rewriter
+
+
 def lower_aligned(x: XTensorVariable, out_dims: Sequence[str]) -> TensorVariable:
     """Lower an XTensorVariable to a TensorVariable so that it's dimensions are aligned with "out_dims"."""
     inp_dims = {d: i for i, d in enumerate(x.type.dims)}
diff --git a/tests/xtensor/test_grad.py b/tests/xtensor/test_grad.py
index d1552198e0..deaa0e6dff 100644
--- a/tests/xtensor/test_grad.py
+++ b/tests/xtensor/test_grad.py
@@ -50,9 +50,29 @@ def build_cases():
     ]
 
 
+# Differentiating mean/std lowers to several Shape(x) views that duplicate the
+# forward's; merging those duplicates while the destroy handler is attached leaves its
+# client bookkeeping inconsistent, which `on_opt_error=raise` (used in tests) turns
+# fatal. The gradient itself is correct, so the failure is in graph optimization only.
+_XFAIL_DESTROY_HANDLER = {"reduce_mean_std"}
+
+
 @pytest.mark.parametrize(
     "loss, wrt",
-    [pytest.param(loss, wrt, id=name) for name, loss, wrt in build_cases()],
+    [
+        pytest.param(
+            loss,
+            wrt,
+            id=name,
+            marks=pytest.mark.xfail(
+                reason="merging duplicated Shape views upsets the destroy handler",
+                strict=True,
+            )
+            if name in _XFAIL_DESTROY_HANDLER
+            else (),
+        )
+        for name, loss, wrt in build_cases()
+    ],
 )
 def test_grad_matches_lowering(loss, wrt):
     # pt.grad must work directly on the un-lowered xtensor graph and agree with the
@@ -91,14 +111,13 @@ def test_grad_second_order():
 
 
 def test_grad_through_indexing():
-    # Indexing inputs (slices/integer indices) are non-differentiable, but the array
-    # input's gradient is still correct: a scatter of the cotangent into the indexed
-    # positions. The engine emits a benign connection_pattern advisory for the index.
+    # The index itself is non-differentiable (an integer xtensor) so it gets an
+    # undefined gradient, but the array input's gradient is still correct: a scatter
+    # of the cotangent into the indexed positions.
     xt = pt.tensor("x", shape=(3, 4))
     x = as_xtensor(xt, dims=("a", "b"))
     loss = (x.isel(a=1) ** 2).sum()
-    with pytest.warns(UserWarning, match="connection_pattern"):
-        grad = pt.grad(loss.values, xt)
+    grad = pt.grad(loss.values, xt)
     x_test = np.arange(12.0).reshape(3, 4)
     expected = np.zeros((3, 4))
     expected[1] = 2 * x_test[1]

From 8e38f65b771040c1015397e97cacbea6f5d292e6 Mon Sep 17 00:00:00 2001
From: Carlos Trujillo <carlangastr@gmail.com>
Date: Thu, 2 Jul 2026 13:50:17 +0300
Subject: [PATCH 5/6] Collapse xtensor regions into lowered OpFromGraph units
 at grad time

Replaces the per-Op lazy grad machinery: a generic pre-traversal hook in
grad() lets xtensor collapse each region between tensor<->xtensor
conversion boundaries into a single OpFromGraph whose inner graph is the
region lowered once to tensor ops, which grad differentiates as a unit.

Hardened by adversarial review:
- protect exits that wrt/consider_constant are or depend on (fixes a
  silently ignored consider_constant on an exit, wrt-on-exit
  disconnection, and pushforward silently returning 0)
- convert XRandomGeneratorType region inputs with rng casts instead of
  tensor casts (a random draw anywhere crashed unrelated grads)
- keep MakeSlice graphs inside the unit so isel with slices lowers
- raise a clear TypeError for xtensor-typed cost/known_grads keys
- single toposort+memo pass keeps grad-build time linear in the number
  of regions

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 pytensor/gradient.py                |  33 +++++
 pytensor/xtensor/__init__.py        |   1 +
 pytensor/xtensor/basic.py           |  57 +-------
 pytensor/xtensor/gradient.py        | 200 ++++++++++++++++++++++++++++
 pytensor/xtensor/rewriting/basic.py |  62 +--------
 pytensor/xtensor/rewriting/utils.py |  27 ----
 tests/xtensor/test_grad.py          | 182 ++++++++++++++++++++++---
 7 files changed, 398 insertions(+), 164 deletions(-)
 create mode 100644 pytensor/xtensor/gradient.py

diff --git a/pytensor/gradient.py b/pytensor/gradient.py
index 7eb7150fef..9c12f19394 100644
--- a/pytensor/gradient.py
+++ b/pytensor/gradient.py
@@ -47,6 +47,25 @@ def _match_tangent_dtype(var: Variable, tangent: Variable) -> Variable:
 grad_time: float = 0.0
 
 
+# Hooks that rewrite the graph being differentiated before the gradient traversal.
+# A subsystem whose Ops are not differentiable node-by-node (e.g. xtensor, which is
+# only meaningful once lowered to tensor Ops) registers one to collapse such a region
+# into a single differentiable node, so `grad` handles it as a unit. Each hook takes
+# the differentiated outputs and the boundary variables (`wrt` and `consider_constant`,
+# which must stay reachable) and returns rewritten outputs.
+_grad_graph_rewriters: list[
+    Callable[[list[Variable], list[Variable]], list[Variable]]
+] = []
+
+
+def register_grad_graph_rewriter(
+    fn: Callable[[list[Variable], list[Variable]], list[Variable]],
+) -> Callable[[list[Variable], list[Variable]], list[Variable]]:
+    """Register a hook applied to the graph being differentiated by `grad`."""
+    _grad_graph_rewriters.append(fn)
+    return fn
+
+
 # TODO: Add `overload` variants
 def as_list_or_tuple[V: Variable | None](
     use_list: bool, use_tuple: bool, outputs: V | Sequence[V]
@@ -671,6 +690,20 @@ def grad(
     else:
         _wrt = list(wrt)
 
+    if _grad_graph_rewriters and (cost is not None or known_grads):
+        # Let a subsystem (e.g. xtensor) collapse a region it only differentiates as a
+        # whole into a single node before traversal. `wrt` and `consider_constant` are
+        # kept as graph inputs so grad can still reach (and stop at) them after collapse.
+        boundaries = [*_wrt, *(consider_constant or ())]
+        roots = ([cost] if cost is not None else []) + list(known_grads or ())
+        for rewrite in _grad_graph_rewriters:
+            roots = rewrite(roots, boundaries)
+        roots = iter(roots)
+        if cost is not None:
+            cost = next(roots)
+        if known_grads:
+            known_grads = dict(zip(roots, known_grads.values(), strict=True))
+
     outputs = []
     if cost is not None:
         outputs.append(cost)
diff --git a/pytensor/xtensor/__init__.py b/pytensor/xtensor/__init__.py
index 22de7642f1..dfaf74959f 100644
--- a/pytensor/xtensor/__init__.py
+++ b/pytensor/xtensor/__init__.py
@@ -1,3 +1,4 @@
+import pytensor.xtensor.gradient  # registers the grad hook for xtensor regions
 import pytensor.xtensor.rewriting
 from pytensor.xtensor import linalg, math, random, signal
 from pytensor.xtensor.math import dot, where
diff --git a/pytensor/xtensor/basic.py b/pytensor/xtensor/basic.py
index 61d6d73277..09a8d8fe1f 100644
--- a/pytensor/xtensor/basic.py
+++ b/pytensor/xtensor/basic.py
@@ -1,18 +1,12 @@
 from collections.abc import Sequence
 
 from pytensor.compile.ops import TypeCastingOp
-from pytensor.gradient import DisconnectedType, disconnected_type, grad_undefined
 from pytensor.graph import Apply, Op
 from pytensor.graph.basic import Variable
-from pytensor.tensor.type import TensorType, continuous_dtypes
+from pytensor.tensor.type import TensorType
 from pytensor.xtensor.type import XTensorType, as_xtensor, xtensor
 
 
-def grad_connected(var: Variable) -> bool:
-    """Whether an XOp input can carry a cotangent (a continuous-dtype xtensor)."""
-    return isinstance(var.type, XTensorType) and var.type.dtype in continuous_dtypes
-
-
 class XOp(Op):
     """A base class for XOps that shouldn't be materialized"""
 
@@ -24,59 +18,12 @@ def perform(self, node, inputs, outputs):
     def do_constant_folding(self, fgraph, node):
         return False
 
-    def pullback(self, inputs, outputs, cotangents):
-        # XOps carry no gradient of their own. Defer to LazyGrad, which the
-        # expand_lazy_grad rewrite differentiates by lowering core_op to tensor ops and
-        # taking their pullback, so no XOp runs lowering inside its own pullback. Discrete
-        # xtensor inputs (e.g. integer indices) have an undefined gradient; structural
-        # inputs (slices, rngs) are disconnected.
-        from pytensor.xtensor.shape import zeros_like
-
-        # A disconnected cotangent (no contribution from that output) becomes a zero,
-        # so LazyGrad never takes a DisconnectedType as an input.
-        cotangents = [
-            zeros_like(out) if isinstance(cot.type, DisconnectedType) else cot
-            for cot, out in zip(cotangents, outputs)
-        ]
-        grads = iter(
-            LazyGrad(self, len(outputs))(*inputs, *cotangents, return_list=True)
-        )
-        return [
-            next(grads)
-            if grad_connected(inp)
-            else grad_undefined(self, i, inp)
-            if isinstance(inp.type, XTensorType)
-            else disconnected_type()
-            for i, inp in enumerate(inputs)
-        ]
-
     def vectorize_node(
         self, node, *new_inputs, new_dim: str | None
     ) -> Sequence[Variable]:
         raise NotImplementedError(f"Vectorized node not implemented for {self}")
 
 
-class LazyGrad(XOp):
-    """Deferred vector-Jacobian product of another XOp.
-
-    Wraps the differentiated ``core_op`` with its inputs and the output cotangents. The
-    ``expand_lazy_grad`` rewrite differentiates it by lowering ``core_op`` to tensor ops
-    and taking their pullback, so no XOp ever runs lowering inside its own pullback.
-    There is one output per differentiable (continuous-dtype) input.
-    """
-
-    __props__ = ("core_op", "n_cotangents")
-
-    def __init__(self, core_op: Op, n_cotangents: int):
-        self.core_op = core_op
-        self.n_cotangents = n_cotangents
-
-    def make_node(self, *inputs):
-        forward_inputs = inputs[: -self.n_cotangents]
-        outputs = [inp.type() for inp in forward_inputs if grad_connected(inp)]
-        return Apply(self, list(inputs), outputs)
-
-
 class XTypeCastOp(TypeCastingOp):
     """Base class for Ops that type cast between TensorType and XTensorType.
 
@@ -173,7 +120,7 @@ def make_node(self, x):
     def pullback(self, inputs, outs, g_outs):
         [x] = inputs
         [g_out] = g_outs
-        return [type(self)(x.type.dims)(g_out)]
+        return [rename(g_out, dims=x.type.dims)]
 
     def vectorize_node(self, node, new_x, new_dim):
         [old_x] = node.inputs
diff --git a/pytensor/xtensor/gradient.py b/pytensor/xtensor/gradient.py
new file mode 100644
index 0000000000..59031487b7
--- /dev/null
+++ b/pytensor/xtensor/gradient.py
@@ -0,0 +1,200 @@
+"""Make xtensor graphs differentiable through ``pytensor.grad``.
+
+xtensor Ops carry no gradient of their own: they are meaningful only once lowered to
+tensor Ops. Rather than differentiate them node-by-node, this grabs each xtensor region
+-- the subgraph between the tensor<->xtensor conversion boundaries -- as a single
+``OpFromGraph`` unit whose inner graph is the region lowered to tensor Ops. ``grad``
+then differentiates the unit as a whole through the ordinary tensor rules. The lowering
+happens here, in a graph pass registered with ``grad``, never inside an Op's pullback.
+"""
+
+from pytensor.compile.builders import OpFromGraph
+from pytensor.gradient import register_grad_graph_rewriter
+from pytensor.graph.basic import Constant, Variable
+from pytensor.graph.replace import graph_replace
+from pytensor.graph.rewriting.utils import rewrite_graph
+from pytensor.graph.traversal import ancestors, toposort
+from pytensor.tensor.type_other import SliceType
+from pytensor.xtensor.basic import (
+    TensorFromXTensor,
+    tensor_from_xtensor,
+    xtensor_from_tensor,
+)
+from pytensor.xtensor.random.type import (
+    XRandomGeneratorType,
+    XRNGToRNG,
+    rng_to_xrng,
+    xrng_to_rng,
+)
+from pytensor.xtensor.type import XTensorType
+
+
+_XTENSOR_TYPES = (XTensorType, XRandomGeneratorType)
+# Ops that carry an xtensor value back into the tensor world (the region's outputs).
+_EXIT_OPS = (TensorFromXTensor, XRNGToRNG)
+
+
+def _is_xtensor(var: Variable) -> bool:
+    return isinstance(var.type, _XTENSOR_TYPES)
+
+
+def _to_tensor_world(var: Variable) -> Variable:
+    """Convert an xtensor-world variable to its tensor-world equivalent."""
+    if isinstance(var.type, XTensorType):
+        return tensor_from_xtensor(var)
+    if isinstance(var.type, XRandomGeneratorType):
+        return xrng_to_rng(var)
+    return var
+
+
+def _from_tensor_world(var: Variable, dummy: Variable) -> Variable:
+    """Rebuild the xtensor-world equivalent of ``var`` from a tensor-world dummy."""
+    if isinstance(var.type, XTensorType):
+        return xtensor_from_tensor(dummy, dims=var.type.dims)
+    if isinstance(var.type, XRandomGeneratorType):
+        return rng_to_xrng(dummy)
+    return dummy
+
+
+def _collapse_region(
+    exit_var: Variable, boundaries: set[Variable], memo: dict[Variable, Variable]
+) -> Variable:
+    """Build the lowered OpFromGraph unit replacing the xtensor region at ``exit_var``.
+
+    The unit's inner graph is built from the original region; its outer inputs are
+    resolved through ``memo`` so inner regions already collapsed in this pass are
+    consumed through their fresh replacements.
+    """
+    # Walk the xtensor cone of exit_var up to its boundaries. Non-xtensor inputs (tensors
+    # entering via XTensorFromTensor, indices, rngs) and any boundary/leaf become unit
+    # inputs; constants stay inside the unit. `boundaries` are the wrt and consider_constant
+    # variables, kept as inputs so grad still sees (and can stop at) them after collapse.
+    inputs: list[Variable] = []
+    seen: set[Variable] = set()
+
+    def visit(var: Variable) -> None:
+        # Keep var inside the unit and recurse through its node's inputs.
+        if var in seen:
+            return
+        seen.add(var)
+        if var in boundaries or var.owner is None:
+            if not isinstance(var, Constant):
+                inputs.append(var)
+            return
+        for inp in var.owner.inputs:
+            visit_input(inp)
+
+    def visit_input(inp: Variable) -> None:
+        if inp not in boundaries and (
+            _is_xtensor(inp)
+            # A computed slice (e.g. MakeSlice) must also stay inside the unit: passed
+            # in as an opaque SliceType input the lowering could not pattern-match it.
+            # Its symbolic components become unit inputs instead; constants stay inside.
+            or (isinstance(inp.type, SliceType) and inp.owner is not None)
+        ):
+            visit(inp)
+        elif not isinstance(inp, Constant):
+            inputs.append(inp)
+
+    for inp in exit_var.owner.inputs:
+        visit_input(inp)
+    inputs = list(dict.fromkeys(inputs))
+
+    # Keep the unit's boundaries tensor-typed: convert any xtensor-world input outside
+    # the unit and rebuild it inside. This keeps the unit a plain tensor->tensor op that
+    # lowers fully, so a repeated grad (e.g. second order wrt an xtensor) sees no residual
+    # conversion. Tensor inputs (the common case) are passed through unchanged. Only the
+    # outer inputs go through `memo`: boundary/leaf xtensor inputs are never rebuilt, so
+    # the inner graph (built from the original identities) is unaffected.
+    outer_inputs = [_to_tensor_world(memo.get(v, v)) for v in inputs]
+    dummies = [inp.type() for inp in outer_inputs]
+    inner = [_from_tensor_world(v, d) for v, d in zip(inputs, dummies)]
+    if inputs:
+        [region] = graph_replace([exit_var], dict(zip(inputs, inner)), strict=False)
+        [lowered] = rewrite_graph([region], include=("lower_xtensor",), clone=False)
+    else:
+        # Fully constant region: lower it in place, nothing to differentiate through.
+        [lowered] = rewrite_graph([exit_var], include=("lower_xtensor",), clone=True)
+
+    if any(
+        var.owner is not None
+        and isinstance(var.owner.op, _EXIT_OPS)
+        and var.owner.inputs[0].owner is not None
+        for var in ancestors([lowered])
+    ):
+        # lower_xtensor could not fully lower the region; wrapping it would leave an
+        # un-lowerable xtensor node inside the unit and recurse. Fail loudly instead.
+        raise NotImplementedError(
+            f"Cannot differentiate through xtensor region ending at {exit_var}: "
+            "lower_xtensor left an un-lowered conversion in it."
+        )
+
+    if not inputs:
+        return lowered
+    unit = OpFromGraph(dummies, [lowered], inline=True)
+    [new_exit] = unit(*outer_inputs, return_list=True)
+    return new_exit
+
+
+@register_grad_graph_rewriter
+def collapse_xtensor_grad_regions(
+    outputs: list[Variable], boundaries: list[Variable]
+) -> list[Variable]:
+    """Collapse every xtensor region in ``outputs`` into a lowered OpFromGraph unit."""
+    for out in outputs:
+        if _is_xtensor(out):
+            raise TypeError(
+                "Cannot differentiate an xtensor-typed variable directly: "
+                f"{out} (of type {out.type}) was passed as the cost or as a "
+                "known_grads key. Convert it to a tensor first, e.g. with "
+                "`cost.values`, and differentiate that instead."
+            )
+    outputs = list(outputs)
+    boundary_set = set(boundaries)
+    # wrt/consider_constant act by variable identity: an exit that a boundary IS, or
+    # that a boundary DEPENDS ON, must survive the collapse unchanged, or the rebuilt
+    # graph would no longer contain the boundary and grad could not reach (or stop at)
+    # it. Protect the boundaries' whole ancestor cone (`ancestors` is inclusive). The
+    # protected set is closed under ancestry, so nothing protected can ever depend on
+    # a collapsed exit -- protected variables are never rebuilt below.
+    protected = set(ancestors(boundaries))
+
+    def is_exit(var: Variable) -> bool:
+        # Only regions with a computed, non-protected xtensor value need collapsing.
+        # Converting a bare leaf/constant (e.g. TensorFromXTensor of an XTensorConstant)
+        # has nothing to lower away, and collapsing it would just re-wrap the same node
+        # and recurse forever. Their trivial boundary pullback handles them instead.
+        return (
+            var.owner is not None
+            and isinstance(var.owner.op, _EXIT_OPS)
+            and var.owner.inputs[0].owner is not None
+            and var not in protected
+            and var.owner.inputs[0] not in protected
+        )
+
+    while True:
+        # One bottom-up pass over the graph: collapse every exit into its unit and
+        # re-clone each downstream node (once) onto the rebuilt variables. Toposort
+        # visits dependencies first, so an inner region is collapsed before any region
+        # or node consuming it -- `memo` carries the fresh identities forward. This
+        # keeps a chain of N regions at one pass instead of one graph rebuild each.
+        memo: dict[Variable, Variable] = {}
+        for node in toposort(outputs):
+            if isinstance(node.op, _EXIT_OPS) and is_exit(node.outputs[0]):
+                exit_var = node.outputs[0]
+                memo[exit_var] = _collapse_region(exit_var, boundary_set, memo)
+            elif any(memo.get(inp, inp) is not inp for inp in node.inputs):
+                new_node = node.clone_with_new_inputs(
+                    [memo.get(inp, inp) for inp in node.inputs]
+                )
+                memo.update(zip(node.outputs, new_node.outputs))
+        if not memo:
+            # `memo` only gains entries through an exit collapse (rebuilds require an
+            # already-changed input), so an empty memo means no exits were found.
+            return outputs
+        outputs = [memo.get(out, out) for out in outputs]
+        # Loop as insurance: collapsed units cannot contain further exits (the residual
+        # guard raises otherwise), so the next pass normally finds nothing and returns.
+        # (Differentiating a unit later re-enters `grad` and hence this hook on its
+        # pure-tensor inner graph — those separate, exit-free invocations return here
+        # immediately and are not extra passes of this loop.)
diff --git a/pytensor/xtensor/rewriting/basic.py b/pytensor/xtensor/rewriting/basic.py
index 24ad6ef6f4..364a5c9965 100644
--- a/pytensor/xtensor/rewriting/basic.py
+++ b/pytensor/xtensor/rewriting/basic.py
@@ -1,25 +1,14 @@
-from pytensor.gradient import DisconnectedType, pullback
 from pytensor.graph import node_rewriter
-from pytensor.graph.basic import clone_get_equiv
-from pytensor.graph.rewriting.utils import rewrite_graph
-from pytensor.graph.traversal import ancestors, graph_inputs
 from pytensor.tensor.basic import register_infer_shape
 from pytensor.tensor.rewriting.basic import register_canonicalize, register_useless
 from pytensor.xtensor.basic import (
-    LazyGrad,
     Rename,
     TensorFromXTensor,
-    XOp,
     XTensorFromTensor,
-    grad_connected,
     xtensor_from_tensor,
 )
 from pytensor.xtensor.random.type import RNGToXRNG, XRNGToRNG
-from pytensor.xtensor.rewriting.utils import (
-    register_lower_lazy_grad,
-    register_lower_xtensor,
-)
-from pytensor.xtensor.shape import zeros_like
+from pytensor.xtensor.rewriting.utils import register_lower_xtensor
 
 
 @register_infer_shape
@@ -96,52 +85,3 @@ def useless_xrng_to_rng(fgraph, node):
     [x] = node.inputs
     if x.owner and isinstance(x.owner.op, RNGToXRNG):
         return [x.owner.inputs[0]]
-
-
-@register_lower_lazy_grad
-@node_rewriter(tracks=[LazyGrad])
-def expand_lazy_grad(fgraph, node):
-    """Differentiate an XOp by lowering it to tensor ops and taking their pullback.
-
-    Runs before lower_xtensor: the differentiated op (``core_op``) is rebuilt on fresh
-    stand-ins and lowered to tensor ops in isolation, then differentiated with the
-    ordinary tensor pullback. Stand-ins (rather than the real inputs) give a repeated
-    input separate per-slot cotangents, and survive the lowering of the conversion ops
-    that the real inputs would be folded into.
-    """
-    op = node.op
-    forward_inputs = node.inputs[: -op.n_cotangents]
-    cotangents = node.inputs[-op.n_cotangents :]
-
-    dummies = [inp.type() if grad_connected(inp) else inp for inp in forward_inputs]
-    lowered = rewrite_graph(
-        list(op.core_op.make_node(*dummies).outputs),
-        include=("lower_lazy_grad", "lower_xtensor"),
-    )
-    if any(isinstance(var.owner.op, XOp) for var in ancestors(lowered) if var.owner):
-        raise NotImplementedError(f"pullback not implemented for {op.core_op}")
-
-    memo = {d: inp for d, inp in zip(dummies, forward_inputs) if grad_connected(inp)}
-    input_grads = pullback(
-        lowered,
-        list(memo),
-        cotangents,
-        disconnected_inputs="ignore",
-        return_disconnected="disconnected",
-    )
-    # The lowering and pullback above built nodes inside throwaway FunctionGraphs. Re-clone
-    # the grad into fresh nodes so it imports into the main graph through the normal path,
-    # grafting the real inputs back in place of the stand-ins. Real variables the grad
-    # already shares (the node inputs and any value the gradient reuses) are kept as-is.
-    keep = list(node.inputs) + [
-        v
-        for v in graph_inputs(input_grads, blockers=node.inputs)
-        if v not in memo and v not in set(node.inputs)
-    ]
-    equiv = clone_get_equiv(keep, input_grads, copy_inputs=False, memo=dict(memo))
-    # An input the cost doesn't reach through this node contributes a zero (its other
-    # paths are summed in by the grad engine); a node output can't be DisconnectedType.
-    return [
-        zeros_like(inp) if isinstance(grad.type, DisconnectedType) else equiv[grad]
-        for grad, inp in zip(input_grads, memo.values())
-    ]
diff --git a/pytensor/xtensor/rewriting/utils.py b/pytensor/xtensor/rewriting/utils.py
index 0b15420bd4..b3d6433658 100644
--- a/pytensor/xtensor/rewriting/utils.py
+++ b/pytensor/xtensor/rewriting/utils.py
@@ -12,26 +12,12 @@
 
 lower_xtensor_db = EquilibriumDB(ignore_newtrees=False)
 
-# Expanding the lazy gradient Op (LazyGrad) rewrites a whole grad subgraph in one shot,
-# which is unsafe to splice into the lower_xtensor equilibrium mid-flight. It runs just
-# before it instead, so the expanded grad is lowered by the normal pass like any other.
-lower_lazy_grad_db = EquilibriumDB(ignore_newtrees=False)
-
 infer_shape_db.register(
     "lower_xtensor",
     lower_xtensor_db,
     "infer_shape",
 )
 
-optdb.register(
-    "lower_lazy_grad",
-    lower_lazy_grad_db,
-    "fast_run",
-    "fast_compile",
-    "minimum_compile",
-    position=0.089,  # before lower_xtensor
-)
-
 optdb.register(
     "lower_xtensor",
     lower_xtensor_db,
@@ -78,19 +64,6 @@ def register(inner_rewriter: RewriteDatabase | NodeRewriter):
         return node_rewriter
 
 
-def register_lower_lazy_grad(node_rewriter: NodeRewriter, **kwargs):
-    name = kwargs.pop("name", None) or node_rewriter.__name__  # type: ignore
-    lower_lazy_grad_db.register(
-        name,
-        node_rewriter,
-        "fast_run",
-        "fast_compile",
-        "minimum_compile",
-        **kwargs,
-    )
-    return node_rewriter
-
-
 def lower_aligned(x: XTensorVariable, out_dims: Sequence[str]) -> TensorVariable:
     """Lower an XTensorVariable to a TensorVariable so that it's dimensions are aligned with "out_dims"."""
     inp_dims = {d: i for i, d in enumerate(x.type.dims)}
diff --git a/tests/xtensor/test_grad.py b/tests/xtensor/test_grad.py
index deaa0e6dff..a8491571b1 100644
--- a/tests/xtensor/test_grad.py
+++ b/tests/xtensor/test_grad.py
@@ -9,6 +9,7 @@
 import pytensor
 import pytensor.tensor as pt
 import pytensor.xtensor as px
+from pytensor.gradient import pushforward
 from pytensor.graph import rewrite_graph
 from pytensor.xtensor.type import as_xtensor
 from tests.unittest_tools import verify_grad
@@ -50,29 +51,9 @@ def build_cases():
     ]
 
 
-# Differentiating mean/std lowers to several Shape(x) views that duplicate the
-# forward's; merging those duplicates while the destroy handler is attached leaves its
-# client bookkeeping inconsistent, which `on_opt_error=raise` (used in tests) turns
-# fatal. The gradient itself is correct, so the failure is in graph optimization only.
-_XFAIL_DESTROY_HANDLER = {"reduce_mean_std"}
-
-
 @pytest.mark.parametrize(
     "loss, wrt",
-    [
-        pytest.param(
-            loss,
-            wrt,
-            id=name,
-            marks=pytest.mark.xfail(
-                reason="merging duplicated Shape views upsets the destroy handler",
-                strict=True,
-            )
-            if name in _XFAIL_DESTROY_HANDLER
-            else (),
-        )
-        for name, loss, wrt in build_cases()
-    ],
+    [pytest.param(loss, wrt, id=name) for name, loss, wrt in build_cases()],
 )
 def test_grad_matches_lowering(loss, wrt):
     # pt.grad must work directly on the un-lowered xtensor graph and agree with the
@@ -133,3 +114,162 @@ def dot_loss(x, w):
         return (px.dot(xx, ww, dim="a") ** 2).sum().values
 
     verify_grad(dot_loss, [rng.normal(size=(3,)), rng.normal(size=(3, 2))], rng=rng)
+
+
+def test_grad_consider_constant():
+    # consider_constant on a variable internal to the xtensor region must still stop the
+    # gradient there once the region is collapsed for differentiation.
+    xt = pt.tensor("x", shape=(3, 4))
+    x = as_xtensor(xt, dims=("a", "b"))
+    inter = px.math.exp(x)
+    loss = ((inter * inter).sum() + (x * x).sum()).values
+    g = pytensor.function([xt], pt.grad(loss, xt, consider_constant=[inter]))
+    # exp(x) is held constant, so only the (x*x) path contributes: d/dx = 2x.
+    x_test = np.random.default_rng(0).normal(size=(3, 4))
+    np.testing.assert_allclose(g(x_test), 2 * x_test)
+
+
+def test_grad_second_order_xtensor_wrt():
+    # Differentiating twice w.r.t. an xtensor variable: the collapsed unit is a plain
+    # tensor op, so the repeated grad lowers and differentiates it without a residual.
+    x = px.xtensor("x", dims=("a",), shape=(3,))
+    loss = (x * x * x).sum()  # d2/dx2 = 6x
+    g2 = pt.grad(pt.grad(loss.values, x).sum().values, x)
+    x_test = np.array([1.0, 2.0, 3.0])
+    np.testing.assert_allclose(pytensor.function([x], g2.values)(x_test), 6 * x_test)
+
+
+def test_grad_chained_regions():
+    # Leaving and re-entering the xtensor world collapses into separate units and still
+    # matches the lower-then-grad reference (order of collapse must not form a cycle).
+    xt = pt.tensor("x", shape=(3, 4))
+    x = as_xtensor(xt, dims=("a", "b"))
+    reentered = as_xtensor(x.sum("a").values + 1.0, dims=("b",))
+    loss = (reentered**2).sum().values
+    [direct] = pt.grad(loss, [xt])
+    [ref] = grad_through_lowering(loss, [xt])
+    d, r = pytensor.function([xt], [direct, ref])(
+        np.random.default_rng(0).normal(size=(3, 4))
+    )
+    np.testing.assert_allclose(d, r)
+
+
+def test_grad_wrt_exit_variable():
+    # wrt a region's tensor exit (`expr.values`): the boundary acts by identity, so the
+    # collapse must leave the exit (and everything it depends on) in place for grad to
+    # reach it; no xtensor op needs differentiating on the cost -> wrt path.
+    xt = pt.tensor("x", shape=(3, 4))
+    x = as_xtensor(xt, dims=("a", "b"))
+    w = px.math.exp(x).values
+    g = pt.grad((w**2).sum(), w)
+    x_test = np.random.default_rng(1).normal(size=(3, 4))
+    np.testing.assert_allclose(pytensor.function([xt], g)(x_test), 2 * np.exp(x_test))
+
+
+def test_grad_consider_constant_exit_variable():
+    # consider_constant on a region's tensor exit must still stop the gradient there;
+    # rewriting the exit out of the graph silently dropped the stop.
+    xt = pt.tensor("x", shape=(3,))
+    x = as_xtensor(xt, dims=("a",))
+    w = px.math.exp(x).values
+    cost = (w * w).sum() + (xt**2).sum()
+    g = pt.grad(cost, xt, consider_constant=[w])
+    x_test = np.array([0.5, 1.0, 1.5])
+    np.testing.assert_allclose(pytensor.function([xt], g)(x_test), 2 * x_test)
+
+
+def test_pushforward():
+    # Forward mode double-pullbacks with a seed that references the exit; the exit is
+    # then a boundary ancestor and must survive the collapse (collapsing it silently
+    # disconnected the seed, returning 0).
+    xt = pt.tensor("x", shape=(3,))
+    x = as_xtensor(xt, dims=("a",))
+    cost = (px.math.exp(x) * 2.0).sum().values
+    v = pt.tensor("v", shape=(3,))
+    jvp = pushforward(cost, [xt], [v])
+    out = pytensor.function([xt, v], jvp)(np.zeros(3), np.ones(3))
+    np.testing.assert_allclose(out, 6.0)
+
+
+def test_grad_unrelated_to_random_region():
+    # A random draw elsewhere in the cost graph must not break grad wrt an unrelated
+    # variable: RNG-typed region inputs take rng conversions, not tensor conversions.
+    theta = pt.scalar("theta")
+    rng = px.random.shared_rng(seed=0)
+    _, draw = px.random.normal(
+        0.0, 1.0, extra_dims={"a": 3}, rng=rng, return_next_rng=True
+    )
+    cost = draw.sum().values * 0.0 + theta**2
+    g = pt.grad(cost, theta)
+    np.testing.assert_allclose(g.eval({theta: 3.0}), 6.0)
+
+
+def test_grad_through_random_region():
+    # Reparameterized draw: grad flows through the deterministic use of the draw and
+    # matches the lower-then-grad reference, with a free xrng as function input.
+    rng = px.random.rng("rng")
+    at = pt.tensor("a", shape=(3,))
+    a = as_xtensor(at, dims=("d",))
+    _, eps = px.random.normal(
+        0.0, 1.0, extra_dims={"d": 3}, rng=rng, return_next_rng=True
+    )
+    loss = ((a * eps) ** 2).sum().values
+    g_direct = pt.grad(loss, at)
+    g_ref = grad_through_lowering(loss, at)
+    a_test = np.arange(1.0, 4.0)
+    d, r = pytensor.function([at, rng], [g_direct, g_ref])(
+        a_test, np.random.default_rng(3)
+    )
+    np.testing.assert_allclose(d, r)
+
+
+def test_grad_through_slice_indexing():
+    # Slice components are structural (MakeSlice) and must stay inside the collapsed
+    # unit so the indexing lowering can pattern-match them, symbolic bounds included.
+    at = pt.tensor("a", shape=(6, 5))
+    xa = as_xtensor(at, dims=("i", "j"))
+    k = pt.iscalar("k")
+    loss = (xa.isel(i=slice(1, 4), j=slice(0, 2)) ** 2).sum().values
+    sym_loss = (xa.isel(i=slice(k, 4)) ** 2).sum().values
+    outs = [pt.grad(loss, at), *grad_through_lowering(loss, [at])]
+    outs += [pt.grad(sym_loss, at), *grad_through_lowering(sym_loss, [at])]
+    a_test = np.random.default_rng(2).normal(size=(6, 5))
+    g, g_ref, gs, gs_ref = pytensor.function([at, k], outs)(a_test, 1)
+    np.testing.assert_allclose(g, g_ref)
+    np.testing.assert_allclose(gs, gs_ref)
+
+
+def test_grad_xtensor_cost_raises():
+    # A raw xtensor cost or known_grads key gets a clear error, not a crash deep in
+    # the grad internals.
+    xt = pt.tensor("x", shape=(3,))
+    x = as_xtensor(xt, dims=("a",))
+    with pytest.raises(TypeError, match="Convert it to a tensor first"):
+        pt.grad((x**2).sum(), xt)
+    with pytest.raises(TypeError, match="Convert it to a tensor first"):
+        pt.grad(None, xt, known_grads={px.math.exp(x): px.math.exp(x)})
+
+
+def test_grad_exit_diamond():
+    # One collapsed exit feeding several consumers -- the cost directly, a re-entered
+    # region, and a node using it twice -- must be rebuilt once and accumulate all paths.
+    xt = pt.tensor("x", shape=(3,))
+    x = as_xtensor(xt, dims=("a",))
+    e = px.math.exp(x).sum("a").values
+    reentered = (as_xtensor(e * 2.0, dims=()) ** 2).sum().values
+    loss = e + reentered + e * e
+    [direct] = pt.grad(loss, [xt])
+    [ref] = grad_through_lowering(loss, [xt])
+    d, r = pytensor.function([xt], [direct, ref])(np.arange(3.0))
+    np.testing.assert_allclose(d, r)
+
+
+def test_grad_known_grads_exit_key():
+    # A tensor exit used as a known_grads key is rewritten consistently with the cost
+    # graph, so the supplied cotangent lands on the collapsed unit's output.
+    xt = pt.tensor("x", shape=(3,))
+    x = as_xtensor(xt, dims=("a",))
+    w = px.math.exp(x).sum("a").values
+    g = pt.grad(None, xt, known_grads={w: pt.constant(2.0)})
+    x_test = np.arange(3.0)
+    np.testing.assert_allclose(pytensor.function([xt], g)(x_test), 2.0 * np.exp(x_test))

From a9b8db2a8b736e80704b08418048f99d53619cdf Mon Sep 17 00:00:00 2001
From: Carlos Trujillo <carlangastr@gmail.com>
Date: Thu, 2 Jul 2026 13:58:56 +0300
Subject: [PATCH 6/6] Fix mypy errors in grad hook and xtensor collapse pass

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 pytensor/gradient.py         |  6 +++---
 pytensor/xtensor/gradient.py | 22 +++++++++++++++-------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/pytensor/gradient.py b/pytensor/gradient.py
index 9c12f19394..e225698514 100644
--- a/pytensor/gradient.py
+++ b/pytensor/gradient.py
@@ -698,11 +698,11 @@ def grad(
         roots = ([cost] if cost is not None else []) + list(known_grads or ())
         for rewrite in _grad_graph_rewriters:
             roots = rewrite(roots, boundaries)
-        roots = iter(roots)
+        roots_iter = iter(roots)
         if cost is not None:
-            cost = next(roots)
+            cost = next(roots_iter)
         if known_grads:
-            known_grads = dict(zip(roots, known_grads.values(), strict=True))
+            known_grads = dict(zip(roots_iter, known_grads.values(), strict=True))
 
     outputs = []
     if cost is not None:
diff --git a/pytensor/xtensor/gradient.py b/pytensor/xtensor/gradient.py
index 59031487b7..3cd2cee40c 100644
--- a/pytensor/xtensor/gradient.py
+++ b/pytensor/xtensor/gradient.py
@@ -8,6 +8,8 @@
 happens here, in a graph pass registered with ``grad``, never inside an Op's pullback.
 """
 
+from typing import cast
+
 from pytensor.compile.builders import OpFromGraph
 from pytensor.gradient import register_grad_graph_rewriter
 from pytensor.graph.basic import Constant, Variable
@@ -41,18 +43,18 @@ def _is_xtensor(var: Variable) -> bool:
 def _to_tensor_world(var: Variable) -> Variable:
     """Convert an xtensor-world variable to its tensor-world equivalent."""
     if isinstance(var.type, XTensorType):
-        return tensor_from_xtensor(var)
+        return cast(Variable, tensor_from_xtensor(var))
     if isinstance(var.type, XRandomGeneratorType):
-        return xrng_to_rng(var)
+        return cast(Variable, xrng_to_rng(var))
     return var
 
 
 def _from_tensor_world(var: Variable, dummy: Variable) -> Variable:
     """Rebuild the xtensor-world equivalent of ``var`` from a tensor-world dummy."""
     if isinstance(var.type, XTensorType):
-        return xtensor_from_tensor(dummy, dims=var.type.dims)
+        return cast(Variable, xtensor_from_tensor(dummy, dims=var.type.dims))
     if isinstance(var.type, XRandomGeneratorType):
-        return rng_to_xrng(dummy)
+        return cast(Variable, rng_to_xrng(dummy))
     return dummy
 
 
@@ -111,10 +113,16 @@ def visit_input(inp: Variable) -> None:
     inner = [_from_tensor_world(v, d) for v, d in zip(inputs, dummies)]
     if inputs:
         [region] = graph_replace([exit_var], dict(zip(inputs, inner)), strict=False)
-        [lowered] = rewrite_graph([region], include=("lower_xtensor",), clone=False)
+        [lowered] = cast(
+            list[Variable],
+            rewrite_graph([region], include=("lower_xtensor",), clone=False),
+        )
     else:
         # Fully constant region: lower it in place, nothing to differentiate through.
-        [lowered] = rewrite_graph([exit_var], include=("lower_xtensor",), clone=True)
+        [lowered] = cast(
+            list[Variable],
+            rewrite_graph([exit_var], include=("lower_xtensor",), clone=True),
+        )
 
     if any(
         var.owner is not None
@@ -133,7 +141,7 @@ def visit_input(inp: Variable) -> None:
         return lowered
     unit = OpFromGraph(dummies, [lowered], inline=True)
     [new_exit] = unit(*outer_inputs, return_list=True)
-    return new_exit
+    return cast(Variable, new_exit)
 
 
 @register_grad_graph_rewriter