From 2863c81f9f9f9c45f9773f1a33ce75dd7a2a508c Mon Sep 17 00:00:00 2001
From: wlnc <wladyslaw.niec@unit8.co>
Date: Wed, 20 May 2026 08:53:51 +0200
Subject: [PATCH 1/8] fix ci and smoke tests

---
 .github/workflows/ci.yml        | 4 ++++
 agents/exponential_das/agent.py | 2 +-
 agents/rl_das/agent.py          | 2 +-
 das/training/rldas.py           | 2 +-
 4 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3d78100..7063a72 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -31,6 +31,9 @@ jobs:
       - name: Install dependencies
         run: uv sync --group dev
 
+      - name: Ruff lint
+        run: uv run ruff check .
+
       - name: Run pytest
         run: uv run pytest tests/ -v --tb=short
 
@@ -58,6 +61,7 @@ jobs:
 
       - name: Run smoke tests
         run: |
+          source .venv/bin/activate
           AGENTS="${{ inputs.smoke_agents }}"
           if [ -n "$AGENTS" ]; then
             bash smoke_test.sh $AGENTS
diff --git a/agents/exponential_das/agent.py b/agents/exponential_das/agent.py
index b5ca3b6..cc85b3d 100644
--- a/agents/exponential_das/agent.py
+++ b/agents/exponential_das/agent.py
@@ -268,7 +268,7 @@ def save(self, path: str) -> None:
     @classmethod
     def load(cls, path: str, obs_dim: int, n_actions: int, **kwargs) -> "ExpDASAgent":
         agent = cls(obs_dim, n_actions, **kwargs)
-        ckpt = torch.load(path, map_location=agent.device)
+        ckpt = torch.load(path, map_location=agent.device, weights_only=False)
         agent.actor.load_state_dict(ckpt["actor"])
         agent.critic.load_state_dict(ckpt["critic"])
         if "actor_opt" in ckpt:
diff --git a/agents/rl_das/agent.py b/agents/rl_das/agent.py
index 4c40cbc..8def163 100644
--- a/agents/rl_das/agent.py
+++ b/agents/rl_das/agent.py
@@ -243,7 +243,7 @@ def save(self, path: str) -> None:
     @classmethod
     def load(cls, path: str, dim: int, n_opt: int, **kwargs) -> "PPOAgent":
         agent = cls(dim, n_opt, **kwargs)
-        ckpt = torch.load(path, map_location=agent.device)
+        ckpt = torch.load(path, map_location=agent.device, weights_only=False)
         agent.actor.load_state_dict(ckpt["actor"])
         agent.critic.load_state_dict(ckpt["critic"])
         if "optimizer" in ckpt:
diff --git a/das/training/rldas.py b/das/training/rldas.py
index 2c041cc..ddac460 100644
--- a/das/training/rldas.py
+++ b/das/training/rldas.py
@@ -63,7 +63,7 @@ def run_rl_das(args) -> None:
 
     if args.eval:
         print("\nRunning final evaluation on test set …")
-        n_problems = len(test_env.problem_ids)
+        n_problems = len(test_env._problem_ids)
         test_results = evaluate(test_env, agent, n_episodes=n_problems)
         mean_best_y = float(np.mean([r["best_y"] for r in test_results]))
         print(f"Test mean best_y = {mean_best_y:.6e}")

From 50bf2724ab77ca252eae5636d75af39e56da640f Mon Sep 17 00:00:00 2001
From: wlnc <wladyslaw.niec@unit8.co>
Date: Wed, 20 May 2026 08:56:43 +0200
Subject: [PATCH 2/8] fix ci and smoke tests 2

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7063a72..8dd2903 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -32,7 +32,7 @@ jobs:
         run: uv sync --group dev
 
       - name: Ruff lint
-        run: uv run ruff check .
+        run: uv run ruff format .
 
       - name: Run pytest
         run: uv run pytest tests/ -v --tb=short

From 7444500f32b12f0a0bd7556d772a8795c360b881 Mon Sep 17 00:00:00 2001
From: wlnc <wladyslaw.niec@unit8.co>
Date: Wed, 20 May 2026 12:20:42 +0200
Subject: [PATCH 3/8] set RL-DAS logic to follow strictly the one of tis
 original version

---
 agents/rl_das/env.py        | 424 +++++++++++------------
 agents/rl_das/network.py    |   8 +-
 agents/rl_das/optimizers.py | 669 ++++++++++++++++++++++++++++++++++++
 agents/rl_das/population.py | 137 ++++++++
 cv.py                       |   4 +
 das/training/rldas.py       |  11 +-
 run_local.sh                |   8 +-
 train.py                    |   6 +-
 8 files changed, 1036 insertions(+), 231 deletions(-)
 create mode 100644 agents/rl_das/optimizers.py
 create mode 100644 agents/rl_das/population.py

diff --git a/agents/rl_das/env.py b/agents/rl_das/env.py
index ccfbd9b..38653ae 100644
--- a/agents/rl_das/env.py
+++ b/agents/rl_das/env.py
@@ -1,18 +1,22 @@
-"""RL-DAS gymnasium environment.
+"""RL-DAS gymnasium environment (Population-based, 9-dim features).
 
-Observation space: flat Box of shape (6 + 2 * n_opt * dim,)
-  - [0:6]           six population-state features
-  - [6:]            movement history, 2*n_opt blocks of ``dim`` scalars each,
-                    interleaved as [best_0, worst_0, best_1, worst_1, ...]
+Strictly follows the original RL-DAS design (Guo et al., 2024):
 
-Action space: Discrete(n_opt)
+- A single mutable ``Population`` object is shared between DE sub-optimizers
+  as warm-started state (no get_data/set_data contract).
+- Portfolio is restricted to the three DE algorithms: NL_SHADE_RSP, JDE21, MadDE.
+- Observation: 9-dim population-state features (computed with local sampling)
+  concatenated with per-optimizer movement history embeddings of shape (dim,).
+
+Observation layout: flat Box of shape (9 + 2 * n_opt * dim,)
+  [0:9]    nine population-state features (see _pop_features)
+  [9:]     movement history, 2*n_opt blocks of dim scalars each,
+           interleaved as [best_0, worst_0, best_1, worst_1, ...]
 
-Reward: max(0, (prev_best - new_best) / cost_scale)  — non-negative improvement
-        normalised by the initial global-best cost so rewards are comparable
-        across problems with different fitness scales.
+Action space: Discrete(n_opt)
 
-The environment operates on BBOB problems accessed through a cocoex Suite.
-Only problems whose dimension matches ``dim`` are used; others are skipped.
+Reward: max(0, (prev_best - new_best) / cost_scale)  non-negative improvement
+        normalised by the initial global-best cost.
 """
 
 from __future__ import annotations
@@ -23,68 +27,136 @@
 import gymnasium as gym
 from gymnasium import spaces
 
+from agents.rl_das.population import Population
 from das.optimizers.base import get_checkpoints
 
 
 # ---------------------------------------------------------------------------
-# Population-state feature extraction (6-D, dimension-independent)
+# Feature helper functions (adapted from original RL-DAS utils.py)
 # ---------------------------------------------------------------------------
 
 
+def _cal_fdc(group_norm: np.ndarray, costs: np.ndarray) -> float:
+    """Fitness-distance correlation. group_norm must be in [0, 1]^dim."""
+    opt_x = group_norm[np.argmin(costs)]
+    ds = np.sum((group_norm - opt_x) ** 2, axis=1)
+    fs = 1.0 / (costs + 1e-8)
+    C_fd = ((fs - fs.mean()) * (ds - ds.mean())).mean()
+    delta_f = ((fs - fs.mean()) ** 2).mean()
+    delta_d = ((ds - ds.mean()) ** 2).mean()
+    return float(C_fd / (delta_d * delta_f + 1e-8))
+
+
+def _dispersion(group_norm: np.ndarray, costs: np.ndarray) -> tuple[float, float]:
+    """Dispersion and dispersion-ratio metrics. group_norm in [0, 1]^dim."""
+    gs, dim = group_norm.shape
+    group_sorted = group_norm[np.argsort(costs)]
+    diam = float(np.sqrt(dim))
+    disp = 0.0
+    max_dis = 0.0
+    for i in range(1, gs):
+        shift = np.concatenate((group_sorted[i:], group_sorted[:i]), 0)
+        distances = np.sqrt(np.sum((group_sorted - shift) ** 2, -1))
+        disp += float(np.sum(distances))
+        cur_max = float(np.max(distances))
+        if cur_max > max_dis:
+            max_dis = cur_max
+    disp /= gs**2
+    gs10 = max(gs * 10 // 100, 1)
+    top10 = group_sorted[:gs10]
+    disp10 = 0.0
+    for i in range(1, gs10):
+        shift = np.concatenate((top10[i:], top10[:i]), 0)
+        disp10 += float(np.sum(np.sqrt(np.sum((top10 - shift) ** 2, -1))))
+    if gs10 > 1:
+        disp10 /= gs10**2
+    return float(disp10 - disp), float(max_dis / (diam + 1e-10))
+
+
+def _negative_slope_coefficient(
+    group_cost: np.ndarray, sample_cost: np.ndarray
+) -> float:
+    """Negative slope coefficient (NSC)."""
+    gs = sample_cost.shape[0]
+    m = 10
+    gs -= gs % m
+    if gs < m:
+        return 0.0
+    pairs = sorted(zip(group_cost[:gs].tolist(), sample_cost[:gs].tolist()))
+    arr = np.array(pairs)
+    sorted_group = arr[:, 0].reshape(m, -1)
+    sorted_sample = arr[:, 1].reshape(m, -1)
+    Ms = np.mean(sorted_group, -1)
+    Ns = np.mean(sorted_sample, -1)
+    nsc = np.minimum((Ns[1:] - Ns[:-1]) / (Ms[1:] - Ms[:-1] + 1e-8), 0)
+    return float(np.sum(nsc))
+
+
+def _average_neutral_ratio(
+    group_cost: np.ndarray, sample_costs: np.ndarray, eps: float = 1.0
+) -> float:
+    """Average neutral ratio (ANR)."""
+    gs = sample_costs.shape[1]
+    dcost = np.fabs(sample_costs - group_cost[:gs])
+    return float(np.mean(np.sum(dcost < eps, axis=0) / sample_costs.shape[0]))
+
+
+def _non_improvable_worsenable(
+    group_cost: np.ndarray, sample_costs: np.ndarray
+) -> tuple[float, float]:
+    """Non-improvable (NI) and non-worsenable (NW) ratios."""
+    gs = sample_costs.shape[1]
+    NI = (
+        1.0
+        - np.count_nonzero(np.sum(group_cost[:gs] > sample_costs, axis=-1))
+        / sample_costs.shape[0]
+    )
+    NW = (
+        1.0
+        - np.count_nonzero(np.sum(group_cost[:gs] < sample_costs, axis=-1))
+        / sample_costs.shape[0]
+    )
+    return float(NI), float(NW)
+
+
 def _pop_features(
-    x: np.ndarray,
-    y: np.ndarray,
-    gbest_y: float,
+    population: Population,
+    sample_costs: np.ndarray,
     cost_scale: float,
-    n_fes: int,
-    max_fes: int,
+    n_fe: int,
+    max_fe: int,
 ) -> np.ndarray:
-    """Return a 6-D population-state feature vector.
-
-    Features
-    --------
-    gbc       : normalised global-best cost
-    fdc       : fitness–distance correlation
-    disp      : mean distance to centroid (dispersion)
-    disp_ratio: dispersion / search-space range
-    nsc       : negative slope coefficient (distance vs fitness)
-    progress  : n_fes / max_fes
+    """Return 9-dim feature vector matching original RL-DAS Population.get_feature.
+
+    Features (in order):
+      gbc        normalised global-best cost
+      fdc        fitness-distance correlation
+      disp       dispersion (disp10 - disp_all)
+      disp_ratio max_pairwise_dist / sqrt(dim)
+      nsc        negative slope coefficient
+      anr        average neutral ratio
+      ni         non-improvable ratio
+      nw         non-worsenable ratio
+      progress   n_fe / max_fe
     """
-    NP = len(y)
-
-    # 1. Normalised global best
-    gbc = float(gbest_y / cost_scale) if cost_scale > 1e-10 else 0.0
-
-    # 2. Fitness–distance correlation
-    gbest_x = x[int(np.argmin(y))]
-    dists = np.linalg.norm(x - gbest_x, axis=1)
-    if dists.std() > 1e-10 and y.std() > 1e-10 and NP > 2:
-        fdc = float(np.corrcoef(dists, y)[0, 1])
-    else:
-        fdc = 0.0
-
-    # 3. Dispersion metrics
-    centroid = x.mean(axis=0)
-    disp_dists = np.linalg.norm(x - centroid, axis=1)
-    disp = float(disp_dists.mean())
-    spread = float(np.ptp(x))
-    disp_ratio = disp / (spread + 1e-10)
-
-    # 4. Negative slope coefficient
-    if dists.std() > 1e-10 and NP > 2:
-        try:
-            slope = float(np.polyfit(dists, y, 1)[0])
-            nsc = -slope if np.isfinite(slope) else 0.0
-        except (np.linalg.LinAlgError, ValueError):
-            nsc = 0.0
-    else:
-        nsc = 0.0
-
-    # 5. Progress
-    progress = float(n_fes / max(max_fes, 1))
-
-    out = np.array([gbc, fdc, disp, disp_ratio, nsc, progress], dtype=np.float32)
-    # Guard against any NaN/inf that could corrupt the network weights
+    group = population.group
+    cost = population.cost
+    lb = population.Xmin
+    ub = population.Xmax
+
+    group_norm = (group - lb) / (ub - lb + 1e-10)
+
+    gbc = float(population.gbest / (cost_scale + 1e-10))
+    fdc = _cal_fdc(group_norm, cost / (cost_scale + 1e-10))
+    disp, disp_ratio = _dispersion(group_norm, cost)
+    nsc = _negative_slope_coefficient(cost, sample_costs[0])
+    anr = _average_neutral_ratio(cost, sample_costs)
+    ni, nw = _non_improvable_worsenable(cost, sample_costs)
+    progress = float(n_fe / max(max_fe, 1))
+
+    out = np.array(
+        [gbc, fdc, disp, disp_ratio, nsc, anr, ni, nw, progress], dtype=np.float32
+    )
     return np.nan_to_num(out, nan=0.0, posinf=1.0, neginf=-1.0)
 
 
@@ -96,33 +168,34 @@ def _pop_features(
 class RLDASEnv(gym.Env):
     """RL-DAS environment wrapping BBOB problems via a cocoex Suite.
 
+    Uses a Population object as shared warm-started state across all DE
+    sub-optimizers (matching the original RL-DAS design).
+
     Parameters
     ----------
     problem_ids:
-        BBOB problem IDs to cycle through (one per episode).  Only IDs whose
-        dimension matches ``dim`` are used.
+        BBOB problem IDs to cycle through (one per episode).
     suite:
         cocoex Suite object.
     optimizers:
-        Ordered list of sub-optimizer classes (same pypop7-compatible classes
-        as used in DASEnv — defines the action space).
+        List of instantiated DE optimizer objects (NL_SHADE_RSP, JDE21, MadDE).
     dim:
-        Problem dimension.  The movement embeddings have shape (dim,), so the
-        agent is dimension-specific.
+        Problem dimension.
     fe_multiplier:
         Budget = fe_multiplier * dim.
     n_checkpoints:
         Number of optimizer-selection steps per episode.
     checkpoint_division_base:
-        cdb=1.0 → uniform checkpoints; cdb>1.0 → exponentially growing.
+        1.0 → uniform checkpoints; >1.0 → exponentially growing.
     n_individuals:
-        Population size.
+        Initial population size (Nmax).
     seed:
-        Seed for deterministic optimizer RNGs (passed to pypop7 as seed_rng).
+        Unused (kept for API compatibility with other DAS envs).
     """
 
     metadata = {"render_modes": []}
-    N_FEATURES = 6
+    N_FEATURES = 9
+    SAMPLE_TIMES = 2
 
     def __init__(
         self,
@@ -138,19 +211,17 @@ def __init__(
     ):
         super().__init__()
 
-        # Filter to matching dimension (BBOB IDs end with _d{dim:02d})
         self._problem_ids = [pid for pid in problem_ids if pid.endswith(f"_d{dim:02d}")]
         if not self._problem_ids:
             raise ValueError(f"No problem_ids match dimension {dim}.")
 
         self.suite = suite
-        self.optimizers = optimizers
+        self._optimizers = optimizers
         self.dim = dim
         self.fe_multiplier = fe_multiplier
         self.n_checkpoints = n_checkpoints
         self.cdb = checkpoint_division_base
         self.n_individuals = n_individuals
-        self._seed = seed
 
         self.n_opt = len(optimizers)
         obs_dim = self.N_FEATURES + 2 * self.n_opt * dim
@@ -160,28 +231,14 @@ def __init__(
         )
         self.action_space = spaces.Discrete(self.n_opt)
 
-        # Episode state
         self._problem = None
         self._problem_idx = 0
         self._max_fe = 0
         self._n_fe = 0
         self._checkpoints: np.ndarray | None = None
         self._checkpoint_idx = 0
-
-        # Current population arrays (shared warm-start state)
-        self._pop_x: np.ndarray | None = None
-        self._pop_y: np.ndarray | None = None
-        self._optimizer_state: dict = {}
-
-        # Global best tracking
-        self._gbest_y: float = np.inf
-        self._gbest_x: np.ndarray | None = None
+        self._population: Population | None = None
         self._cost_scale: float = 1.0
-        self._worst_y: float = -np.inf
-        self._initial_range: tuple[float, float] = (np.inf, -np.inf)
-
-        # Movement history: list of accumulated movement vectors per optimizer
-        # shape: (n_opt,) lists of (dim,) arrays; averaged in observation
         self._best_history: list[list[np.ndarray]] = [[] for _ in range(self.n_opt)]
         self._worst_history: list[list[np.ndarray]] = [[] for _ in range(self.n_opt)]
 
@@ -194,24 +251,20 @@ def reset(self, seed=None, options=None):
 
         problem_id = self._problem_ids[self._problem_idx % len(self._problem_ids)]
         self._problem_idx += 1
-
         self._problem = self.suite.get_problem(problem_id)
+
+        lb = float(self._problem.lower_bounds[0])
+        ub = float(self._problem.upper_bounds[0])
         self._max_fe = self.fe_multiplier * self.dim
         self._checkpoints = get_checkpoints(
             self.n_checkpoints, self._max_fe, self.n_individuals, self.cdb
         )
-
-        self._n_fe = 0
         self._checkpoint_idx = 0
-        self._optimizer_state = {}
-        self._pop_x = None
-        self._pop_y = None
 
-        self._gbest_y = np.inf
-        self._gbest_x = None
-        self._cost_scale = 1.0
-        self._worst_y = -np.inf
-        self._initial_range = (np.inf, -np.inf)
+        self._population = Population(self.dim, lb, ub, Nmax=self.n_individuals)
+        self._population.initialize_costs(self._problem)
+        self._cost_scale = max(float(self._population.gbest), 1e-10)
+        self._n_fe = self._population.NP
 
         self._best_history = [[] for _ in range(self.n_opt)]
         self._worst_history = [[] for _ in range(self.n_opt)]
@@ -223,143 +276,84 @@ def step(self, action: int):
         assert self._problem is not None, "Call reset() before step()"
 
         target_fe = int(self._checkpoints[self._checkpoint_idx])
-        prev_gbest = self._gbest_y
+        prev_gbest = self._population.gbest
 
-        # Record pre-step best/worst positions for movement computation
-        prev_best_x, prev_worst_x = self._get_best_worst_positions()
+        prev_best_x = self._population.gbest_solution.copy()
+        worst_idx = int(np.argmax(self._population.cost))
+        prev_worst_x = self._population.group[worst_idx].copy()
 
-        result = self._run_optimizer(action, target_fe)
-        self._update_state(result)
+        self._population, _, end_fes = self._optimizers[action].step(
+            self._population, self._problem, self._n_fe, target_fe, self._max_fe
+        )
+        self._n_fe = end_fes
 
-        # Compute movement for the chosen optimizer
-        new_best_x, new_worst_x = self._get_best_worst_positions()
-        best_move = (new_best_x - prev_best_x) / (self.dim**0.5 + 1e-10)
-        worst_move = (new_worst_x - prev_worst_x) / (self.dim**0.5 + 1e-10)
-        self._best_history[action].append(best_move)
-        self._worst_history[action].append(worst_move)
+        new_best_x = self._population.gbest_solution.copy()
+        new_worst_x = self._population.group[
+            int(np.argmax(self._population.cost))
+        ].copy()
+        scale = float(self.dim**0.5) + 1e-10
+        self._best_history[action].append((new_best_x - prev_best_x) / scale)
+        self._worst_history[action].append((new_worst_x - prev_worst_x) / scale)
 
         self._checkpoint_idx += 1
         terminated = (
             self._checkpoint_idx >= self.n_checkpoints or self._n_fe >= self._max_fe
         )
 
-        improvement = (
-            max(0.0, prev_gbest - self._gbest_y)
-            if np.isfinite(prev_gbest) and np.isfinite(self._gbest_y)
-            else 0.0
+        reward = float(
+            max(0.0, (prev_gbest - self._population.gbest) / (self._cost_scale + 1e-10))
         )
-        reward = float(improvement / (self._cost_scale + 1e-10))
-        if not np.isfinite(reward):
-            reward = 0.0
 
         obs = self._build_observation()
-        info = {"best_y": self._gbest_y, "n_fe": self._n_fe}
-        return obs, reward, terminated, False, info
+        return (
+            obs,
+            reward,
+            terminated,
+            False,
+            {"best_y": self._population.gbest, "n_fe": self._n_fe},
+        )
 
     # ------------------------------------------------------------------
-    # Optimizer execution (reuses pypop7 warm-start machinery)
+    # Local sampling (counts FEs toward budget)
     # ------------------------------------------------------------------
 
-    def _run_optimizer(self, action: int, target_fe: int) -> dict:
-        optimizer_class = self.optimizers[action]
-        problem_config = {
-            "fitness_function": self._problem,
-            "ndim_problem": self.dim,
-            "lower_boundary": self._problem.lower_bounds,
-            "upper_boundary": self._problem.upper_bounds,
-        }
-        options = {
-            "max_function_evaluations": self._max_fe,
-            "target_fe": target_fe,
-            "n_individuals": self.n_individuals,
-            "best_so_far_y": self._gbest_y,
-            "verbose": False,
-        }
-        if self._seed is not None:
-            options["seed_rng"] = (
-                self._seed * 1_000_000
-                + self._problem_idx * 1_000
-                + self._checkpoint_idx
-            ) % (2**31)
-
-        optimizer = optimizer_class(problem_config, options)
-        optimizer.n_function_evaluations = self._n_fe
-        optimizer.set_data(
-            best_x=self._gbest_x,
-            best_y=self._gbest_y if self._gbest_y < np.inf else None,
-            **self._optimizer_state,
-        )
-        result = optimizer.optimize()
-        if isinstance(result, tuple):
-            result = result[0]
-
-        new_state = optimizer.get_data()
-        if new_state:
-            self._optimizer_state = new_state
-        elif len(optimizer.x_history) > 0:
-            self._optimizer_state = {
-                "x": np.array(optimizer.x_history[-self.n_individuals :]),
-                "y": np.array(optimizer.y_history[-self.n_individuals :]),
-            }
-
-        # Update cached population arrays
-        if "x" in self._optimizer_state:
-            self._pop_x = self._optimizer_state["x"]
-            self._pop_y = self._optimizer_state["y"]
-
-        return result
-
-    def _update_state(self, result: dict) -> None:
-        new_best_y: float = result.get("best_so_far_y", np.inf)
-        new_best_x: np.ndarray | None = result.get("best_so_far_x")
-        worst_y: float = result.get("worst_so_far_y", -np.inf)
-
-        if new_best_y < self._gbest_y:
-            self._gbest_y = new_best_y
-            self._gbest_x = new_best_x
-
-        if worst_y > self._worst_y:
-            self._worst_y = worst_y
-
-        if self._initial_range[0] == np.inf:
-            self._initial_range = (new_best_y, max(worst_y, new_best_y + 1e-5))
-            self._cost_scale = max(abs(new_best_y), 1e-10)
-
-        y_hist = result.get("y_history")
-        n_fe_step = len(y_hist) if y_hist is not None else 0
-        self._n_fe = result.get("n_function_evaluations", self._n_fe + n_fe_step)
+    def _local_sample(self) -> np.ndarray:
+        """Run SAMPLE_TIMES independent trials on a deepcopy of the population.
+
+        Returns
+        -------
+        sample_costs : ndarray of shape (SAMPLE_TIMES, min_NP)
+        """
+        sample_size = self._population.NP
+        costs = []
+        min_len = sample_size
+        for _ in range(self.SAMPLE_TIMES):
+            pop_copy = copy.deepcopy(self._population)
+            opt = self._optimizers[np.random.randint(self.n_opt)]
+            popped, _, _ = opt.step(
+                pop_copy,
+                self._problem,
+                self._n_fe,
+                self._n_fe + sample_size,
+                self._max_fe,
+            )
+            costs.append(popped.cost.copy())
+            if popped.cost.shape[0] < min_len:
+                min_len = popped.cost.shape[0]
+        self._n_fe = min(self._n_fe + sample_size * self.SAMPLE_TIMES, self._max_fe)
+        return np.array([c[:min_len] for c in costs])
 
     # ------------------------------------------------------------------
     # Observation construction
     # ------------------------------------------------------------------
 
-    def _get_best_worst_positions(self) -> tuple[np.ndarray, np.ndarray]:
-        """Return current best and worst particle positions."""
-        if self._pop_x is None or len(self._pop_x) == 0:
-            zeros = np.zeros(self.dim, dtype=np.float32)
-            return zeros, zeros
-        best_idx = int(np.argmin(self._pop_y))
-        worst_idx = int(np.argmax(self._pop_y))
-        return self._pop_x[best_idx].astype(np.float32), self._pop_x[worst_idx].astype(
-            np.float32
-        )
-
     def _build_observation(self) -> np.ndarray:
-        # Population-state features
-        if self._pop_x is not None and len(self._pop_x) > 0:
-            features = _pop_features(
-                self._pop_x,
-                self._pop_y,
-                self._gbest_y,
-                self._cost_scale,
-                self._n_fe,
-                self._max_fe,
-            )
-        else:
-            features = np.zeros(self.N_FEATURES, dtype=np.float32)
+        sample_costs = self._local_sample()
+        features = _pop_features(
+            self._population, sample_costs, self._cost_scale, self._n_fe, self._max_fe
+        )
 
-        # Per-optimizer movement history (averaged, or zero if unused)
-        movements = []
+        movements: list[np.ndarray] = []
         for k in range(self.n_opt):
             if self._best_history[k]:
                 best_emb = np.mean(self._best_history[k], axis=0).astype(np.float32)
diff --git a/agents/rl_das/network.py b/agents/rl_das/network.py
index efc0fd7..16e08ab 100644
--- a/agents/rl_das/network.py
+++ b/agents/rl_das/network.py
@@ -6,13 +6,13 @@
 
 Architecture (following Guo et al. 2024):
 
-  Input: flat obs = [features_6d | best_move_0_dim | worst_move_0_dim | ...]
+  Input: flat obs = [features_9d | best_move_0_dim | worst_move_0_dim | ...]
 
   For each of the 2*n_opt movement blocks:
       embedder_k : Linear(dim, 64) -> ReLU -> Linear(64, 1) -> ReLU
 
-  backbone_input = cat(features_6d, *[emb_k(move_k) for k])   shape: (6+2*n_opt,)
-  backbone       : Linear(6+2*n_opt, 64) -> Tanh -> Linear(64, 16) -> Tanh
+  backbone_input = cat(features_9d, *[emb_k(move_k) for k])   shape: (9+2*n_opt,)
+  backbone       : Linear(9+2*n_opt, 64) -> Tanh -> Linear(64, 16) -> Tanh
 
   Actor head  : Linear(16, n_opt) -> Softmax
   Critic head : Linear(16, 1)
@@ -42,7 +42,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class _RLDASBackbone(nn.Module):
     """Shared feature extractor used by both Actor and Critic."""
 
-    N_FEATURES = 6  # must match env.RLDASEnv.N_FEATURES
+    N_FEATURES = 9  # must match env.RLDASEnv.N_FEATURES
 
     def __init__(self, dim: int, n_opt: int) -> None:
         super().__init__()
diff --git a/agents/rl_das/optimizers.py b/agents/rl_das/optimizers.py
new file mode 100644
index 0000000..40dae89
--- /dev/null
+++ b/agents/rl_das/optimizers.py
@@ -0,0 +1,669 @@
+"""DE optimizers for RL-DAS: NL_SHADE_RSP, JDE21, MadDE.
+
+Ported from the original RL-DAS (Guo et al., 2024) with the following
+adaptations for DAS2 / BBOB:
+
+- Batch evaluation: ``np.array([float(problem(xs[i])) for i in range(len(xs))])``
+  instead of ``problem.func(xs) - problem.optimum``.
+- Bounds from ``population.Xmin`` / ``population.Xmax`` (not hardcoded ±100).
+- No early-termination at ``cost < 1e-8`` (FE budget controls episode end).
+- ``__init__`` takes no ``dim`` argument (dimension inferred from population).
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import scipy.stats as stats
+
+
+def _eval(problem, xs: np.ndarray) -> np.ndarray:
+    return np.array([float(problem(xs[i])) for i in range(len(xs))])
+
+
+class NL_SHADE_RSP:
+    def __init__(self) -> None:
+        self.pb = 0.4
+        self.pa = 0.5
+
+    def _binomial(self, x: np.ndarray, v: np.ndarray, cr: np.ndarray) -> np.ndarray:
+        NP, dim = x.shape
+        jrand = np.random.randint(dim, size=NP)
+        u = np.where(np.random.rand(NP, dim) < cr.repeat(dim).reshape(NP, dim), v, x)
+        u[np.arange(NP), jrand] = v[np.arange(NP), jrand]
+        return u
+
+    def _exponential(self, x: np.ndarray, v: np.ndarray, cr: np.ndarray) -> np.ndarray:
+        NP, dim = x.shape
+        u = x.copy()
+        L = np.random.randint(dim, size=NP).repeat(dim).reshape(NP, dim)
+        L = L <= np.arange(dim)
+        rvs = np.random.rand(NP, dim)
+        L = np.where(rvs > cr.repeat(dim).reshape(NP, dim), L, 0)
+        return u * (1 - L) + v * L
+
+    def _update_pa(self, fa: float, fp: float, na: int, NP: int) -> None:
+        if na == 0 or fa == 0:
+            self.pa = 0.5
+            return
+        self.pa = (fa / (na + 1e-15)) / ((fa / (na + 1e-15)) + (fp / (NP - na + 1e-15)))
+        self.pa = float(np.clip(self.pa, 0.1, 0.9))
+
+    def step(
+        self,
+        population,
+        problem,
+        FEs: int,
+        FEs_end: int,
+        MaxFEs: int,
+        record_period: int = -1,
+    ):
+        if record_period <= 0:
+            record_period = FEs_end - FEs
+        NP, dim = population.NP, population.dim
+        NA = int(NP * 2.1)
+        if NA < population.archive.shape[0]:
+            population.archive = population.archive[:NA]
+        self.pa = 0.5
+        k = 1
+        Fevs = []
+        while FEs >= k * record_period:
+            k += 1
+        population.sort(population.NP)
+
+        while FEs < FEs_end and FEs < MaxFEs:
+            Cr, F = population.choose_F_Cr()
+            Cr = np.sort(Cr)
+            pr = np.exp(-(np.arange(NP) + 1) / NP)
+            pr /= np.sum(pr)
+            cross_exponential = np.random.random() < 0.5
+            pb_upper = int(max(2, NP * self.pb))
+            pbs = np.random.randint(pb_upper, size=NP)
+            count = 0
+            duplicate = np.where(pbs == np.arange(NP))[0]
+            while duplicate.shape[0] > 0 and count < 1:
+                pbs[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
+                duplicate = np.where(pbs == np.arange(NP))[0]
+                count += 1
+            xpb = population.group[pbs]
+            r1 = np.random.randint(NP, size=NP)
+            count = 0
+            duplicate = np.where((r1 == np.arange(NP)) + (r1 == pbs))[0]
+            while duplicate.shape[0] > 0 and count < 25:
+                r1[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
+                duplicate = np.where((r1 == np.arange(NP)) + (r1 == pbs))[0]
+                count += 1
+            x1 = population.group[r1]
+            rvs = np.random.rand(NP)
+            r2_pop = np.where(rvs >= self.pa)[0]
+            r2_arc = np.where(rvs < self.pa)[0]
+            use_arc = np.zeros(NP, dtype=bool)
+            use_arc[r2_arc] = True
+            if population.archive.shape[0] < 25:
+                r2_pop = np.arange(NP)
+                r2_arc = np.array([], dtype=np.int32)
+            r2 = np.random.choice(np.arange(NP), size=r2_pop.shape[0], p=pr)
+            count = 0
+            duplicate = np.where(
+                (r2 == r2_pop) + (r2 == pbs[r2_pop]) + (r2 == r1[r2_pop])
+            )[0]
+            while duplicate.shape[0] > 0 and count < 25:
+                r2[duplicate] = np.random.choice(
+                    np.arange(NP), size=duplicate.shape[0], p=pr
+                )
+                duplicate = np.where(
+                    (r2 == r2_pop) + (r2 == pbs[r2_pop]) + (r2 == r1[r2_pop])
+                )[0]
+                count += 1
+            x2 = np.zeros((NP, dim))
+            if r2_pop.shape[0] > 0:
+                x2[r2_pop] = population.group[r2]
+            if r2_arc.shape[0] > 0:
+                x2[r2_arc] = population.archive[
+                    np.random.randint(
+                        min(population.archive.shape[0], NA),
+                        size=r2_arc.shape[0],
+                    )
+                ]
+            Fs = F.repeat(dim).reshape(NP, dim)
+            vs = population.group + Fs * (xpb - population.group) + Fs * (x1 - x2)
+            Crb = np.zeros(NP)
+            tmp_id = np.where(np.arange(NP) + FEs < 0.5 * MaxFEs)[0]
+            Crb[tmp_id] = 2 * ((FEs + tmp_id) / MaxFEs - 0.5)
+            if cross_exponential:
+                us = self._binomial(population.group, vs, Crb)
+            else:
+                us = self._exponential(population.group, vs, Cr)
+
+            # Reinitialise out-of-bounds elements with uniform random in [lb, ub]
+            oob = (us < population.Xmin) | (us > population.Xmax)
+            if np.any(oob):
+                rand_pts = (
+                    np.random.rand(NP, dim) * (population.Xmax - population.Xmin)
+                    + population.Xmin
+                )
+                us = np.where(oob, rand_pts, us)
+
+            cost = _eval(problem, us)
+            optim = np.where(cost < population.cost)[0]
+            for i in optim:
+                population.update_archive(i)
+            population.F[optim] = F[optim]
+            population.Cr[optim] = Cr[optim]
+            SF = F[optim]
+            SCr = Cr[optim]
+            df = (population.cost[optim] - cost[optim]) / (
+                population.cost[optim] + 1e-9
+            )
+            arc_usage = use_arc[optim]
+            fp = float(np.sum(df[arc_usage]))
+            fa = float(np.sum(df[~arc_usage]))
+            na = int(np.sum(arc_usage))
+            population.group[optim] = us[optim]
+            population.cost[optim] = cost[optim]
+            if float(np.min(cost)) < population.gbest:
+                population.gbest = float(np.min(cost))
+                population.gbest_solution = us[np.argmin(cost)].copy()
+            FEs += NP
+            self.pb = 0.4 - 0.2 * (FEs / MaxFEs)
+            population.NLPSR(FEs, MaxFEs)
+            population.update_M_F_Cr(SF, SCr, df)
+            self._update_pa(fa, fp, na, NP)
+            NP = population.NP
+            NA = population.NA
+            if FEs >= k * record_period:
+                Fevs.append(population.gbest)
+                k += 1
+
+        return population, Fevs, min(FEs, MaxFEs)
+
+
+class JDE21:
+    def __init__(self) -> None:
+        self.sNP = 10
+        self.tao1 = 0.1
+        self.tao2 = 0.1
+        self.Finit = 0.5
+        self.CRinit = 0.9
+        self.Fl_b = 0.1
+        self.Fl_s = 0.17
+        self.Fu = 1.1
+        self.CRl_b = 0.0
+        self.CRl_s = 0.1
+        self.CRu_b = 1.1
+        self.CRu_s = 0.8
+        self.eps = 1e-12
+        self.MyEps = 0.25
+        self.nReset = 0
+        self.sReset = 0
+        self.cCopy = 0
+
+    def _prevec_enakih(self, cost: np.ndarray, best: float) -> bool:
+        eqs = len(cost[np.fabs(cost - best) < self.eps])
+        return eqs > 2 and eqs > len(cost) * self.MyEps
+
+    def _crowding(self, group: np.ndarray, vs: np.ndarray) -> np.ndarray:
+        NP, dim = vs.shape
+        dist = np.sum(
+            ((group * np.ones((NP, NP, dim))).transpose(1, 0, 2) - vs) ** 2, -1
+        ).transpose()
+        return np.argmin(dist, -1)
+
+    def step(
+        self,
+        population,
+        problem,
+        FEs: int,
+        FEs_end: int,
+        MaxFEs: int,
+        record_period: int = -1,
+    ):
+        if record_period <= 0:
+            record_period = FEs_end - FEs
+        NP = population.NP
+        dim = population.dim
+        sNP = self.sNP
+        bNP = max(NP - sNP, 2)
+        age = 0
+        k = 1
+        Fevs = []
+
+        def mutate_cross_select(r1, r2, r3, SF, SCr, df, age, big):
+            nonlocal bNP
+            if big:
+                xNP = bNP
+                randF = np.random.rand(xNP) * self.Fu + self.Fl_b
+                randCr = np.random.rand(xNP) * self.CRu_b + self.CRl_b
+                pF = population.F[:xNP]
+                pCr = population.Cr[:xNP]
+            else:
+                # Use actual slice size (may be < sNP when population shrank)
+                xNP = population.F[bNP:].shape[0]
+                if xNP == 0:
+                    return SF, SCr, df, age
+                randF = np.random.rand(xNP) * self.Fu + self.Fl_s
+                randCr = np.random.rand(xNP) * self.CRu_b + self.CRl_s
+                pF = population.F[bNP:]
+                pCr = population.Cr[bNP:]
+
+            rvs = np.random.rand(xNP)
+            F = np.where(rvs < self.tao1, randF, pF)
+            rvs = np.random.rand(xNP)
+            Cr = np.where(rvs < self.tao2, randCr, pCr)
+            Fs = F.repeat(dim).reshape(xNP, dim)
+            Crs = Cr.repeat(dim).reshape(xNP, dim)
+            v = population.group[r1] + Fs * (
+                population.group[r2] - population.group[r3]
+            )
+            v = np.clip(v, population.Xmin, population.Xmax)
+            jrand = np.random.randint(dim, size=xNP)
+            u = np.where(
+                np.random.rand(xNP, dim) < Crs,
+                v,
+                (population.group[:bNP] if big else population.group[bNP:]),
+            )
+            u[np.arange(xNP), jrand] = v[np.arange(xNP), jrand]
+            cost = _eval(problem, u)
+            crowding_ids = (
+                self._crowding(population.group[:xNP], u)
+                if big
+                else np.arange(xNP) + bNP
+            )
+            age += xNP
+            for i in range(xNP):
+                idx = crowding_ids[i]
+                if cost[i] < population.cost[idx]:
+                    population.update_archive(idx)
+                    population.group[idx] = u[i]
+                    population.cost[idx] = cost[i]
+                    population.F[idx] = F[i]
+                    population.Cr[idx] = Cr[i]
+                    SF = np.append(SF, F[i])
+                    SCr = np.append(SCr, Cr[i])
+                    d = (population.cost[i] - cost[i]) / (population.cost[i] + 1e-9)
+                    df = np.append(df, d)
+                    if cost[i] < population.cbest:
+                        age = 0
+                        population.cbest_id = idx
+                        population.cbest = cost[i]
+                        if cost[i] < population.gbest:
+                            population.gbest = cost[i]
+                            population.gbest_solution = u[i].copy()
+
+            return SF, SCr, df, age
+
+        population.sort(NP, True)
+        while FEs >= k * record_period:
+            k += 1
+        while FEs < FEs_end:
+            df = np.array([])
+            SF = np.array([])
+            SCr = np.array([])
+            if (
+                self._prevec_enakih(population.cost[:bNP], population.gbest)
+                or age > MaxFEs / 10
+            ):
+                self.nReset += 1
+                population.group[:bNP] = population.initialize_group(bNP)
+                population.F[:bNP] = self.Finit
+                population.Cr[:bNP] = self.CRinit
+                population.cost[:bNP] = 1e15
+                age = 0
+                population.cbest = float(np.min(population.cost))
+                population.cbest_id = int(np.argmin(population.cost))
+
+            if FEs < MaxFEs / 3:
+                mig = 1
+            elif FEs < 2 * MaxFEs / 3:
+                mig = 2
+            else:
+                mig = 3
+
+            r1 = np.random.randint(bNP, size=bNP)
+            count = 0
+            duplicate = np.where((r1 == np.arange(bNP)) * (r1 == population.cbest_id))[
+                0
+            ]
+            while duplicate.shape[0] > 0 and count < 25:
+                r1[duplicate] = np.random.randint(bNP, size=duplicate.shape[0])
+                duplicate = np.where(
+                    (r1 == np.arange(bNP)) * (r1 == population.cbest_id)
+                )[0]
+                count += 1
+            r2 = np.random.randint(bNP + mig, size=bNP)
+            count = 0
+            duplicate = np.where((r2 == np.arange(bNP)) + (r2 == r1))[0]
+            while duplicate.shape[0] > 0 and count < 25:
+                r2[duplicate] = np.random.randint(bNP + mig, size=duplicate.shape[0])
+                duplicate = np.where((r2 == np.arange(bNP)) + (r2 == r1))[0]
+                count += 1
+            r3 = np.random.randint(bNP + mig, size=bNP)
+            count = 0
+            duplicate = np.where((r3 == np.arange(bNP)) + (r3 == r1) + (r3 == r2))[0]
+            while duplicate.shape[0] > 0 and count < 25:
+                r3[duplicate] = np.random.randint(bNP + mig, size=duplicate.shape[0])
+                duplicate = np.where((r3 == np.arange(bNP)) + (r3 == r1) + (r3 == r2))[
+                    0
+                ]
+                count += 1
+            SF, SCr, df, age = mutate_cross_select(
+                r1, r2, r3, SF, SCr, df, age, big=True
+            )
+            FEs += bNP
+            if FEs >= k * record_period:
+                Fevs.append(population.gbest)
+                k += 1
+            if FEs >= FEs_end or FEs >= MaxFEs:
+                break
+
+            curr_sNP = NP - bNP
+            if (
+                curr_sNP > 0
+                and population.cbest_id >= bNP
+                and self._prevec_enakih(population.cost[bNP:], population.cbest)
+            ):
+                self.sReset += 1
+                cbest = population.cbest
+                cbest_id = population.cbest_id
+                tmp = population.group[cbest_id].copy()
+                population.group[bNP:] = population.initialize_group(curr_sNP)
+                population.F[bNP:] = self.Finit
+                population.Cr[bNP:] = self.CRinit
+                population.cost[bNP:] = 1e15
+                population.cbest = cbest
+                population.cbest_id = cbest_id
+                population.group[cbest_id] = tmp
+                population.cost[cbest_id] = cbest
+
+            if population.cbest_id < bNP:
+                self.cCopy += 1
+                population.cost[bNP] = population.cbest
+                population.group[bNP] = population.group[population.cbest_id].copy()
+                population.cbest_id = bNP
+
+            curr_sNP = NP - bNP  # actual small-pop size (= sNP unless shrunken)
+            for _ in range(bNP // curr_sNP if curr_sNP > 0 else 0):
+                r1 = np.random.randint(curr_sNP, size=curr_sNP) + bNP
+                count = 0
+                duplicate = np.where(r1 == (np.arange(curr_sNP) + bNP))[0]
+                while duplicate.shape[0] > 0 and count < 25:
+                    r1[duplicate] = (
+                        np.random.randint(curr_sNP, size=duplicate.shape[0]) + bNP
+                    )
+                    duplicate = np.where(r1 == (np.arange(curr_sNP) + bNP))[0]
+                    count += 1
+                r2 = np.random.randint(curr_sNP, size=curr_sNP) + bNP
+                count = 0
+                duplicate = np.where((r2 == (np.arange(curr_sNP) + bNP)) + (r2 == r1))[
+                    0
+                ]
+                while duplicate.shape[0] > 0 and count < 25:
+                    r2[duplicate] = (
+                        np.random.randint(curr_sNP, size=duplicate.shape[0]) + bNP
+                    )
+                    duplicate = np.where(
+                        (r2 == (np.arange(curr_sNP) + bNP)) + (r2 == r1)
+                    )[0]
+                    count += 1
+                r3 = np.random.randint(curr_sNP, size=curr_sNP) + bNP
+                count = 0
+                duplicate = np.where(
+                    (r3 == (np.arange(curr_sNP) + bNP)) + (r3 == r1) + (r3 == r2)
+                )[0]
+                while duplicate.shape[0] > 0 and count < 25:
+                    r3[duplicate] = (
+                        np.random.randint(curr_sNP, size=duplicate.shape[0]) + bNP
+                    )
+                    duplicate = np.where(
+                        (r3 == (np.arange(curr_sNP) + bNP)) + (r3 == r1) + (r3 == r2)
+                    )[0]
+                    count += 1
+                SF, SCr, df, age = mutate_cross_select(
+                    r1, r2, r3, SF, SCr, df, age, big=False
+                )
+                FEs += curr_sNP
+                if FEs >= k * record_period:
+                    Fevs.append(population.gbest)
+                    k += 1
+                if FEs >= FEs_end or FEs >= MaxFEs:
+                    break
+
+            population.update_M_F_Cr(SF, SCr, df)
+            NP = min(
+                int(population.cal_NP_next_gen(FEs, MaxFEs)), len(population.group)
+            )
+            population.NP = NP
+            population.group = population.group[-NP:]
+            population.cost = population.cost[-NP:]
+            population.F = population.F[-NP:]
+            population.Cr = population.Cr[-NP:]
+            population.cbest_id = int(np.argmin(population.cost))
+            population.cbest = float(np.min(population.cost))
+            bNP = max(NP - sNP, 2)
+
+        return population, Fevs, min(FEs, MaxFEs)
+
+
+class MadDE:
+    def __init__(self) -> None:
+        self.p = 0.18
+        self.PqBX = 0.01
+        self.pm = np.ones(3) / 3
+
+    def _ctb_w_arc(
+        self,
+        group: np.ndarray,
+        best: np.ndarray,
+        archive: np.ndarray,
+        Fs: np.ndarray,
+    ) -> np.ndarray:
+        NP, dim = group.shape
+        NB = best.shape[0]
+        NA = archive.shape[0]
+        count = 0
+        rb = np.random.randint(NB, size=NP)
+        duplicate = np.where(rb == np.arange(NP))[0]
+        while duplicate.shape[0] > 0 and count < 25:
+            rb[duplicate] = np.random.randint(NB, size=duplicate.shape[0])
+            duplicate = np.where(rb == np.arange(NP))[0]
+            count += 1
+        count = 0
+        r1 = np.random.randint(NP, size=NP)
+        duplicate = np.where((r1 == rb) + (r1 == np.arange(NP)))[0]
+        while duplicate.shape[0] > 0 and count < 25:
+            r1[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
+            duplicate = np.where((r1 == rb) + (r1 == np.arange(NP)))[0]
+            count += 1
+        count = 0
+        r2 = np.random.randint(NP + NA, size=NP)
+        duplicate = np.where((r2 == rb) + (r2 == np.arange(NP)) + (r2 == r1))[0]
+        while duplicate.shape[0] > 0 and count < 25:
+            r2[duplicate] = np.random.randint(NP + NA, size=duplicate.shape[0])
+            duplicate = np.where((r2 == rb) + (r2 == np.arange(NP)) + (r2 == r1))[0]
+            count += 1
+        xb = best[rb]
+        x1 = group[r1]
+        x2 = np.concatenate((group, archive), 0)[r2] if NA > 0 else group[r2]
+        return group + Fs * (xb - group) + Fs * (x1 - x2)
+
+    def _ctr_w_arc(
+        self,
+        group: np.ndarray,
+        archive: np.ndarray,
+        Fs: np.ndarray,
+    ) -> np.ndarray:
+        NP, dim = group.shape
+        NA = archive.shape[0]
+        count = 0
+        r1 = np.random.randint(NP, size=NP)
+        duplicate = np.where(r1 == np.arange(NP))[0]
+        while duplicate.shape[0] > 0 and count < 25:
+            r1[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
+            duplicate = np.where(r1 == np.arange(NP))[0]
+            count += 1
+        count = 0
+        r2 = np.random.randint(NP + NA, size=NP)
+        duplicate = np.where((r2 == np.arange(NP)) + (r2 == r1))[0]
+        while duplicate.shape[0] > 0 and count < 25:
+            r2[duplicate] = np.random.randint(NP + NA, size=duplicate.shape[0])
+            duplicate = np.where((r2 == np.arange(NP)) + (r2 == r1))[0]
+            count += 1
+        x1 = group[r1]
+        x2 = np.concatenate((group, archive), 0)[r2] if NA > 0 else group[r2]
+        return group + Fs * (x1 - x2)
+
+    def _weighted_rtb(
+        self,
+        group: np.ndarray,
+        best: np.ndarray,
+        Fs: np.ndarray,
+        Fas: float,
+    ) -> np.ndarray:
+        NP, dim = group.shape
+        NB = best.shape[0]
+        count = 0
+        rb = np.random.randint(NB, size=NP)
+        duplicate = np.where(rb == np.arange(NP))[0]
+        while duplicate.shape[0] > 0 and count < 25:
+            rb[duplicate] = np.random.randint(NB, size=duplicate.shape[0])
+            duplicate = np.where(rb == np.arange(NP))[0]
+            count += 1
+        count = 0
+        r1 = np.random.randint(NP, size=NP)
+        duplicate = np.where((r1 == rb) + (r1 == np.arange(NP)))[0]
+        while duplicate.shape[0] > 0 and count < 25:
+            r1[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
+            duplicate = np.where((r1 == rb) + (r1 == np.arange(NP)))[0]
+            count += 1
+        count = 0
+        r2 = np.random.randint(NP, size=NP)
+        duplicate = np.where((r2 == rb) + (r2 == np.arange(NP)) + (r2 == r1))[0]
+        while duplicate.shape[0] > 0 and count < 25:
+            r2[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
+            duplicate = np.where((r2 == rb) + (r2 == np.arange(NP)) + (r2 == r1))[0]
+            count += 1
+        return Fs * group[r1] + Fs * Fas * (best[rb] - group[r2])
+
+    @staticmethod
+    def _binomial(x: np.ndarray, v: np.ndarray, Crs: np.ndarray) -> np.ndarray:
+        NP, dim = x.shape
+        jrand = np.random.randint(dim, size=NP)
+        u = np.where(np.random.rand(NP, dim) < Crs, v, x)
+        u[np.arange(NP), jrand] = v[np.arange(NP), jrand]
+        return u
+
+    def step(
+        self,
+        population,
+        problem,
+        FEs: int,
+        FEs_end: int,
+        MaxFEs: int,
+        record_period: int = -1,
+    ):
+        if record_period <= 0:
+            record_period = FEs_end - FEs
+        Fevs = []
+        k = 1
+        while FEs >= k * record_period:
+            k += 1
+        population.sort(population.NP)
+        while FEs < FEs_end and FEs < MaxFEs:
+            NP, dim = population.NP, population.dim
+            q = 2 * self.p - self.p * FEs / MaxFEs
+            Fa = 0.5 + 0.5 * FEs / MaxFEs
+            Cr, F = population.choose_F_Cr()
+            mu = np.random.choice(3, size=NP, p=self.pm)
+            p1 = population.group[mu == 0]
+            p2 = population.group[mu == 1]
+            p3 = population.group[mu == 2]
+            pbest = population.group[: max(int(self.p * NP), 2)]
+            qbest = population.group[: max(int(q * NP), 2)]
+            Fs = F.repeat(dim).reshape(NP, dim)
+            v1 = self._ctb_w_arc(p1, pbest, population.archive, Fs[mu == 0])
+            v2 = self._ctr_w_arc(p2, population.archive, Fs[mu == 1])
+            v3 = self._weighted_rtb(p3, qbest, Fs[mu == 2], Fa)
+            v = np.zeros((NP, dim))
+            v[mu == 0] = v1
+            v[mu == 1] = v2
+            v[mu == 2] = v3
+
+            # Bounds handling: midpoint reflection with lb/ub
+            Xmin = population.Xmin
+            Xmax = population.Xmax
+            v = np.where(v < Xmin, (population.group + Xmin) / 2, v)
+            v = np.where(v > Xmax, (population.group + Xmax) / 2, v)
+
+            rvs = np.random.rand(NP)
+            Crs = Cr.repeat(dim).reshape(NP, dim)
+            u = np.zeros((NP, dim))
+            if np.sum(rvs <= self.PqBX) > 0:
+                qu = v[rvs <= self.PqBX]
+                if population.archive.shape[0] > 0:
+                    qbest = np.concatenate((population.group, population.archive), 0)[
+                        : max(int(q * (NP + population.archive.shape[0])), 2)
+                    ]
+                cross_qbest = qbest[np.random.randint(qbest.shape[0], size=qu.shape[0])]
+                qu = self._binomial(cross_qbest, qu, Crs[rvs <= self.PqBX])
+                u[rvs <= self.PqBX] = qu
+            bu = v[rvs > self.PqBX]
+            bu = self._binomial(
+                population.group[rvs > self.PqBX], bu, Crs[rvs > self.PqBX]
+            )
+            u[rvs > self.PqBX] = bu
+
+            ncost = _eval(problem, u)
+            FEs += NP
+            optim = np.where(ncost < population.cost)[0]
+            for i in optim:
+                population.update_archive(i)
+            SF = F[optim]
+            SCr = Cr[optim]
+            df = np.maximum(0, population.cost - ncost)
+            population.update_M_F_Cr(SF, SCr, df[optim])
+            count_S = np.zeros(3)
+            for i in range(3):
+                count_S[i] = np.mean(df[mu == i] / (population.cost[mu == i] + 1e-9))
+            if np.sum(count_S) > 0:
+                self.pm = np.clip(count_S / np.sum(count_S), 0.1, 0.9)
+                self.pm /= np.sum(self.pm)
+            else:
+                self.pm = np.ones(3) / 3
+            population.group[optim] = u[optim]
+            population.cost = np.minimum(population.cost, ncost)
+            population.NLPSR(FEs, MaxFEs)
+            if float(np.min(population.cost)) < population.gbest:
+                population.gbest = float(np.min(population.cost))
+                population.gbest_solution = population.group[
+                    np.argmin(population.cost)
+                ].copy()
+            if FEs >= k * record_period:
+                Fevs.append(population.gbest)
+                k += 1
+
+        return population, Fevs, min(FEs, MaxFEs)
+
+
+_PORTFOLIO: dict[str, type] = {
+    "NL_SHADE_RSP": NL_SHADE_RSP,
+    "MADDE": MadDE,
+    "JDE21": JDE21,
+}
+
+
+def get_rldas_portfolio(names: list[str] | None = None) -> list:
+    """Return instantiated RL-DAS optimizer objects.
+
+    Parameters
+    ----------
+    names:
+        Optimizer names. Defaults to ``["NL_SHADE_RSP", "MADDE", "JDE21"]``.
+        Valid names: ``"NL_SHADE_RSP"``, ``"MADDE"``, ``"JDE21"``.
+    """
+    if names is None:
+        names = ["NL_SHADE_RSP", "MADDE", "JDE21"]
+    unknown = [n for n in names if n not in _PORTFOLIO]
+    if unknown:
+        raise ValueError(
+            f"Unknown RL-DAS optimizer(s): {unknown}. Valid choices: {list(_PORTFOLIO)}"
+        )
+    return [_PORTFOLIO[n]() for n in names]
diff --git a/agents/rl_das/population.py b/agents/rl_das/population.py
new file mode 100644
index 0000000..a65ffe2
--- /dev/null
+++ b/agents/rl_das/population.py
@@ -0,0 +1,137 @@
+"""Population class for RL-DAS DE optimizers.
+
+Direct port of the original RL-DAS Population with configurable bounds
+so it works with BBOB problems ([-5, 5]) instead of CEC ([-100, 100]).
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import scipy.stats as stats
+
+
+class Population:
+    def __init__(
+        self,
+        dim: int,
+        lb: np.ndarray | float,
+        ub: np.ndarray | float,
+        Nmax: int = 170,
+        Nmin: int = 30,
+    ):
+        self.dim = dim
+        self.Nmax = Nmax
+        self.Nmin = Nmin
+        self.NP = Nmax
+        self.NA = int(self.NP * 2.1)
+        self.cost = np.zeros(self.NP)
+        self.cbest: float = 1e15
+        self.cbest_id: int = -1
+        self.gbest: float = 1e15
+        self.gbest_solution = np.zeros(dim)
+        self.Xmin = (
+            np.full(dim, lb, dtype=float)
+            if np.isscalar(lb)
+            else np.asarray(lb, dtype=float)
+        )
+        self.Xmax = (
+            np.full(dim, ub, dtype=float)
+            if np.isscalar(ub)
+            else np.asarray(ub, dtype=float)
+        )
+        self.group = self._init_group()
+        self.archive: np.ndarray = np.array([])
+        self.MF = np.ones(dim * 20) * 0.2
+        self.MCr = np.ones(dim * 20) * 0.2
+        self.k: int = 0
+        self.F = np.ones(self.NP) * 0.5
+        self.Cr = np.ones(self.NP) * 0.9
+
+    def _init_group(self, size: int = -1) -> np.ndarray:
+        if size < 0:
+            size = self.NP
+        return np.random.rand(size, self.dim) * (self.Xmax - self.Xmin) + self.Xmin
+
+    # Alias used by JDE21 reset logic
+    def initialize_group(self, size: int = -1) -> np.ndarray:
+        return self._init_group(size)
+
+    def initialize_costs(self, problem) -> None:
+        self.cost = np.array([float(problem(self.group[i])) for i in range(self.NP)])
+        self.gbest = self.cbest = float(np.min(self.cost))
+        self.cbest_id = int(np.argmin(self.cost))
+        self.gbest_solution = self.group[self.cbest_id].copy()
+
+    def sort(self, size: int, reverse: bool = False) -> None:
+        r = -1 if reverse else 1
+        ind = np.concatenate(
+            (np.argsort(r * self.cost[:size]), np.arange(self.NP)[size:])
+        )
+        self.cost = self.cost[ind]
+        self.cbest = float(np.min(self.cost))
+        self.cbest_id = int(np.argmin(self.cost))
+        self.group = self.group[ind]
+        self.F = self.F[ind]
+        self.Cr = self.Cr[ind]
+
+    def cal_NP_next_gen(self, FEs: int, MaxFEs: int) -> float:
+        return np.round(
+            self.Nmax
+            + (self.Nmin - self.Nmax) * np.power(FEs / MaxFEs, 1 - FEs / MaxFEs)
+        )
+
+    def slice(self, size: int) -> None:
+        self.NP = size
+        self.group = self.group[:size]
+        self.cost = self.cost[:size]
+        self.F = self.F[:size]
+        self.Cr = self.Cr[:size]
+        if self.cbest_id >= size:
+            self.cbest_id = int(np.argmin(self.cost))
+            self.cbest = float(np.min(self.cost))
+
+    def _mean_wL(self, df: np.ndarray, s: np.ndarray) -> float:
+        w = df / np.sum(df)
+        if np.sum(w * s) > 1e-6:
+            return float(np.sum(w * (s**2)) / np.sum(w * s))
+        return 0.5
+
+    def choose_F_Cr(self) -> tuple[np.ndarray, np.ndarray]:
+        gs = self.NP
+        ind_r = np.random.randint(0, self.MF.shape[0], size=gs)
+        C_r = np.minimum(
+            1, np.maximum(0, np.random.normal(loc=self.MCr[ind_r], scale=0.1, size=gs))
+        )
+        cauchy_locs = self.MF[ind_r]
+        F = stats.cauchy.rvs(loc=cauchy_locs, scale=0.1, size=gs)
+        err = np.where(F < 0)[0]
+        F[err] = 2 * cauchy_locs[err] - F[err]
+        return C_r, np.minimum(1, F)
+
+    def update_M_F_Cr(self, SF: np.ndarray, SCr: np.ndarray, df: np.ndarray) -> None:
+        if SF.shape[0] > 0:
+            self.MF[self.k] = self._mean_wL(df, SF)
+            self.MCr[self.k] = self._mean_wL(df, SCr)
+            self.k = (self.k + 1) % self.MF.shape[0]
+        else:
+            self.MF[self.k] = 0.5
+            self.MCr[self.k] = 0.5
+
+    def NLPSR(self, FEs: int, MaxFEs: int) -> None:
+        self.sort(self.NP)
+        N = self.cal_NP_next_gen(FEs, MaxFEs)
+        A = int(max(N * 2.1, self.Nmin))
+        N = int(N)
+        if N < self.NP:
+            self.slice(N)
+        if A < self.archive.shape[0]:
+            self.NA = A
+            self.archive = self.archive[:A]
+
+    def update_archive(self, old_id: int) -> None:
+        if self.archive.shape[0] < self.NA:
+            self.archive = np.append(self.archive, self.group[old_id]).reshape(
+                -1, self.dim
+            )
+        else:
+            self.archive[np.random.randint(self.archive.shape[0])] = self.group[old_id]
diff --git a/cv.py b/cv.py
index 557ec2c..28ae3d1 100644
--- a/cv.py
+++ b/cv.py
@@ -140,6 +140,10 @@ def _parse_args() -> argparse.Namespace:
         "--save-interval", type=int, default=50, help="Checkpoint every N epochs"
     )
     rl.add_argument("--device", default="cpu", help="PyTorch device")
+    rl.set_defaults(
+        portfolio=["NL_SHADE_RSP", "MADDE", "JDE21"],
+        n_individuals=170,
+    )
 
     # ---- Exp-DAS ----------------------------------------------------
     exp = sub.add_parser(
diff --git a/das/training/rldas.py b/das/training/rldas.py
index ddac460..00e859f 100644
--- a/das/training/rldas.py
+++ b/das/training/rldas.py
@@ -6,7 +6,6 @@
 import numpy as np
 
 from das.env.bbob_splits import get_cv_folds, get_train_test_split
-from das.optimizers.portfolio import get_portfolio
 from das.training.common import write_jsonl
 
 
@@ -14,10 +13,9 @@ def run_rl_das(args) -> None:
     import cocoex as cx
     from agents.rl_das import RLDASEnv, PPOAgent
     from agents.rl_das import train, evaluate
+    from agents.rl_das.optimizers import get_rldas_portfolio
 
-    optimizers = get_portfolio(args.portfolio)
-    if not optimizers:
-        raise ValueError(f"Unknown optimizers: {args.portfolio}")
+    optimizers = get_rldas_portfolio(args.portfolio)
 
     train_ids, test_ids = get_train_test_split(args.mode, [args.dim])
     print(f"Train: {len(train_ids)} problems  |  Test: {len(test_ids)} problems")
@@ -76,10 +74,9 @@ def run_cv_rl_das(args) -> None:
     import cocoex as cx
     from agents.rl_das import RLDASEnv, PPOAgent
     from agents.rl_das import train, evaluate
+    from agents.rl_das.optimizers import get_rldas_portfolio
 
-    optimizers = get_portfolio(args.portfolio)
-    if not optimizers:
-        raise ValueError(f"Unknown optimizers: {args.portfolio}")
+    optimizers = get_rldas_portfolio(args.portfolio)
 
     suite = cx.Suite("bbob", "", "")
 
diff --git a/run_local.sh b/run_local.sh
index 2415bd9..f1363fb 100755
--- a/run_local.sh
+++ b/run_local.sh
@@ -29,12 +29,12 @@ case "$AGENT" in
             -p "${PORTFOLIO[@]}" -d 2 --cv-mode LOIO --n-epochs 1 --seed $SEED  --fe-multiplier 10  --n-checkpoints 3
         ;;
     rl-das)
-        python train.py rl-das ${PORTFOLIO_STR}_RLDAS_LOCAL_SEED${SEED} \
-            -p "${PORTFOLIO[@]}" --dim 2 --n-epochs 1 --seed $SEED --fe-multiplier 10  --n-checkpoints 3
+        python train.py rl-das NL_SHADE_RSP_MADDE_JDE21_RLDAS_LOCAL_SEED${SEED} \
+            --dim 2 --n-epochs 1 --seed $SEED --fe-multiplier 10 --n-checkpoints 3
         ;;
     rl-das-cv)
-        python cv.py rl-das ${PORTFOLIO_STR}_RLDAS_CV_LOCAL_SEED${SEED} \
-            -p "${PORTFOLIO[@]}" --dim 2 --cv-mode LOIO --n-epochs 1 --seed $SEED  --fe-multiplier 10  --n-checkpoints 3
+        python cv.py rl-das NL_SHADE_RSP_MADDE_JDE21_RLDAS_CV_LOCAL_SEED${SEED} \
+            --dim 2 --cv-mode LOIO --n-epochs 1 --seed $SEED --fe-multiplier 10 --n-checkpoints 3
         ;;
     exp-das)
         python train.py exp-das ${PORTFOLIO_STR}_EXPDAS_LOCAL_SEED${SEED} \
diff --git a/train.py b/train.py
index ebd98b5..5024ec5 100644
--- a/train.py
+++ b/train.py
@@ -153,7 +153,11 @@ def _parse_args() -> argparse.Namespace:
     rl.add_argument(
         "--no-eval", dest="eval", action="store_false", help="Skip final evaluation"
     )
-    rl.set_defaults(eval=True)
+    rl.set_defaults(
+        eval=True,
+        portfolio=["NL_SHADE_RSP", "MADDE", "JDE21"],
+        n_individuals=170,
+    )
 
     # ---- Exp-DAS ----------------------------------------------------
     exp = sub.add_parser(

From b6c88cb56e6b759652eee3d925bc85082f0ad54c Mon Sep 17 00:00:00 2001
From: wlnc <wladyslaw.niec@unit8.co>
Date: Wed, 20 May 2026 12:36:42 +0200
Subject: [PATCH 4/8] add README

---
 README.md | 362 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 362 insertions(+)
 create mode 100644 README.md

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..4e5cac7
--- /dev/null
+++ b/README.md
@@ -0,0 +1,362 @@
+# DynamicAlgorithmSelection2
+
+RL-based Dynamic Algorithm Selection (DAS) on the [BBOB benchmark](https://numbbo.github.io/coco/). A controller learns to switch between a portfolio of black-box optimizers at runtime — allocating function-evaluation budget to whichever optimizer is most promising at each checkpoint.
+
+---
+
+## Agents
+
+Three agent families share the same BBOB problem set and evaluation protocol:
+
+| Agent | Description | Key reference |
+|---|---|---|
+| **PPO** | Stable Baselines 3 PPO with VecNormalize; multi-dimensional; ELA-based observations | — |
+| **RL-DAS** | Custom single-dimension PyTorch PPO; DE-only portfolio; population-state features with local sampling | Guo et al., 2024 |
+| **Exp-DAS** | Custom PyTorch PPO with exponential checkpoint spacing; flexible portfolio | — |
+
+### PPO
+Uses `DASEnv` — a Gymnasium environment that wraps a warm-started portfolio of arbitrary optimizers. Observations are 22-dimensional ELA landscape features plus per-optimizer movement history. Trains across multiple dimensions simultaneously.
+
+### RL-DAS
+Faithful port of [Guo et al. 2024](https://doi.org/10.1145/3638529.3654223) with BBOB adaptations:
+- Fixed DE portfolio: **NL_SHADE_RSP**, **MadDE**, **JDE21** (all share a single `Population` object as mutable warm-started state).
+- 9-dimensional population-state features computed via local sampling (2 independent forward passes on a population deepcopy).
+- Movement embedder networks compress D-dim displacement vectors to scalars; the backbone is dimension-specific (one model per `--dim`).
+- Hand-rolled PPO training loop (no SB3 dependency for this agent).
+
+### Exp-DAS
+Evolution of the original DAS `policy-gradient` agent. Uses `DASEnv` (same as PPO) but replaces uniform checkpoint spacing with an **exponential schedule** controlled by the Checkpoint Division Base (`--cdb`):
+
+- **`cdb = 1.0` (uniform):** every checkpoint covers the same number of function evaluations — consistent monitoring throughout the run.
+- **`cdb > 1.0` (exponential):** early checkpoints are short (frequent switching during initial exploration) and later checkpoints are long (uninterrupted convergence during exploitation).
+
+The agent uses separate actor and critic learning rates and a configurable number of PPO gradient epochs per update. Like PPO, it supports multiple dimensions simultaneously and an arbitrary optimizer portfolio.
+
+---
+
+## Installation
+
+Requires Python 3.11. Dependency management via [uv](https://docs.astral.sh/uv/).
+
+```bash
+uv sync
+```
+
+---
+
+## Quick start
+
+`run_local.sh` runs a single agent with tiny settings (fast smoke test):
+
+```bash
+bash run_local.sh [seed] [agent] [portfolio...]
+
+# Examples
+bash run_local.sh 42 ppo          CPSO NM TDE
+bash run_local.sh 42 ppo-cv       CPSO NM TDE
+bash run_local.sh 42 rl-das               # DE portfolio fixed; no -p needed
+bash run_local.sh 42 rl-das-cv
+bash run_local.sh 42 exp-das      CPSO NM TDE
+bash run_local.sh 42 exp-das-cv   CPSO NM TDE
+bash run_local.sh 42 baselines    CPSO NM TDE
+```
+
+Run the full smoke-test suite (all agent types):
+
+```bash
+bash smoke_test.sh
+# or selectively
+bash smoke_test.sh rl-das rl-das-cv
+```
+
+---
+
+## Training
+
+```bash
+python train.py {ppo,rl-das,exp-das} <name> [options]
+```
+
+### PPO
+
+```bash
+python train.py ppo MY_PPO \
+    -p CPSO NM TDE \
+    -d 2 5 10 \
+    -E 20 \
+    --fe-multiplier 10000 \
+    --n-checkpoints 10 \
+    --seed 42
+```
+
+Key options:
+
+| Flag | Default | Description |
+|---|---|---|
+| `-p / --portfolio` | `SPSO IPSO SPSOL` | Sub-optimizer names |
+| `-d / --dims` | all | Problem dimensions |
+| `-E / --n-epochs` | 20 | Passes over the training set |
+| `--fe-multiplier` | 10 000 | Budget = multiplier × dimension |
+| `--n-checkpoints` | 10 | Optimizer-selection steps per episode |
+| `-x / --cdb` | 1.0 | Checkpoint division base (1 = uniform) |
+| `-O / --reward-option` | 1 | Reward shaping (1–4) |
+| `--wandb` | off | Log to Weights & Biases |
+
+Outputs: `models/<name>.zip`, `models/<name>_vecnorm.pkl`
+
+### RL-DAS
+
+```bash
+python train.py rl-das MY_RLDAS \
+    --dim 10 \
+    --n-epochs 20 \
+    --fe-multiplier 10000 \
+    --seed 42
+```
+
+The portfolio is fixed to `NL_SHADE_RSP MADDE JDE21` and `--n-individuals` defaults to 170 (matching the original paper). Use `--portfolio` to override.
+
+Key options:
+
+| Flag | Default | Description |
+|---|---|---|
+| `--dim` | 10 | Problem dimension (one model per dim) |
+| `--n-epochs` | 20 | Training epochs |
+| `--lr` | 1e-5 | Learning rate |
+| `--k-epoch` | `0.3 × n_checkpoints` | PPO gradient steps per episode |
+| `--device` | cpu | PyTorch device |
+
+Outputs: `models/<name>_final.pt`, `models/<name>_epoch<N>.pt`, `models/<name>_train_log.jsonl`
+
+### Exp-DAS
+
+```bash
+python train.py exp-das MY_EXPDAS \
+    -p CPSO NM TDE \
+    --dims 2 5 10 \
+    -E 3 \
+    --cdb 2.0 \
+    --reward-option 1 \
+    --seed 42
+```
+
+Key options:
+
+| Flag | Default | Description |
+|---|---|---|
+| `--dims` | `2 5 10` | Problem dimensions |
+| `--cdb` | 2.0 | Checkpoint Division Base (see below) |
+| `-E / --n-epochs` | 3 | Passes over the training set |
+| `--actor-lr` | 3e-5 | Actor learning rate |
+| `--critic-lr` | 1e-5 | Critic learning rate |
+| `--ppo-epochs` | 6 | PPO gradient epochs per update |
+| `--buffer-capacity` | `16 × n_checkpoints` | PPO rollout buffer size in steps |
+| `-O / --reward-option` | 1 | Reward shaping strategy (1–4, see below) |
+| `--save-interval` | 500 | Save a checkpoint every N episodes |
+| `--device` | cpu | PyTorch device |
+
+Outputs: `models/<name>_best.pt`, `models/<name>_final.pt`, `models/<name>_ep<N>.pt`, `models/<name>_train_log.jsonl`
+
+---
+
+## Checkpoint Division Base (CDB)
+
+The `--cdb` argument controls how the total FE budget is distributed across the `n_checkpoints` decision points in each episode.
+
+With `cdb = 1.0` every checkpoint covers the same number of FEs (uniform). With `cdb > 1.0` checkpoint durations grow exponentially: the first checkpoints are short (fast switching during early exploration) and the last are long (uninterrupted convergence during exploitation).
+
+```
+cdb = 1.0  →  [───][───][───][───][───]   uniform
+cdb = 2.0  →  [─][──][────][────────]    exponential
+```
+
+**When to use each value:**
+
+| Value | Effect | Use case |
+|---|---|---|
+| `1.0` | Equal-length checkpoints | Consistent monitoring; PPO default |
+| `2.0` | Moderate exponential growth | Exp-DAS default; balances exploration and exploitation |
+| `> 2.0` | Aggressive early switching | Portfolios where early optimizer choice is decisive |
+
+The `--cdb` flag is available for all three agents (`ppo`, `rl-das` ignores it, `exp-das`).
+
+---
+
+## Reward options
+
+The `-O / --reward-option` flag selects the reward signal used at each checkpoint. All options measure improvement in the best objective value found so far and scale it by the initial value range.
+
+| Option | Name | Description |
+|---|---|---|
+| `1` | Log-scaled improvement | `improvement` between consecutive checkpoints, clipped to `[0, 1]`, then `log(r + 1e-5)`. Smooths large variance. **Default.** |
+| `2` | Linear clipped improvement | Same as option 1 but without the log transform: `clip(improvement, 0, 1)`. |
+| `3` | Sparse total improvement | Returns `0` at every intermediate checkpoint; at the final checkpoint returns the log-scaled total improvement from episode start. Focuses the agent on end-of-run quality. |
+| `4` | Binary threshold | Returns `1` if scaled improvement ≥ `1e-3`, else `0`. Simple binary feedback. |
+
+---
+
+## Cross-validation
+
+```bash
+python cv.py {ppo,rl-das,exp-das} <name> [options]
+```
+
+Two CV modes:
+
+- **LOIO** (Leave-One-Instance-Out): hold out a subset of BBOB instances per fold.
+- **LOPO** (Leave-One-Problem-Out): hold out a subset of BBOB functions per fold.
+
+```bash
+# PPO – 3-fold LOIO
+python cv.py ppo MY_PPO_CV \
+    -p CPSO NM TDE -d 5 10 \
+    --cv-mode LOIO --n-folds 3 --n-epochs 10 --seed 42
+
+# RL-DAS – 3-fold LOPO, dim 10 only
+python cv.py rl-das MY_RLDAS_CV \
+    --dim 10 --cv-mode LOPO --n-folds 3 --n-epochs 20 --seed 42
+
+# Run only folds 0 and 2
+python cv.py exp-das MY_EXPDAS_CV \
+    -p CPSO NM TDE --dims 5 10 \
+    --cv-mode LOIO --folds 0 2 --n-epochs 3 --seed 42
+```
+
+Outputs per fold: `results/<name>_cv_<fold_tag>.jsonl`
+Aggregated: `results/<name>_cv_summary.jsonl`
+
+---
+
+## Baselines
+
+```bash
+python baselines.py <name> --agent <agent_type> [options]
+```
+
+Agent types:
+
+| Type | Description |
+|---|---|
+| `random` | Uniform random selection at each checkpoint |
+| `fixed:<name>` | Always pick one optimizer, e.g. `fixed:CPSO` |
+| `single:<name>` | One optimizer runs the full budget (no checkpointing) |
+| `all` | All of the above; derives oracle-best / oracle-worst |
+
+```bash
+python baselines.py MY_BASELINES --agent all \
+    -p CPSO NM TDE -d 2 5 10 --seed 42
+```
+
+---
+
+## Evaluation
+
+Load a trained PPO model and evaluate it on the BBOB test set:
+
+```bash
+python evaluate.py MY_PPO \
+    -p CPSO NM TDE -d 5 10 --seed 42
+```
+
+Add `--coco-observer` to write COCO-compatible data for `cocopp` post-processing.
+
+---
+
+## Problem set
+
+The BBOB benchmark provides 24 functions × 15 instances × 6 dimensions = 2 160 problems per dimension.
+
+**Dimensions:** `2, 3, 5, 10, 20, 40`
+
+**Default train/test split** (`--mode easy`): trains on 14 structurally simpler functions and tests on the remaining 10 harder functions.
+
+| Mode | Train | Test |
+|---|---|---|
+| `easy` | functions {4,6–14,18–20,22–24} | remaining 10 functions |
+| `hard` | inverse of easy | — |
+| `random` | 2/3 of all problems | 1/3 |
+
+---
+
+## Optimizer portfolio
+
+Available sub-optimizers (pass names via `-p / --portfolio`):
+
+| Family | Names |
+|---|---|
+| PSO | `SPSO`, `IPSO`, `SPSOL`, `CPSO` |
+| DE | `NL_SHADE_RSP`, `MADDE`, `JDE21`, `TDE` |
+| ES | `NM` (Nelder-Mead) |
+| BO | `BO` |
+| DS | `DS` (Direct Search) |
+
+RL-DAS always uses the DE trio `NL_SHADE_RSP / MADDE / JDE21` — overridable with `--portfolio`.
+
+---
+
+## HPC / SLURM
+
+Submit all agents for a given seed and portfolio:
+
+```bash
+bash runner.sh
+```
+
+Individual SLURM scripts:
+
+| Script | Agent |
+|---|---|
+| `ppo_study.slurm` | PPO |
+| `rl_das_study.slurm` | RL-DAS |
+| `exp_das_study.slurm` | Exp-DAS |
+| `baselines.slurm` | Baselines |
+
+---
+
+## Project structure
+
+```
+DynamicAlgorithmSelection2/
+├── train.py              # Unified training entry point
+├── cv.py                 # Cross-validation entry point
+├── baselines.py          # Baseline agents
+├── evaluate.py           # Model evaluation
+├── run_local.sh          # Local smoke-test runner
+├── smoke_test.sh         # Full smoke-test suite
+├── runner.sh             # SLURM batch submission
+│
+├── agents/
+│   ├── rl_das/           # RL-DAS (Guo et al. 2024 port)
+│   │   ├── env.py        # RLDASEnv: Population-based Gymnasium env
+│   │   ├── optimizers.py # NL_SHADE_RSP, JDE21, MadDE (BBOB-adapted)
+│   │   ├── population.py # Shared mutable Population state (NLPSR)
+│   │   ├── agent.py      # PPOAgent (actor-critic)
+│   │   ├── network.py    # Movement embedder + backbone
+│   │   └── trainer.py    # train() / evaluate() loops
+│   └── exponential_das/  # Exp-DAS agent
+│
+├── das/
+│   ├── env/
+│   │   ├── das_env.py    # DASEnv: Gymnasium env for PPO / Exp-DAS
+│   │   ├── bbob_splits.py# BBOB problem IDs, train/test/CV splits
+│   │   ├── observation.py# ELA feature extraction (22-dim)
+│   │   └── reward.py     # Reward shaping options
+│   ├── optimizers/
+│   │   ├── portfolio.py  # get_portfolio() factory
+│   │   └── {PSO,DE,ES,BO,DS}/  # Sub-optimizer implementations
+│   └── training/
+│       ├── ppo.py        # run_ppo() / run_cv_ppo()
+│       ├── rldas.py      # run_rl_das() / run_cv_rl_das()
+│       ├── expdas.py     # run_exp_das() / run_cv_exp_das()
+│       └── common.py     # Shared utilities (JSONL writer, etc.)
+│
+├── tests/                # pytest test suite
+└── pyproject.toml
+```
+
+---
+
+## References
+
+- Guo, Y. et al. (2024). *Deep Reinforcement Learning for Dynamic Algorithm Selection: A Proof-of-Principle Study on Differential Evolution*. GECCO 2024. https://doi.org/10.1145/3638529.3654223
+- Hansen, N. et al. (2021). *COCO: A Platform for Comparing Continuous Optimizers in a Black-Box Setting*. Optimization Methods and Software.
\ No newline at end of file

From 20d931ec1cd75e5418c18738721be0166b29d181 Mon Sep 17 00:00:00 2001
From: wlnc <wladyslaw.niec@unit8.co>
Date: Fri, 22 May 2026 22:22:46 +0200
Subject: [PATCH 5/8] update runners

---
 cv.py              | 21 ++++++++++++---------
 rl_das_study.slurm | 20 ++++++--------------
 runner.sh          |  9 +++++----
 train.py           | 21 ++++++++++++---------
 4 files changed, 35 insertions(+), 36 deletions(-)

diff --git a/cv.py b/cv.py
index 28ae3d1..8b2a6d1 100644
--- a/cv.py
+++ b/cv.py
@@ -28,15 +28,18 @@
 # ------------------------------------------------------------------ #
 
 
-def _add_shared_args(p: argparse.ArgumentParser) -> None:
+def _add_shared_args(
+    p: argparse.ArgumentParser, *, include_portfolio: bool = True
+) -> None:
     p.add_argument("name", help="Experiment name (used for output file names)")
-    p.add_argument(
-        "-p",
-        "--portfolio",
-        nargs="+",
-        default=["SPSO", "IPSO", "SPSOL"],
-        help="Sub-optimizer names from the portfolio",
-    )
+    if include_portfolio:
+        p.add_argument(
+            "-p",
+            "--portfolio",
+            nargs="+",
+            default=["SPSO", "IPSO", "SPSOL"],
+            help="Sub-optimizer names from the portfolio",
+        )
     p.add_argument(
         "--fe-multiplier",
         type=int,
@@ -121,7 +124,7 @@ def _parse_args() -> argparse.Namespace:
         help="Custom RL-DAS: single-dimension, pure-PyTorch PPO",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
-    _add_shared_args(rl)
+    _add_shared_args(rl, include_portfolio=False)
     rl.add_argument(
         "--dim", type=int, default=10, help="Problem dimension (agent is dim-specific)"
     )
diff --git a/rl_das_study.slurm b/rl_das_study.slurm
index 504cf97..60bd113 100644
--- a/rl_das_study.slurm
+++ b/rl_das_study.slurm
@@ -10,36 +10,28 @@
 #SBATCH -A plgrldas2026-gpu-a100
 #SBATCH --array=0-7
 
-# Args: SEED [PORTFOLIO...]
+# Args: SEED
 SEED=${1:-42}
 
-if [ "$#" -lt 2 ]; then
-    PORTFOLIO=('SPSO' 'IPSO' 'SPSOL')
-else
-    PORTFOLIO=("${@:2}")
-fi
-
-PORTFOLIO_STR=$(IFS="_"; echo "${PORTFOLIO[*]}")
-
 ENV_PATH="$SCRATCH/DynamicAlgorithmSelection2/.venv/bin/activate"
 source "$ENV_PATH"
 mkdir -p logs
 
 DIMS=(2 3 5 10)
 
-echo "Array job $SLURM_ARRAY_TASK_ID | SEED=$SEED | PORTFOLIO=${PORTFOLIO[*]}"
+echo "Array job $SLURM_ARRAY_TASK_ID | SEED=$SEED"
 
 # 0-3: CV-LOIO per dimension
 if [[ $SLURM_ARRAY_TASK_ID -ge 0 && $SLURM_ARRAY_TASK_ID -le 3 ]]; then
     DIM=${DIMS[$SLURM_ARRAY_TASK_ID]}
     echo "RL-DAS | CV-LOIO | dim=$DIM"
-    python cv.py rl-das ${PORTFOLIO_STR}_RLDAS_LOIO_DIM${DIM}_SEED${SEED} \
-        -p "${PORTFOLIO[@]}" --dim $DIM --cv-mode LOIO --n-epochs 500 --seed $SEED
+    python cv.py rl-das NL_SHADE_RSP_MADDE_JDE21_RLDAS_LOIO_DIM${DIM}_SEED${SEED} \
+        --dim $DIM --cv-mode LOIO --n-epochs 500 --seed $SEED
 
 # 4-7: CV-LOPO per dimension
 elif [[ $SLURM_ARRAY_TASK_ID -ge 4 && $SLURM_ARRAY_TASK_ID -le 7 ]]; then
     DIM=${DIMS[$((SLURM_ARRAY_TASK_ID - 4))]}
     echo "RL-DAS | CV-LOPO | dim=$DIM"
-    python cv.py rl-das ${PORTFOLIO_STR}_RLDAS_LOPO_DIM${DIM}_SEED${SEED} \
-        -p "${PORTFOLIO[@]}" --dim $DIM --cv-mode LOPO --n-epochs 500 --seed $SEED
+    python cv.py rl-das NL_SHADE_RSP_MADDE_JDE21_RLDAS_LOPO_DIM${DIM}_SEED${SEED} \
+        --dim $DIM --cv-mode LOPO --n-epochs 500 --seed $SEED
 fi
\ No newline at end of file
diff --git a/runner.sh b/runner.sh
index 49b27c4..0d05cf1 100755
--- a/runner.sh
+++ b/runner.sh
@@ -9,16 +9,17 @@ PORTFOLIOS=(
 echo "Starting job submissions..."
 
 for SEED in "${SEEDS[@]}"; do
+
+    echo "Submitting RL-DAS study | SEED=$SEED"
+    sbatch rl_das_study.slurm $SEED
+    sleep 1
+
     for PORTFOLIO in "${PORTFOLIOS[@]}"; do
 
         echo "Submitting PPO study | SEED=$SEED | PORTFOLIO=$PORTFOLIO"
         sbatch ppo_study.slurm $SEED $PORTFOLIO
         sleep 1
 
-        echo "Submitting RL-DAS study | SEED=$SEED | PORTFOLIO=$PORTFOLIO"
-        sbatch rl_das_study.slurm $SEED $PORTFOLIO
-        sleep 1
-
         echo "Submitting Exp-DAS study | SEED=$SEED | PORTFOLIO=$PORTFOLIO"
         sbatch exp_das_study.slurm $SEED $PORTFOLIO
         sleep 1
diff --git a/train.py b/train.py
index 5024ec5..73a1530 100644
--- a/train.py
+++ b/train.py
@@ -41,15 +41,18 @@
 # ------------------------------------------------------------------ #
 
 
-def _add_shared_args(p: argparse.ArgumentParser) -> None:
+def _add_shared_args(
+    p: argparse.ArgumentParser, *, include_portfolio: bool = True
+) -> None:
     p.add_argument("name", help="Experiment name (used for output file names)")
-    p.add_argument(
-        "-p",
-        "--portfolio",
-        nargs="+",
-        default=["SPSO", "IPSO", "SPSOL"],
-        help="Sub-optimizer names from the portfolio",
-    )
+    if include_portfolio:
+        p.add_argument(
+            "-p",
+            "--portfolio",
+            nargs="+",
+            default=["SPSO", "IPSO", "SPSOL"],
+            help="Sub-optimizer names from the portfolio",
+        )
     p.add_argument(
         "--mode",
         choices=["easy", "hard", "random"],
@@ -131,7 +134,7 @@ def _parse_args() -> argparse.Namespace:
         help="Custom RL-DAS: single-dimension, pure-PyTorch PPO",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
-    _add_shared_args(rl)
+    _add_shared_args(rl, include_portfolio=False)
     rl.add_argument(
         "--dim", type=int, default=10, help="Problem dimension (agent is dim-specific)"
     )

From 4d916310978deec6a4dd4a5fd51ba96489355cff Mon Sep 17 00:00:00 2001
From: wlnc <wladyslaw.niec@unit8.co>
Date: Sun, 24 May 2026 11:43:26 +0200
Subject: [PATCH 6/8] update results logging

---
 .github/workflows/ci.yml                 |   3 +
 agents/exponential_das/trainer.py        |  25 +-
 agents/rl_das/optimizers.py              | 669 -----------------------
 agents/rl_das/optimizers/__init__.py     |  40 ++
 agents/rl_das/optimizers/jde21.py        | 275 ++++++++++
 agents/rl_das/optimizers/madde.py        | 209 +++++++
 agents/rl_das/optimizers/nl_shade_rsp.py | 166 ++++++
 agents/rl_das/trainer.py                 |   2 +-
 baselines.py                             | 141 ++---
 cv.py                                    |   7 +
 das/env/__init__.py                      |   2 +
 das/env/das_env.py                       |   1 +
 das/optimizers/BO/__init__.py            |   2 +-
 das/optimizers/DS/nm.py                  |   6 +-
 das/optimizers/__init__.py               |   5 +-
 das/training/callbacks.py                |   5 +-
 das/training/common.py                   |  57 ++
 das/training/expdas.py                   | 225 +++++---
 evaluate.py                              |  62 +--
 tests/test_baselines.py                  | 514 +++++++++++++----
 20 files changed, 1428 insertions(+), 988 deletions(-)
 delete mode 100644 agents/rl_das/optimizers.py
 create mode 100644 agents/rl_das/optimizers/__init__.py
 create mode 100644 agents/rl_das/optimizers/jde21.py
 create mode 100644 agents/rl_das/optimizers/madde.py
 create mode 100644 agents/rl_das/optimizers/nl_shade_rsp.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8dd2903..2884b5b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -34,6 +34,9 @@ jobs:
       - name: Ruff lint
         run: uv run ruff format .
 
+      - name: Ruff check
+        run: uv run ruff check
+
       - name: Run pytest
         run: uv run pytest tests/ -v --tb=short
 
diff --git a/agents/exponential_das/trainer.py b/agents/exponential_das/trainer.py
index 432791e..bcba617 100644
--- a/agents/exponential_das/trainer.py
+++ b/agents/exponential_das/trainer.py
@@ -20,6 +20,7 @@
 
 from agents.exponential_das.agent import ExpDASAgent
 from das.env.das_env import DASEnv
+from das.training.common import compute_run_stats
 
 
 def train(
@@ -111,7 +112,9 @@ def train(
 
         if ep % eval_interval == 0:
             test_results = evaluate(test_env, agent, n_episodes=20)
-            mean_test_r = float(np.mean([r["reward"] for r in test_results]))
+            mean_test_r = float(
+                np.mean([next(iter(r.values()))["reward"] for r in test_results])
+            )
             entry["mean_test_reward"] = mean_test_r
             mean_train_r = float(np.mean(episode_rewards[-eval_interval:]))
             print(
@@ -145,24 +148,36 @@ def evaluate(
     env: DASEnv,
     agent: ExpDASAgent,
     n_episodes: int = 20,
+    global_optima: dict[str, float] | None = None,
 ) -> list[dict]:
     """Run the agent deterministically and return per-episode results."""
+    if global_optima is None:
+        global_optima = {}
     results = []
     for _ in range(n_episodes):
         obs, info = env.reset()
+        problem_id = info.get("problem_id", "")
         done = False
         total_reward = 0.0
+        fitness_history: list[tuple[int, float]] = []
+        step_info: dict = {}
+
         while not done:
             action = agent.predict(obs)
             obs, reward, terminated, truncated, step_info = env.step(action)
             done = terminated or truncated
             total_reward += reward
+            fitness_history.extend(step_info.get("fitness_history_step", []))
+
+        max_fe = step_info.get("n_fe", 0)
+        global_minimum = global_optima.get(problem_id, 0.0)
+        stats = compute_run_stats(fitness_history, max_fe, global_minimum)
         results.append(
             {
-                "problem_id": info.get("problem_id", ""),
-                "reward": total_reward,
-                "best_y": step_info.get("best_y", float("inf")),
-                "n_fe": step_info.get("n_fe", 0),
+                problem_id: {
+                    **stats,
+                    "reward": total_reward,
+                }
             }
         )
     return results
diff --git a/agents/rl_das/optimizers.py b/agents/rl_das/optimizers.py
deleted file mode 100644
index 40dae89..0000000
--- a/agents/rl_das/optimizers.py
+++ /dev/null
@@ -1,669 +0,0 @@
-"""DE optimizers for RL-DAS: NL_SHADE_RSP, JDE21, MadDE.
-
-Ported from the original RL-DAS (Guo et al., 2024) with the following
-adaptations for DAS2 / BBOB:
-
-- Batch evaluation: ``np.array([float(problem(xs[i])) for i in range(len(xs))])``
-  instead of ``problem.func(xs) - problem.optimum``.
-- Bounds from ``population.Xmin`` / ``population.Xmax`` (not hardcoded ±100).
-- No early-termination at ``cost < 1e-8`` (FE budget controls episode end).
-- ``__init__`` takes no ``dim`` argument (dimension inferred from population).
-"""
-
-from __future__ import annotations
-
-import numpy as np
-import scipy.stats as stats
-
-
-def _eval(problem, xs: np.ndarray) -> np.ndarray:
-    return np.array([float(problem(xs[i])) for i in range(len(xs))])
-
-
-class NL_SHADE_RSP:
-    def __init__(self) -> None:
-        self.pb = 0.4
-        self.pa = 0.5
-
-    def _binomial(self, x: np.ndarray, v: np.ndarray, cr: np.ndarray) -> np.ndarray:
-        NP, dim = x.shape
-        jrand = np.random.randint(dim, size=NP)
-        u = np.where(np.random.rand(NP, dim) < cr.repeat(dim).reshape(NP, dim), v, x)
-        u[np.arange(NP), jrand] = v[np.arange(NP), jrand]
-        return u
-
-    def _exponential(self, x: np.ndarray, v: np.ndarray, cr: np.ndarray) -> np.ndarray:
-        NP, dim = x.shape
-        u = x.copy()
-        L = np.random.randint(dim, size=NP).repeat(dim).reshape(NP, dim)
-        L = L <= np.arange(dim)
-        rvs = np.random.rand(NP, dim)
-        L = np.where(rvs > cr.repeat(dim).reshape(NP, dim), L, 0)
-        return u * (1 - L) + v * L
-
-    def _update_pa(self, fa: float, fp: float, na: int, NP: int) -> None:
-        if na == 0 or fa == 0:
-            self.pa = 0.5
-            return
-        self.pa = (fa / (na + 1e-15)) / ((fa / (na + 1e-15)) + (fp / (NP - na + 1e-15)))
-        self.pa = float(np.clip(self.pa, 0.1, 0.9))
-
-    def step(
-        self,
-        population,
-        problem,
-        FEs: int,
-        FEs_end: int,
-        MaxFEs: int,
-        record_period: int = -1,
-    ):
-        if record_period <= 0:
-            record_period = FEs_end - FEs
-        NP, dim = population.NP, population.dim
-        NA = int(NP * 2.1)
-        if NA < population.archive.shape[0]:
-            population.archive = population.archive[:NA]
-        self.pa = 0.5
-        k = 1
-        Fevs = []
-        while FEs >= k * record_period:
-            k += 1
-        population.sort(population.NP)
-
-        while FEs < FEs_end and FEs < MaxFEs:
-            Cr, F = population.choose_F_Cr()
-            Cr = np.sort(Cr)
-            pr = np.exp(-(np.arange(NP) + 1) / NP)
-            pr /= np.sum(pr)
-            cross_exponential = np.random.random() < 0.5
-            pb_upper = int(max(2, NP * self.pb))
-            pbs = np.random.randint(pb_upper, size=NP)
-            count = 0
-            duplicate = np.where(pbs == np.arange(NP))[0]
-            while duplicate.shape[0] > 0 and count < 1:
-                pbs[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
-                duplicate = np.where(pbs == np.arange(NP))[0]
-                count += 1
-            xpb = population.group[pbs]
-            r1 = np.random.randint(NP, size=NP)
-            count = 0
-            duplicate = np.where((r1 == np.arange(NP)) + (r1 == pbs))[0]
-            while duplicate.shape[0] > 0 and count < 25:
-                r1[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
-                duplicate = np.where((r1 == np.arange(NP)) + (r1 == pbs))[0]
-                count += 1
-            x1 = population.group[r1]
-            rvs = np.random.rand(NP)
-            r2_pop = np.where(rvs >= self.pa)[0]
-            r2_arc = np.where(rvs < self.pa)[0]
-            use_arc = np.zeros(NP, dtype=bool)
-            use_arc[r2_arc] = True
-            if population.archive.shape[0] < 25:
-                r2_pop = np.arange(NP)
-                r2_arc = np.array([], dtype=np.int32)
-            r2 = np.random.choice(np.arange(NP), size=r2_pop.shape[0], p=pr)
-            count = 0
-            duplicate = np.where(
-                (r2 == r2_pop) + (r2 == pbs[r2_pop]) + (r2 == r1[r2_pop])
-            )[0]
-            while duplicate.shape[0] > 0 and count < 25:
-                r2[duplicate] = np.random.choice(
-                    np.arange(NP), size=duplicate.shape[0], p=pr
-                )
-                duplicate = np.where(
-                    (r2 == r2_pop) + (r2 == pbs[r2_pop]) + (r2 == r1[r2_pop])
-                )[0]
-                count += 1
-            x2 = np.zeros((NP, dim))
-            if r2_pop.shape[0] > 0:
-                x2[r2_pop] = population.group[r2]
-            if r2_arc.shape[0] > 0:
-                x2[r2_arc] = population.archive[
-                    np.random.randint(
-                        min(population.archive.shape[0], NA),
-                        size=r2_arc.shape[0],
-                    )
-                ]
-            Fs = F.repeat(dim).reshape(NP, dim)
-            vs = population.group + Fs * (xpb - population.group) + Fs * (x1 - x2)
-            Crb = np.zeros(NP)
-            tmp_id = np.where(np.arange(NP) + FEs < 0.5 * MaxFEs)[0]
-            Crb[tmp_id] = 2 * ((FEs + tmp_id) / MaxFEs - 0.5)
-            if cross_exponential:
-                us = self._binomial(population.group, vs, Crb)
-            else:
-                us = self._exponential(population.group, vs, Cr)
-
-            # Reinitialise out-of-bounds elements with uniform random in [lb, ub]
-            oob = (us < population.Xmin) | (us > population.Xmax)
-            if np.any(oob):
-                rand_pts = (
-                    np.random.rand(NP, dim) * (population.Xmax - population.Xmin)
-                    + population.Xmin
-                )
-                us = np.where(oob, rand_pts, us)
-
-            cost = _eval(problem, us)
-            optim = np.where(cost < population.cost)[0]
-            for i in optim:
-                population.update_archive(i)
-            population.F[optim] = F[optim]
-            population.Cr[optim] = Cr[optim]
-            SF = F[optim]
-            SCr = Cr[optim]
-            df = (population.cost[optim] - cost[optim]) / (
-                population.cost[optim] + 1e-9
-            )
-            arc_usage = use_arc[optim]
-            fp = float(np.sum(df[arc_usage]))
-            fa = float(np.sum(df[~arc_usage]))
-            na = int(np.sum(arc_usage))
-            population.group[optim] = us[optim]
-            population.cost[optim] = cost[optim]
-            if float(np.min(cost)) < population.gbest:
-                population.gbest = float(np.min(cost))
-                population.gbest_solution = us[np.argmin(cost)].copy()
-            FEs += NP
-            self.pb = 0.4 - 0.2 * (FEs / MaxFEs)
-            population.NLPSR(FEs, MaxFEs)
-            population.update_M_F_Cr(SF, SCr, df)
-            self._update_pa(fa, fp, na, NP)
-            NP = population.NP
-            NA = population.NA
-            if FEs >= k * record_period:
-                Fevs.append(population.gbest)
-                k += 1
-
-        return population, Fevs, min(FEs, MaxFEs)
-
-
-class JDE21:
-    def __init__(self) -> None:
-        self.sNP = 10
-        self.tao1 = 0.1
-        self.tao2 = 0.1
-        self.Finit = 0.5
-        self.CRinit = 0.9
-        self.Fl_b = 0.1
-        self.Fl_s = 0.17
-        self.Fu = 1.1
-        self.CRl_b = 0.0
-        self.CRl_s = 0.1
-        self.CRu_b = 1.1
-        self.CRu_s = 0.8
-        self.eps = 1e-12
-        self.MyEps = 0.25
-        self.nReset = 0
-        self.sReset = 0
-        self.cCopy = 0
-
-    def _prevec_enakih(self, cost: np.ndarray, best: float) -> bool:
-        eqs = len(cost[np.fabs(cost - best) < self.eps])
-        return eqs > 2 and eqs > len(cost) * self.MyEps
-
-    def _crowding(self, group: np.ndarray, vs: np.ndarray) -> np.ndarray:
-        NP, dim = vs.shape
-        dist = np.sum(
-            ((group * np.ones((NP, NP, dim))).transpose(1, 0, 2) - vs) ** 2, -1
-        ).transpose()
-        return np.argmin(dist, -1)
-
-    def step(
-        self,
-        population,
-        problem,
-        FEs: int,
-        FEs_end: int,
-        MaxFEs: int,
-        record_period: int = -1,
-    ):
-        if record_period <= 0:
-            record_period = FEs_end - FEs
-        NP = population.NP
-        dim = population.dim
-        sNP = self.sNP
-        bNP = max(NP - sNP, 2)
-        age = 0
-        k = 1
-        Fevs = []
-
-        def mutate_cross_select(r1, r2, r3, SF, SCr, df, age, big):
-            nonlocal bNP
-            if big:
-                xNP = bNP
-                randF = np.random.rand(xNP) * self.Fu + self.Fl_b
-                randCr = np.random.rand(xNP) * self.CRu_b + self.CRl_b
-                pF = population.F[:xNP]
-                pCr = population.Cr[:xNP]
-            else:
-                # Use actual slice size (may be < sNP when population shrank)
-                xNP = population.F[bNP:].shape[0]
-                if xNP == 0:
-                    return SF, SCr, df, age
-                randF = np.random.rand(xNP) * self.Fu + self.Fl_s
-                randCr = np.random.rand(xNP) * self.CRu_b + self.CRl_s
-                pF = population.F[bNP:]
-                pCr = population.Cr[bNP:]
-
-            rvs = np.random.rand(xNP)
-            F = np.where(rvs < self.tao1, randF, pF)
-            rvs = np.random.rand(xNP)
-            Cr = np.where(rvs < self.tao2, randCr, pCr)
-            Fs = F.repeat(dim).reshape(xNP, dim)
-            Crs = Cr.repeat(dim).reshape(xNP, dim)
-            v = population.group[r1] + Fs * (
-                population.group[r2] - population.group[r3]
-            )
-            v = np.clip(v, population.Xmin, population.Xmax)
-            jrand = np.random.randint(dim, size=xNP)
-            u = np.where(
-                np.random.rand(xNP, dim) < Crs,
-                v,
-                (population.group[:bNP] if big else population.group[bNP:]),
-            )
-            u[np.arange(xNP), jrand] = v[np.arange(xNP), jrand]
-            cost = _eval(problem, u)
-            crowding_ids = (
-                self._crowding(population.group[:xNP], u)
-                if big
-                else np.arange(xNP) + bNP
-            )
-            age += xNP
-            for i in range(xNP):
-                idx = crowding_ids[i]
-                if cost[i] < population.cost[idx]:
-                    population.update_archive(idx)
-                    population.group[idx] = u[i]
-                    population.cost[idx] = cost[i]
-                    population.F[idx] = F[i]
-                    population.Cr[idx] = Cr[i]
-                    SF = np.append(SF, F[i])
-                    SCr = np.append(SCr, Cr[i])
-                    d = (population.cost[i] - cost[i]) / (population.cost[i] + 1e-9)
-                    df = np.append(df, d)
-                    if cost[i] < population.cbest:
-                        age = 0
-                        population.cbest_id = idx
-                        population.cbest = cost[i]
-                        if cost[i] < population.gbest:
-                            population.gbest = cost[i]
-                            population.gbest_solution = u[i].copy()
-
-            return SF, SCr, df, age
-
-        population.sort(NP, True)
-        while FEs >= k * record_period:
-            k += 1
-        while FEs < FEs_end:
-            df = np.array([])
-            SF = np.array([])
-            SCr = np.array([])
-            if (
-                self._prevec_enakih(population.cost[:bNP], population.gbest)
-                or age > MaxFEs / 10
-            ):
-                self.nReset += 1
-                population.group[:bNP] = population.initialize_group(bNP)
-                population.F[:bNP] = self.Finit
-                population.Cr[:bNP] = self.CRinit
-                population.cost[:bNP] = 1e15
-                age = 0
-                population.cbest = float(np.min(population.cost))
-                population.cbest_id = int(np.argmin(population.cost))
-
-            if FEs < MaxFEs / 3:
-                mig = 1
-            elif FEs < 2 * MaxFEs / 3:
-                mig = 2
-            else:
-                mig = 3
-
-            r1 = np.random.randint(bNP, size=bNP)
-            count = 0
-            duplicate = np.where((r1 == np.arange(bNP)) * (r1 == population.cbest_id))[
-                0
-            ]
-            while duplicate.shape[0] > 0 and count < 25:
-                r1[duplicate] = np.random.randint(bNP, size=duplicate.shape[0])
-                duplicate = np.where(
-                    (r1 == np.arange(bNP)) * (r1 == population.cbest_id)
-                )[0]
-                count += 1
-            r2 = np.random.randint(bNP + mig, size=bNP)
-            count = 0
-            duplicate = np.where((r2 == np.arange(bNP)) + (r2 == r1))[0]
-            while duplicate.shape[0] > 0 and count < 25:
-                r2[duplicate] = np.random.randint(bNP + mig, size=duplicate.shape[0])
-                duplicate = np.where((r2 == np.arange(bNP)) + (r2 == r1))[0]
-                count += 1
-            r3 = np.random.randint(bNP + mig, size=bNP)
-            count = 0
-            duplicate = np.where((r3 == np.arange(bNP)) + (r3 == r1) + (r3 == r2))[0]
-            while duplicate.shape[0] > 0 and count < 25:
-                r3[duplicate] = np.random.randint(bNP + mig, size=duplicate.shape[0])
-                duplicate = np.where((r3 == np.arange(bNP)) + (r3 == r1) + (r3 == r2))[
-                    0
-                ]
-                count += 1
-            SF, SCr, df, age = mutate_cross_select(
-                r1, r2, r3, SF, SCr, df, age, big=True
-            )
-            FEs += bNP
-            if FEs >= k * record_period:
-                Fevs.append(population.gbest)
-                k += 1
-            if FEs >= FEs_end or FEs >= MaxFEs:
-                break
-
-            curr_sNP = NP - bNP
-            if (
-                curr_sNP > 0
-                and population.cbest_id >= bNP
-                and self._prevec_enakih(population.cost[bNP:], population.cbest)
-            ):
-                self.sReset += 1
-                cbest = population.cbest
-                cbest_id = population.cbest_id
-                tmp = population.group[cbest_id].copy()
-                population.group[bNP:] = population.initialize_group(curr_sNP)
-                population.F[bNP:] = self.Finit
-                population.Cr[bNP:] = self.CRinit
-                population.cost[bNP:] = 1e15
-                population.cbest = cbest
-                population.cbest_id = cbest_id
-                population.group[cbest_id] = tmp
-                population.cost[cbest_id] = cbest
-
-            if population.cbest_id < bNP:
-                self.cCopy += 1
-                population.cost[bNP] = population.cbest
-                population.group[bNP] = population.group[population.cbest_id].copy()
-                population.cbest_id = bNP
-
-            curr_sNP = NP - bNP  # actual small-pop size (= sNP unless shrunken)
-            for _ in range(bNP // curr_sNP if curr_sNP > 0 else 0):
-                r1 = np.random.randint(curr_sNP, size=curr_sNP) + bNP
-                count = 0
-                duplicate = np.where(r1 == (np.arange(curr_sNP) + bNP))[0]
-                while duplicate.shape[0] > 0 and count < 25:
-                    r1[duplicate] = (
-                        np.random.randint(curr_sNP, size=duplicate.shape[0]) + bNP
-                    )
-                    duplicate = np.where(r1 == (np.arange(curr_sNP) + bNP))[0]
-                    count += 1
-                r2 = np.random.randint(curr_sNP, size=curr_sNP) + bNP
-                count = 0
-                duplicate = np.where((r2 == (np.arange(curr_sNP) + bNP)) + (r2 == r1))[
-                    0
-                ]
-                while duplicate.shape[0] > 0 and count < 25:
-                    r2[duplicate] = (
-                        np.random.randint(curr_sNP, size=duplicate.shape[0]) + bNP
-                    )
-                    duplicate = np.where(
-                        (r2 == (np.arange(curr_sNP) + bNP)) + (r2 == r1)
-                    )[0]
-                    count += 1
-                r3 = np.random.randint(curr_sNP, size=curr_sNP) + bNP
-                count = 0
-                duplicate = np.where(
-                    (r3 == (np.arange(curr_sNP) + bNP)) + (r3 == r1) + (r3 == r2)
-                )[0]
-                while duplicate.shape[0] > 0 and count < 25:
-                    r3[duplicate] = (
-                        np.random.randint(curr_sNP, size=duplicate.shape[0]) + bNP
-                    )
-                    duplicate = np.where(
-                        (r3 == (np.arange(curr_sNP) + bNP)) + (r3 == r1) + (r3 == r2)
-                    )[0]
-                    count += 1
-                SF, SCr, df, age = mutate_cross_select(
-                    r1, r2, r3, SF, SCr, df, age, big=False
-                )
-                FEs += curr_sNP
-                if FEs >= k * record_period:
-                    Fevs.append(population.gbest)
-                    k += 1
-                if FEs >= FEs_end or FEs >= MaxFEs:
-                    break
-
-            population.update_M_F_Cr(SF, SCr, df)
-            NP = min(
-                int(population.cal_NP_next_gen(FEs, MaxFEs)), len(population.group)
-            )
-            population.NP = NP
-            population.group = population.group[-NP:]
-            population.cost = population.cost[-NP:]
-            population.F = population.F[-NP:]
-            population.Cr = population.Cr[-NP:]
-            population.cbest_id = int(np.argmin(population.cost))
-            population.cbest = float(np.min(population.cost))
-            bNP = max(NP - sNP, 2)
-
-        return population, Fevs, min(FEs, MaxFEs)
-
-
-class MadDE:
-    def __init__(self) -> None:
-        self.p = 0.18
-        self.PqBX = 0.01
-        self.pm = np.ones(3) / 3
-
-    def _ctb_w_arc(
-        self,
-        group: np.ndarray,
-        best: np.ndarray,
-        archive: np.ndarray,
-        Fs: np.ndarray,
-    ) -> np.ndarray:
-        NP, dim = group.shape
-        NB = best.shape[0]
-        NA = archive.shape[0]
-        count = 0
-        rb = np.random.randint(NB, size=NP)
-        duplicate = np.where(rb == np.arange(NP))[0]
-        while duplicate.shape[0] > 0 and count < 25:
-            rb[duplicate] = np.random.randint(NB, size=duplicate.shape[0])
-            duplicate = np.where(rb == np.arange(NP))[0]
-            count += 1
-        count = 0
-        r1 = np.random.randint(NP, size=NP)
-        duplicate = np.where((r1 == rb) + (r1 == np.arange(NP)))[0]
-        while duplicate.shape[0] > 0 and count < 25:
-            r1[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
-            duplicate = np.where((r1 == rb) + (r1 == np.arange(NP)))[0]
-            count += 1
-        count = 0
-        r2 = np.random.randint(NP + NA, size=NP)
-        duplicate = np.where((r2 == rb) + (r2 == np.arange(NP)) + (r2 == r1))[0]
-        while duplicate.shape[0] > 0 and count < 25:
-            r2[duplicate] = np.random.randint(NP + NA, size=duplicate.shape[0])
-            duplicate = np.where((r2 == rb) + (r2 == np.arange(NP)) + (r2 == r1))[0]
-            count += 1
-        xb = best[rb]
-        x1 = group[r1]
-        x2 = np.concatenate((group, archive), 0)[r2] if NA > 0 else group[r2]
-        return group + Fs * (xb - group) + Fs * (x1 - x2)
-
-    def _ctr_w_arc(
-        self,
-        group: np.ndarray,
-        archive: np.ndarray,
-        Fs: np.ndarray,
-    ) -> np.ndarray:
-        NP, dim = group.shape
-        NA = archive.shape[0]
-        count = 0
-        r1 = np.random.randint(NP, size=NP)
-        duplicate = np.where(r1 == np.arange(NP))[0]
-        while duplicate.shape[0] > 0 and count < 25:
-            r1[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
-            duplicate = np.where(r1 == np.arange(NP))[0]
-            count += 1
-        count = 0
-        r2 = np.random.randint(NP + NA, size=NP)
-        duplicate = np.where((r2 == np.arange(NP)) + (r2 == r1))[0]
-        while duplicate.shape[0] > 0 and count < 25:
-            r2[duplicate] = np.random.randint(NP + NA, size=duplicate.shape[0])
-            duplicate = np.where((r2 == np.arange(NP)) + (r2 == r1))[0]
-            count += 1
-        x1 = group[r1]
-        x2 = np.concatenate((group, archive), 0)[r2] if NA > 0 else group[r2]
-        return group + Fs * (x1 - x2)
-
-    def _weighted_rtb(
-        self,
-        group: np.ndarray,
-        best: np.ndarray,
-        Fs: np.ndarray,
-        Fas: float,
-    ) -> np.ndarray:
-        NP, dim = group.shape
-        NB = best.shape[0]
-        count = 0
-        rb = np.random.randint(NB, size=NP)
-        duplicate = np.where(rb == np.arange(NP))[0]
-        while duplicate.shape[0] > 0 and count < 25:
-            rb[duplicate] = np.random.randint(NB, size=duplicate.shape[0])
-            duplicate = np.where(rb == np.arange(NP))[0]
-            count += 1
-        count = 0
-        r1 = np.random.randint(NP, size=NP)
-        duplicate = np.where((r1 == rb) + (r1 == np.arange(NP)))[0]
-        while duplicate.shape[0] > 0 and count < 25:
-            r1[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
-            duplicate = np.where((r1 == rb) + (r1 == np.arange(NP)))[0]
-            count += 1
-        count = 0
-        r2 = np.random.randint(NP, size=NP)
-        duplicate = np.where((r2 == rb) + (r2 == np.arange(NP)) + (r2 == r1))[0]
-        while duplicate.shape[0] > 0 and count < 25:
-            r2[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
-            duplicate = np.where((r2 == rb) + (r2 == np.arange(NP)) + (r2 == r1))[0]
-            count += 1
-        return Fs * group[r1] + Fs * Fas * (best[rb] - group[r2])
-
-    @staticmethod
-    def _binomial(x: np.ndarray, v: np.ndarray, Crs: np.ndarray) -> np.ndarray:
-        NP, dim = x.shape
-        jrand = np.random.randint(dim, size=NP)
-        u = np.where(np.random.rand(NP, dim) < Crs, v, x)
-        u[np.arange(NP), jrand] = v[np.arange(NP), jrand]
-        return u
-
-    def step(
-        self,
-        population,
-        problem,
-        FEs: int,
-        FEs_end: int,
-        MaxFEs: int,
-        record_period: int = -1,
-    ):
-        if record_period <= 0:
-            record_period = FEs_end - FEs
-        Fevs = []
-        k = 1
-        while FEs >= k * record_period:
-            k += 1
-        population.sort(population.NP)
-        while FEs < FEs_end and FEs < MaxFEs:
-            NP, dim = population.NP, population.dim
-            q = 2 * self.p - self.p * FEs / MaxFEs
-            Fa = 0.5 + 0.5 * FEs / MaxFEs
-            Cr, F = population.choose_F_Cr()
-            mu = np.random.choice(3, size=NP, p=self.pm)
-            p1 = population.group[mu == 0]
-            p2 = population.group[mu == 1]
-            p3 = population.group[mu == 2]
-            pbest = population.group[: max(int(self.p * NP), 2)]
-            qbest = population.group[: max(int(q * NP), 2)]
-            Fs = F.repeat(dim).reshape(NP, dim)
-            v1 = self._ctb_w_arc(p1, pbest, population.archive, Fs[mu == 0])
-            v2 = self._ctr_w_arc(p2, population.archive, Fs[mu == 1])
-            v3 = self._weighted_rtb(p3, qbest, Fs[mu == 2], Fa)
-            v = np.zeros((NP, dim))
-            v[mu == 0] = v1
-            v[mu == 1] = v2
-            v[mu == 2] = v3
-
-            # Bounds handling: midpoint reflection with lb/ub
-            Xmin = population.Xmin
-            Xmax = population.Xmax
-            v = np.where(v < Xmin, (population.group + Xmin) / 2, v)
-            v = np.where(v > Xmax, (population.group + Xmax) / 2, v)
-
-            rvs = np.random.rand(NP)
-            Crs = Cr.repeat(dim).reshape(NP, dim)
-            u = np.zeros((NP, dim))
-            if np.sum(rvs <= self.PqBX) > 0:
-                qu = v[rvs <= self.PqBX]
-                if population.archive.shape[0] > 0:
-                    qbest = np.concatenate((population.group, population.archive), 0)[
-                        : max(int(q * (NP + population.archive.shape[0])), 2)
-                    ]
-                cross_qbest = qbest[np.random.randint(qbest.shape[0], size=qu.shape[0])]
-                qu = self._binomial(cross_qbest, qu, Crs[rvs <= self.PqBX])
-                u[rvs <= self.PqBX] = qu
-            bu = v[rvs > self.PqBX]
-            bu = self._binomial(
-                population.group[rvs > self.PqBX], bu, Crs[rvs > self.PqBX]
-            )
-            u[rvs > self.PqBX] = bu
-
-            ncost = _eval(problem, u)
-            FEs += NP
-            optim = np.where(ncost < population.cost)[0]
-            for i in optim:
-                population.update_archive(i)
-            SF = F[optim]
-            SCr = Cr[optim]
-            df = np.maximum(0, population.cost - ncost)
-            population.update_M_F_Cr(SF, SCr, df[optim])
-            count_S = np.zeros(3)
-            for i in range(3):
-                count_S[i] = np.mean(df[mu == i] / (population.cost[mu == i] + 1e-9))
-            if np.sum(count_S) > 0:
-                self.pm = np.clip(count_S / np.sum(count_S), 0.1, 0.9)
-                self.pm /= np.sum(self.pm)
-            else:
-                self.pm = np.ones(3) / 3
-            population.group[optim] = u[optim]
-            population.cost = np.minimum(population.cost, ncost)
-            population.NLPSR(FEs, MaxFEs)
-            if float(np.min(population.cost)) < population.gbest:
-                population.gbest = float(np.min(population.cost))
-                population.gbest_solution = population.group[
-                    np.argmin(population.cost)
-                ].copy()
-            if FEs >= k * record_period:
-                Fevs.append(population.gbest)
-                k += 1
-
-        return population, Fevs, min(FEs, MaxFEs)
-
-
-_PORTFOLIO: dict[str, type] = {
-    "NL_SHADE_RSP": NL_SHADE_RSP,
-    "MADDE": MadDE,
-    "JDE21": JDE21,
-}
-
-
-def get_rldas_portfolio(names: list[str] | None = None) -> list:
-    """Return instantiated RL-DAS optimizer objects.
-
-    Parameters
-    ----------
-    names:
-        Optimizer names. Defaults to ``["NL_SHADE_RSP", "MADDE", "JDE21"]``.
-        Valid names: ``"NL_SHADE_RSP"``, ``"MADDE"``, ``"JDE21"``.
-    """
-    if names is None:
-        names = ["NL_SHADE_RSP", "MADDE", "JDE21"]
-    unknown = [n for n in names if n not in _PORTFOLIO]
-    if unknown:
-        raise ValueError(
-            f"Unknown RL-DAS optimizer(s): {unknown}. Valid choices: {list(_PORTFOLIO)}"
-        )
-    return [_PORTFOLIO[n]() for n in names]
diff --git a/agents/rl_das/optimizers/__init__.py b/agents/rl_das/optimizers/__init__.py
new file mode 100644
index 0000000..61c8883
--- /dev/null
+++ b/agents/rl_das/optimizers/__init__.py
@@ -0,0 +1,40 @@
+"""DE optimizer registry for RL-DAS.
+
+Optimizer implementations:
+  - nl_shade_rsp.py  → NL_SHADE_RSP
+  - jde21.py         → JDE21
+  - madde.py         → MadDE
+"""
+
+from __future__ import annotations
+
+from .jde21 import JDE21
+from .madde import MadDE
+from .nl_shade_rsp import NL_SHADE_RSP
+
+__all__ = ["NL_SHADE_RSP", "JDE21", "MadDE", "get_rldas_portfolio", "_PORTFOLIO"]
+
+_PORTFOLIO: dict[str, type] = {
+    "NL_SHADE_RSP": NL_SHADE_RSP,
+    "MADDE": MadDE,
+    "JDE21": JDE21,
+}
+
+
+def get_rldas_portfolio(names: list[str] | None = None) -> list:
+    """Return instantiated RL-DAS optimizer objects.
+
+    Parameters
+    ----------
+    names:
+        Optimizer names. Defaults to ``["NL_SHADE_RSP", "MADDE", "JDE21"]``.
+        Valid names: ``"NL_SHADE_RSP"``, ``"MADDE"``, ``"JDE21"``.
+    """
+    if names is None:
+        names = ["NL_SHADE_RSP", "MADDE", "JDE21"]
+    unknown = [n for n in names if n not in _PORTFOLIO]
+    if unknown:
+        raise ValueError(
+            f"Unknown RL-DAS optimizer(s): {unknown}. Valid choices: {list(_PORTFOLIO)}"
+        )
+    return [_PORTFOLIO[n]() for n in names]
diff --git a/agents/rl_das/optimizers/jde21.py b/agents/rl_das/optimizers/jde21.py
new file mode 100644
index 0000000..06a1b8b
--- /dev/null
+++ b/agents/rl_das/optimizers/jde21.py
@@ -0,0 +1,275 @@
+"""JDE21 differential-evolution optimizer for RL-DAS."""
+
+from __future__ import annotations
+
+import numpy as np
+
+
+def _eval(problem, xs: np.ndarray) -> np.ndarray:
+    return np.array([float(problem(xs[i])) for i in range(len(xs))])
+
+
+class JDE21:
+    def __init__(self) -> None:
+        self.sNP = 10
+        self.tao1 = 0.1
+        self.tao2 = 0.1
+        self.Finit = 0.5
+        self.CRinit = 0.9
+        self.Fl_b = 0.1
+        self.Fl_s = 0.17
+        self.Fu = 1.1
+        self.CRl_b = 0.0
+        self.CRl_s = 0.1
+        self.CRu_b = 1.1
+        self.CRu_s = 0.8
+        self.eps = 1e-12
+        self.MyEps = 0.25
+        self.nReset = 0
+        self.sReset = 0
+        self.cCopy = 0
+
+    def _prevec_enakih(self, cost: np.ndarray, best: float) -> bool:
+        eqs = len(cost[np.fabs(cost - best) < self.eps])
+        return eqs > 2 and eqs > len(cost) * self.MyEps
+
+    def _crowding(self, group: np.ndarray, vs: np.ndarray) -> np.ndarray:
+        NP, dim = vs.shape
+        dist = np.sum(
+            ((group * np.ones((NP, NP, dim))).transpose(1, 0, 2) - vs) ** 2, -1
+        ).transpose()
+        return np.argmin(dist, -1)
+
+    def step(
+        self,
+        population,
+        problem,
+        FEs: int,
+        FEs_end: int,
+        MaxFEs: int,
+        record_period: int = -1,
+    ):
+        if record_period <= 0:
+            record_period = FEs_end - FEs
+        NP = population.NP
+        dim = population.dim
+        sNP = self.sNP
+        bNP = max(NP - sNP, 2)
+        age = 0
+        k = 1
+        Fevs = []
+
+        def mutate_cross_select(r1, r2, r3, SF, SCr, df, age, big):
+            nonlocal bNP
+            if big:
+                xNP = bNP
+                randF = np.random.rand(xNP) * self.Fu + self.Fl_b
+                randCr = np.random.rand(xNP) * self.CRu_b + self.CRl_b
+                pF = population.F[:xNP]
+                pCr = population.Cr[:xNP]
+            else:
+                # Use actual slice size (may be < sNP when population shrank)
+                xNP = population.F[bNP:].shape[0]
+                if xNP == 0:
+                    return SF, SCr, df, age
+                randF = np.random.rand(xNP) * self.Fu + self.Fl_s
+                randCr = np.random.rand(xNP) * self.CRu_b + self.CRl_s
+                pF = population.F[bNP:]
+                pCr = population.Cr[bNP:]
+
+            rvs = np.random.rand(xNP)
+            F = np.where(rvs < self.tao1, randF, pF)
+            rvs = np.random.rand(xNP)
+            Cr = np.where(rvs < self.tao2, randCr, pCr)
+            Fs = F.repeat(dim).reshape(xNP, dim)
+            Crs = Cr.repeat(dim).reshape(xNP, dim)
+            v = population.group[r1] + Fs * (
+                population.group[r2] - population.group[r3]
+            )
+            v = np.clip(v, population.Xmin, population.Xmax)
+            jrand = np.random.randint(dim, size=xNP)
+            u = np.where(
+                np.random.rand(xNP, dim) < Crs,
+                v,
+                (population.group[:bNP] if big else population.group[bNP:]),
+            )
+            u[np.arange(xNP), jrand] = v[np.arange(xNP), jrand]
+            cost = _eval(problem, u)
+            crowding_ids = (
+                self._crowding(population.group[:xNP], u)
+                if big
+                else np.arange(xNP) + bNP
+            )
+            age += xNP
+            for i in range(xNP):
+                idx = crowding_ids[i]
+                if cost[i] < population.cost[idx]:
+                    population.update_archive(idx)
+                    population.group[idx] = u[i]
+                    population.cost[idx] = cost[i]
+                    population.F[idx] = F[i]
+                    population.Cr[idx] = Cr[i]
+                    SF = np.append(SF, F[i])
+                    SCr = np.append(SCr, Cr[i])
+                    d = (population.cost[i] - cost[i]) / (population.cost[i] + 1e-9)
+                    df = np.append(df, d)
+                    if cost[i] < population.cbest:
+                        age = 0
+                        population.cbest_id = idx
+                        population.cbest = cost[i]
+                        if cost[i] < population.gbest:
+                            population.gbest = cost[i]
+                            population.gbest_solution = u[i].copy()
+
+            return SF, SCr, df, age
+
+        population.sort(NP, True)
+        while FEs >= k * record_period:
+            k += 1
+        while FEs < FEs_end:
+            df = np.array([])
+            SF = np.array([])
+            SCr = np.array([])
+            if (
+                self._prevec_enakih(population.cost[:bNP], population.gbest)
+                or age > MaxFEs / 10
+            ):
+                self.nReset += 1
+                population.group[:bNP] = population.initialize_group(bNP)
+                population.F[:bNP] = self.Finit
+                population.Cr[:bNP] = self.CRinit
+                population.cost[:bNP] = 1e15
+                age = 0
+                population.cbest = float(np.min(population.cost))
+                population.cbest_id = int(np.argmin(population.cost))
+
+            if FEs < MaxFEs / 3:
+                mig = 1
+            elif FEs < 2 * MaxFEs / 3:
+                mig = 2
+            else:
+                mig = 3
+
+            r1 = np.random.randint(bNP, size=bNP)
+            count = 0
+            duplicate = np.where((r1 == np.arange(bNP)) * (r1 == population.cbest_id))[
+                0
+            ]
+            while duplicate.shape[0] > 0 and count < 25:
+                r1[duplicate] = np.random.randint(bNP, size=duplicate.shape[0])
+                duplicate = np.where(
+                    (r1 == np.arange(bNP)) * (r1 == population.cbest_id)
+                )[0]
+                count += 1
+            r2 = np.random.randint(bNP + mig, size=bNP)
+            count = 0
+            duplicate = np.where((r2 == np.arange(bNP)) + (r2 == r1))[0]
+            while duplicate.shape[0] > 0 and count < 25:
+                r2[duplicate] = np.random.randint(bNP + mig, size=duplicate.shape[0])
+                duplicate = np.where((r2 == np.arange(bNP)) + (r2 == r1))[0]
+                count += 1
+            r3 = np.random.randint(bNP + mig, size=bNP)
+            count = 0
+            duplicate = np.where((r3 == np.arange(bNP)) + (r3 == r1) + (r3 == r2))[0]
+            while duplicate.shape[0] > 0 and count < 25:
+                r3[duplicate] = np.random.randint(bNP + mig, size=duplicate.shape[0])
+                duplicate = np.where((r3 == np.arange(bNP)) + (r3 == r1) + (r3 == r2))[
+                    0
+                ]
+                count += 1
+            SF, SCr, df, age = mutate_cross_select(
+                r1, r2, r3, SF, SCr, df, age, big=True
+            )
+            FEs += bNP
+            if FEs >= k * record_period:
+                Fevs.append(population.gbest)
+                k += 1
+            if FEs >= FEs_end or FEs >= MaxFEs:
+                break
+
+            curr_sNP = NP - bNP
+            if (
+                curr_sNP > 0
+                and population.cbest_id >= bNP
+                and self._prevec_enakih(population.cost[bNP:], population.cbest)
+            ):
+                self.sReset += 1
+                cbest = population.cbest
+                cbest_id = population.cbest_id
+                tmp = population.group[cbest_id].copy()
+                population.group[bNP:] = population.initialize_group(curr_sNP)
+                population.F[bNP:] = self.Finit
+                population.Cr[bNP:] = self.CRinit
+                population.cost[bNP:] = 1e15
+                population.cbest = cbest
+                population.cbest_id = cbest_id
+                population.group[cbest_id] = tmp
+                population.cost[cbest_id] = cbest
+
+            if population.cbest_id < bNP:
+                self.cCopy += 1
+                population.cost[bNP] = population.cbest
+                population.group[bNP] = population.group[population.cbest_id].copy()
+                population.cbest_id = bNP
+
+            curr_sNP = NP - bNP  # actual small-pop size (= sNP unless shrunken)
+            for _ in range(bNP // curr_sNP if curr_sNP > 0 else 0):
+                r1 = np.random.randint(curr_sNP, size=curr_sNP) + bNP
+                count = 0
+                duplicate = np.where(r1 == (np.arange(curr_sNP) + bNP))[0]
+                while duplicate.shape[0] > 0 and count < 25:
+                    r1[duplicate] = (
+                        np.random.randint(curr_sNP, size=duplicate.shape[0]) + bNP
+                    )
+                    duplicate = np.where(r1 == (np.arange(curr_sNP) + bNP))[0]
+                    count += 1
+                r2 = np.random.randint(curr_sNP, size=curr_sNP) + bNP
+                count = 0
+                duplicate = np.where((r2 == (np.arange(curr_sNP) + bNP)) + (r2 == r1))[
+                    0
+                ]
+                while duplicate.shape[0] > 0 and count < 25:
+                    r2[duplicate] = (
+                        np.random.randint(curr_sNP, size=duplicate.shape[0]) + bNP
+                    )
+                    duplicate = np.where(
+                        (r2 == (np.arange(curr_sNP) + bNP)) + (r2 == r1)
+                    )[0]
+                    count += 1
+                r3 = np.random.randint(curr_sNP, size=curr_sNP) + bNP
+                count = 0
+                duplicate = np.where(
+                    (r3 == (np.arange(curr_sNP) + bNP)) + (r3 == r1) + (r3 == r2)
+                )[0]
+                while duplicate.shape[0] > 0 and count < 25:
+                    r3[duplicate] = (
+                        np.random.randint(curr_sNP, size=duplicate.shape[0]) + bNP
+                    )
+                    duplicate = np.where(
+                        (r3 == (np.arange(curr_sNP) + bNP)) + (r3 == r1) + (r3 == r2)
+                    )[0]
+                    count += 1
+                SF, SCr, df, age = mutate_cross_select(
+                    r1, r2, r3, SF, SCr, df, age, big=False
+                )
+                FEs += curr_sNP
+                if FEs >= k * record_period:
+                    Fevs.append(population.gbest)
+                    k += 1
+                if FEs >= FEs_end or FEs >= MaxFEs:
+                    break
+
+            population.update_M_F_Cr(SF, SCr, df)
+            NP = min(
+                int(population.cal_NP_next_gen(FEs, MaxFEs)), len(population.group)
+            )
+            population.NP = NP
+            population.group = population.group[-NP:]
+            population.cost = population.cost[-NP:]
+            population.F = population.F[-NP:]
+            population.Cr = population.Cr[-NP:]
+            population.cbest_id = int(np.argmin(population.cost))
+            population.cbest = float(np.min(population.cost))
+            bNP = max(NP - sNP, 2)
+
+        return population, Fevs, min(FEs, MaxFEs)
diff --git a/agents/rl_das/optimizers/madde.py b/agents/rl_das/optimizers/madde.py
new file mode 100644
index 0000000..7bd5911
--- /dev/null
+++ b/agents/rl_das/optimizers/madde.py
@@ -0,0 +1,209 @@
+"""MadDE differential-evolution optimizer for RL-DAS."""
+
+from __future__ import annotations
+
+import numpy as np
+
+
+def _eval(problem, xs: np.ndarray) -> np.ndarray:
+    return np.array([float(problem(xs[i])) for i in range(len(xs))])
+
+
+class MadDE:
+    def __init__(self) -> None:
+        self.p = 0.18
+        self.PqBX = 0.01
+        self.pm = np.ones(3) / 3
+
+    def _ctb_w_arc(
+        self,
+        group: np.ndarray,
+        best: np.ndarray,
+        archive: np.ndarray,
+        Fs: np.ndarray,
+    ) -> np.ndarray:
+        NP, dim = group.shape
+        NB = best.shape[0]
+        NA = archive.shape[0]
+        count = 0
+        rb = np.random.randint(NB, size=NP)
+        duplicate = np.where(rb == np.arange(NP))[0]
+        while duplicate.shape[0] > 0 and count < 25:
+            rb[duplicate] = np.random.randint(NB, size=duplicate.shape[0])
+            duplicate = np.where(rb == np.arange(NP))[0]
+            count += 1
+        count = 0
+        r1 = np.random.randint(NP, size=NP)
+        duplicate = np.where((r1 == rb) + (r1 == np.arange(NP)))[0]
+        while duplicate.shape[0] > 0 and count < 25:
+            r1[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
+            duplicate = np.where((r1 == rb) + (r1 == np.arange(NP)))[0]
+            count += 1
+        count = 0
+        r2 = np.random.randint(NP + NA, size=NP)
+        duplicate = np.where((r2 == rb) + (r2 == np.arange(NP)) + (r2 == r1))[0]
+        while duplicate.shape[0] > 0 and count < 25:
+            r2[duplicate] = np.random.randint(NP + NA, size=duplicate.shape[0])
+            duplicate = np.where((r2 == rb) + (r2 == np.arange(NP)) + (r2 == r1))[0]
+            count += 1
+        xb = best[rb]
+        x1 = group[r1]
+        x2 = np.concatenate((group, archive), 0)[r2] if NA > 0 else group[r2]
+        return group + Fs * (xb - group) + Fs * (x1 - x2)
+
+    def _ctr_w_arc(
+        self,
+        group: np.ndarray,
+        archive: np.ndarray,
+        Fs: np.ndarray,
+    ) -> np.ndarray:
+        NP, dim = group.shape
+        NA = archive.shape[0]
+        count = 0
+        r1 = np.random.randint(NP, size=NP)
+        duplicate = np.where(r1 == np.arange(NP))[0]
+        while duplicate.shape[0] > 0 and count < 25:
+            r1[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
+            duplicate = np.where(r1 == np.arange(NP))[0]
+            count += 1
+        count = 0
+        r2 = np.random.randint(NP + NA, size=NP)
+        duplicate = np.where((r2 == np.arange(NP)) + (r2 == r1))[0]
+        while duplicate.shape[0] > 0 and count < 25:
+            r2[duplicate] = np.random.randint(NP + NA, size=duplicate.shape[0])
+            duplicate = np.where((r2 == np.arange(NP)) + (r2 == r1))[0]
+            count += 1
+        x1 = group[r1]
+        x2 = np.concatenate((group, archive), 0)[r2] if NA > 0 else group[r2]
+        return group + Fs * (x1 - x2)
+
+    def _weighted_rtb(
+        self,
+        group: np.ndarray,
+        best: np.ndarray,
+        Fs: np.ndarray,
+        Fas: float,
+    ) -> np.ndarray:
+        NP, dim = group.shape
+        NB = best.shape[0]
+        count = 0
+        rb = np.random.randint(NB, size=NP)
+        duplicate = np.where(rb == np.arange(NP))[0]
+        while duplicate.shape[0] > 0 and count < 25:
+            rb[duplicate] = np.random.randint(NB, size=duplicate.shape[0])
+            duplicate = np.where(rb == np.arange(NP))[0]
+            count += 1
+        count = 0
+        r1 = np.random.randint(NP, size=NP)
+        duplicate = np.where((r1 == rb) + (r1 == np.arange(NP)))[0]
+        while duplicate.shape[0] > 0 and count < 25:
+            r1[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
+            duplicate = np.where((r1 == rb) + (r1 == np.arange(NP)))[0]
+            count += 1
+        count = 0
+        r2 = np.random.randint(NP, size=NP)
+        duplicate = np.where((r2 == rb) + (r2 == np.arange(NP)) + (r2 == r1))[0]
+        while duplicate.shape[0] > 0 and count < 25:
+            r2[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
+            duplicate = np.where((r2 == rb) + (r2 == np.arange(NP)) + (r2 == r1))[0]
+            count += 1
+        return Fs * group[r1] + Fs * Fas * (best[rb] - group[r2])
+
+    @staticmethod
+    def _binomial(x: np.ndarray, v: np.ndarray, Crs: np.ndarray) -> np.ndarray:
+        NP, dim = x.shape
+        jrand = np.random.randint(dim, size=NP)
+        u = np.where(np.random.rand(NP, dim) < Crs, v, x)
+        u[np.arange(NP), jrand] = v[np.arange(NP), jrand]
+        return u
+
+    def step(
+        self,
+        population,
+        problem,
+        FEs: int,
+        FEs_end: int,
+        MaxFEs: int,
+        record_period: int = -1,
+    ):
+        if record_period <= 0:
+            record_period = FEs_end - FEs
+        Fevs = []
+        k = 1
+        while FEs >= k * record_period:
+            k += 1
+        population.sort(population.NP)
+        while FEs < FEs_end and FEs < MaxFEs:
+            NP, dim = population.NP, population.dim
+            q = 2 * self.p - self.p * FEs / MaxFEs
+            Fa = 0.5 + 0.5 * FEs / MaxFEs
+            Cr, F = population.choose_F_Cr()
+            mu = np.random.choice(3, size=NP, p=self.pm)
+            p1 = population.group[mu == 0]
+            p2 = population.group[mu == 1]
+            p3 = population.group[mu == 2]
+            pbest = population.group[: max(int(self.p * NP), 2)]
+            qbest = population.group[: max(int(q * NP), 2)]
+            Fs = F.repeat(dim).reshape(NP, dim)
+            v1 = self._ctb_w_arc(p1, pbest, population.archive, Fs[mu == 0])
+            v2 = self._ctr_w_arc(p2, population.archive, Fs[mu == 1])
+            v3 = self._weighted_rtb(p3, qbest, Fs[mu == 2], Fa)
+            v = np.zeros((NP, dim))
+            v[mu == 0] = v1
+            v[mu == 1] = v2
+            v[mu == 2] = v3
+
+            # Bounds handling: midpoint reflection with lb/ub
+            Xmin = population.Xmin
+            Xmax = population.Xmax
+            v = np.where(v < Xmin, (population.group + Xmin) / 2, v)
+            v = np.where(v > Xmax, (population.group + Xmax) / 2, v)
+
+            rvs = np.random.rand(NP)
+            Crs = Cr.repeat(dim).reshape(NP, dim)
+            u = np.zeros((NP, dim))
+            if np.sum(rvs <= self.PqBX) > 0:
+                qu = v[rvs <= self.PqBX]
+                if population.archive.shape[0] > 0:
+                    qbest = np.concatenate((population.group, population.archive), 0)[
+                        : max(int(q * (NP + population.archive.shape[0])), 2)
+                    ]
+                cross_qbest = qbest[np.random.randint(qbest.shape[0], size=qu.shape[0])]
+                qu = self._binomial(cross_qbest, qu, Crs[rvs <= self.PqBX])
+                u[rvs <= self.PqBX] = qu
+            bu = v[rvs > self.PqBX]
+            bu = self._binomial(
+                population.group[rvs > self.PqBX], bu, Crs[rvs > self.PqBX]
+            )
+            u[rvs > self.PqBX] = bu
+
+            ncost = _eval(problem, u)
+            FEs += NP
+            optim = np.where(ncost < population.cost)[0]
+            for i in optim:
+                population.update_archive(i)
+            SF = F[optim]
+            SCr = Cr[optim]
+            df = np.maximum(0, population.cost - ncost)
+            population.update_M_F_Cr(SF, SCr, df[optim])
+            count_S = np.zeros(3)
+            for i in range(3):
+                count_S[i] = np.mean(df[mu == i] / (population.cost[mu == i] + 1e-9))
+            if np.sum(count_S) > 0:
+                self.pm = np.clip(count_S / np.sum(count_S), 0.1, 0.9)
+                self.pm /= np.sum(self.pm)
+            else:
+                self.pm = np.ones(3) / 3
+            population.group[optim] = u[optim]
+            population.cost = np.minimum(population.cost, ncost)
+            population.NLPSR(FEs, MaxFEs)
+            if float(np.min(population.cost)) < population.gbest:
+                population.gbest = float(np.min(population.cost))
+                population.gbest_solution = population.group[
+                    np.argmin(population.cost)
+                ].copy()
+            if FEs >= k * record_period:
+                Fevs.append(population.gbest)
+                k += 1
+
+        return population, Fevs, min(FEs, MaxFEs)
diff --git a/agents/rl_das/optimizers/nl_shade_rsp.py b/agents/rl_das/optimizers/nl_shade_rsp.py
new file mode 100644
index 0000000..8a180a7
--- /dev/null
+++ b/agents/rl_das/optimizers/nl_shade_rsp.py
@@ -0,0 +1,166 @@
+"""NL_SHADE_RSP differential-evolution optimizer for RL-DAS."""
+
+from __future__ import annotations
+
+import numpy as np
+
+
+def _eval(problem, xs: np.ndarray) -> np.ndarray:
+    return np.array([float(problem(xs[i])) for i in range(len(xs))])
+
+
+class NL_SHADE_RSP:
+    def __init__(self) -> None:
+        self.pb = 0.4
+        self.pa = 0.5
+
+    def _binomial(self, x: np.ndarray, v: np.ndarray, cr: np.ndarray) -> np.ndarray:
+        NP, dim = x.shape
+        jrand = np.random.randint(dim, size=NP)
+        u = np.where(np.random.rand(NP, dim) < cr.repeat(dim).reshape(NP, dim), v, x)
+        u[np.arange(NP), jrand] = v[np.arange(NP), jrand]
+        return u
+
+    def _exponential(self, x: np.ndarray, v: np.ndarray, cr: np.ndarray) -> np.ndarray:
+        NP, dim = x.shape
+        u = x.copy()
+        L = np.random.randint(dim, size=NP).repeat(dim).reshape(NP, dim)
+        L = L <= np.arange(dim)
+        rvs = np.random.rand(NP, dim)
+        L = np.where(rvs > cr.repeat(dim).reshape(NP, dim), L, 0)
+        return u * (1 - L) + v * L
+
+    def _update_pa(self, fa: float, fp: float, na: int, NP: int) -> None:
+        if na == 0 or fa == 0:
+            self.pa = 0.5
+            return
+        self.pa = (fa / (na + 1e-15)) / ((fa / (na + 1e-15)) + (fp / (NP - na + 1e-15)))
+        self.pa = float(np.clip(self.pa, 0.1, 0.9))
+
+    def step(
+        self,
+        population,
+        problem,
+        FEs: int,
+        FEs_end: int,
+        MaxFEs: int,
+        record_period: int = -1,
+    ):
+        if record_period <= 0:
+            record_period = FEs_end - FEs
+        NP, dim = population.NP, population.dim
+        NA = int(NP * 2.1)
+        if NA < population.archive.shape[0]:
+            population.archive = population.archive[:NA]
+        self.pa = 0.5
+        k = 1
+        Fevs = []
+        while FEs >= k * record_period:
+            k += 1
+        population.sort(population.NP)
+
+        while FEs < FEs_end and FEs < MaxFEs:
+            Cr, F = population.choose_F_Cr()
+            Cr = np.sort(Cr)
+            pr = np.exp(-(np.arange(NP) + 1) / NP)
+            pr /= np.sum(pr)
+            cross_exponential = np.random.random() < 0.5
+            pb_upper = int(max(2, NP * self.pb))
+            pbs = np.random.randint(pb_upper, size=NP)
+            count = 0
+            duplicate = np.where(pbs == np.arange(NP))[0]
+            while duplicate.shape[0] > 0 and count < 1:
+                pbs[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
+                duplicate = np.where(pbs == np.arange(NP))[0]
+                count += 1
+            xpb = population.group[pbs]
+            r1 = np.random.randint(NP, size=NP)
+            count = 0
+            duplicate = np.where((r1 == np.arange(NP)) + (r1 == pbs))[0]
+            while duplicate.shape[0] > 0 and count < 25:
+                r1[duplicate] = np.random.randint(NP, size=duplicate.shape[0])
+                duplicate = np.where((r1 == np.arange(NP)) + (r1 == pbs))[0]
+                count += 1
+            x1 = population.group[r1]
+            rvs = np.random.rand(NP)
+            r2_pop = np.where(rvs >= self.pa)[0]
+            r2_arc = np.where(rvs < self.pa)[0]
+            use_arc = np.zeros(NP, dtype=bool)
+            use_arc[r2_arc] = True
+            if population.archive.shape[0] < 25:
+                r2_pop = np.arange(NP)
+                r2_arc = np.array([], dtype=np.int32)
+            r2 = np.random.choice(np.arange(NP), size=r2_pop.shape[0], p=pr)
+            count = 0
+            duplicate = np.where(
+                (r2 == r2_pop) + (r2 == pbs[r2_pop]) + (r2 == r1[r2_pop])
+            )[0]
+            while duplicate.shape[0] > 0 and count < 25:
+                r2[duplicate] = np.random.choice(
+                    np.arange(NP), size=duplicate.shape[0], p=pr
+                )
+                duplicate = np.where(
+                    (r2 == r2_pop) + (r2 == pbs[r2_pop]) + (r2 == r1[r2_pop])
+                )[0]
+                count += 1
+            x2 = np.zeros((NP, dim))
+            if r2_pop.shape[0] > 0:
+                x2[r2_pop] = population.group[r2]
+            if r2_arc.shape[0] > 0:
+                x2[r2_arc] = population.archive[
+                    np.random.randint(
+                        min(population.archive.shape[0], NA),
+                        size=r2_arc.shape[0],
+                    )
+                ]
+            Fs = F.repeat(dim).reshape(NP, dim)
+            vs = population.group + Fs * (xpb - population.group) + Fs * (x1 - x2)
+            Crb = np.zeros(NP)
+            tmp_id = np.where(np.arange(NP) + FEs < 0.5 * MaxFEs)[0]
+            Crb[tmp_id] = 2 * ((FEs + tmp_id) / MaxFEs - 0.5)
+            if cross_exponential:
+                us = self._binomial(population.group, vs, Crb)
+            else:
+                us = self._exponential(population.group, vs, Cr)
+
+            # Reinitialise out-of-bounds elements with uniform random in [lb, ub]
+            oob = (us < population.Xmin) | (us > population.Xmax)
+            if np.any(oob):
+                rand_pts = (
+                    np.random.rand(NP, dim) * (population.Xmax - population.Xmin)
+                    + population.Xmin
+                )
+                us = np.where(oob, rand_pts, us)
+
+            cost = _eval(problem, us)
+            optim = np.where(cost < population.cost)[0]
+            for i in optim:
+                population.update_archive(i)
+            population.F[optim] = F[optim]
+            population.Cr[optim] = Cr[optim]
+            SF = F[optim]
+            SCr = Cr[optim]
+            df = (population.cost[optim] - cost[optim]) / (
+                population.cost[optim] + 1e-9
+            )
+            arc_usage = use_arc[optim]
+            fp = float(np.sum(df[arc_usage]))
+            fa = float(np.sum(df[~arc_usage]))
+            na = int(np.sum(arc_usage))
+            population.group[optim] = us[optim]
+            population.cost[optim] = cost[optim]
+            if float(np.min(cost)) < population.gbest:
+                population.gbest = float(np.min(cost))
+                population.gbest_solution = us[np.argmin(cost)].copy()
+            FEs += NP
+            self.pb = 0.4 - 0.2 * (FEs / MaxFEs)
+            population.NLPSR(FEs, MaxFEs)
+            population.update_M_F_Cr(SF, SCr, df)
+            self._update_pa(fa, fp, na, NP)
+            NP = population.NP
+            NA = population.NA
+            if FEs >= k * record_period:
+                Fevs.append(population.gbest)
+                k += 1
+
+        return population, Fevs, min(FEs, MaxFEs)
diff --git a/agents/rl_das/trainer.py b/agents/rl_das/trainer.py
index 2f86581..ee9b343 100644
--- a/agents/rl_das/trainer.py
+++ b/agents/rl_das/trainer.py
@@ -112,7 +112,7 @@ def train(
             ep = _run_episode(train_env, agent, deterministic=False)
             epoch_rewards.append(ep["total_reward"])
 
-            diagnostics = agent.learn(k_epoch)
+            agent.learn(k_epoch)
             agent.rollout.clear()
 
         mean_train_reward = float(np.mean(epoch_rewards))
diff --git a/baselines.py b/baselines.py
index 9fbd9c6..909e983 100644
--- a/baselines.py
+++ b/baselines.py
@@ -27,7 +27,7 @@
 
 Outputs
 -------
-    results/<name>_<agent_tag>.jsonl     per-problem {problem_id, agent, best_y, gap}
+    results/<name>_<agent_tag>.jsonl     per-problem {problem_id: {area_under_optimization_curve, aocc, final_fitness, agent}}
     results/<name>_baselines_summary.jsonl   aggregate comparison (when --agent all)
 """
 
@@ -35,7 +35,6 @@
 import json
 import os
 import warnings
-from itertools import product
 
 import cocoex
 import numpy as np
@@ -45,7 +44,7 @@
 from das.optimizers.portfolio import get_portfolio
 from das.utils import set_seed
 from das.env.bbob_splits import ALL_DIMS, get_train_test_split
-from das.training.common import load_global_optima
+from das.training.common import compute_run_stats, load_global_optima
 
 warnings.filterwarnings("ignore")
 
@@ -71,15 +70,22 @@ def _policy(obs: np.ndarray, n_actions: int) -> int:
 # ------------------------------------------------------------------ #
 
 
-def run_episode(env: DASEnv, policy_fn) -> dict:
-    """Run one complete episode; return the final info dict."""
-    obs, info = env.reset()
+def run_episode(env: DASEnv, policy_fn) -> tuple[dict, list[tuple[int, float]]]:
+    """Run one complete episode.
+
+    Returns the final step-info dict and the full best-so-far improvement
+    history accumulated across all checkpoints (exact AOCC input).
+    """
+    obs, _ = env.reset()
     done = False
+    step_info: dict = {}
+    fitness_history: list[tuple[int, float]] = []
     while not done:
         action = policy_fn(obs, env.action_space.n)
-        obs, _, terminated, truncated, info = env.step(action)
+        obs, _, terminated, truncated, step_info = env.step(action)
         done = terminated or truncated
-    return info
+        fitness_history.extend(step_info.get("fitness_history_step", []))
+    return step_info, fitness_history
 
 
 def collect_env_results(
@@ -104,17 +110,11 @@ def collect_env_results(
     )
     results = []
     for problem_id in tqdm(test_ids, desc=f"  {agent_tag}", smoothing=0.0):
-        info = run_episode(env, policy_fn)
-        best_y = info.get("best_y", float("inf"))
-        optimum = global_optima.get(problem_id, 0.0)
-        results.append(
-            {
-                "problem_id": problem_id,
-                "agent": agent_tag,
-                "best_y": best_y,
-                "gap": best_y - optimum,
-            }
-        )
+        step_info, fitness_history = run_episode(env, policy_fn)
+        max_fe = step_info.get("n_fe", 0)
+        global_minimum = global_optima.get(problem_id, 0.0)
+        stats = compute_run_stats(fitness_history, max_fe, global_minimum)
+        results.append({problem_id: {**stats, "agent": agent_tag}})
     env.close()
     return results
 
@@ -125,9 +125,17 @@ def collect_env_results(
 
 
 def run_single_algorithm(
-    optimizer_class, problem, fe_multiplier: int, n_individuals: int
-) -> float:
-    """Run one optimizer for the full budget in one uninterrupted call."""
+    optimizer_class,
+    problem,
+    fe_multiplier: int,
+    n_individuals: int,
+    global_minimum: float = 0.0,
+) -> dict[str, float]:
+    """Run one optimizer for the full budget in one uninterrupted call.
+
+    Returns a stats dict with area_under_optimization_curve, aocc, and final_fitness.
+    Uses the optimizer's exact fitness_history (improvement points) for AOCC.
+    """
     max_fe = fe_multiplier * problem.dimension
     problem_config = {
         "fitness_function": problem,
@@ -145,7 +153,8 @@ def run_single_algorithm(
     result = optimizer.optimize()
     if isinstance(result, tuple):
         result = result[0]
-    return float(result.get("best_so_far_y", float("inf")))
+    fitness_history = result.get("fitness_history", [])
+    return compute_run_stats(fitness_history, max_fe, global_minimum)
 
 
 def collect_single_results(
@@ -161,18 +170,11 @@ def collect_single_results(
     results = []
     for problem_id in tqdm(test_ids, desc=f"  {agent_tag}", smoothing=0.0):
         problem = suite.get_problem(problem_id)
-        best_y = run_single_algorithm(
-            optimizer_class, problem, fe_multiplier, n_individuals
-        )
-        optimum = global_optima.get(problem_id, 0.0)
-        results.append(
-            {
-                "problem_id": problem_id,
-                "agent": agent_tag,
-                "best_y": best_y,
-                "gap": best_y - optimum,
-            }
+        global_minimum = global_optima.get(problem_id, 0.0)
+        stats = run_single_algorithm(
+            optimizer_class, problem, fe_multiplier, n_individuals, global_minimum
         )
+        results.append({problem_id: {**stats, "agent": agent_tag}})
     return results
 
 
@@ -192,38 +194,43 @@ def compute_oracle(all_results: dict[str, list[dict]]) -> tuple[list[dict], list
     -------
     oracle_best, oracle_worst: per-problem dicts annotated with winning agent
     """
-    # Index by problem_id
-    by_problem: dict[str, list[dict]] = {}
+    by_problem: dict[str, list[tuple[str, dict]]] = {}
     for tag, records in all_results.items():
         for r in records:
-            pid = r["problem_id"]
-            by_problem.setdefault(pid, []).append(r)
+            pid, metrics = next(iter(r.items()))
+            by_problem.setdefault(pid, []).append((pid, metrics))
 
     oracle_best, oracle_worst = [], []
-    for pid, records in by_problem.items():
-        best = min(records, key=lambda r: r["gap"])
-        worst = max(records, key=lambda r: r["gap"])
+    for pid, entries in by_problem.items():
+        best_pid, best_m = min(entries, key=lambda e: e[1]["final_fitness"])
+        worst_pid, worst_m = max(entries, key=lambda e: e[1]["final_fitness"])
         oracle_best.append(
             {
-                "problem_id": pid,
-                "agent": "oracle-best",
-                "best_agent": best["agent"],
-                "best_y": best["best_y"],
-                "gap": best["gap"],
+                pid: {
+                    "area_under_optimization_curve": best_m[
+                        "area_under_optimization_curve"
+                    ],
+                    "final_fitness": best_m["final_fitness"],
+                    "agent": "oracle-best",
+                    "best_agent": best_m["agent"],
+                }
             }
         )
         oracle_worst.append(
             {
-                "problem_id": pid,
-                "agent": "oracle-worst",
-                "worst_agent": worst["agent"],
-                "best_y": worst["best_y"],
-                "gap": worst["gap"],
+                pid: {
+                    "area_under_optimization_curve": worst_m[
+                        "area_under_optimization_curve"
+                    ],
+                    "final_fitness": worst_m["final_fitness"],
+                    "agent": "oracle-worst",
+                    "worst_agent": worst_m["agent"],
+                }
             }
         )
 
-    oracle_best.sort(key=lambda r: r["problem_id"])
-    oracle_worst.sort(key=lambda r: r["problem_id"])
+    oracle_best.sort(key=lambda r: next(iter(r)))
+    oracle_worst.sort(key=lambda r: next(iter(r)))
     return oracle_best, oracle_worst
 
 
@@ -233,14 +240,20 @@ def compute_oracle(all_results: dict[str, list[dict]]) -> tuple[list[dict], list
 
 
 def summarise(tag: str, records: list[dict]) -> dict:
-    gaps = [r["gap"] for r in records]
+    fitnesses = [next(iter(r.values()))["final_fitness"] for r in records]
+    aocc_vals = [next(iter(r.values()))["aocc"] for r in records]
+    auoc_vals = [
+        next(iter(r.values()))["area_under_optimization_curve"] for r in records
+    ]
     return {
         "agent": tag,
-        "n_problems": len(gaps),
-        "mean_gap": float(np.mean(gaps)),
-        "median_gap": float(np.median(gaps)),
-        "best_gap": float(np.min(gaps)),
-        "worst_gap": float(np.max(gaps)),
+        "n_problems": len(fitnesses),
+        "mean_final_fitness": float(np.mean(fitnesses)),
+        "median_final_fitness": float(np.median(fitnesses)),
+        "best_final_fitness": float(np.min(fitnesses)),
+        "worst_final_fitness": float(np.max(fitnesses)),
+        "mean_aocc": float(np.mean(aocc_vals)),
+        "mean_auoc": float(np.mean(auoc_vals)),
     }
 
 
@@ -252,17 +265,15 @@ def save_results(records: list[dict], path: str) -> None:
 
 def print_summary(summaries: list[dict]) -> None:
     width = max(len(s["agent"]) for s in summaries) + 2
-    header = (
-        f"  {'Agent':<{width}}  {'Mean gap':>12}  {'Median gap':>12}  {'Best gap':>12}"
-    )
+    header = f"  {'Agent':<{width}}  {'Mean fitness':>14}  {'Median fitness':>14}  {'Mean AUOC':>14}"
     print(header)
     print("  " + "-" * (len(header) - 2))
     for s in summaries:
         print(
             f"  {s['agent']:<{width}}  "
-            f"{s['mean_gap']:>12.4e}  "
-            f"{s['median_gap']:>12.4e}  "
-            f"{s['best_gap']:>12.4e}"
+            f"{s['mean_final_fitness']:>14.4e}  "
+            f"{s['median_final_fitness']:>14.4e}  "
+            f"{s['mean_auoc']:>14.4e}"
         )
 
 
diff --git a/cv.py b/cv.py
index 8b2a6d1..323d7f3 100644
--- a/cv.py
+++ b/cv.py
@@ -195,6 +195,13 @@ def _parse_args() -> argparse.Namespace:
         "--ppo-epochs", type=int, default=6, help="PPO gradient epochs per update"
     )
     exp.add_argument("--device", default="cpu", help="PyTorch device")
+    exp.add_argument(
+        "-j",
+        "--n-jobs",
+        type=int,
+        default=1,
+        help="Number of folds to run in parallel (default: 1 = sequential)",
+    )
 
     return root.parse_args()
 
diff --git a/das/env/__init__.py b/das/env/__init__.py
index 111b9ff..0ba3a18 100644
--- a/das/env/__init__.py
+++ b/das/env/__init__.py
@@ -1 +1,3 @@
 from das.env.das_env import DASEnv
+
+__all__ = ["DASEnv"]
diff --git a/das/env/das_env.py b/das/env/das_env.py
index edd4c90..22a51d0 100644
--- a/das/env/das_env.py
+++ b/das/env/das_env.py
@@ -158,6 +158,7 @@ def step(self, action: int):
             "best_y": self._best_y,
             "n_fe": self._n_fe,
             "checkpoint": self._checkpoint_idx,
+            "fitness_history_step": result.get("fitness_history", []),
         }
         return obs, reward, terminated, False, info
 
diff --git a/das/optimizers/BO/__init__.py b/das/optimizers/BO/__init__.py
index 46a4ce2..1e8dd03 100644
--- a/das/optimizers/BO/__init__.py
+++ b/das/optimizers/BO/__init__.py
@@ -1 +1 @@
-from das.optimizers.BO.gpbo import GPBO_EI, GPBO_UCB
+from das.optimizers.BO.gpbo import GPBO_EI as GPBO_EI, GPBO_UCB as GPBO_UCB
diff --git a/das/optimizers/DS/nm.py b/das/optimizers/DS/nm.py
index 18d48cb..9882906 100644
--- a/das/optimizers/DS/nm.py
+++ b/das/optimizers/DS/nm.py
@@ -72,7 +72,7 @@ def initialize(self, x=None, y=None):
 
     def iterate(self, x, y):
         order = np.argsort(y)
-        l, h = order[0], order[-1]
+        lo, h = order[0], order[-1]
         p_mean = np.mean(x[order[:-1]], axis=0)
 
         # Reflection
@@ -85,7 +85,7 @@ def iterate(self, x, y):
         if self._check_terminations():
             return x, y
 
-        if y_r < y[l]:
+        if y_r < y[lo]:
             # Expansion
             p_e = np.clip(
                 self.gamma * p_r + (1 - self.gamma) * p_mean,
@@ -113,7 +113,7 @@ def iterate(self, x, y):
                     # Shrinkage
                     for i in range(1, self.n_individuals):
                         x[order[i]] = np.clip(
-                            x[l] + self.shrinkage * (x[order[i]] - x[l]),
+                            x[lo] + self.shrinkage * (x[order[i]] - x[lo]),
                             self.lower_boundary,
                             self.upper_boundary,
                         )
diff --git a/das/optimizers/__init__.py b/das/optimizers/__init__.py
index e516e9e..db98390 100644
--- a/das/optimizers/__init__.py
+++ b/das/optimizers/__init__.py
@@ -1 +1,4 @@
-from das.optimizers.portfolio import PORTFOLIO, get_portfolio
+from das.optimizers.portfolio import (
+    PORTFOLIO as PORTFOLIO,
+    get_portfolio as get_portfolio,
+)
diff --git a/das/training/callbacks.py b/das/training/callbacks.py
index 5f24a14..83a3c68 100644
--- a/das/training/callbacks.py
+++ b/das/training/callbacks.py
@@ -9,11 +9,10 @@
 
 import json
 import os
-from itertools import product
 from typing import Any
 
 import numpy as np
-from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
+from stable_baselines3.common.callbacks import BaseCallback
 from stable_baselines3.common.vec_env import VecEnv
 
 try:
@@ -124,7 +123,7 @@ def _on_step(self) -> bool:
         return True
 
     def _evaluate(self):
-        aoccs, best_ys = [], []
+        best_ys = []
         obs = self.eval_env.reset()
 
         for _ in range(self.n_eval_episodes):
diff --git a/das/training/common.py b/das/training/common.py
index 65bc07a..63e6a73 100644
--- a/das/training/common.py
+++ b/das/training/common.py
@@ -3,6 +3,8 @@
 import json
 import os
 
+import numpy as np
+
 from das.env.das_env import DASEnv
 
 
@@ -13,6 +15,61 @@ def load_global_optima(path: str = "bbob_optima.jsonl") -> dict[str, float]:
         return {k: v for line in f for k, v in json.loads(line).items()}
 
 
+def compute_run_stats(
+    fitness_history: list[tuple[int, float]],
+    max_fe: int,
+    global_minimum: float,
+) -> dict[str, float]:
+    """Compute AUOC and AOCC from a best-so-far improvement history.
+
+    Parameters
+    ----------
+    fitness_history:
+        List of (n_fe, best_y) pairs recorded at every improvement point.
+        FE counts must be absolute (1-indexed, relative to episode start).
+    max_fe:
+        Total function-evaluation budget for the episode.
+    global_minimum:
+        Known optimum value; used to shift fitness so the excess is >= 0.
+        Pass 0.0 when the optimum is unknown.
+    """
+    if not fitness_history or max_fe <= 0:
+        return {"area_under_optimization_curve": 0.0, "aocc": 0.0, "final_fitness": 0.0}
+
+    lb, ub = 1e-8, 1e8
+    log_lb, log_ub = -8.0, 8.0
+
+    auoc_area = 0.0
+    aocc_area = 0.0
+    prev_fe = 0
+
+    for fe, best_y in fitness_history:
+        shifted = best_y - global_minimum
+        width = fe - prev_fe
+
+        auoc_area += shifted * width
+
+        clipped = np.clip(shifted, lb, ub)
+        normalized = (np.log10(clipped) - log_lb) / (log_ub - log_lb)
+        aocc_area += (1.0 - normalized) * width
+
+        prev_fe = fe
+
+    # Final plateau from last improvement to end of budget
+    final_shifted = fitness_history[-1][1] - global_minimum
+    final_width = max_fe - fitness_history[-1][0]
+    auoc_area += final_shifted * final_width
+    final_clipped = np.clip(final_shifted, lb, ub)
+    final_normalized = (np.log10(final_clipped) - log_lb) / (log_ub - log_lb)
+    aocc_area += (1.0 - final_normalized) * final_width
+
+    return {
+        "area_under_optimization_curve": auoc_area / max_fe,
+        "aocc": aocc_area / max_fe,
+        "final_fitness": final_shifted,
+    }
+
+
 def make_das_env(problem_ids: list[str], optimizers: list, cfg: dict):
     """Return a zero-argument factory for use with SB3's make_vec_env."""
 
diff --git a/das/training/expdas.py b/das/training/expdas.py
index 05dc281..62eb0db 100644
--- a/das/training/expdas.py
+++ b/das/training/expdas.py
@@ -2,6 +2,7 @@
 
 import json
 import os
+from concurrent.futures import ProcessPoolExecutor, as_completed
 
 import cocoex as cx
 import numpy as np
@@ -10,7 +11,7 @@
 from das.env.das_env import DASEnv
 from das.env.observation import observation_dim
 from das.optimizers.portfolio import get_portfolio
-from das.training.common import write_jsonl
+from das.training.common import load_global_optima, write_jsonl
 
 
 def run_exp_das(args) -> None:
@@ -76,34 +77,51 @@ def run_exp_das(args) -> None:
 
     if args.eval:
         print("\nFinal evaluation on test set …")
-        test_results = evaluate(test_env, agent, n_episodes=min(len(test_ids), 50))
-        mean_best_y = float(np.mean([r["best_y"] for r in test_results]))
-        print(f"Test mean best_y = {mean_best_y:.6e}")
+        global_optima = load_global_optima()
+        test_results = evaluate(
+            test_env,
+            agent,
+            n_episodes=min(len(test_ids), 50),
+            global_optima=global_optima,
+        )
+        mean_final_fitness = float(
+            np.mean([next(iter(r.values()))["final_fitness"] for r in test_results])
+        )
+        print(f"Test mean final_fitness = {mean_final_fitness:.6e}")
         eval_path = os.path.join("results", f"{args.name}_eval.jsonl")
         write_jsonl(eval_path, test_results)
         print(f"Results saved to {eval_path}")
 
 
-def run_cv_exp_das(args) -> None:
-    from agents.exponential_das import ExpDASAgent
-    from agents.exponential_das import train, evaluate
+def _run_single_fold(
+    fold_idx: int, fold_info: tuple, args
+) -> tuple[int, str, list[dict]]:
+    """Train and evaluate one CV fold.  Safe to call in a subprocess.
 
-    optimizers = get_portfolio(args.portfolio)
-    n_opt = len(optimizers)
-    obs_dim = observation_dim(n_opt)
+    Each call creates its own cocoex Suite so the (non-picklable) Suite object
+    never crosses process boundaries.
 
-    suite = cx.Suite("bbob", "", "")
+    Returns (fold_idx, fold_tag, fold_results).
+    """
+    # Lazy imports so the function can be pickled by ProcessPoolExecutor.
+    import cocoex as _cx
+    from agents.exponential_das import ExpDASAgent
+    from agents.exponential_das import train as _train, evaluate as _evaluate
 
-    all_folds = get_cv_folds(
-        args.cv_mode, args.dims, seed=args.seed, n_folds=args.n_folds
-    )
-    fold_indices = list(range(len(all_folds))) if args.folds is None else args.folds
+    train_ids, test_ids, fold_tag = fold_info
+    fold_name = f"{args.name}_cv_{fold_tag}"
+    model_path = os.path.join("models", f"{fold_name}_final.pt")
+    result_path = os.path.join("results", f"{fold_name}.jsonl")
 
-    print(f"CV mode    : {args.cv_mode}  ({len(fold_indices)}/{len(all_folds)} folds)")
-    print(f"n_epochs/fold: {args.n_epochs}  |  dims={args.dims}")
+    prefix = f"[{fold_tag}]"
+    print(f"{prefix} starting  ({len(train_ids)} train / {len(test_ids)} test)")
 
+    optimizers = get_portfolio(args.portfolio)
+    n_opt = len(optimizers)
+    obs_dim = observation_dim(n_opt)
     buffer_capacity = args.buffer_capacity or (16 * args.n_checkpoints)
 
+    suite = _cx.Suite("bbob", "", "")
     env_cfg = dict(
         suite=suite,
         optimizers=optimizers,
@@ -115,80 +133,113 @@ def run_cv_exp_das(args) -> None:
         seed=args.seed,
     )
 
-    fold_summaries = []
+    total_episodes = args.n_epochs * len(train_ids)
+
+    if os.path.exists(model_path):
+        print(f"{prefix} [skip training] model already exists")
+        agent = ExpDASAgent.load(model_path, obs_dim=obs_dim, n_actions=n_opt)
+    else:
+        agent = ExpDASAgent(
+            obs_dim=obs_dim,
+            n_actions=n_opt,
+            buffer_capacity=buffer_capacity,
+            actor_lr=args.actor_lr,
+            critic_lr=args.critic_lr,
+            ppo_epochs=args.ppo_epochs,
+            n_checkpoints=args.n_checkpoints,
+            device=args.device,
+        )
+        train_env = DASEnv(problem_ids=train_ids, **env_cfg)
+        _train(
+            train_env=train_env,
+            test_env=None,
+            agent=agent,
+            total_episodes=total_episodes,
+            eval_interval=total_episodes + 1,
+            save_interval=args.save_interval,
+            save_dir="models",
+            name=fold_name,
+        )
+        train_env.close()
+
+    if os.path.exists(result_path):
+        print(f"{prefix} [skip evaluation] results already exist")
+        with open(result_path) as fh:
+            fold_results = [json.loads(line) for line in fh]
+    else:
+        global_optima = load_global_optima()
+        eval_env = DASEnv(problem_ids=test_ids, **env_cfg)
+        raw = _evaluate(
+            eval_env, agent, n_episodes=len(test_ids), global_optima=global_optima
+        )
+        fold_results = [
+            {pid: {**m, "fold": fold_tag}} for r in raw for pid, m in r.items()
+        ]
+        eval_env.close()
+        write_jsonl(result_path, fold_results)
+        print(f"{prefix} saved results → {result_path}")
+
+    mean_ff = float(
+        np.mean([next(iter(r.values()))["final_fitness"] for r in fold_results])
+    )
+    print(f"{prefix} mean final_fitness={mean_ff:.4e}")
+    return fold_idx, fold_tag, fold_results
 
-    for run_idx, fold_idx in enumerate(fold_indices):
-        train_ids, test_ids, fold_tag = all_folds[fold_idx]
-        fold_name = f"{args.name}_cv_{fold_tag}"
-        model_path = os.path.join("models", f"{fold_name}_final.pt")
-        result_path = os.path.join("results", f"{fold_name}.jsonl")
 
-        print(f"\n{'=' * 60}")
-        print(
-            f"Fold {run_idx + 1}/{len(fold_indices)}: {fold_tag}"
-            f"  ({len(train_ids)} train / {len(test_ids)} test)"
+def run_cv_exp_das(args) -> None:
+    all_folds = get_cv_folds(
+        args.cv_mode, args.dims, seed=args.seed, n_folds=args.n_folds
+    )
+    fold_indices = list(range(len(all_folds))) if args.folds is None else args.folds
+    n_jobs = getattr(args, "n_jobs", 1)
+
+    print(f"CV mode    : {args.cv_mode}  ({len(fold_indices)}/{len(all_folds)} folds)")
+    print(f"n_epochs/fold: {args.n_epochs}  |  dims={args.dims}  |  n_jobs={n_jobs}")
+
+    # ------------------------------------------------------------------ #
+    # Run folds — in parallel when n_jobs > 1, sequentially otherwise    #
+    # ------------------------------------------------------------------ #
+
+    results_by_fold: dict[int, tuple[str, list[dict]]] = {}
+
+    if n_jobs > 1:
+        with ProcessPoolExecutor(max_workers=n_jobs) as pool:
+            futures = {
+                pool.submit(_run_single_fold, fi, all_folds[fi], args): fi
+                for fi in fold_indices
+            }
+            for future in as_completed(futures):
+                fi, fold_tag, fold_results = future.result()
+                results_by_fold[fi] = (fold_tag, fold_results)
+    else:
+        for fi in fold_indices:
+            print(f"\n{'=' * 60}")
+            fi_out, fold_tag, fold_results = _run_single_fold(fi, all_folds[fi], args)
+            results_by_fold[fi_out] = (fold_tag, fold_results)
+
+    # ------------------------------------------------------------------ #
+    # Aggregate summary (same order as fold_indices)                      #
+    # ------------------------------------------------------------------ #
+
+    fold_summaries = []
+    all_final_fitnesses: list[float] = []
+
+    for fi in fold_indices:
+        fold_tag, fold_results = results_by_fold[fi]
+        mean_ff = float(
+            np.mean([next(iter(r.values()))["final_fitness"] for r in fold_results])
         )
-        print(f"{'=' * 60}")
-
-        total_episodes = args.n_epochs * len(train_ids)
-
-        if os.path.exists(model_path):
-            print(f"  [skip training] {model_path} already exists")
-            agent = ExpDASAgent.load(model_path, obs_dim=obs_dim, n_actions=n_opt)
-        else:
-            agent = ExpDASAgent(
-                obs_dim=obs_dim,
-                n_actions=n_opt,
-                buffer_capacity=buffer_capacity,
-                actor_lr=args.actor_lr,
-                critic_lr=args.critic_lr,
-                ppo_epochs=args.ppo_epochs,
-                n_checkpoints=args.n_checkpoints,
-                device=args.device,
-            )
-            train_env = DASEnv(problem_ids=train_ids, **env_cfg)
-            train(
-                train_env=train_env,
-                test_env=None,
-                agent=agent,
-                total_episodes=total_episodes,
-                eval_interval=total_episodes + 1,
-                save_interval=args.save_interval,
-                save_dir="models",
-                name=fold_name,
-            )
-            train_env.close()
-
-        if os.path.exists(result_path):
-            print(f"  [skip evaluation] {result_path} already exists")
-            with open(result_path) as fh:
-                fold_results = [json.loads(line) for line in fh]
-        else:
-            eval_env = DASEnv(problem_ids=test_ids, **env_cfg)
-            raw = evaluate(eval_env, agent, n_episodes=len(test_ids))
-            fold_results = [{**r, "fold": fold_tag} for r in raw]
-            eval_env.close()
-            write_jsonl(result_path, fold_results)
-            print(f"  Saved results → {result_path}")
-
-        mean_best_y = float(np.mean([r["best_y"] for r in fold_results]))
         fold_summaries.append(
             {
                 "fold": fold_tag,
-                "fold_idx": fold_idx,
+                "fold_idx": fi,
                 "n_test": len(fold_results),
-                "mean_best_y": mean_best_y,
+                "mean_final_fitness": mean_ff,
             }
         )
-        print(f"  mean best_y={mean_best_y:.4e}")
-
-    all_best_y = []
-    for fold_idx in fold_indices:
-        _, _, fold_tag = all_folds[fold_idx]
-        rpath = os.path.join("results", f"{args.name}_cv_{fold_tag}.jsonl")
-        if os.path.exists(rpath):
-            with open(rpath) as fh:
-                all_best_y.extend(json.loads(line)["best_y"] for line in fh)
+        all_final_fitnesses.extend(
+            next(iter(r.values()))["final_fitness"] for r in fold_results
+        )
 
     overall = {
         "cv_mode": args.cv_mode,
@@ -197,7 +248,9 @@ def run_cv_exp_das(args) -> None:
         "dims": args.dims,
         "n_epochs_per_fold": args.n_epochs,
         "n_folds_run": len(fold_summaries),
-        "overall_mean_best_y": float(np.mean(all_best_y)) if all_best_y else None,
+        "overall_mean_final_fitness": float(np.mean(all_final_fitnesses))
+        if all_final_fitnesses
+        else None,
         "folds": fold_summaries,
     }
     summary_path = os.path.join("results", f"{args.name}_cv_summary.jsonl")
@@ -206,6 +259,8 @@ def run_cv_exp_das(args) -> None:
     print(f"\n{'=' * 60}")
     print(f"Cross-validation complete  ({args.cv_mode}  dims={args.dims})")
     print(f"  Folds run          : {len(fold_summaries)}")
-    if all_best_y:
-        print(f"  Overall mean best_y: {overall['overall_mean_best_y']:.4e}")
+    if all_final_fitnesses:
+        print(
+            f"  Overall mean final_fitness: {overall['overall_mean_final_fitness']:.4e}"
+        )
     print(f"  Summary            : {summary_path}")
diff --git a/evaluate.py b/evaluate.py
index d1140ff..da8fd06 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -12,9 +12,7 @@
 import json
 import os
 import warnings
-from itertools import product
 
-import cocoex
 import numpy as np
 from stable_baselines3 import PPO
 from stable_baselines3.common.vec_env import VecNormalize
@@ -22,36 +20,13 @@
 from tqdm import tqdm
 
 from das.env.bbob_splits import ALL_DIMS, get_train_test_split
-from das.env.das_env import DASEnv
 from das.optimizers.portfolio import get_portfolio
 from das.utils import set_seed
-from das.training.common import load_global_optima, make_das_env
+from das.training.common import compute_run_stats, load_global_optima, make_das_env
 
 warnings.filterwarnings("ignore")
 
 
-# ------------------------------------------------------------------ #
-# AOCC metric                                                          #
-# ------------------------------------------------------------------ #
-
-
-def aocc(
-    fitness_history: list[tuple[int, float]], max_fe: int, optimum: float
-) -> float:
-    lb, ub = -8.0, 8.0
-    area, prev_fe = 0.0, 0
-    for fe, f in fitness_history:
-        v = np.clip(f - optimum, 1e-8, 1e8)
-        area += (1.0 - (np.log10(v) - lb) / (ub - lb)) * (fe - prev_fe)
-        prev_fe = fe
-    if fitness_history:
-        last_v = np.clip(fitness_history[-1][1] - optimum, 1e-8, 1e8)
-        area += (1.0 - (np.log10(last_v) - lb) / (ub - lb)) * (
-            max_fe - fitness_history[-1][0]
-        )
-    return area / max_fe
-
-
 # ------------------------------------------------------------------ #
 # CLI                                                                  #
 # ------------------------------------------------------------------ #
@@ -110,42 +85,37 @@ def main():
         eval_env.norm_reward = False
 
     global_optima = load_global_optima()
-    observer = (
-        cocoex.Observer("bbob", f"result_folder: {args.name}") if args.coco else None
-    )
 
     out_path = os.path.join("results", f"{args.name}_eval.jsonl")
     results_all = []
 
     for problem_id in tqdm(test_ids, smoothing=0.0):
-        # Run one episode
         obs = eval_env.reset()
         done = [False]
-        info = {}
+        fitness_history: list[tuple[int, float]] = []
+        step_info: dict = {}
         while not done[0]:
             action, _ = model.predict(obs, deterministic=True)
             obs, _, done, infos = eval_env.step(action)
-            if done[0]:
-                info = infos[0]
-
-        best_y = info.get("best_y", float("inf"))
-        optimum = global_optima.get(problem_id, 0.0)
-        record = {
-            "problem_id": problem_id,
-            "best_y": best_y,
-            "gap": best_y - optimum,
-        }
-        results_all.append(record)
+            step_info = infos[0]
+            fitness_history.extend(step_info.get("fitness_history_step", []))
+
+        max_fe = step_info.get("n_fe", 0)
+        global_minimum = global_optima.get(problem_id, 0.0)
+        stats = compute_run_stats(fitness_history, max_fe, global_minimum)
+        results_all.append({problem_id: stats})
 
     with open(out_path, "w") as f:
         for r in results_all:
             f.write(json.dumps(r) + "\n")
 
-    gaps = [r["gap"] for r in results_all]
+    metrics = [next(iter(r.values())) for r in results_all]
     print(f"\nEvaluation complete  ({len(results_all)} problems)")
-    print(f"  Mean gap  : {np.mean(gaps):.4e}")
-    print(f"  Median gap: {np.median(gaps):.4e}")
-    print(f"  Results   : {out_path}")
+    print(
+        f"  Mean final_fitness  : {np.mean([m['final_fitness'] for m in metrics]):.4e}"
+    )
+    print(f"  Mean AOCC           : {np.mean([m['aocc'] for m in metrics]):.4f}")
+    print(f"  Results             : {out_path}")
 
 
 if __name__ == "__main__":
diff --git a/tests/test_baselines.py b/tests/test_baselines.py
index 718d8c5..a17cf49 100644
--- a/tests/test_baselines.py
+++ b/tests/test_baselines.py
@@ -4,15 +4,12 @@
 real BBOB/cocoex is needed and episodes complete in milliseconds.
 """
 
-import json
-import os
-import tempfile
-
 import numpy as np
 import pytest
 
 from das.env.das_env import DASEnv
 from das.optimizers.portfolio import get_portfolio
+from das.training.common import compute_run_stats
 from das.utils import set_seed
 from baselines import (
     collect_env_results,
@@ -62,6 +59,8 @@ def get_problem(self, problem_id: str) -> MockProblem:
 N_INDIVIDUALS = 10
 PORTFOLIO = ["SPSO", "IPSO"]
 
+METRICS_KEYS = {"area_under_optimization_curve", "aocc", "final_fitness"}
+
 
 def make_env(problem_ids=PROBLEM_IDS, suite=None):
     return DASEnv(
@@ -86,6 +85,96 @@ def make_cfg():
     )
 
 
+def _metrics(record: dict) -> dict:
+    """Extract the metrics dict from a {problem_id: metrics} record."""
+    return next(iter(record.values()))
+
+
+def _pid(record: dict) -> str:
+    """Extract the problem_id key from a {problem_id: metrics} record."""
+    return next(iter(record.keys()))
+
+
+# ------------------------------------------------------------------ #
+# 0. compute_run_stats (unit tests for the core metric function)      #
+# ------------------------------------------------------------------ #
+
+
+class TestComputeRunStats:
+    def test_aocc_in_unit_interval(self):
+        history = [(100, 5.0), (300, 2.0), (700, 0.5)]
+        stats = compute_run_stats(history, 1000, global_minimum=0.0)
+        assert 0.0 <= stats["aocc"] <= 1.0
+
+    def test_auoc_known_value(self):
+        # AUOC = integral of shifted fitness / max_fe (piecewise constant)
+        # history = [(100, 5), (300, 2), (700, 0.5)], max_fe=1000, g=0
+        # area = 5*100 + 2*200 + 0.5*400 + 0.5*300  (final plateau 700→1000)
+        #      = 500 + 400 + 200 + 150 = 1250
+        # AUOC = 1250 / 1000 = 1.25
+        history = [(100, 5.0), (300, 2.0), (700, 0.5)]
+        stats = compute_run_stats(history, 1000, global_minimum=0.0)
+        assert stats["area_under_optimization_curve"] == pytest.approx(1.25)
+
+    def test_final_fitness_is_shifted(self):
+        history = [(500, 3.0)]
+        stats = compute_run_stats(history, 1000, global_minimum=1.0)
+        assert stats["final_fitness"] == pytest.approx(2.0)  # 3.0 - 1.0
+
+    def test_final_fitness_zero_global_minimum(self):
+        history = [(500, -4.5)]
+        stats = compute_run_stats(history, 1000, global_minimum=0.0)
+        assert stats["final_fitness"] == pytest.approx(-4.5)
+
+    def test_empty_history_returns_zeros(self):
+        stats = compute_run_stats([], 1000, global_minimum=0.0)
+        assert stats["aocc"] == 0.0
+        assert stats["area_under_optimization_curve"] == 0.0
+        assert stats["final_fitness"] == 0.0
+
+    def test_single_improvement_at_start(self):
+        # Improvement at first FE, constant for the rest → AUOC = shifted_y
+        history = [(1, 2.0)]
+        stats = compute_run_stats(history, 1000, global_minimum=0.0)
+        # area = 2.0*1 (loop) + 2.0*999 (plateau) = 2000 → / 1000 = 2.0
+        assert stats["area_under_optimization_curve"] == pytest.approx(2.0)
+
+    def test_monotone_improvement_raises_aocc(self):
+        # More improvement points should yield higher AOCC
+        fast = [(10, 1.0), (20, 0.1), (30, 0.01)]
+        slow = [(10, 1.0), (500, 0.1), (900, 0.01)]
+        stats_fast = compute_run_stats(fast, 1000, global_minimum=0.0)
+        stats_slow = compute_run_stats(slow, 1000, global_minimum=0.0)
+        assert stats_fast["aocc"] > stats_slow["aocc"]
+
+    def test_all_keys_present(self):
+        history = [(100, 1.0)]
+        stats = compute_run_stats(history, 1000, global_minimum=0.0)
+        assert METRICS_KEYS == set(stats.keys())
+
+    def test_global_minimum_shifts_auoc(self):
+        history = [(500, 5.0)]
+        stats_g0 = compute_run_stats(history, 1000, global_minimum=0.0)
+        stats_g2 = compute_run_stats(history, 1000, global_minimum=2.0)
+        # With g=2, shifted=3 < shifted=5 → lower AUOC
+        assert (
+            stats_g2["area_under_optimization_curve"]
+            < stats_g0["area_under_optimization_curve"]
+        )
+
+    def test_aocc_perfect_convergence(self):
+        # If the optimizer reaches very close to the optimum immediately, AOCC ≈ 1
+        history = [(1, 1e-7)]  # shifted ≈ 1e-7 ≈ lb → normalized ≈ 0 → (1-0)*width
+        stats = compute_run_stats(history, 1000, global_minimum=0.0)
+        assert stats["aocc"] > 0.9
+
+    def test_aocc_no_convergence(self):
+        # If the optimizer stays near ub (1e8), AOCC ≈ 0
+        history = [(1, 1e8)]
+        stats = compute_run_stats(history, 1000, global_minimum=0.0)
+        assert stats["aocc"] < 0.1
+
+
 # ------------------------------------------------------------------ #
 # 1. Policy functions                                                  #
 # ------------------------------------------------------------------ #
@@ -124,21 +213,55 @@ def test_fixed_policy_ignores_obs(self):
 
 
 class TestRunEpisode:
-    def test_returns_info_dict_with_best_y(self):
+    def test_returns_two_element_tuple(self):
         env = make_env()
-        info = run_episode(env, random_policy)
-        assert "best_y" in info
-        assert isinstance(info["best_y"], float)
+        result = run_episode(env, random_policy)
+        assert isinstance(result, tuple) and len(result) == 2
 
-    def test_fixed_policy_runs_full_episode(self):
+    def test_step_info_has_best_y(self):
         env = make_env()
-        info = run_episode(env, fixed_policy(0))
-        assert "best_y" in info
+        step_info, _ = run_episode(env, random_policy)
+        assert "best_y" in step_info
+        assert isinstance(step_info["best_y"], float)
 
     def test_best_y_is_finite(self):
         env = make_env()
-        info = run_episode(env, random_policy)
-        assert np.isfinite(info["best_y"])
+        step_info, _ = run_episode(env, random_policy)
+        assert np.isfinite(step_info["best_y"])
+
+    def test_fitness_history_is_list_of_tuples(self):
+        env = make_env()
+        _, fitness_history = run_episode(env, random_policy)
+        assert isinstance(fitness_history, list)
+        for fe, y in fitness_history:
+            assert isinstance(fe, int)
+            assert isinstance(y, float)
+
+    def test_fitness_history_fe_is_increasing(self):
+        env = make_env()
+        _, fitness_history = run_episode(env, random_policy)
+        fes = [fe for fe, _ in fitness_history]
+        assert fes == sorted(fes)
+
+    def test_fitness_history_y_is_nonincreasing(self):
+        """Improvement history must be strictly improving (best-so-far never worsens)."""
+        env = make_env()
+        _, fitness_history = run_episode(env, random_policy)
+        ys = [y for _, y in fitness_history]
+        for i in range(1, len(ys)):
+            assert ys[i] < ys[i - 1]
+
+    def test_fitness_history_nonempty_after_episode(self):
+        """At least one improvement must occur (first evaluation beats inf)."""
+        env = make_env()
+        _, fitness_history = run_episode(env, random_policy)
+        assert len(fitness_history) >= 1
+
+    def test_fixed_policy_runs_full_episode(self):
+        env = make_env()
+        step_info, fitness_history = run_episode(env, fixed_policy(0))
+        assert np.isfinite(step_info["best_y"])
+        assert len(fitness_history) >= 1
 
     def test_episode_advances_problem_idx(self):
         env = make_env()
@@ -173,7 +296,7 @@ def test_returns_one_record_per_problem(self):
         )
         assert len(records) == 2
 
-    def test_record_structure(self):
+    def test_record_is_nested_dict(self):
         suite = MockSuite()
         records = collect_env_results(
             "random",
@@ -185,35 +308,74 @@ def test_record_structure(self):
             {},
         )
         r = records[0]
-        assert set(r.keys()) >= {"problem_id", "agent", "best_y", "gap"}
+        assert len(r) == 1
+        pid, metrics = next(iter(r.items()))
+        assert pid == PROBLEM_IDS[0]
+        assert METRICS_KEYS <= set(metrics.keys())
 
-    def test_agent_tag_in_records(self):
+    def test_all_metric_keys_present(self):
         suite = MockSuite()
         records = collect_env_results(
-            "fixed:SPSO",
-            fixed_policy(0),
+            "random",
+            random_policy,
             PROBLEM_IDS[:1],
             suite,
             get_portfolio(PORTFOLIO),
             make_cfg(),
             {},
         )
-        assert records[0]["agent"] == "fixed:SPSO"
+        metrics = _metrics(records[0])
+        assert METRICS_KEYS <= set(metrics.keys())
 
-    def test_gap_equals_best_y_minus_optimum(self):
+    def test_aocc_in_unit_interval(self):
         suite = MockSuite()
-        optima = {PROBLEM_IDS[0]: 1.0}
         records = collect_env_results(
             "random",
             random_policy,
+            PROBLEM_IDS[:2],
+            suite,
+            get_portfolio(PORTFOLIO),
+            make_cfg(),
+            {},
+        )
+        for r in records:
+            assert 0.0 <= _metrics(r)["aocc"] <= 1.0
+
+    def test_final_fitness_is_finite(self):
+        suite = MockSuite()
+        records = collect_env_results(
+            "random",
+            random_policy,
+            PROBLEM_IDS[:2],
+            suite,
+            get_portfolio(PORTFOLIO),
+            make_cfg(),
+            {},
+        )
+        for r in records:
+            assert np.isfinite(_metrics(r)["final_fitness"])
+
+    def test_final_fitness_shifted_by_global_minimum(self):
+        # Use compute_run_stats directly with a fixed history to verify shift logic.
+        history = [(100, 5.0)]
+        stats_g0 = compute_run_stats(history, 1000, global_minimum=0.0)
+        stats_g1 = compute_run_stats(history, 1000, global_minimum=1.0)
+        assert stats_g0["final_fitness"] - stats_g1["final_fitness"] == pytest.approx(
+            1.0
+        )
+
+    def test_agent_tag_in_metrics(self):
+        suite = MockSuite()
+        records = collect_env_results(
+            "fixed:SPSO",
+            fixed_policy(0),
             PROBLEM_IDS[:1],
             suite,
             get_portfolio(PORTFOLIO),
             make_cfg(),
-            optima,
+            {},
         )
-        r = records[0]
-        assert abs(r["gap"] - (r["best_y"] - 1.0)) < 1e-9
+        assert _metrics(records[0])["agent"] == "fixed:SPSO"
 
     def test_problem_ids_match_order(self):
         suite = MockSuite()
@@ -227,14 +389,10 @@ def test_problem_ids_match_order(self):
             make_cfg(),
             {},
         )
-        assert [r["problem_id"] for r in records] == ids
+        assert [_pid(r) for r in records] == ids
 
     def test_fixed_policy_only_uses_one_action(self):
-        """A fixed-action policy must always select the same optimizer.
-
-        We verify this by checking that env._choices_history contains only
-        the chosen action index after one complete episode.
-        """
+        """A fixed-action policy must always select the same optimizer."""
         suite = MockSuite()
         for action_idx in range(len(PORTFOLIO)):
             env = make_env(suite=suite)
@@ -250,21 +408,33 @@ def test_fixed_policy_only_uses_one_action(self):
 
 
 class TestSingleAlgorithm:
-    def test_run_single_returns_float(self):
+    def test_run_single_returns_metrics_dict(self):
+        opt_class = get_portfolio(["SPSO"])[0]
+        problem = MockProblem("test_p", dim=2)
+        stats = run_single_algorithm(opt_class, problem, FE_MULTIPLIER, N_INDIVIDUALS)
+        assert isinstance(stats, dict)
+        assert METRICS_KEYS <= set(stats.keys())
+
+    def test_run_single_all_metrics_finite(self):
+        opt_class = get_portfolio(["SPSO"])[0]
+        problem = MockProblem("test_p", dim=2)
+        stats = run_single_algorithm(opt_class, problem, FE_MULTIPLIER, N_INDIVIDUALS)
+        assert np.isfinite(stats["final_fitness"])
+        assert np.isfinite(stats["area_under_optimization_curve"])
+        assert np.isfinite(stats["aocc"])
+
+    def test_run_single_aocc_in_unit_interval(self):
         opt_class = get_portfolio(["SPSO"])[0]
         problem = MockProblem("test_p", dim=2)
-        result = run_single_algorithm(opt_class, problem, FE_MULTIPLIER, N_INDIVIDUALS)
-        assert isinstance(result, float)
-        assert np.isfinite(result)
+        stats = run_single_algorithm(opt_class, problem, FE_MULTIPLIER, N_INDIVIDUALS)
+        assert 0.0 <= stats["aocc"] <= 1.0
 
-    def test_run_single_uses_full_budget(self):
-        """The optimizer must receive the full budget (fe_multiplier × dim)."""
+    def test_run_single_larger_budget_not_worse(self):
         opt_class = get_portfolio(["SPSO"])[0]
         problem = MockProblem("test_p", dim=2)
-        result_small = run_single_algorithm(opt_class, problem, 2, N_INDIVIDUALS)
-        result_large = run_single_algorithm(opt_class, problem, 50, N_INDIVIDUALS)
-        # Larger budget should not produce a worse result
-        assert result_large <= result_small + 1e-6
+        small = run_single_algorithm(opt_class, problem, 2, N_INDIVIDUALS)
+        large = run_single_algorithm(opt_class, problem, 50, N_INDIVIDUALS)
+        assert large["final_fitness"] <= small["final_fitness"] + 1e-6
 
     def test_collect_single_returns_one_record_per_problem(self):
         suite = MockSuite()
@@ -293,11 +463,26 @@ def test_collect_single_record_structure(self):
             {},
         )
         r = records[0]
-        assert set(r.keys()) >= {"problem_id", "agent", "best_y", "gap"}
-        assert r["agent"] == "single:SPSO"
+        metrics = _metrics(r)
+        assert METRICS_KEYS <= set(metrics.keys())
+        assert metrics["agent"] == "single:SPSO"
+
+    def test_collect_single_aocc_in_unit_interval(self):
+        suite = MockSuite()
+        opt_class = get_portfolio(["SPSO"])[0]
+        records = collect_single_results(
+            "single:SPSO",
+            opt_class,
+            PROBLEM_IDS[:2],
+            suite,
+            FE_MULTIPLIER,
+            N_INDIVIDUALS,
+            {},
+        )
+        for r in records:
+            assert 0.0 <= _metrics(r)["aocc"] <= 1.0
 
-    def test_single_vs_fixed_both_finite(self):
-        """Both single and fixed-policy agents should produce finite gaps."""
+    def test_single_vs_fixed_both_have_finite_final_fitness(self):
         suite = MockSuite()
         opt_class = get_portfolio(["SPSO"])[0]
 
@@ -320,7 +505,7 @@ def test_single_vs_fixed_both_finite(self):
             {},
         )
         for r in single_records + fixed_records:
-            assert np.isfinite(r["gap"])
+            assert np.isfinite(_metrics(r)["final_fitness"])
 
 
 # ------------------------------------------------------------------ #
@@ -330,54 +515,80 @@ def test_single_vs_fixed_both_finite(self):
 
 class TestComputeOracle:
     def _make_results(self):
-        """Two agents with known per-problem gaps for testing."""
+        """Two agents with known per-problem metrics for testing."""
         return {
             "agent_A": [
-                {"problem_id": "p1", "agent": "agent_A", "best_y": 5.0, "gap": 5.0},
-                {"problem_id": "p2", "agent": "agent_A", "best_y": 3.0, "gap": 3.0},
+                {
+                    "p1": {
+                        "final_fitness": 5.0,
+                        "aocc": 0.3,
+                        "area_under_optimization_curve": 5.0,
+                        "agent": "agent_A",
+                    }
+                },
+                {
+                    "p2": {
+                        "final_fitness": 3.0,
+                        "aocc": 0.5,
+                        "area_under_optimization_curve": 3.0,
+                        "agent": "agent_A",
+                    }
+                },
             ],
             "agent_B": [
-                {"problem_id": "p1", "agent": "agent_B", "best_y": 2.0, "gap": 2.0},
-                {"problem_id": "p2", "agent": "agent_B", "best_y": 8.0, "gap": 8.0},
+                {
+                    "p1": {
+                        "final_fitness": 2.0,
+                        "aocc": 0.7,
+                        "area_under_optimization_curve": 2.0,
+                        "agent": "agent_B",
+                    }
+                },
+                {
+                    "p2": {
+                        "final_fitness": 8.0,
+                        "aocc": 0.1,
+                        "area_under_optimization_curve": 8.0,
+                        "agent": "agent_B",
+                    }
+                },
             ],
         }
 
-    def test_oracle_best_picks_minimum_gap(self):
+    def test_oracle_best_picks_minimum_final_fitness(self):
         best, _ = compute_oracle(self._make_results())
-        by_pid = {r["problem_id"]: r for r in best}
-        assert by_pid["p1"]["gap"] == 2.0  # agent_B wins
-        assert by_pid["p2"]["gap"] == 3.0  # agent_A wins
+        by_pid = {_pid(r): _metrics(r) for r in best}
+        assert by_pid["p1"]["final_fitness"] == 2.0  # agent_B wins
+        assert by_pid["p2"]["final_fitness"] == 3.0  # agent_A wins
 
-    def test_oracle_worst_picks_maximum_gap(self):
+    def test_oracle_worst_picks_maximum_final_fitness(self):
         _, worst = compute_oracle(self._make_results())
-        by_pid = {r["problem_id"]: r for r in worst}
-        assert by_pid["p1"]["gap"] == 5.0  # agent_A is worst
-        assert by_pid["p2"]["gap"] == 8.0  # agent_B is worst
+        by_pid = {_pid(r): _metrics(r) for r in worst}
+        assert by_pid["p1"]["final_fitness"] == 5.0  # agent_A is worst
+        assert by_pid["p2"]["final_fitness"] == 8.0  # agent_B is worst
 
     def test_oracle_best_records_winning_agent(self):
         best, _ = compute_oracle(self._make_results())
-        by_pid = {r["problem_id"]: r for r in best}
+        by_pid = {_pid(r): _metrics(r) for r in best}
         assert by_pid["p1"]["best_agent"] == "agent_B"
         assert by_pid["p2"]["best_agent"] == "agent_A"
 
     def test_oracle_worst_records_worst_agent(self):
         _, worst = compute_oracle(self._make_results())
-        by_pid = {r["problem_id"]: r for r in worst}
+        by_pid = {_pid(r): _metrics(r) for r in worst}
         assert by_pid["p1"]["worst_agent"] == "agent_A"
         assert by_pid["p2"]["worst_agent"] == "agent_B"
 
     def test_oracle_covers_all_problems(self):
         best, worst = compute_oracle(self._make_results())
-        pids_best = {r["problem_id"] for r in best}
-        pids_worst = {r["problem_id"] for r in worst}
-        assert pids_best == {"p1", "p2"}
-        assert pids_worst == {"p1", "p2"}
+        assert {_pid(r) for r in best} == {"p1", "p2"}
+        assert {_pid(r) for r in worst} == {"p1", "p2"}
 
     def test_oracle_best_le_oracle_worst(self):
-        """Per problem, oracle-best gap ≤ oracle-worst gap."""
+        """Per problem, oracle-best final_fitness ≤ oracle-worst final_fitness."""
         best, worst = compute_oracle(self._make_results())
-        best_by_pid = {r["problem_id"]: r["gap"] for r in best}
-        worst_by_pid = {r["problem_id"]: r["gap"] for r in worst}
+        best_by_pid = {_pid(r): _metrics(r)["final_fitness"] for r in best}
+        worst_by_pid = {_pid(r): _metrics(r)["final_fitness"] for r in worst}
         for pid in best_by_pid:
             assert best_by_pid[pid] <= worst_by_pid[pid]
 
@@ -387,34 +598,76 @@ def test_oracle_with_real_runs(self):
         ids = PROBLEM_IDS[:2]
         cfg = make_cfg()
         optimizers = get_portfolio(PORTFOLIO)
-        global_optima = {}
 
         records_spso = collect_env_results(
-            "fixed:SPSO", fixed_policy(0), ids, suite, optimizers, cfg, global_optima
+            "fixed:SPSO", fixed_policy(0), ids, suite, optimizers, cfg, {}
         )
         records_ipso = collect_env_results(
-            "fixed:IPSO", fixed_policy(1), ids, suite, optimizers, cfg, global_optima
+            "fixed:IPSO", fixed_policy(1), ids, suite, optimizers, cfg, {}
         )
 
         best, worst = compute_oracle(
-            {
-                "fixed:SPSO": records_spso,
-                "fixed:IPSO": records_ipso,
-            }
+            {"fixed:SPSO": records_spso, "fixed:IPSO": records_ipso}
         )
         assert len(best) == len(ids)
         assert len(worst) == len(ids)
         for b, w in zip(
-            sorted(best, key=lambda r: r["problem_id"]),
-            sorted(worst, key=lambda r: r["problem_id"]),
+            sorted(best, key=_pid),
+            sorted(worst, key=_pid),
         ):
-            assert b["gap"] <= w["gap"] + 1e-9
+            assert _metrics(b)["final_fitness"] <= _metrics(w)["final_fitness"] + 1e-9
 
 
 # ------------------------------------------------------------------ #
 # 6. summarise                                                         #
 # ------------------------------------------------------------------ #
 
+
+class TestSummarise:
+    def _records(self, final_fitnesses, aocc_values=None):
+        if aocc_values is None:
+            aocc_values = [0.5] * len(final_fitnesses)
+        return [
+            {
+                f"p{i}": {
+                    "final_fitness": ff,
+                    "aocc": a,
+                    "area_under_optimization_curve": ff,
+                    "agent": "x",
+                }
+            }
+            for i, (ff, a) in enumerate(zip(final_fitnesses, aocc_values))
+        ]
+
+    def test_mean_final_fitness(self):
+        s = summarise("x", self._records([1.0, 3.0]))
+        assert s["mean_final_fitness"] == pytest.approx(2.0)
+
+    def test_median_final_fitness(self):
+        s = summarise("x", self._records([1.0, 2.0, 9.0]))
+        assert s["median_final_fitness"] == pytest.approx(2.0)
+
+    def test_best_final_fitness(self):
+        s = summarise("x", self._records([5.0, 1.0, 3.0]))
+        assert s["best_final_fitness"] == pytest.approx(1.0)
+
+    def test_worst_final_fitness(self):
+        s = summarise("x", self._records([5.0, 1.0, 3.0]))
+        assert s["worst_final_fitness"] == pytest.approx(5.0)
+
+    def test_mean_aocc(self):
+        s = summarise("x", self._records([1.0, 2.0], aocc_values=[0.4, 0.6]))
+        assert s["mean_aocc"] == pytest.approx(0.5)
+
+    def test_n_problems(self):
+        s = summarise("x", self._records([1.0, 2.0, 3.0]))
+        assert s["n_problems"] == 3
+
+    def test_agent_field(self):
+        s = summarise("my_agent", self._records([1.0]))
+        assert s["agent"] == "my_agent"
+
+
 # ------------------------------------------------------------------ #
 # 7. Reproducibility via seed                                          #
 # ------------------------------------------------------------------ #
@@ -439,36 +692,46 @@ def test_same_seed_same_best_y(self):
         for seed in [0, 42, 999]:
             set_seed(seed)
             env1 = self._seeded_env(seed)
-            info1 = run_episode(env1, fixed_policy(0))
+            step_info1, _ = run_episode(env1, fixed_policy(0))
 
             set_seed(seed)
             env2 = self._seeded_env(seed)
-            info2 = run_episode(env2, fixed_policy(0))
+            step_info2, _ = run_episode(env2, fixed_policy(0))
 
-            assert info1["best_y"] == pytest.approx(info2["best_y"]), (
-                f"seed={seed}: {info1['best_y']} != {info2['best_y']}"
+            assert step_info1["best_y"] == pytest.approx(step_info2["best_y"]), (
+                f"seed={seed}: {step_info1['best_y']} != {step_info2['best_y']}"
             )
 
+    def test_same_seed_same_aocc(self):
+        """Two envs with the same seed produce identical AOCC via identical fitness history."""
+        for seed in [0, 42]:
+            set_seed(seed)
+            env1 = self._seeded_env(seed)
+            _, hist1 = run_episode(env1, fixed_policy(0))
+
+            set_seed(seed)
+            env2 = self._seeded_env(seed)
+            _, hist2 = run_episode(env2, fixed_policy(0))
+
+            assert hist1 == hist2
+
     def test_different_seeds_different_best_y(self):
         """Different seeds produce different optimizer trajectories (with high probability)."""
         results = set()
         for seed in [0, 1, 2, 3, 4]:
             set_seed(seed)
             env = self._seeded_env(seed)
-            info = run_episode(env, fixed_policy(0))
-            results.add(round(info["best_y"], 8))
+            step_info, _ = run_episode(env, fixed_policy(0))
+            results.add(round(step_info["best_y"], 8))
         # With 5 different seeds, at least 2 distinct outcomes are expected
         assert len(results) > 1
 
     def test_seed_rng_set_in_optimizer_options(self):
         """DASEnv must pass seed_rng to pypop7 when env._seed is set."""
-        from unittest.mock import patch, MagicMock
-
         env = self._seeded_env(seed=42)
         env.reset()
 
         captured_options = {}
-
         original_class = get_portfolio(PORTFOLIO)[0]
 
         class CapturingOptimizer(original_class):
@@ -486,8 +749,6 @@ def __init__(self, problem, options):
 
     def test_no_seed_does_not_set_seed_rng(self):
         """Without a seed, seed_rng must NOT be injected (pypop7 uses random state)."""
-        from unittest.mock import patch
-
         env = DASEnv(
             problem_ids=PROBLEM_IDS[:1],
             suite=MockSuite(),
@@ -538,29 +799,64 @@ def __init__(self, problem, options):
         )
 
 
-class TestSummarise:
-    def _records(self, gaps):
-        return [
-            {"problem_id": f"p{i}", "agent": "x", "best_y": g, "gap": g}
-            for i, g in enumerate(gaps)
-        ]
+# ------------------------------------------------------------------ #
+# 8. DASEnv step info — fitness_history_step                          #
+# ------------------------------------------------------------------ #
 
-    def test_mean_gap(self):
-        s = summarise("x", self._records([1.0, 3.0]))
-        assert s["mean_gap"] == pytest.approx(2.0)
 
-    def test_median_gap(self):
-        s = summarise("x", self._records([1.0, 2.0, 9.0]))
-        assert s["median_gap"] == pytest.approx(2.0)
+class TestFitnessHistoryStep:
+    def test_step_info_contains_fitness_history_step(self):
+        env = make_env()
+        env.reset()
+        _, _, _, _, info = env.step(0)
+        assert "fitness_history_step" in info
 
-    def test_best_gap(self):
-        s = summarise("x", self._records([5.0, 1.0, 3.0]))
-        assert s["best_gap"] == pytest.approx(1.0)
+    def test_fitness_history_step_is_list(self):
+        env = make_env()
+        env.reset()
+        _, _, _, _, info = env.step(0)
+        assert isinstance(info["fitness_history_step"], list)
 
-    def test_worst_gap(self):
-        s = summarise("x", self._records([5.0, 1.0, 3.0]))
-        assert s["worst_gap"] == pytest.approx(5.0)
+    def test_fitness_history_step_entries_are_int_float_tuples(self):
+        env = make_env()
+        env.reset()
+        _, _, _, _, info = env.step(0)
+        for fe, y in info["fitness_history_step"]:
+            assert isinstance(fe, int)
+            assert isinstance(y, float)
 
-    def test_n_problems(self):
-        s = summarise("x", self._records([1.0, 2.0, 3.0]))
-        assert s["n_problems"] == 3
+    def test_fitness_history_step_fe_within_budget(self):
+        env = make_env()
+        env.reset()
+        _, _, _, _, info = env.step(0)
+        max_fe = FE_MULTIPLIER * 2  # dim=2
+        for fe, _ in info["fitness_history_step"]:
+            assert 1 <= fe <= max_fe
+
+    def test_fitness_history_step_accumulated_across_checkpoints(self):
+        """Full episode fitness history must contain at least as many points as one step."""
+        env = make_env()
+        env.reset()
+        all_history = []
+        done = False
+        while not done:
+            _, _, terminated, truncated, info = env.step(0)
+            done = terminated or truncated
+            all_history.extend(info["fitness_history_step"])
+
+        # At minimum one improvement in the first checkpoint (from inf)
+        assert len(all_history) >= 1
+
+    def test_fitness_history_step_fe_monotone_across_episode(self):
+        """FE values accumulated across all checkpoints must be strictly increasing."""
+        env = make_env()
+        env.reset()
+        all_history = []
+        done = False
+        while not done:
+            _, _, terminated, truncated, info = env.step(0)
+            done = terminated or truncated
+            all_history.extend(info["fitness_history_step"])
+
+        fes = [fe for fe, _ in all_history]
+        assert fes == sorted(set(fes)), "Duplicate or out-of-order FE values in history"

From f13fdd81de67b1ed60f6a7b97b8d1b15b0630b1c Mon Sep 17 00:00:00 2001
From: wlnc <wladyslaw.niec@unit8.co>
Date: Sun, 24 May 2026 12:50:51 +0200
Subject: [PATCH 7/8] add parallelism check to smoke tests

---
 run_local.sh  | 4 ++++
 smoke_test.sh | 5 +++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/run_local.sh b/run_local.sh
index f1363fb..211aec8 100755
--- a/run_local.sh
+++ b/run_local.sh
@@ -44,6 +44,10 @@ case "$AGENT" in
         python cv.py exp-das ${PORTFOLIO_STR}_EXPDAS_CV_LOCAL_SEED${SEED} \
             -p "${PORTFOLIO[@]}" --dims 2 --cv-mode LOIO --n-epochs 1 --seed $SEED --fe-multiplier 10  --n-checkpoints 3
         ;;
+    exp-das-cv-par)
+        python cv.py exp-das ${PORTFOLIO_STR}_EXPDAS_CV_PAR_LOCAL_SEED${SEED} \
+            -p "${PORTFOLIO[@]}" --dims 2 --cv-mode LOIO --n-epochs 1 --seed $SEED --fe-multiplier 10 --n-checkpoints 3 --n-jobs 2
+        ;;
     baselines)
         python baselines.py ${PORTFOLIO_STR}_BASELINES_LOCAL_SEED${SEED} \
             -p "${PORTFOLIO[@]}" --agent all -d 2 --seed $SEED  --fe-multiplier 10 --n-checkpoints 3
diff --git a/smoke_test.sh b/smoke_test.sh
index 83d606f..4dd856f 100755
--- a/smoke_test.sh
+++ b/smoke_test.sh
@@ -32,7 +32,7 @@ run_smoke() {
 if [ "$#" -gt 0 ]; then
     AGENTS=("$@")
 else
-    AGENTS=(ppo ppo-cv rl-das rl-das-cv exp-das exp-das-cv baselines)
+    AGENTS=(ppo ppo-cv rl-das rl-das-cv exp-das exp-das-cv exp-das-cv-par baselines)
 fi
 
 for agent in "${AGENTS[@]}"; do
@@ -42,7 +42,8 @@ for agent in "${AGENTS[@]}"; do
         rl-das)     run_smoke "rl-das train"  rl-das    ;;
         rl-das-cv)  run_smoke "rl-das cv"     rl-das-cv ;;
         exp-das)    run_smoke "exp-das train" exp-das   ;;
-        exp-das-cv) run_smoke "exp-das cv"    exp-das-cv;;
+        exp-das-cv)     run_smoke "exp-das cv"         exp-das-cv    ;;
+        exp-das-cv-par) run_smoke "exp-das cv parallel" exp-das-cv-par;;
         baselines)  run_smoke "baselines"     baselines ;;
         *)          echo "Unknown agent: $agent"; exit 1 ;;
     esac

From b0acdb281a1cc61d6add49672d3bb52a80a6ca8d Mon Sep 17 00:00:00 2001
From: wlnc <wladyslaw.niec@unit8.co>
Date: Sun, 24 May 2026 13:13:43 +0200
Subject: [PATCH 8/8] aocc logging fix

---
 baselines.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/baselines.py b/baselines.py
index 909e983..0cb7dc9 100644
--- a/baselines.py
+++ b/baselines.py
@@ -210,6 +210,7 @@ def compute_oracle(all_results: dict[str, list[dict]]) -> tuple[list[dict], list
                     "area_under_optimization_curve": best_m[
                         "area_under_optimization_curve"
                     ],
+                    "aocc": best_m["aocc"],
                     "final_fitness": best_m["final_fitness"],
                     "agent": "oracle-best",
                     "best_agent": best_m["agent"],
@@ -222,6 +223,7 @@ def compute_oracle(all_results: dict[str, list[dict]]) -> tuple[list[dict], list
                     "area_under_optimization_curve": worst_m[
                         "area_under_optimization_curve"
                     ],
+                    "aocc": worst_m["aocc"],
                     "final_fitness": worst_m["final_fitness"],
                     "agent": "oracle-worst",
                     "worst_agent": worst_m["agent"],