From d1028c3c5619a8ad1f131e8336467a5033967102 Mon Sep 17 00:00:00 2001
From: Grzmro <106807538+Grzmro@users.noreply.github.com>
Date: Tue, 19 May 2026 10:53:35 +0200
Subject: [PATCH 1/5] fix: reward scaling, PPO clipping, ELA memory cap, and
 eval metrics

---
 agents/rl_das/agent.py   | 21 +++++++-------
 agents/rl_das/env.py     |  7 +++++
 agents/rl_das/network.py |  7 ++++-
 agents/rl_das/trainer.py | 27 ++++++++++++++----
 das/env/bbob_splits.py   |  6 ++--
 das/env/das_env.py       | 60 +++++++++++++++++++++++++++++++++-------
 das/env/observation.py   | 37 ++++++++++++++++---------
 das/env/reward.py        | 10 ++++---
 das/training/rldas.py    | 22 ++++++++++++---
 9 files changed, 146 insertions(+), 51 deletions(-)

diff --git a/agents/rl_das/agent.py b/agents/rl_das/agent.py
index 8def163..29a8ea1 100644
--- a/agents/rl_das/agent.py
+++ b/agents/rl_das/agent.py
@@ -194,17 +194,16 @@ def learn(self, k_epoch: int, bootstrap_value: float = 0.0) -> dict[str, float]:
             )
             actor_loss = -torch.min(surr1, surr2).mean()
 
-            # Value clipping (like PPO v2) from the 2nd epoch onward
-            if epoch_idx > 0:
-                values_clipped = old_values_t + torch.clamp(
-                    values - old_values_t, -self.eps_clip, self.eps_clip
-                )
-                critic_loss = torch.max(
-                    (values - returns_t.detach()) ** 2,
-                    (values_clipped - returns_t.detach()) ** 2,
-                ).mean()
-            else:
-                critic_loss = (values - returns_t.detach()).pow(2).mean()
+            # Value clipping applied from the first inner epoch.  Skipping it
+            # on epoch 0 allowed an unconstrained large update on the first step,
+            # breaking the PPO v2 guarantee that value changes stay within eps_clip.
+            values_clipped = old_values_t + torch.clamp(
+                values - old_values_t, -self.eps_clip, self.eps_clip
+            )
+            critic_loss = torch.max(
+                (values - returns_t.detach()) ** 2,
+                (values_clipped - returns_t.detach()) ** 2,
+            ).mean()
 
             loss = actor_loss + 0.5 * critic_loss - 0.01 * entropy
 
diff --git a/agents/rl_das/env.py b/agents/rl_das/env.py
index d438147..f3de680 100644
--- a/agents/rl_das/env.py
+++ b/agents/rl_das/env.py
@@ -242,6 +242,13 @@ def __init__(
         self._best_history: list[list[np.ndarray]] = [[] for _ in range(self.n_opt)]
         self._worst_history: list[list[np.ndarray]] = [[] for _ in range(self.n_opt)]
 
+    @property
+    def problem_ids(self) -> list[str]:
+        # Public accessor — callers should not reach into _problem_ids directly
+        # because it is filtered (dimension-matched) and may differ from the
+        # original list passed to the constructor.
+        return self._problem_ids
+
     # ------------------------------------------------------------------
     # Gymnasium interface
     # ------------------------------------------------------------------
diff --git a/agents/rl_das/network.py b/agents/rl_das/network.py
index 16e08ab..fe4e48d 100644
--- a/agents/rl_das/network.py
+++ b/agents/rl_das/network.py
@@ -32,7 +32,9 @@ def __init__(self, dim: int) -> None:
             nn.Linear(dim, 64),
             nn.ReLU(),
             nn.Linear(64, 1),
-            nn.ReLU(),
+            # No second ReLU: movement vectors are signed displacements.
+            # Clamping to >= 0 discards direction — the network cannot tell
+            # whether the optimizer stepped left or right in search space.
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -94,4 +96,7 @@ def __init__(self, dim: int, n_opt: int) -> None:
         self.head = nn.Linear(16, 1)
 
     def forward(self, obs: torch.Tensor) -> torch.Tensor:
+        # Mirror Actor's NaN guard: a NaN value estimate flows into advantages
+        # and silently zeroes all gradients via backward(), corrupting the update.
+        obs = torch.nan_to_num(obs, nan=0.0, posinf=1.0, neginf=-1.0)
         return self.head(self.backbone(obs)).squeeze(-1)  # (batch,)
diff --git a/agents/rl_das/trainer.py b/agents/rl_das/trainer.py
index ee9b343..0d6d261 100644
--- a/agents/rl_das/trainer.py
+++ b/agents/rl_das/trainer.py
@@ -102,18 +102,24 @@ def train(
     """
     Path(save_dir).mkdir(parents=True, exist_ok=True)
     log: list[dict] = []
-    n_train = len(train_env._problem_ids)
+    n_train = len(train_env.problem_ids)
 
     for epoch in range(1, n_epochs + 1):
-        epoch_rewards = []
+        epoch_rewards: list[float] = []
+        epoch_diagnostics: list[dict] = []
         epoch_start = time.time()
 
         for _ in range(n_train):
             ep = _run_episode(train_env, agent, deterministic=False)
             epoch_rewards.append(ep["total_reward"])
 
-            agent.learn(k_epoch)
+            # bootstrap_value=0.0 is correct: this env only terminates naturally
+            # (terminated=True, truncated always False), so the last done=True flag
+            # already zeroes future returns — no critic bootstrap is needed.
+            diag = agent.learn(k_epoch, bootstrap_value=0.0)
             agent.rollout.clear()
+            if diag:
+                epoch_diagnostics.append(diag)
 
         mean_train_reward = float(np.mean(epoch_rewards))
         entry: dict = {
@@ -122,9 +128,16 @@ def train(
             "elapsed_s": round(time.time() - epoch_start, 2),
         }
 
+        # Log per-epoch PPO diagnostics so training instability is visible
+        # (e.g. actor_loss explosion, entropy collapse) without manual debugging.
+        if epoch_diagnostics:
+            entry["actor_loss"] = float(np.mean([d["actor_loss"] for d in epoch_diagnostics]))
+            entry["critic_loss"] = float(np.mean([d["critic_loss"] for d in epoch_diagnostics]))
+            entry["entropy"] = float(np.mean([d["entropy"] for d in epoch_diagnostics]))
+
         if epoch % eval_interval == 0:
             test_results = evaluate(
-                test_env, agent, n_episodes=len(test_env._problem_ids)
+                test_env, agent, n_episodes=len(test_env.problem_ids)
             )
             entry["mean_test_reward"] = float(
                 np.mean([r["total_reward"] for r in test_results])
@@ -137,12 +150,16 @@ def train(
                 f"  train_r={mean_train_reward:.4f}"
                 f"  test_r={entry['mean_test_reward']:.4f}"
                 f"  test_best_y={entry['mean_test_best_y']:.4e}"
+                f"  actor_loss={entry.get('actor_loss', float('nan')):.4f}"
+                f"  entropy={entry.get('entropy', float('nan')):.4f}"
                 f"  ({entry['elapsed_s']:.1f}s)"
             )
         else:
             print(
                 f"Epoch {epoch:4d}/{n_epochs}"
                 f"  train_r={mean_train_reward:.4f}"
+                f"  actor_loss={entry.get('actor_loss', float('nan')):.4f}"
+                f"  entropy={entry.get('entropy', float('nan')):.4f}"
                 f"  ({entry['elapsed_s']:.1f}s)"
             )
 
@@ -186,7 +203,7 @@ def evaluate(
     List of dicts with keys: problem_id, total_reward, best_y, n_fe.
     """
     if n_episodes is None:
-        n_episodes = len(env._problem_ids)
+        n_episodes = len(env.problem_ids)
 
     results = []
     for _ in range(n_episodes):
diff --git a/das/env/bbob_splits.py b/das/env/bbob_splits.py
index bc9a8da..8794163 100644
--- a/das/env/bbob_splits.py
+++ b/das/env/bbob_splits.py
@@ -7,7 +7,7 @@
 ALL_DIMS = [2, 3, 5, 10, 20, 40]
 ALL_FUNCTIONS = set(range(1, 25))
 INSTANCE_IDS = [1, 2, 3, 4, 5, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80]
-EASY_TRAIN_FUNCTIONS = {4, *range(6, 15), 18, 19, 20, 22, 23, 24}
+EASY_TRAIN_FUNCTIONS = {1, 2, 3, 4, *range(6, 15), 18, 19, 20, 22, 23, 24} # Czy tutaj nie powinny być też funkcje 1,2,3?
 
 
 def build_problem_ids(
@@ -22,7 +22,7 @@ def build_problem_ids(
     ]
 
 
-def get_train_test_split(mode: str, dims: list[int]) -> tuple[list[str], list[str]]:
+def get_train_test_split(mode: str, dims: list[int], seed: int = 0) -> tuple[list[str], list[str]]:
     """Return (train_ids, test_ids) for the given split mode and dimensions.
 
     Modes:
@@ -42,7 +42,7 @@ def get_train_test_split(mode: str, dims: list[int]) -> tuple[list[str], list[st
         )
     # random 2/3 – 1/3 split
     all_ids = build_problem_ids(ALL_FUNCTIONS, dims)
-    rng = np.random.default_rng()
+    rng = np.random.default_rng(seed)
     rng.shuffle(all_ids)
     split = 2 * len(all_ids) // 3
     return all_ids[:split], all_ids[split:]
diff --git a/das/env/das_env.py b/das/env/das_env.py
index d508b5c..e45d4f3 100644
--- a/das/env/das_env.py
+++ b/das/env/das_env.py
@@ -16,10 +16,21 @@
 import gymnasium as gym
 from gymnasium import spaces
 
-from das.env.observation import compute_observation, observation_dim
+from das.env.observation import (
+    compute_observation,
+    observation_dim,
+    compute_ela_features,
+    MAX_HISTORY_SAMPLE,
+    ELA_DIM,
+)
 from das.env.reward import compute_reward
 from das.optimizers.base import get_checkpoints
 
+# Recompute ELA every ~500 new population samples.  pflacco runs regression,
+# nearest-neighbour search, and IC calculations on every call — running it
+# every step would dominate wall-clock time for long training runs.
+_ELA_RECOMPUTE_THRESHOLD = MAX_HISTORY_SAMPLE // 5
+
 
 class DASEnv(gym.Env):
     """DAS environment.
@@ -110,6 +121,11 @@ def __init__(
         self._stagnation_count = 0
         self._choices_history: list[int] = []
 
+        # ELA features are expensive; cache the last computed vector and refresh
+        # lazily once _ELA_RECOMPUTE_THRESHOLD new samples have arrived.
+        self._ela_cache: np.ndarray = np.zeros(ELA_DIM, dtype=np.float32)
+        self._ela_cache_len: int = 0
+
     # ------------------------------------------------------------------ #
     # Gymnasium interface                                                  #
     # ------------------------------------------------------------------ #
@@ -140,6 +156,8 @@ def reset(self, seed=None, options=None):
         self._initial_range = (float("inf"), -np.inf)
         self._stagnation_count = 0
         self._choices_history = []
+        self._ela_cache = np.zeros(ELA_DIM, dtype=np.float32)
+        self._ela_cache_len = 0
 
         obs = self._build_observation()
         info = {"problem_id": problem_id, "dimension": dim}
@@ -251,14 +269,25 @@ def _update_episode_state(self, result: dict, prev_best_y: float):
         if worst_y > self._worst_y:
             self._worst_y = worst_y
 
-        # Set initial range on first step
+        # Set initial range on first step.
+        # When worst_so_far_y is absent the default is -inf, which collapses
+        # scale to 1e-5 and inflates every subsequent reward by 1e5.  Instead,
+        # derive scale from the magnitude of the initial best fitness.
         if self._initial_range[0] == float("inf"):
-            self._initial_range = (new_best_y, max(worst_y, new_best_y + 1e-5))
+            safe_worst = (
+                worst_y if np.isfinite(worst_y) else new_best_y + max(abs(new_best_y), 1.0)
+            )
+            self._initial_range = (new_best_y, max(safe_worst, new_best_y + 1e-5))
 
-        # Stagnation counter
+        # Stagnation counter — prefer the FE delta from the result dict so that
+        # stagnation accumulates correctly even when y_history is not returned.
         x_hist: np.ndarray | None = result.get("x_history")
         y_hist: np.ndarray | None = result.get("y_history")
-        n_fe_step = len(y_hist) if y_hist is not None else 0
+        n_fe_reported = result.get("n_function_evaluations")
+        if n_fe_reported is not None:
+            n_fe_step = max(0, n_fe_reported - self._n_fe)
+        else:
+            n_fe_step = len(y_hist) if y_hist is not None else 0
 
         if new_best_y >= prev_best_y:
             self._stagnation_count += n_fe_step
@@ -267,20 +296,30 @@ def _update_episode_state(self, result: dict, prev_best_y: float):
 
         self._n_fe = result.get("n_function_evaluations", self._n_fe + n_fe_step)
 
-        # Accumulate population history for ELA
+        # Accumulate population history for ELA, capped at MAX_HISTORY_SAMPLE rows.
+        # Without the cap, large budgets (e.g. 40-dim × 10 000 FE) accumulate
+        # hundreds of thousands of rows — GBs of RAM for a single episode.
         if x_hist is not None and len(x_hist) > 0:
             self._x_history = (
-                x_hist
+                x_hist[-MAX_HISTORY_SAMPLE:]
                 if self._x_history is None
-                else np.concatenate([self._x_history, x_hist])
+                else np.concatenate([self._x_history, x_hist])[-MAX_HISTORY_SAMPLE:]
             )
             self._y_history = (
-                y_hist
+                y_hist[-MAX_HISTORY_SAMPLE:]
                 if self._y_history is None
-                else np.concatenate([self._y_history, y_hist])
+                else np.concatenate([self._y_history, y_hist])[-MAX_HISTORY_SAMPLE:]
             )
 
     def _build_observation(self) -> np.ndarray:
+        # Recompute ELA only when enough new samples have arrived.
+        # _ela_cache starts as zeros (correct before 50 samples) and is reset
+        # each episode, so stale features from a previous episode never leak in.
+        current_len = len(self._x_history) if self._x_history is not None else 0
+        if current_len >= 50 and current_len - self._ela_cache_len >= _ELA_RECOMPUTE_THRESHOLD:
+            self._ela_cache = compute_ela_features(self._x_history, self._y_history)
+            self._ela_cache_len = current_len
+
         return compute_observation(
             x_history=self._x_history,
             y_history=self._y_history,
@@ -291,4 +330,5 @@ def _build_observation(self) -> np.ndarray:
             max_fe=max(self._max_fe, 1),
             stagnation_count=self._stagnation_count,
             ndim_problem=self._problem.dimension if self._problem is not None else 1,
+            ela=self._ela_cache,
         )
diff --git a/das/env/observation.py b/das/env/observation.py
index 262d2ba..95b38fe 100644
--- a/das/env/observation.py
+++ b/das/env/observation.py
@@ -58,10 +58,12 @@ def compute_ela_features(x: np.ndarray, y: np.ndarray) -> np.ndarray:
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
 
-        _, unique_idx = np.unique(x, axis=0, return_index=True)
-        unique_idx = np.sort(unique_idx)
-        x = x[unique_idx][-MAX_HISTORY_SAMPLE:]
-        y = y[unique_idx][-MAX_HISTORY_SAMPLE:]
+        # Slice to the most-recent samples first; deduplication is done below
+        # in normalised space where it is actually meaningful — raw-space
+        # np.unique missed points that become identical after normalisation and
+        # was therefore doing redundant work without full correctness guarantees.
+        x = x[-MAX_HISTORY_SAMPLE:]
+        y = y[-MAX_HISTORY_SAMPLE:]
 
         x_norm_arr = (x - x.mean()) / (x.std() + 1e-8)
         y_norm_arr = (y - y.mean()) / (y.std() + 1e-8)
@@ -96,8 +98,14 @@ def compute_ela_features(x: np.ndarray, y: np.ndarray) -> np.ndarray:
                 )
             }
 
-        all_feats = {**meta, **nbc, **disp, **ic, **ela_distr}
-        return np.array([all_feats[k] for k in ELA_FEATURE_KEYS], dtype=np.float32)
+        # pflacco may return an incomplete dict for degenerate or edge-case
+        # inputs that slipped past the variance guard above.  Fall back to
+        # zeros rather than crashing training with a KeyError mid-run.
+        try:
+            all_feats = {**meta, **nbc, **disp, **ic, **ela_distr}
+            return np.array([all_feats[k] for k in ELA_FEATURE_KEYS], dtype=np.float32)
+        except (KeyError, ValueError):
+            return np.zeros(ELA_DIM, dtype=np.float32)
 
 
 def compute_action_history_features(
@@ -124,9 +132,8 @@ def compute_action_history_features(
         last_idx = choices_history[-1]
         last_action[last_idx] = 1.0
 
-        counts = np.array(
-            [choices_history.count(j) for j in range(n_actions)], dtype=np.float32
-        )
+        # O(n) instead of O(n_actions * n_steps) from calling list.count in a loop.
+        counts = np.bincount(choices_history, minlength=n_actions).astype(np.float32)
         frequencies = counts / len(choices_history)
 
         run = 0
@@ -165,12 +172,16 @@ def compute_observation(
     max_fe: int,
     stagnation_count: int,
     ndim_problem: int,
+    ela: np.ndarray | None = None,
 ) -> np.ndarray:
     """Assemble the full observation vector from its components."""
-    if x_history is not None and y_history is not None and len(x_history) >= 50:
-        ela = compute_ela_features(x_history, y_history)
-    else:
-        ela = np.zeros(ELA_DIM, dtype=np.float32)
+    # Accept a pre-computed ELA vector so the caller can cache it across steps
+    # and avoid running pflacco on every observation build (pflacco is expensive).
+    if ela is None:
+        if x_history is not None and y_history is not None and len(x_history) >= 50:
+            ela = compute_ela_features(x_history, y_history)
+        else:
+            ela = np.zeros(ELA_DIM, dtype=np.float32)
 
     action_hist = compute_action_history_features(
         choices_history, n_actions, n_checkpoints, ndim_problem
diff --git a/das/env/reward.py b/das/env/reward.py
index 3098e38..eb31fb0 100644
--- a/das/env/reward.py
+++ b/das/env/reward.py
@@ -18,7 +18,7 @@ def _improvement_ratio(
 def reward_log_scaled(new_best_y, old_best_y, initial_range, is_final=False):
     """Log-scaled incremental improvement (original r1)."""
     if old_best_y == float("inf"):
-        return float(np.log(initial_range[1] - initial_range[0] + 1e-10))
+        return 0.0
     ratio = _improvement_ratio(new_best_y, old_best_y, initial_range)
     return float(np.log(np.clip(ratio, 0.0, 1.0) + 1e-5))
 
@@ -26,7 +26,9 @@ def reward_log_scaled(new_best_y, old_best_y, initial_range, is_final=False):
 def reward_linear(new_best_y, old_best_y, initial_range, is_final=False):
     """Linear improvement clipped to [0, 1] (original r2)."""
     if old_best_y == float("inf"):
-        return float(np.log(initial_range[1] - initial_range[0] + 1e-10))
+        # No prior best on the first step — returning log(scale) here would
+        # produce a value outside [0, 1] and break the linear contract.
+        return 0.0
     return float(
         np.clip(_improvement_ratio(new_best_y, old_best_y, initial_range), 0.0, 1.0)
     )
@@ -35,7 +37,7 @@ def reward_linear(new_best_y, old_best_y, initial_range, is_final=False):
 def reward_sparse(new_best_y, old_best_y, initial_range, is_final=False):
     """Sparse: only reward at the final checkpoint (original r3)."""
     if old_best_y == float("inf") or not is_final:
-        return float(np.log(initial_range[1] - initial_range[0] + 1e-10))
+        return 0.0
     total_improvement = initial_range[0] - new_best_y
     scale = initial_range[1] - initial_range[0]
     return float(np.log(total_improvement / (scale + 1e-10) + 1e-5))
@@ -44,7 +46,7 @@ def reward_sparse(new_best_y, old_best_y, initial_range, is_final=False):
 def reward_binary(new_best_y, old_best_y, initial_range, is_final=False):
     """Binary: 1 if improvement >= 0.1%, else 0 (original r4)."""
     if old_best_y == float("inf"):
-        return float(np.log(initial_range[1] - initial_range[0] + 1e-10))
+        return 0.0
     ratio = _improvement_ratio(new_best_y, old_best_y, initial_range)
     return 1.0 if ratio >= 1e-3 else 0.0
 
diff --git a/das/training/rldas.py b/das/training/rldas.py
index 07f32c6..588e2b7 100644
--- a/das/training/rldas.py
+++ b/das/training/rldas.py
@@ -22,8 +22,11 @@ def run_rl_das(args) -> None:
 
     suite = IOHSuite()
 
-    if args.k_epoch is None:
-        args.k_epoch = max(1, int(0.3 * args.n_checkpoints))
+    # Local variable — avoid mutating args so the caller's namespace stays predictable.
+    k_epoch = (
+        args.k_epoch if args.k_epoch is not None
+        else max(1, int(0.3 * args.n_checkpoints))
+    )
 
     env_kwargs = dict(
         suite=suite,
@@ -45,7 +48,7 @@ def run_rl_das(args) -> None:
     print(
         f"RL-DAS  |  dim={args.dim}  |  portfolio={args.portfolio}"
         f"  |  obs_dim={train_env.observation_space.shape[0]}"
-        f"  |  k_epoch={args.k_epoch}"
+        f"  |  k_epoch={k_epoch}"
     )
 
     train(
@@ -53,7 +56,7 @@ def run_rl_das(args) -> None:
         test_env=test_env,
         agent=agent,
         n_epochs=args.n_epochs,
-        k_epoch=args.k_epoch,
+        k_epoch=k_epoch,
         eval_interval=args.eval_interval,
         save_interval=args.save_interval,
         save_dir="models",
@@ -62,6 +65,17 @@ def run_rl_das(args) -> None:
 
     if args.eval:
         print("\nRunning final evaluation on test set …")
+
+        # Fresh env so _problem_idx starts at 0.  test_env accumulated increments
+        # from periodic evaluations inside train() and would start from a rotated
+        # offset rather than problem 0, making results hard to reproduce.
+        eval_env = RLDASEnv(problem_ids=test_ids, **env_kwargs)
+        n_problems = len(test_ids)
+        test_results = evaluate(eval_env, agent, n_episodes=n_problems)
+
+        # Create the output directory before writing — write_jsonl does not
+        # create parent directories and would raise FileNotFoundError otherwise.
+        os.makedirs("results", exist_ok=True)
         n_problems = len(test_env._problem_ids)
         test_results = evaluate(test_env, agent, n_episodes=n_problems)
         mean_best_y = float(np.mean([r["best_y"] for r in test_results]))

From b44e47bf33269ba64754714f0df53d33c5c12a03 Mon Sep 17 00:00:00 2001
From: Grzmro <106807538+Grzmro@users.noreply.github.com>
Date: Tue, 19 May 2026 12:55:22 +0200
Subject: [PATCH 2/5] feat: add ela_recompute_every parameter for ELA feature
 recomputation control

---
 das/env/das_env.py     | 11 ++++++-----
 das/training/common.py |  1 +
 das/training/ppo.py    |  1 +
 train.py               | 10 ++++++++++
 4 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/das/env/das_env.py b/das/env/das_env.py
index e45d4f3..aef3ff1 100644
--- a/das/env/das_env.py
+++ b/das/env/das_env.py
@@ -26,10 +26,6 @@
 from das.env.reward import compute_reward
 from das.optimizers.base import get_checkpoints
 
-# Recompute ELA every ~500 new population samples.  pflacco runs regression,
-# nearest-neighbour search, and IC calculations on every call — running it
-# every step would dominate wall-clock time for long training runs.
-_ELA_RECOMPUTE_THRESHOLD = MAX_HISTORY_SAMPLE // 5
 
 
 class DASEnv(gym.Env):
@@ -71,6 +67,7 @@ def __init__(
         reward_option: int = 1,
         n_individuals: int | list[int | None] | None = None,
         seed: int | None = None,
+        ela_recompute_every: int = MAX_HISTORY_SAMPLE // 5 # ~500,
     ):
         super().__init__()
         self.problem_ids = problem_ids
@@ -93,6 +90,7 @@ def __init__(
                 )
             self.n_individuals = pop
         self._seed = seed
+        self._ela_recompute_every = max(1, ela_recompute_every)
 
         n_actions = len(optimizers)
         obs_dim = observation_dim(n_actions)
@@ -316,7 +314,10 @@ def _build_observation(self) -> np.ndarray:
         # _ela_cache starts as zeros (correct before 50 samples) and is reset
         # each episode, so stale features from a previous episode never leak in.
         current_len = len(self._x_history) if self._x_history is not None else 0
-        if current_len >= 50 and current_len - self._ela_cache_len >= _ELA_RECOMPUTE_THRESHOLD:
+        if current_len >= 50 and (
+            self._ela_cache_len == 0
+            or current_len - self._ela_cache_len >= self._ela_recompute_every
+        ):
             self._ela_cache = compute_ela_features(self._x_history, self._y_history)
             self._ela_cache_len = current_len
 
diff --git a/das/training/common.py b/das/training/common.py
index b8ac184..f2e2244 100644
--- a/das/training/common.py
+++ b/das/training/common.py
@@ -130,6 +130,7 @@ def _init():
             reward_option=cfg["reward_option"],
             n_individuals=cfg["n_individuals"],
             seed=cfg.get("seed"),
+            ela_recompute_every=cfg.get("ela_recompute_every", 500),
         )
 
     return _init
diff --git a/das/training/ppo.py b/das/training/ppo.py
index 79fdf2d..1cc4020 100644
--- a/das/training/ppo.py
+++ b/das/training/ppo.py
@@ -138,6 +138,7 @@ def run_ppo(args) -> None:
         "reward_option": args.reward_option,
         "n_individuals": args.n_individuals,
         "seed": args.seed,
+        "ela_recompute_every": args.ela_recompute_every,
     }
 
     print(f"Portfolio : {args.portfolio}")
diff --git a/train.py b/train.py
index 7c64d2d..6b058a4 100644
--- a/train.py
+++ b/train.py
@@ -78,6 +78,16 @@ def _add_shared_args(
         help="Population size override (default: each algorithm uses its own built-in default)",
     )
     p.add_argument("--seed", type=int, default=42)
+    p.add_argument(
+        "--ela-recompute-every",
+        type=int,
+        default=500,
+        help=(
+            "Recompute ELA features every N new population samples. "
+            "Set to 1 to recompute on every step (slow but maximally fresh). "
+            "Default: 500."
+        ),
+    )
 
 
 def _parse_args() -> argparse.Namespace:

From e840dba8f6be19dacb17e4c914e10bc21b624917 Mon Sep 17 00:00:00 2001
From: Grzmro <106807538+Grzmro@users.noreply.github.com>
Date: Mon, 25 May 2026 13:28:01 +0200
Subject: [PATCH 3/5] Revert "feat: add ela_recompute_every parameter for ELA
 feature recomputation control"

This reverts commit 2a1c641e42e31957de83af7013c0cf9ce126e12a.
---
 das/env/das_env.py     | 11 +++++------
 das/training/common.py |  1 -
 das/training/ppo.py    |  1 -
 train.py               | 10 ----------
 4 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/das/env/das_env.py b/das/env/das_env.py
index aef3ff1..e45d4f3 100644
--- a/das/env/das_env.py
+++ b/das/env/das_env.py
@@ -26,6 +26,10 @@
 from das.env.reward import compute_reward
 from das.optimizers.base import get_checkpoints
 
+# Recompute ELA every ~500 new population samples.  pflacco runs regression,
+# nearest-neighbour search, and IC calculations on every call — running it
+# every step would dominate wall-clock time for long training runs.
+_ELA_RECOMPUTE_THRESHOLD = MAX_HISTORY_SAMPLE // 5
 
 
 class DASEnv(gym.Env):
@@ -67,7 +71,6 @@ def __init__(
         reward_option: int = 1,
         n_individuals: int | list[int | None] | None = None,
         seed: int | None = None,
-        ela_recompute_every: int = MAX_HISTORY_SAMPLE // 5 # ~500,
     ):
         super().__init__()
         self.problem_ids = problem_ids
@@ -90,7 +93,6 @@ def __init__(
                 )
             self.n_individuals = pop
         self._seed = seed
-        self._ela_recompute_every = max(1, ela_recompute_every)
 
         n_actions = len(optimizers)
         obs_dim = observation_dim(n_actions)
@@ -314,10 +316,7 @@ def _build_observation(self) -> np.ndarray:
         # _ela_cache starts as zeros (correct before 50 samples) and is reset
         # each episode, so stale features from a previous episode never leak in.
         current_len = len(self._x_history) if self._x_history is not None else 0
-        if current_len >= 50 and (
-            self._ela_cache_len == 0
-            or current_len - self._ela_cache_len >= self._ela_recompute_every
-        ):
+        if current_len >= 50 and current_len - self._ela_cache_len >= _ELA_RECOMPUTE_THRESHOLD:
             self._ela_cache = compute_ela_features(self._x_history, self._y_history)
             self._ela_cache_len = current_len
 
diff --git a/das/training/common.py b/das/training/common.py
index f2e2244..b8ac184 100644
--- a/das/training/common.py
+++ b/das/training/common.py
@@ -130,7 +130,6 @@ def _init():
             reward_option=cfg["reward_option"],
             n_individuals=cfg["n_individuals"],
             seed=cfg.get("seed"),
-            ela_recompute_every=cfg.get("ela_recompute_every", 500),
         )
 
     return _init
diff --git a/das/training/ppo.py b/das/training/ppo.py
index 1cc4020..79fdf2d 100644
--- a/das/training/ppo.py
+++ b/das/training/ppo.py
@@ -138,7 +138,6 @@ def run_ppo(args) -> None:
         "reward_option": args.reward_option,
         "n_individuals": args.n_individuals,
         "seed": args.seed,
-        "ela_recompute_every": args.ela_recompute_every,
     }
 
     print(f"Portfolio : {args.portfolio}")
diff --git a/train.py b/train.py
index 6b058a4..7c64d2d 100644
--- a/train.py
+++ b/train.py
@@ -78,16 +78,6 @@ def _add_shared_args(
         help="Population size override (default: each algorithm uses its own built-in default)",
     )
     p.add_argument("--seed", type=int, default=42)
-    p.add_argument(
-        "--ela-recompute-every",
-        type=int,
-        default=500,
-        help=(
-            "Recompute ELA features every N new population samples. "
-            "Set to 1 to recompute on every step (slow but maximally fresh). "
-            "Default: 500."
-        ),
-    )
 
 
 def _parse_args() -> argparse.Namespace:

From 592a7c7259e525bad00fc68ba1f647fca9aff14b Mon Sep 17 00:00:00 2001
From: Grzmro <106807538+Grzmro@users.noreply.github.com>
Date: Mon, 25 May 2026 13:57:50 +0200
Subject: [PATCH 4/5] refactor

---
 das/env/bbob_splits.py |  6 +++---
 das/env/das_env.py     | 28 +---------------------------
 das/env/observation.py | 32 ++++++++++----------------------
 das/env/reward.py      |  8 +++-----
 4 files changed, 17 insertions(+), 57 deletions(-)

diff --git a/das/env/bbob_splits.py b/das/env/bbob_splits.py
index 8794163..bc9a8da 100644
--- a/das/env/bbob_splits.py
+++ b/das/env/bbob_splits.py
@@ -7,7 +7,7 @@
 ALL_DIMS = [2, 3, 5, 10, 20, 40]
 ALL_FUNCTIONS = set(range(1, 25))
 INSTANCE_IDS = [1, 2, 3, 4, 5, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80]
-EASY_TRAIN_FUNCTIONS = {1, 2, 3, 4, *range(6, 15), 18, 19, 20, 22, 23, 24} # Czy tutaj nie powinny być też funkcje 1,2,3?
+EASY_TRAIN_FUNCTIONS = {4, *range(6, 15), 18, 19, 20, 22, 23, 24}
 
 
 def build_problem_ids(
@@ -22,7 +22,7 @@ def build_problem_ids(
     ]
 
 
-def get_train_test_split(mode: str, dims: list[int], seed: int = 0) -> tuple[list[str], list[str]]:
+def get_train_test_split(mode: str, dims: list[int]) -> tuple[list[str], list[str]]:
     """Return (train_ids, test_ids) for the given split mode and dimensions.
 
     Modes:
@@ -42,7 +42,7 @@ def get_train_test_split(mode: str, dims: list[int], seed: int = 0) -> tuple[lis
         )
     # random 2/3 – 1/3 split
     all_ids = build_problem_ids(ALL_FUNCTIONS, dims)
-    rng = np.random.default_rng(seed)
+    rng = np.random.default_rng()
     rng.shuffle(all_ids)
     split = 2 * len(all_ids) // 3
     return all_ids[:split], all_ids[split:]
diff --git a/das/env/das_env.py b/das/env/das_env.py
index e45d4f3..1fc06d3 100644
--- a/das/env/das_env.py
+++ b/das/env/das_env.py
@@ -16,21 +16,10 @@
 import gymnasium as gym
 from gymnasium import spaces
 
-from das.env.observation import (
-    compute_observation,
-    observation_dim,
-    compute_ela_features,
-    MAX_HISTORY_SAMPLE,
-    ELA_DIM,
-)
+from das.env.observation import (compute_observation, observation_dim, MAX_HISTORY_SAMPLE)
 from das.env.reward import compute_reward
 from das.optimizers.base import get_checkpoints
 
-# Recompute ELA every ~500 new population samples.  pflacco runs regression,
-# nearest-neighbour search, and IC calculations on every call — running it
-# every step would dominate wall-clock time for long training runs.
-_ELA_RECOMPUTE_THRESHOLD = MAX_HISTORY_SAMPLE // 5
-
 
 class DASEnv(gym.Env):
     """DAS environment.
@@ -121,11 +110,6 @@ def __init__(
         self._stagnation_count = 0
         self._choices_history: list[int] = []
 
-        # ELA features are expensive; cache the last computed vector and refresh
-        # lazily once _ELA_RECOMPUTE_THRESHOLD new samples have arrived.
-        self._ela_cache: np.ndarray = np.zeros(ELA_DIM, dtype=np.float32)
-        self._ela_cache_len: int = 0
-
     # ------------------------------------------------------------------ #
     # Gymnasium interface                                                  #
     # ------------------------------------------------------------------ #
@@ -156,8 +140,6 @@ def reset(self, seed=None, options=None):
         self._initial_range = (float("inf"), -np.inf)
         self._stagnation_count = 0
         self._choices_history = []
-        self._ela_cache = np.zeros(ELA_DIM, dtype=np.float32)
-        self._ela_cache_len = 0
 
         obs = self._build_observation()
         info = {"problem_id": problem_id, "dimension": dim}
@@ -312,13 +294,6 @@ def _update_episode_state(self, result: dict, prev_best_y: float):
             )
 
     def _build_observation(self) -> np.ndarray:
-        # Recompute ELA only when enough new samples have arrived.
-        # _ela_cache starts as zeros (correct before 50 samples) and is reset
-        # each episode, so stale features from a previous episode never leak in.
-        current_len = len(self._x_history) if self._x_history is not None else 0
-        if current_len >= 50 and current_len - self._ela_cache_len >= _ELA_RECOMPUTE_THRESHOLD:
-            self._ela_cache = compute_ela_features(self._x_history, self._y_history)
-            self._ela_cache_len = current_len
 
         return compute_observation(
             x_history=self._x_history,
@@ -330,5 +305,4 @@ def _build_observation(self) -> np.ndarray:
             max_fe=max(self._max_fe, 1),
             stagnation_count=self._stagnation_count,
             ndim_problem=self._problem.dimension if self._problem is not None else 1,
-            ela=self._ela_cache,
         )
diff --git a/das/env/observation.py b/das/env/observation.py
index 95b38fe..6f6d354 100644
--- a/das/env/observation.py
+++ b/das/env/observation.py
@@ -58,12 +58,10 @@ def compute_ela_features(x: np.ndarray, y: np.ndarray) -> np.ndarray:
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
 
-        # Slice to the most-recent samples first; deduplication is done below
-        # in normalised space where it is actually meaningful — raw-space
-        # np.unique missed points that become identical after normalisation and
-        # was therefore doing redundant work without full correctness guarantees.
-        x = x[-MAX_HISTORY_SAMPLE:]
-        y = y[-MAX_HISTORY_SAMPLE:]
+        _, unique_idx = np.unique(x, axis=0, return_index=True)
+        unique_idx = np.sort(unique_idx)
+        x = x[unique_idx][-MAX_HISTORY_SAMPLE:]
+        y = y[unique_idx][-MAX_HISTORY_SAMPLE:]
 
         x_norm_arr = (x - x.mean()) / (x.std() + 1e-8)
         y_norm_arr = (y - y.mean()) / (y.std() + 1e-8)
@@ -98,14 +96,8 @@ def compute_ela_features(x: np.ndarray, y: np.ndarray) -> np.ndarray:
                 )
             }
 
-        # pflacco may return an incomplete dict for degenerate or edge-case
-        # inputs that slipped past the variance guard above.  Fall back to
-        # zeros rather than crashing training with a KeyError mid-run.
-        try:
-            all_feats = {**meta, **nbc, **disp, **ic, **ela_distr}
-            return np.array([all_feats[k] for k in ELA_FEATURE_KEYS], dtype=np.float32)
-        except (KeyError, ValueError):
-            return np.zeros(ELA_DIM, dtype=np.float32)
+        all_feats = {**meta, **nbc, **disp, **ic, **ela_distr}
+        return np.array([all_feats[k] for k in ELA_FEATURE_KEYS], dtype=np.float32)
 
 
 def compute_action_history_features(
@@ -172,16 +164,12 @@ def compute_observation(
     max_fe: int,
     stagnation_count: int,
     ndim_problem: int,
-    ela: np.ndarray | None = None,
 ) -> np.ndarray:
     """Assemble the full observation vector from its components."""
-    # Accept a pre-computed ELA vector so the caller can cache it across steps
-    # and avoid running pflacco on every observation build (pflacco is expensive).
-    if ela is None:
-        if x_history is not None and y_history is not None and len(x_history) >= 50:
-            ela = compute_ela_features(x_history, y_history)
-        else:
-            ela = np.zeros(ELA_DIM, dtype=np.float32)
+    if x_history is not None and y_history is not None and len(x_history) >= 50:
+        ela = compute_ela_features(x_history, y_history)
+    else:
+        ela = np.zeros(ELA_DIM, dtype=np.float32)
 
     action_hist = compute_action_history_features(
         choices_history, n_actions, n_checkpoints, ndim_problem
diff --git a/das/env/reward.py b/das/env/reward.py
index eb31fb0..933b262 100644
--- a/das/env/reward.py
+++ b/das/env/reward.py
@@ -18,7 +18,7 @@ def _improvement_ratio(
 def reward_log_scaled(new_best_y, old_best_y, initial_range, is_final=False):
     """Log-scaled incremental improvement (original r1)."""
     if old_best_y == float("inf"):
-        return 0.0
+        return float(np.log(initial_range[1] - initial_range[0] + 1e-10))
     ratio = _improvement_ratio(new_best_y, old_best_y, initial_range)
     return float(np.log(np.clip(ratio, 0.0, 1.0) + 1e-5))
 
@@ -26,9 +26,7 @@ def reward_log_scaled(new_best_y, old_best_y, initial_range, is_final=False):
 def reward_linear(new_best_y, old_best_y, initial_range, is_final=False):
     """Linear improvement clipped to [0, 1] (original r2)."""
     if old_best_y == float("inf"):
-        # No prior best on the first step — returning log(scale) here would
-        # produce a value outside [0, 1] and break the linear contract.
-        return 0.0
+        return float(np.log(initial_range[1] - initial_range[0] + 1e-10))
     return float(
         np.clip(_improvement_ratio(new_best_y, old_best_y, initial_range), 0.0, 1.0)
     )
@@ -37,7 +35,7 @@ def reward_linear(new_best_y, old_best_y, initial_range, is_final=False):
 def reward_sparse(new_best_y, old_best_y, initial_range, is_final=False):
     """Sparse: only reward at the final checkpoint (original r3)."""
     if old_best_y == float("inf") or not is_final:
-        return 0.0
+        return float(np.log(initial_range[1] - initial_range[0] + 1e-10))
     total_improvement = initial_range[0] - new_best_y
     scale = initial_range[1] - initial_range[0]
     return float(np.log(total_improvement / (scale + 1e-10) + 1e-5))

From c93efb888ff88259c072e10921c736cdae294192 Mon Sep 17 00:00:00 2001
From: wniec <niecwladek@gmail.com>
Date: Fri, 29 May 2026 20:12:31 +0200
Subject: [PATCH 5/5] ruff fix

---
 agents/rl_das/trainer.py | 8 ++++++--
 das/env/das_env.py       | 6 ++++--
 das/training/rldas.py    | 3 ++-
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/agents/rl_das/trainer.py b/agents/rl_das/trainer.py
index 0d6d261..4d2b0a6 100644
--- a/agents/rl_das/trainer.py
+++ b/agents/rl_das/trainer.py
@@ -131,8 +131,12 @@ def train(
         # Log per-epoch PPO diagnostics so training instability is visible
         # (e.g. actor_loss explosion, entropy collapse) without manual debugging.
         if epoch_diagnostics:
-            entry["actor_loss"] = float(np.mean([d["actor_loss"] for d in epoch_diagnostics]))
-            entry["critic_loss"] = float(np.mean([d["critic_loss"] for d in epoch_diagnostics]))
+            entry["actor_loss"] = float(
+                np.mean([d["actor_loss"] for d in epoch_diagnostics])
+            )
+            entry["critic_loss"] = float(
+                np.mean([d["critic_loss"] for d in epoch_diagnostics])
+            )
             entry["entropy"] = float(np.mean([d["entropy"] for d in epoch_diagnostics]))
 
         if epoch % eval_interval == 0:
diff --git a/das/env/das_env.py b/das/env/das_env.py
index 1fc06d3..57fbe84 100644
--- a/das/env/das_env.py
+++ b/das/env/das_env.py
@@ -16,7 +16,7 @@
 import gymnasium as gym
 from gymnasium import spaces
 
-from das.env.observation import (compute_observation, observation_dim, MAX_HISTORY_SAMPLE)
+from das.env.observation import compute_observation, observation_dim, MAX_HISTORY_SAMPLE
 from das.env.reward import compute_reward
 from das.optimizers.base import get_checkpoints
 
@@ -257,7 +257,9 @@ def _update_episode_state(self, result: dict, prev_best_y: float):
         # derive scale from the magnitude of the initial best fitness.
         if self._initial_range[0] == float("inf"):
             safe_worst = (
-                worst_y if np.isfinite(worst_y) else new_best_y + max(abs(new_best_y), 1.0)
+                worst_y
+                if np.isfinite(worst_y)
+                else new_best_y + max(abs(new_best_y), 1.0)
             )
             self._initial_range = (new_best_y, max(safe_worst, new_best_y + 1e-5))
 
diff --git a/das/training/rldas.py b/das/training/rldas.py
index 588e2b7..0bfcb5a 100644
--- a/das/training/rldas.py
+++ b/das/training/rldas.py
@@ -24,7 +24,8 @@ def run_rl_das(args) -> None:
 
     # Local variable — avoid mutating args so the caller's namespace stays predictable.
     k_epoch = (
-        args.k_epoch if args.k_epoch is not None
+        args.k_epoch
+        if args.k_epoch is not None
         else max(1, int(0.3 * args.n_checkpoints))
     )