diff --git a/agents/rl_das/agent.py b/agents/rl_das/agent.py
index 8def163..29a8ea1 100644
--- a/agents/rl_das/agent.py
+++ b/agents/rl_das/agent.py
@@ -194,17 +194,16 @@ def learn(self, k_epoch: int, bootstrap_value: float = 0.0) -> dict[str, float]:
             )
             actor_loss = -torch.min(surr1, surr2).mean()
 
-            # Value clipping (like PPO v2) from the 2nd epoch onward
-            if epoch_idx > 0:
-                values_clipped = old_values_t + torch.clamp(
-                    values - old_values_t, -self.eps_clip, self.eps_clip
-                )
-                critic_loss = torch.max(
-                    (values - returns_t.detach()) ** 2,
-                    (values_clipped - returns_t.detach()) ** 2,
-                ).mean()
-            else:
-                critic_loss = (values - returns_t.detach()).pow(2).mean()
+            # Value clipping applied from the first inner epoch.  Skipping it
+            # on epoch 0 allowed an unconstrained large update on the first step,
+            # breaking the PPO v2 guarantee that value changes stay within eps_clip.
+            values_clipped = old_values_t + torch.clamp(
+                values - old_values_t, -self.eps_clip, self.eps_clip
+            )
+            critic_loss = torch.max(
+                (values - returns_t.detach()) ** 2,
+                (values_clipped - returns_t.detach()) ** 2,
+            ).mean()
 
             loss = actor_loss + 0.5 * critic_loss - 0.01 * entropy
 
diff --git a/agents/rl_das/env.py b/agents/rl_das/env.py
index d438147..f3de680 100644
--- a/agents/rl_das/env.py
+++ b/agents/rl_das/env.py
@@ -242,6 +242,13 @@ def __init__(
         self._best_history: list[list[np.ndarray]] = [[] for _ in range(self.n_opt)]
         self._worst_history: list[list[np.ndarray]] = [[] for _ in range(self.n_opt)]
 
+    @property
+    def problem_ids(self) -> list[str]:
+        # Public accessor — callers should not reach into _problem_ids directly
+        # because it is filtered (dimension-matched) and may differ from the
+        # original list passed to the constructor.
+        return self._problem_ids
+
     # ------------------------------------------------------------------
     # Gymnasium interface
     # ------------------------------------------------------------------
diff --git a/agents/rl_das/network.py b/agents/rl_das/network.py
index 16e08ab..fe4e48d 100644
--- a/agents/rl_das/network.py
+++ b/agents/rl_das/network.py
@@ -32,7 +32,9 @@ def __init__(self, dim: int) -> None:
             nn.Linear(dim, 64),
             nn.ReLU(),
             nn.Linear(64, 1),
-            nn.ReLU(),
+            # No second ReLU: movement vectors are signed displacements.
+            # Clamping to >= 0 discards direction — the network cannot tell
+            # whether the optimizer stepped left or right in search space.
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -94,4 +96,7 @@ def __init__(self, dim: int, n_opt: int) -> None:
         self.head = nn.Linear(16, 1)
 
     def forward(self, obs: torch.Tensor) -> torch.Tensor:
+        # Mirror Actor's NaN guard: a NaN value estimate flows into advantages
+        # and silently zeroes all gradients via backward(), corrupting the update.
+        obs = torch.nan_to_num(obs, nan=0.0, posinf=1.0, neginf=-1.0)
         return self.head(self.backbone(obs)).squeeze(-1)  # (batch,)
diff --git a/agents/rl_das/trainer.py b/agents/rl_das/trainer.py
index ee9b343..4d2b0a6 100644
--- a/agents/rl_das/trainer.py
+++ b/agents/rl_das/trainer.py
@@ -102,18 +102,24 @@ def train(
     """
     Path(save_dir).mkdir(parents=True, exist_ok=True)
     log: list[dict] = []
-    n_train = len(train_env._problem_ids)
+    n_train = len(train_env.problem_ids)
 
     for epoch in range(1, n_epochs + 1):
-        epoch_rewards = []
+        epoch_rewards: list[float] = []
+        epoch_diagnostics: list[dict] = []
         epoch_start = time.time()
 
         for _ in range(n_train):
             ep = _run_episode(train_env, agent, deterministic=False)
             epoch_rewards.append(ep["total_reward"])
 
-            agent.learn(k_epoch)
+            # bootstrap_value=0.0 is correct: this env only terminates naturally
+            # (terminated=True, truncated always False), so the last done=True flag
+            # already zeroes future returns — no critic bootstrap is needed.
+            diag = agent.learn(k_epoch, bootstrap_value=0.0)
             agent.rollout.clear()
+            if diag:
+                epoch_diagnostics.append(diag)
 
         mean_train_reward = float(np.mean(epoch_rewards))
         entry: dict = {
@@ -122,9 +128,20 @@ def train(
             "elapsed_s": round(time.time() - epoch_start, 2),
         }
 
+        # Log per-epoch PPO diagnostics so training instability is visible
+        # (e.g. actor_loss explosion, entropy collapse) without manual debugging.
+        if epoch_diagnostics:
+            entry["actor_loss"] = float(
+                np.mean([d["actor_loss"] for d in epoch_diagnostics])
+            )
+            entry["critic_loss"] = float(
+                np.mean([d["critic_loss"] for d in epoch_diagnostics])
+            )
+            entry["entropy"] = float(np.mean([d["entropy"] for d in epoch_diagnostics]))
+
         if epoch % eval_interval == 0:
             test_results = evaluate(
-                test_env, agent, n_episodes=len(test_env._problem_ids)
+                test_env, agent, n_episodes=len(test_env.problem_ids)
             )
             entry["mean_test_reward"] = float(
                 np.mean([r["total_reward"] for r in test_results])
@@ -137,12 +154,16 @@ def train(
                 f"  train_r={mean_train_reward:.4f}"
                 f"  test_r={entry['mean_test_reward']:.4f}"
                 f"  test_best_y={entry['mean_test_best_y']:.4e}"
+                f"  actor_loss={entry.get('actor_loss', float('nan')):.4f}"
+                f"  entropy={entry.get('entropy', float('nan')):.4f}"
                 f"  ({entry['elapsed_s']:.1f}s)"
             )
         else:
             print(
                 f"Epoch {epoch:4d}/{n_epochs}"
                 f"  train_r={mean_train_reward:.4f}"
+                f"  actor_loss={entry.get('actor_loss', float('nan')):.4f}"
+                f"  entropy={entry.get('entropy', float('nan')):.4f}"
                 f"  ({entry['elapsed_s']:.1f}s)"
             )
 
@@ -186,7 +207,7 @@ def evaluate(
     List of dicts with keys: problem_id, total_reward, best_y, n_fe.
     """
     if n_episodes is None:
-        n_episodes = len(env._problem_ids)
+        n_episodes = len(env.problem_ids)
 
     results = []
     for _ in range(n_episodes):
diff --git a/das/env/das_env.py b/das/env/das_env.py
index d508b5c..57fbe84 100644
--- a/das/env/das_env.py
+++ b/das/env/das_env.py
@@ -16,7 +16,7 @@
 import gymnasium as gym
 from gymnasium import spaces
 
-from das.env.observation import compute_observation, observation_dim
+from das.env.observation import compute_observation, observation_dim, MAX_HISTORY_SAMPLE
 from das.env.reward import compute_reward
 from das.optimizers.base import get_checkpoints
 
@@ -251,14 +251,27 @@ def _update_episode_state(self, result: dict, prev_best_y: float):
         if worst_y > self._worst_y:
             self._worst_y = worst_y
 
-        # Set initial range on first step
+        # Set initial range on first step.
+        # When worst_so_far_y is absent the default is -inf, which collapses
+        # scale to 1e-5 and inflates every subsequent reward by 1e5.  Instead,
+        # derive scale from the magnitude of the initial best fitness.
         if self._initial_range[0] == float("inf"):
-            self._initial_range = (new_best_y, max(worst_y, new_best_y + 1e-5))
+            safe_worst = (
+                worst_y
+                if np.isfinite(worst_y)
+                else new_best_y + max(abs(new_best_y), 1.0)
+            )
+            self._initial_range = (new_best_y, max(safe_worst, new_best_y + 1e-5))
 
-        # Stagnation counter
+        # Stagnation counter — prefer the FE delta from the result dict so that
+        # stagnation accumulates correctly even when y_history is not returned.
         x_hist: np.ndarray | None = result.get("x_history")
         y_hist: np.ndarray | None = result.get("y_history")
-        n_fe_step = len(y_hist) if y_hist is not None else 0
+        n_fe_reported = result.get("n_function_evaluations")
+        if n_fe_reported is not None:
+            n_fe_step = max(0, n_fe_reported - self._n_fe)
+        else:
+            n_fe_step = len(y_hist) if y_hist is not None else 0
 
         if new_best_y >= prev_best_y:
             self._stagnation_count += n_fe_step
@@ -267,20 +280,23 @@ def _update_episode_state(self, result: dict, prev_best_y: float):
 
         self._n_fe = result.get("n_function_evaluations", self._n_fe + n_fe_step)
 
-        # Accumulate population history for ELA
+        # Accumulate population history for ELA, capped at MAX_HISTORY_SAMPLE rows.
+        # Without the cap, large budgets (e.g. 40-dim × 10 000 FE) accumulate
+        # hundreds of thousands of rows — GBs of RAM for a single episode.
         if x_hist is not None and len(x_hist) > 0:
             self._x_history = (
-                x_hist
+                x_hist[-MAX_HISTORY_SAMPLE:]
                 if self._x_history is None
-                else np.concatenate([self._x_history, x_hist])
+                else np.concatenate([self._x_history, x_hist])[-MAX_HISTORY_SAMPLE:]
             )
             self._y_history = (
-                y_hist
+                y_hist[-MAX_HISTORY_SAMPLE:]
                 if self._y_history is None
-                else np.concatenate([self._y_history, y_hist])
+                else np.concatenate([self._y_history, y_hist])[-MAX_HISTORY_SAMPLE:]
             )
 
     def _build_observation(self) -> np.ndarray:
+
         return compute_observation(
             x_history=self._x_history,
             y_history=self._y_history,
diff --git a/das/env/observation.py b/das/env/observation.py
index 262d2ba..6f6d354 100644
--- a/das/env/observation.py
+++ b/das/env/observation.py
@@ -124,9 +124,8 @@ def compute_action_history_features(
         last_idx = choices_history[-1]
         last_action[last_idx] = 1.0
 
-        counts = np.array(
-            [choices_history.count(j) for j in range(n_actions)], dtype=np.float32
-        )
+        # O(n) instead of O(n_actions * n_steps) from calling list.count in a loop.
+        counts = np.bincount(choices_history, minlength=n_actions).astype(np.float32)
         frequencies = counts / len(choices_history)
 
         run = 0
diff --git a/das/env/reward.py b/das/env/reward.py
index 3098e38..933b262 100644
--- a/das/env/reward.py
+++ b/das/env/reward.py
@@ -44,7 +44,7 @@ def reward_sparse(new_best_y, old_best_y, initial_range, is_final=False):
 def reward_binary(new_best_y, old_best_y, initial_range, is_final=False):
     """Binary: 1 if improvement >= 0.1%, else 0 (original r4)."""
     if old_best_y == float("inf"):
-        return float(np.log(initial_range[1] - initial_range[0] + 1e-10))
+        return 0.0
     ratio = _improvement_ratio(new_best_y, old_best_y, initial_range)
     return 1.0 if ratio >= 1e-3 else 0.0
 
diff --git a/das/training/rldas.py b/das/training/rldas.py
index 07f32c6..0bfcb5a 100644
--- a/das/training/rldas.py
+++ b/das/training/rldas.py
@@ -22,8 +22,12 @@ def run_rl_das(args) -> None:
 
     suite = IOHSuite()
 
-    if args.k_epoch is None:
-        args.k_epoch = max(1, int(0.3 * args.n_checkpoints))
+    # Local variable — avoid mutating args so the caller's namespace stays predictable.
+    k_epoch = (
+        args.k_epoch
+        if args.k_epoch is not None
+        else max(1, int(0.3 * args.n_checkpoints))
+    )
 
     env_kwargs = dict(
         suite=suite,
@@ -45,7 +49,7 @@ def run_rl_das(args) -> None:
     print(
         f"RL-DAS  |  dim={args.dim}  |  portfolio={args.portfolio}"
         f"  |  obs_dim={train_env.observation_space.shape[0]}"
-        f"  |  k_epoch={args.k_epoch}"
+        f"  |  k_epoch={k_epoch}"
     )
 
     train(
@@ -53,7 +57,7 @@ def run_rl_das(args) -> None:
         test_env=test_env,
         agent=agent,
         n_epochs=args.n_epochs,
-        k_epoch=args.k_epoch,
+        k_epoch=k_epoch,
         eval_interval=args.eval_interval,
         save_interval=args.save_interval,
         save_dir="models",
@@ -62,6 +66,17 @@ def run_rl_das(args) -> None:
 
     if args.eval:
         print("\nRunning final evaluation on test set …")
+
+        # Fresh env so _problem_idx starts at 0.  test_env accumulated increments
+        # from periodic evaluations inside train() and would start from a rotated
+        # offset rather than problem 0, making results hard to reproduce.
+        eval_env = RLDASEnv(problem_ids=test_ids, **env_kwargs)
+        n_problems = len(test_ids)
+        test_results = evaluate(eval_env, agent, n_episodes=n_problems)
+
+        # Create the output directory before writing — write_jsonl does not
+        # create parent directories and would raise FileNotFoundError otherwise.
+        os.makedirs("results", exist_ok=True)
         n_problems = len(test_env._problem_ids)
         test_results = evaluate(test_env, agent, n_episodes=n_problems)
         mean_best_y = float(np.mean([r["best_y"] for r in test_results]))