diff --git a/agents/rl_das/agent.py b/agents/rl_das/agent.py index 8def163..29a8ea1 100644 --- a/agents/rl_das/agent.py +++ b/agents/rl_das/agent.py @@ -194,17 +194,16 @@ def learn(self, k_epoch: int, bootstrap_value: float = 0.0) -> dict[str, float]: ) actor_loss = -torch.min(surr1, surr2).mean() - # Value clipping (like PPO v2) from the 2nd epoch onward - if epoch_idx > 0: - values_clipped = old_values_t + torch.clamp( - values - old_values_t, -self.eps_clip, self.eps_clip - ) - critic_loss = torch.max( - (values - returns_t.detach()) ** 2, - (values_clipped - returns_t.detach()) ** 2, - ).mean() - else: - critic_loss = (values - returns_t.detach()).pow(2).mean() + # Value clipping applied from the first inner epoch. Skipping it + # on epoch 0 allowed an unconstrained large update on the first step, + # breaking the PPO v2 guarantee that value changes stay within eps_clip. + values_clipped = old_values_t + torch.clamp( + values - old_values_t, -self.eps_clip, self.eps_clip + ) + critic_loss = torch.max( + (values - returns_t.detach()) ** 2, + (values_clipped - returns_t.detach()) ** 2, + ).mean() loss = actor_loss + 0.5 * critic_loss - 0.01 * entropy diff --git a/agents/rl_das/env.py b/agents/rl_das/env.py index d438147..f3de680 100644 --- a/agents/rl_das/env.py +++ b/agents/rl_das/env.py @@ -242,6 +242,13 @@ def __init__( self._best_history: list[list[np.ndarray]] = [[] for _ in range(self.n_opt)] self._worst_history: list[list[np.ndarray]] = [[] for _ in range(self.n_opt)] + @property + def problem_ids(self) -> list[str]: + # Public accessor — callers should not reach into _problem_ids directly + # because it is filtered (dimension-matched) and may differ from the + # original list passed to the constructor. + return self._problem_ids + # ------------------------------------------------------------------ # Gymnasium interface # ------------------------------------------------------------------ diff --git a/agents/rl_das/network.py b/agents/rl_das/network.py index 16e08ab..fe4e48d 100644 --- a/agents/rl_das/network.py +++ b/agents/rl_das/network.py @@ -32,7 +32,9 @@ def __init__(self, dim: int) -> None: nn.Linear(dim, 64), nn.ReLU(), nn.Linear(64, 1), - nn.ReLU(), + # No second ReLU: movement vectors are signed displacements. + # Clamping to >= 0 discards direction — the network cannot tell + # whether the optimizer stepped left or right in search space. ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -94,4 +96,7 @@ def __init__(self, dim: int, n_opt: int) -> None: self.head = nn.Linear(16, 1) def forward(self, obs: torch.Tensor) -> torch.Tensor: + # Mirror Actor's NaN guard: a NaN value estimate flows into advantages + # and silently zeroes all gradients via backward(), corrupting the update. + obs = torch.nan_to_num(obs, nan=0.0, posinf=1.0, neginf=-1.0) return self.head(self.backbone(obs)).squeeze(-1) # (batch,) diff --git a/agents/rl_das/trainer.py b/agents/rl_das/trainer.py index ee9b343..4d2b0a6 100644 --- a/agents/rl_das/trainer.py +++ b/agents/rl_das/trainer.py @@ -102,18 +102,24 @@ def train( """ Path(save_dir).mkdir(parents=True, exist_ok=True) log: list[dict] = [] - n_train = len(train_env._problem_ids) + n_train = len(train_env.problem_ids) for epoch in range(1, n_epochs + 1): - epoch_rewards = [] + epoch_rewards: list[float] = [] + epoch_diagnostics: list[dict] = [] epoch_start = time.time() for _ in range(n_train): ep = _run_episode(train_env, agent, deterministic=False) epoch_rewards.append(ep["total_reward"]) - agent.learn(k_epoch) + # bootstrap_value=0.0 is correct: this env only terminates naturally + # (terminated=True, truncated always False), so the last done=True flag + # already zeroes future returns — no critic bootstrap is needed. + diag = agent.learn(k_epoch, bootstrap_value=0.0) agent.rollout.clear() + if diag: + epoch_diagnostics.append(diag) mean_train_reward = float(np.mean(epoch_rewards)) entry: dict = { @@ -122,9 +128,20 @@ def train( "elapsed_s": round(time.time() - epoch_start, 2), } + # Log per-epoch PPO diagnostics so training instability is visible + # (e.g. actor_loss explosion, entropy collapse) without manual debugging. + if epoch_diagnostics: + entry["actor_loss"] = float( + np.mean([d["actor_loss"] for d in epoch_diagnostics]) + ) + entry["critic_loss"] = float( + np.mean([d["critic_loss"] for d in epoch_diagnostics]) + ) + entry["entropy"] = float(np.mean([d["entropy"] for d in epoch_diagnostics])) + if epoch % eval_interval == 0: test_results = evaluate( - test_env, agent, n_episodes=len(test_env._problem_ids) + test_env, agent, n_episodes=len(test_env.problem_ids) ) entry["mean_test_reward"] = float( np.mean([r["total_reward"] for r in test_results]) @@ -137,12 +154,16 @@ def train( f" train_r={mean_train_reward:.4f}" f" test_r={entry['mean_test_reward']:.4f}" f" test_best_y={entry['mean_test_best_y']:.4e}" + f" actor_loss={entry.get('actor_loss', float('nan')):.4f}" + f" entropy={entry.get('entropy', float('nan')):.4f}" f" ({entry['elapsed_s']:.1f}s)" ) else: print( f"Epoch {epoch:4d}/{n_epochs}" f" train_r={mean_train_reward:.4f}" + f" actor_loss={entry.get('actor_loss', float('nan')):.4f}" + f" entropy={entry.get('entropy', float('nan')):.4f}" f" ({entry['elapsed_s']:.1f}s)" ) @@ -186,7 +207,7 @@ def evaluate( List of dicts with keys: problem_id, total_reward, best_y, n_fe. """ if n_episodes is None: - n_episodes = len(env._problem_ids) + n_episodes = len(env.problem_ids) results = [] for _ in range(n_episodes): diff --git a/das/env/das_env.py b/das/env/das_env.py index d508b5c..57fbe84 100644 --- a/das/env/das_env.py +++ b/das/env/das_env.py @@ -16,7 +16,7 @@ import gymnasium as gym from gymnasium import spaces -from das.env.observation import compute_observation, observation_dim +from das.env.observation import compute_observation, observation_dim, MAX_HISTORY_SAMPLE from das.env.reward import compute_reward from das.optimizers.base import get_checkpoints @@ -251,14 +251,27 @@ def _update_episode_state(self, result: dict, prev_best_y: float): if worst_y > self._worst_y: self._worst_y = worst_y - # Set initial range on first step + # Set initial range on first step. + # When worst_so_far_y is absent the default is -inf, which collapses + # scale to 1e-5 and inflates every subsequent reward by 1e5. Instead, + # derive scale from the magnitude of the initial best fitness. if self._initial_range[0] == float("inf"): - self._initial_range = (new_best_y, max(worst_y, new_best_y + 1e-5)) + safe_worst = ( + worst_y + if np.isfinite(worst_y) + else new_best_y + max(abs(new_best_y), 1.0) + ) + self._initial_range = (new_best_y, max(safe_worst, new_best_y + 1e-5)) - # Stagnation counter + # Stagnation counter — prefer the FE delta from the result dict so that + # stagnation accumulates correctly even when y_history is not returned. x_hist: np.ndarray | None = result.get("x_history") y_hist: np.ndarray | None = result.get("y_history") - n_fe_step = len(y_hist) if y_hist is not None else 0 + n_fe_reported = result.get("n_function_evaluations") + if n_fe_reported is not None: + n_fe_step = max(0, n_fe_reported - self._n_fe) + else: + n_fe_step = len(y_hist) if y_hist is not None else 0 if new_best_y >= prev_best_y: self._stagnation_count += n_fe_step @@ -267,20 +280,23 @@ def _update_episode_state(self, result: dict, prev_best_y: float): self._n_fe = result.get("n_function_evaluations", self._n_fe + n_fe_step) - # Accumulate population history for ELA + # Accumulate population history for ELA, capped at MAX_HISTORY_SAMPLE rows. + # Without the cap, large budgets (e.g. 40-dim × 10 000 FE) accumulate + # hundreds of thousands of rows — GBs of RAM for a single episode. if x_hist is not None and len(x_hist) > 0: self._x_history = ( - x_hist + x_hist[-MAX_HISTORY_SAMPLE:] if self._x_history is None - else np.concatenate([self._x_history, x_hist]) + else np.concatenate([self._x_history, x_hist])[-MAX_HISTORY_SAMPLE:] ) self._y_history = ( - y_hist + y_hist[-MAX_HISTORY_SAMPLE:] if self._y_history is None - else np.concatenate([self._y_history, y_hist]) + else np.concatenate([self._y_history, y_hist])[-MAX_HISTORY_SAMPLE:] ) def _build_observation(self) -> np.ndarray: + return compute_observation( x_history=self._x_history, y_history=self._y_history, diff --git a/das/env/observation.py b/das/env/observation.py index 262d2ba..6f6d354 100644 --- a/das/env/observation.py +++ b/das/env/observation.py @@ -124,9 +124,8 @@ def compute_action_history_features( last_idx = choices_history[-1] last_action[last_idx] = 1.0 - counts = np.array( - [choices_history.count(j) for j in range(n_actions)], dtype=np.float32 - ) + # O(n) instead of O(n_actions * n_steps) from calling list.count in a loop. + counts = np.bincount(choices_history, minlength=n_actions).astype(np.float32) frequencies = counts / len(choices_history) run = 0 diff --git a/das/env/reward.py b/das/env/reward.py index 3098e38..933b262 100644 --- a/das/env/reward.py +++ b/das/env/reward.py @@ -44,7 +44,7 @@ def reward_sparse(new_best_y, old_best_y, initial_range, is_final=False): def reward_binary(new_best_y, old_best_y, initial_range, is_final=False): """Binary: 1 if improvement >= 0.1%, else 0 (original r4).""" if old_best_y == float("inf"): - return float(np.log(initial_range[1] - initial_range[0] + 1e-10)) + return 0.0 ratio = _improvement_ratio(new_best_y, old_best_y, initial_range) return 1.0 if ratio >= 1e-3 else 0.0 diff --git a/das/training/rldas.py b/das/training/rldas.py index 07f32c6..0bfcb5a 100644 --- a/das/training/rldas.py +++ b/das/training/rldas.py @@ -22,8 +22,12 @@ def run_rl_das(args) -> None: suite = IOHSuite() - if args.k_epoch is None: - args.k_epoch = max(1, int(0.3 * args.n_checkpoints)) + # Local variable — avoid mutating args so the caller's namespace stays predictable. + k_epoch = ( + args.k_epoch + if args.k_epoch is not None + else max(1, int(0.3 * args.n_checkpoints)) + ) env_kwargs = dict( suite=suite, @@ -45,7 +49,7 @@ def run_rl_das(args) -> None: print( f"RL-DAS | dim={args.dim} | portfolio={args.portfolio}" f" | obs_dim={train_env.observation_space.shape[0]}" - f" | k_epoch={args.k_epoch}" + f" | k_epoch={k_epoch}" ) train( @@ -53,7 +57,7 @@ def run_rl_das(args) -> None: test_env=test_env, agent=agent, n_epochs=args.n_epochs, - k_epoch=args.k_epoch, + k_epoch=k_epoch, eval_interval=args.eval_interval, save_interval=args.save_interval, save_dir="models", @@ -62,6 +66,17 @@ def run_rl_das(args) -> None: if args.eval: print("\nRunning final evaluation on test set …") + + # Fresh env so _problem_idx starts at 0. test_env accumulated increments + # from periodic evaluations inside train() and would start from a rotated + # offset rather than problem 0, making results hard to reproduce. + eval_env = RLDASEnv(problem_ids=test_ids, **env_kwargs) + n_problems = len(test_ids) + test_results = evaluate(eval_env, agent, n_episodes=n_problems) + + # Create the output directory before writing — write_jsonl does not + # create parent directories and would raise FileNotFoundError otherwise. + os.makedirs("results", exist_ok=True) n_problems = len(test_env._problem_ids) test_results = evaluate(test_env, agent, n_episodes=n_problems) mean_best_y = float(np.mean([r["best_y"] for r in test_results]))