From d1028c3c5619a8ad1f131e8336467a5033967102 Mon Sep 17 00:00:00 2001 From: Grzmro <106807538+Grzmro@users.noreply.github.com> Date: Tue, 19 May 2026 10:53:35 +0200 Subject: [PATCH 1/5] fix: reward scaling, PPO clipping, ELA memory cap, and eval metrics --- agents/rl_das/agent.py | 21 +++++++------- agents/rl_das/env.py | 7 +++++ agents/rl_das/network.py | 7 ++++- agents/rl_das/trainer.py | 27 ++++++++++++++---- das/env/bbob_splits.py | 6 ++-- das/env/das_env.py | 60 +++++++++++++++++++++++++++++++++------- das/env/observation.py | 37 ++++++++++++++++--------- das/env/reward.py | 10 ++++--- das/training/rldas.py | 22 ++++++++++++--- 9 files changed, 146 insertions(+), 51 deletions(-) diff --git a/agents/rl_das/agent.py b/agents/rl_das/agent.py index 8def163..29a8ea1 100644 --- a/agents/rl_das/agent.py +++ b/agents/rl_das/agent.py @@ -194,17 +194,16 @@ def learn(self, k_epoch: int, bootstrap_value: float = 0.0) -> dict[str, float]: ) actor_loss = -torch.min(surr1, surr2).mean() - # Value clipping (like PPO v2) from the 2nd epoch onward - if epoch_idx > 0: - values_clipped = old_values_t + torch.clamp( - values - old_values_t, -self.eps_clip, self.eps_clip - ) - critic_loss = torch.max( - (values - returns_t.detach()) ** 2, - (values_clipped - returns_t.detach()) ** 2, - ).mean() - else: - critic_loss = (values - returns_t.detach()).pow(2).mean() + # Value clipping applied from the first inner epoch. Skipping it + # on epoch 0 allowed an unconstrained large update on the first step, + # breaking the PPO v2 guarantee that value changes stay within eps_clip. + values_clipped = old_values_t + torch.clamp( + values - old_values_t, -self.eps_clip, self.eps_clip + ) + critic_loss = torch.max( + (values - returns_t.detach()) ** 2, + (values_clipped - returns_t.detach()) ** 2, + ).mean() loss = actor_loss + 0.5 * critic_loss - 0.01 * entropy diff --git a/agents/rl_das/env.py b/agents/rl_das/env.py index d438147..f3de680 100644 --- a/agents/rl_das/env.py +++ b/agents/rl_das/env.py @@ -242,6 +242,13 @@ def __init__( self._best_history: list[list[np.ndarray]] = [[] for _ in range(self.n_opt)] self._worst_history: list[list[np.ndarray]] = [[] for _ in range(self.n_opt)] + @property + def problem_ids(self) -> list[str]: + # Public accessor — callers should not reach into _problem_ids directly + # because it is filtered (dimension-matched) and may differ from the + # original list passed to the constructor. + return self._problem_ids + # ------------------------------------------------------------------ # Gymnasium interface # ------------------------------------------------------------------ diff --git a/agents/rl_das/network.py b/agents/rl_das/network.py index 16e08ab..fe4e48d 100644 --- a/agents/rl_das/network.py +++ b/agents/rl_das/network.py @@ -32,7 +32,9 @@ def __init__(self, dim: int) -> None: nn.Linear(dim, 64), nn.ReLU(), nn.Linear(64, 1), - nn.ReLU(), + # No second ReLU: movement vectors are signed displacements. + # Clamping to >= 0 discards direction — the network cannot tell + # whether the optimizer stepped left or right in search space. ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -94,4 +96,7 @@ def __init__(self, dim: int, n_opt: int) -> None: self.head = nn.Linear(16, 1) def forward(self, obs: torch.Tensor) -> torch.Tensor: + # Mirror Actor's NaN guard: a NaN value estimate flows into advantages + # and silently zeroes all gradients via backward(), corrupting the update. + obs = torch.nan_to_num(obs, nan=0.0, posinf=1.0, neginf=-1.0) return self.head(self.backbone(obs)).squeeze(-1) # (batch,) diff --git a/agents/rl_das/trainer.py b/agents/rl_das/trainer.py index ee9b343..0d6d261 100644 --- a/agents/rl_das/trainer.py +++ b/agents/rl_das/trainer.py @@ -102,18 +102,24 @@ def train( """ Path(save_dir).mkdir(parents=True, exist_ok=True) log: list[dict] = [] - n_train = len(train_env._problem_ids) + n_train = len(train_env.problem_ids) for epoch in range(1, n_epochs + 1): - epoch_rewards = [] + epoch_rewards: list[float] = [] + epoch_diagnostics: list[dict] = [] epoch_start = time.time() for _ in range(n_train): ep = _run_episode(train_env, agent, deterministic=False) epoch_rewards.append(ep["total_reward"]) - agent.learn(k_epoch) + # bootstrap_value=0.0 is correct: this env only terminates naturally + # (terminated=True, truncated always False), so the last done=True flag + # already zeroes future returns — no critic bootstrap is needed. + diag = agent.learn(k_epoch, bootstrap_value=0.0) agent.rollout.clear() + if diag: + epoch_diagnostics.append(diag) mean_train_reward = float(np.mean(epoch_rewards)) entry: dict = { @@ -122,9 +128,16 @@ def train( "elapsed_s": round(time.time() - epoch_start, 2), } + # Log per-epoch PPO diagnostics so training instability is visible + # (e.g. actor_loss explosion, entropy collapse) without manual debugging. + if epoch_diagnostics: + entry["actor_loss"] = float(np.mean([d["actor_loss"] for d in epoch_diagnostics])) + entry["critic_loss"] = float(np.mean([d["critic_loss"] for d in epoch_diagnostics])) + entry["entropy"] = float(np.mean([d["entropy"] for d in epoch_diagnostics])) + if epoch % eval_interval == 0: test_results = evaluate( - test_env, agent, n_episodes=len(test_env._problem_ids) + test_env, agent, n_episodes=len(test_env.problem_ids) ) entry["mean_test_reward"] = float( np.mean([r["total_reward"] for r in test_results]) @@ -137,12 +150,16 @@ def train( f" train_r={mean_train_reward:.4f}" f" test_r={entry['mean_test_reward']:.4f}" f" test_best_y={entry['mean_test_best_y']:.4e}" + f" actor_loss={entry.get('actor_loss', float('nan')):.4f}" + f" entropy={entry.get('entropy', float('nan')):.4f}" f" ({entry['elapsed_s']:.1f}s)" ) else: print( f"Epoch {epoch:4d}/{n_epochs}" f" train_r={mean_train_reward:.4f}" + f" actor_loss={entry.get('actor_loss', float('nan')):.4f}" + f" entropy={entry.get('entropy', float('nan')):.4f}" f" ({entry['elapsed_s']:.1f}s)" ) @@ -186,7 +203,7 @@ def evaluate( List of dicts with keys: problem_id, total_reward, best_y, n_fe. """ if n_episodes is None: - n_episodes = len(env._problem_ids) + n_episodes = len(env.problem_ids) results = [] for _ in range(n_episodes): diff --git a/das/env/bbob_splits.py b/das/env/bbob_splits.py index bc9a8da..8794163 100644 --- a/das/env/bbob_splits.py +++ b/das/env/bbob_splits.py @@ -7,7 +7,7 @@ ALL_DIMS = [2, 3, 5, 10, 20, 40] ALL_FUNCTIONS = set(range(1, 25)) INSTANCE_IDS = [1, 2, 3, 4, 5, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80] -EASY_TRAIN_FUNCTIONS = {4, *range(6, 15), 18, 19, 20, 22, 23, 24} +EASY_TRAIN_FUNCTIONS = {1, 2, 3, 4, *range(6, 15), 18, 19, 20, 22, 23, 24} # Czy tutaj nie powinny być też funkcje 1,2,3? def build_problem_ids( @@ -22,7 +22,7 @@ def build_problem_ids( ] -def get_train_test_split(mode: str, dims: list[int]) -> tuple[list[str], list[str]]: +def get_train_test_split(mode: str, dims: list[int], seed: int = 0) -> tuple[list[str], list[str]]: """Return (train_ids, test_ids) for the given split mode and dimensions. Modes: @@ -42,7 +42,7 @@ def get_train_test_split(mode: str, dims: list[int]) -> tuple[list[str], list[st ) # random 2/3 – 1/3 split all_ids = build_problem_ids(ALL_FUNCTIONS, dims) - rng = np.random.default_rng() + rng = np.random.default_rng(seed) rng.shuffle(all_ids) split = 2 * len(all_ids) // 3 return all_ids[:split], all_ids[split:] diff --git a/das/env/das_env.py b/das/env/das_env.py index d508b5c..e45d4f3 100644 --- a/das/env/das_env.py +++ b/das/env/das_env.py @@ -16,10 +16,21 @@ import gymnasium as gym from gymnasium import spaces -from das.env.observation import compute_observation, observation_dim +from das.env.observation import ( + compute_observation, + observation_dim, + compute_ela_features, + MAX_HISTORY_SAMPLE, + ELA_DIM, +) from das.env.reward import compute_reward from das.optimizers.base import get_checkpoints +# Recompute ELA every ~500 new population samples. pflacco runs regression, +# nearest-neighbour search, and IC calculations on every call — running it +# every step would dominate wall-clock time for long training runs. +_ELA_RECOMPUTE_THRESHOLD = MAX_HISTORY_SAMPLE // 5 + class DASEnv(gym.Env): """DAS environment. @@ -110,6 +121,11 @@ def __init__( self._stagnation_count = 0 self._choices_history: list[int] = [] + # ELA features are expensive; cache the last computed vector and refresh + # lazily once _ELA_RECOMPUTE_THRESHOLD new samples have arrived. + self._ela_cache: np.ndarray = np.zeros(ELA_DIM, dtype=np.float32) + self._ela_cache_len: int = 0 + # ------------------------------------------------------------------ # # Gymnasium interface # # ------------------------------------------------------------------ # @@ -140,6 +156,8 @@ def reset(self, seed=None, options=None): self._initial_range = (float("inf"), -np.inf) self._stagnation_count = 0 self._choices_history = [] + self._ela_cache = np.zeros(ELA_DIM, dtype=np.float32) + self._ela_cache_len = 0 obs = self._build_observation() info = {"problem_id": problem_id, "dimension": dim} @@ -251,14 +269,25 @@ def _update_episode_state(self, result: dict, prev_best_y: float): if worst_y > self._worst_y: self._worst_y = worst_y - # Set initial range on first step + # Set initial range on first step. + # When worst_so_far_y is absent the default is -inf, which collapses + # scale to 1e-5 and inflates every subsequent reward by 1e5. Instead, + # derive scale from the magnitude of the initial best fitness. if self._initial_range[0] == float("inf"): - self._initial_range = (new_best_y, max(worst_y, new_best_y + 1e-5)) + safe_worst = ( + worst_y if np.isfinite(worst_y) else new_best_y + max(abs(new_best_y), 1.0) + ) + self._initial_range = (new_best_y, max(safe_worst, new_best_y + 1e-5)) - # Stagnation counter + # Stagnation counter — prefer the FE delta from the result dict so that + # stagnation accumulates correctly even when y_history is not returned. x_hist: np.ndarray | None = result.get("x_history") y_hist: np.ndarray | None = result.get("y_history") - n_fe_step = len(y_hist) if y_hist is not None else 0 + n_fe_reported = result.get("n_function_evaluations") + if n_fe_reported is not None: + n_fe_step = max(0, n_fe_reported - self._n_fe) + else: + n_fe_step = len(y_hist) if y_hist is not None else 0 if new_best_y >= prev_best_y: self._stagnation_count += n_fe_step @@ -267,20 +296,30 @@ def _update_episode_state(self, result: dict, prev_best_y: float): self._n_fe = result.get("n_function_evaluations", self._n_fe + n_fe_step) - # Accumulate population history for ELA + # Accumulate population history for ELA, capped at MAX_HISTORY_SAMPLE rows. + # Without the cap, large budgets (e.g. 40-dim × 10 000 FE) accumulate + # hundreds of thousands of rows — GBs of RAM for a single episode. if x_hist is not None and len(x_hist) > 0: self._x_history = ( - x_hist + x_hist[-MAX_HISTORY_SAMPLE:] if self._x_history is None - else np.concatenate([self._x_history, x_hist]) + else np.concatenate([self._x_history, x_hist])[-MAX_HISTORY_SAMPLE:] ) self._y_history = ( - y_hist + y_hist[-MAX_HISTORY_SAMPLE:] if self._y_history is None - else np.concatenate([self._y_history, y_hist]) + else np.concatenate([self._y_history, y_hist])[-MAX_HISTORY_SAMPLE:] ) def _build_observation(self) -> np.ndarray: + # Recompute ELA only when enough new samples have arrived. + # _ela_cache starts as zeros (correct before 50 samples) and is reset + # each episode, so stale features from a previous episode never leak in. + current_len = len(self._x_history) if self._x_history is not None else 0 + if current_len >= 50 and current_len - self._ela_cache_len >= _ELA_RECOMPUTE_THRESHOLD: + self._ela_cache = compute_ela_features(self._x_history, self._y_history) + self._ela_cache_len = current_len + return compute_observation( x_history=self._x_history, y_history=self._y_history, @@ -291,4 +330,5 @@ def _build_observation(self) -> np.ndarray: max_fe=max(self._max_fe, 1), stagnation_count=self._stagnation_count, ndim_problem=self._problem.dimension if self._problem is not None else 1, + ela=self._ela_cache, ) diff --git a/das/env/observation.py b/das/env/observation.py index 262d2ba..95b38fe 100644 --- a/das/env/observation.py +++ b/das/env/observation.py @@ -58,10 +58,12 @@ def compute_ela_features(x: np.ndarray, y: np.ndarray) -> np.ndarray: with warnings.catch_warnings(): warnings.simplefilter("ignore") - _, unique_idx = np.unique(x, axis=0, return_index=True) - unique_idx = np.sort(unique_idx) - x = x[unique_idx][-MAX_HISTORY_SAMPLE:] - y = y[unique_idx][-MAX_HISTORY_SAMPLE:] + # Slice to the most-recent samples first; deduplication is done below + # in normalised space where it is actually meaningful — raw-space + # np.unique missed points that become identical after normalisation and + # was therefore doing redundant work without full correctness guarantees. + x = x[-MAX_HISTORY_SAMPLE:] + y = y[-MAX_HISTORY_SAMPLE:] x_norm_arr = (x - x.mean()) / (x.std() + 1e-8) y_norm_arr = (y - y.mean()) / (y.std() + 1e-8) @@ -96,8 +98,14 @@ def compute_ela_features(x: np.ndarray, y: np.ndarray) -> np.ndarray: ) } - all_feats = {**meta, **nbc, **disp, **ic, **ela_distr} - return np.array([all_feats[k] for k in ELA_FEATURE_KEYS], dtype=np.float32) + # pflacco may return an incomplete dict for degenerate or edge-case + # inputs that slipped past the variance guard above. Fall back to + # zeros rather than crashing training with a KeyError mid-run. + try: + all_feats = {**meta, **nbc, **disp, **ic, **ela_distr} + return np.array([all_feats[k] for k in ELA_FEATURE_KEYS], dtype=np.float32) + except (KeyError, ValueError): + return np.zeros(ELA_DIM, dtype=np.float32) def compute_action_history_features( @@ -124,9 +132,8 @@ def compute_action_history_features( last_idx = choices_history[-1] last_action[last_idx] = 1.0 - counts = np.array( - [choices_history.count(j) for j in range(n_actions)], dtype=np.float32 - ) + # O(n) instead of O(n_actions * n_steps) from calling list.count in a loop. + counts = np.bincount(choices_history, minlength=n_actions).astype(np.float32) frequencies = counts / len(choices_history) run = 0 @@ -165,12 +172,16 @@ def compute_observation( max_fe: int, stagnation_count: int, ndim_problem: int, + ela: np.ndarray | None = None, ) -> np.ndarray: """Assemble the full observation vector from its components.""" - if x_history is not None and y_history is not None and len(x_history) >= 50: - ela = compute_ela_features(x_history, y_history) - else: - ela = np.zeros(ELA_DIM, dtype=np.float32) + # Accept a pre-computed ELA vector so the caller can cache it across steps + # and avoid running pflacco on every observation build (pflacco is expensive). + if ela is None: + if x_history is not None and y_history is not None and len(x_history) >= 50: + ela = compute_ela_features(x_history, y_history) + else: + ela = np.zeros(ELA_DIM, dtype=np.float32) action_hist = compute_action_history_features( choices_history, n_actions, n_checkpoints, ndim_problem diff --git a/das/env/reward.py b/das/env/reward.py index 3098e38..eb31fb0 100644 --- a/das/env/reward.py +++ b/das/env/reward.py @@ -18,7 +18,7 @@ def _improvement_ratio( def reward_log_scaled(new_best_y, old_best_y, initial_range, is_final=False): """Log-scaled incremental improvement (original r1).""" if old_best_y == float("inf"): - return float(np.log(initial_range[1] - initial_range[0] + 1e-10)) + return 0.0 ratio = _improvement_ratio(new_best_y, old_best_y, initial_range) return float(np.log(np.clip(ratio, 0.0, 1.0) + 1e-5)) @@ -26,7 +26,9 @@ def reward_log_scaled(new_best_y, old_best_y, initial_range, is_final=False): def reward_linear(new_best_y, old_best_y, initial_range, is_final=False): """Linear improvement clipped to [0, 1] (original r2).""" if old_best_y == float("inf"): - return float(np.log(initial_range[1] - initial_range[0] + 1e-10)) + # No prior best on the first step — returning log(scale) here would + # produce a value outside [0, 1] and break the linear contract. + return 0.0 return float( np.clip(_improvement_ratio(new_best_y, old_best_y, initial_range), 0.0, 1.0) ) @@ -35,7 +37,7 @@ def reward_linear(new_best_y, old_best_y, initial_range, is_final=False): def reward_sparse(new_best_y, old_best_y, initial_range, is_final=False): """Sparse: only reward at the final checkpoint (original r3).""" if old_best_y == float("inf") or not is_final: - return float(np.log(initial_range[1] - initial_range[0] + 1e-10)) + return 0.0 total_improvement = initial_range[0] - new_best_y scale = initial_range[1] - initial_range[0] return float(np.log(total_improvement / (scale + 1e-10) + 1e-5)) @@ -44,7 +46,7 @@ def reward_sparse(new_best_y, old_best_y, initial_range, is_final=False): def reward_binary(new_best_y, old_best_y, initial_range, is_final=False): """Binary: 1 if improvement >= 0.1%, else 0 (original r4).""" if old_best_y == float("inf"): - return float(np.log(initial_range[1] - initial_range[0] + 1e-10)) + return 0.0 ratio = _improvement_ratio(new_best_y, old_best_y, initial_range) return 1.0 if ratio >= 1e-3 else 0.0 diff --git a/das/training/rldas.py b/das/training/rldas.py index 07f32c6..588e2b7 100644 --- a/das/training/rldas.py +++ b/das/training/rldas.py @@ -22,8 +22,11 @@ def run_rl_das(args) -> None: suite = IOHSuite() - if args.k_epoch is None: - args.k_epoch = max(1, int(0.3 * args.n_checkpoints)) + # Local variable — avoid mutating args so the caller's namespace stays predictable. + k_epoch = ( + args.k_epoch if args.k_epoch is not None + else max(1, int(0.3 * args.n_checkpoints)) + ) env_kwargs = dict( suite=suite, @@ -45,7 +48,7 @@ def run_rl_das(args) -> None: print( f"RL-DAS | dim={args.dim} | portfolio={args.portfolio}" f" | obs_dim={train_env.observation_space.shape[0]}" - f" | k_epoch={args.k_epoch}" + f" | k_epoch={k_epoch}" ) train( @@ -53,7 +56,7 @@ def run_rl_das(args) -> None: test_env=test_env, agent=agent, n_epochs=args.n_epochs, - k_epoch=args.k_epoch, + k_epoch=k_epoch, eval_interval=args.eval_interval, save_interval=args.save_interval, save_dir="models", @@ -62,6 +65,17 @@ def run_rl_das(args) -> None: if args.eval: print("\nRunning final evaluation on test set …") + + # Fresh env so _problem_idx starts at 0. test_env accumulated increments + # from periodic evaluations inside train() and would start from a rotated + # offset rather than problem 0, making results hard to reproduce. + eval_env = RLDASEnv(problem_ids=test_ids, **env_kwargs) + n_problems = len(test_ids) + test_results = evaluate(eval_env, agent, n_episodes=n_problems) + + # Create the output directory before writing — write_jsonl does not + # create parent directories and would raise FileNotFoundError otherwise. + os.makedirs("results", exist_ok=True) n_problems = len(test_env._problem_ids) test_results = evaluate(test_env, agent, n_episodes=n_problems) mean_best_y = float(np.mean([r["best_y"] for r in test_results])) From b44e47bf33269ba64754714f0df53d33c5c12a03 Mon Sep 17 00:00:00 2001 From: Grzmro <106807538+Grzmro@users.noreply.github.com> Date: Tue, 19 May 2026 12:55:22 +0200 Subject: [PATCH 2/5] feat: add ela_recompute_every parameter for ELA feature recomputation control --- das/env/das_env.py | 11 ++++++----- das/training/common.py | 1 + das/training/ppo.py | 1 + train.py | 10 ++++++++++ 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/das/env/das_env.py b/das/env/das_env.py index e45d4f3..aef3ff1 100644 --- a/das/env/das_env.py +++ b/das/env/das_env.py @@ -26,10 +26,6 @@ from das.env.reward import compute_reward from das.optimizers.base import get_checkpoints -# Recompute ELA every ~500 new population samples. pflacco runs regression, -# nearest-neighbour search, and IC calculations on every call — running it -# every step would dominate wall-clock time for long training runs. -_ELA_RECOMPUTE_THRESHOLD = MAX_HISTORY_SAMPLE // 5 class DASEnv(gym.Env): @@ -71,6 +67,7 @@ def __init__( reward_option: int = 1, n_individuals: int | list[int | None] | None = None, seed: int | None = None, + ela_recompute_every: int = MAX_HISTORY_SAMPLE // 5 # ~500, ): super().__init__() self.problem_ids = problem_ids @@ -93,6 +90,7 @@ def __init__( ) self.n_individuals = pop self._seed = seed + self._ela_recompute_every = max(1, ela_recompute_every) n_actions = len(optimizers) obs_dim = observation_dim(n_actions) @@ -316,7 +314,10 @@ def _build_observation(self) -> np.ndarray: # _ela_cache starts as zeros (correct before 50 samples) and is reset # each episode, so stale features from a previous episode never leak in. current_len = len(self._x_history) if self._x_history is not None else 0 - if current_len >= 50 and current_len - self._ela_cache_len >= _ELA_RECOMPUTE_THRESHOLD: + if current_len >= 50 and ( + self._ela_cache_len == 0 + or current_len - self._ela_cache_len >= self._ela_recompute_every + ): self._ela_cache = compute_ela_features(self._x_history, self._y_history) self._ela_cache_len = current_len diff --git a/das/training/common.py b/das/training/common.py index b8ac184..f2e2244 100644 --- a/das/training/common.py +++ b/das/training/common.py @@ -130,6 +130,7 @@ def _init(): reward_option=cfg["reward_option"], n_individuals=cfg["n_individuals"], seed=cfg.get("seed"), + ela_recompute_every=cfg.get("ela_recompute_every", 500), ) return _init diff --git a/das/training/ppo.py b/das/training/ppo.py index 79fdf2d..1cc4020 100644 --- a/das/training/ppo.py +++ b/das/training/ppo.py @@ -138,6 +138,7 @@ def run_ppo(args) -> None: "reward_option": args.reward_option, "n_individuals": args.n_individuals, "seed": args.seed, + "ela_recompute_every": args.ela_recompute_every, } print(f"Portfolio : {args.portfolio}") diff --git a/train.py b/train.py index 7c64d2d..6b058a4 100644 --- a/train.py +++ b/train.py @@ -78,6 +78,16 @@ def _add_shared_args( help="Population size override (default: each algorithm uses its own built-in default)", ) p.add_argument("--seed", type=int, default=42) + p.add_argument( + "--ela-recompute-every", + type=int, + default=500, + help=( + "Recompute ELA features every N new population samples. " + "Set to 1 to recompute on every step (slow but maximally fresh). " + "Default: 500." + ), + ) def _parse_args() -> argparse.Namespace: From e840dba8f6be19dacb17e4c914e10bc21b624917 Mon Sep 17 00:00:00 2001 From: Grzmro <106807538+Grzmro@users.noreply.github.com> Date: Mon, 25 May 2026 13:28:01 +0200 Subject: [PATCH 3/5] Revert "feat: add ela_recompute_every parameter for ELA feature recomputation control" This reverts commit 2a1c641e42e31957de83af7013c0cf9ce126e12a. --- das/env/das_env.py | 11 +++++------ das/training/common.py | 1 - das/training/ppo.py | 1 - train.py | 10 ---------- 4 files changed, 5 insertions(+), 18 deletions(-) diff --git a/das/env/das_env.py b/das/env/das_env.py index aef3ff1..e45d4f3 100644 --- a/das/env/das_env.py +++ b/das/env/das_env.py @@ -26,6 +26,10 @@ from das.env.reward import compute_reward from das.optimizers.base import get_checkpoints +# Recompute ELA every ~500 new population samples. pflacco runs regression, +# nearest-neighbour search, and IC calculations on every call — running it +# every step would dominate wall-clock time for long training runs. +_ELA_RECOMPUTE_THRESHOLD = MAX_HISTORY_SAMPLE // 5 class DASEnv(gym.Env): @@ -67,7 +71,6 @@ def __init__( reward_option: int = 1, n_individuals: int | list[int | None] | None = None, seed: int | None = None, - ela_recompute_every: int = MAX_HISTORY_SAMPLE // 5 # ~500, ): super().__init__() self.problem_ids = problem_ids @@ -90,7 +93,6 @@ def __init__( ) self.n_individuals = pop self._seed = seed - self._ela_recompute_every = max(1, ela_recompute_every) n_actions = len(optimizers) obs_dim = observation_dim(n_actions) @@ -314,10 +316,7 @@ def _build_observation(self) -> np.ndarray: # _ela_cache starts as zeros (correct before 50 samples) and is reset # each episode, so stale features from a previous episode never leak in. current_len = len(self._x_history) if self._x_history is not None else 0 - if current_len >= 50 and ( - self._ela_cache_len == 0 - or current_len - self._ela_cache_len >= self._ela_recompute_every - ): + if current_len >= 50 and current_len - self._ela_cache_len >= _ELA_RECOMPUTE_THRESHOLD: self._ela_cache = compute_ela_features(self._x_history, self._y_history) self._ela_cache_len = current_len diff --git a/das/training/common.py b/das/training/common.py index f2e2244..b8ac184 100644 --- a/das/training/common.py +++ b/das/training/common.py @@ -130,7 +130,6 @@ def _init(): reward_option=cfg["reward_option"], n_individuals=cfg["n_individuals"], seed=cfg.get("seed"), - ela_recompute_every=cfg.get("ela_recompute_every", 500), ) return _init diff --git a/das/training/ppo.py b/das/training/ppo.py index 1cc4020..79fdf2d 100644 --- a/das/training/ppo.py +++ b/das/training/ppo.py @@ -138,7 +138,6 @@ def run_ppo(args) -> None: "reward_option": args.reward_option, "n_individuals": args.n_individuals, "seed": args.seed, - "ela_recompute_every": args.ela_recompute_every, } print(f"Portfolio : {args.portfolio}") diff --git a/train.py b/train.py index 6b058a4..7c64d2d 100644 --- a/train.py +++ b/train.py @@ -78,16 +78,6 @@ def _add_shared_args( help="Population size override (default: each algorithm uses its own built-in default)", ) p.add_argument("--seed", type=int, default=42) - p.add_argument( - "--ela-recompute-every", - type=int, - default=500, - help=( - "Recompute ELA features every N new population samples. " - "Set to 1 to recompute on every step (slow but maximally fresh). " - "Default: 500." - ), - ) def _parse_args() -> argparse.Namespace: From 592a7c7259e525bad00fc68ba1f647fca9aff14b Mon Sep 17 00:00:00 2001 From: Grzmro <106807538+Grzmro@users.noreply.github.com> Date: Mon, 25 May 2026 13:57:50 +0200 Subject: [PATCH 4/5] refactor --- das/env/bbob_splits.py | 6 +++--- das/env/das_env.py | 28 +--------------------------- das/env/observation.py | 32 ++++++++++---------------------- das/env/reward.py | 8 +++----- 4 files changed, 17 insertions(+), 57 deletions(-) diff --git a/das/env/bbob_splits.py b/das/env/bbob_splits.py index 8794163..bc9a8da 100644 --- a/das/env/bbob_splits.py +++ b/das/env/bbob_splits.py @@ -7,7 +7,7 @@ ALL_DIMS = [2, 3, 5, 10, 20, 40] ALL_FUNCTIONS = set(range(1, 25)) INSTANCE_IDS = [1, 2, 3, 4, 5, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80] -EASY_TRAIN_FUNCTIONS = {1, 2, 3, 4, *range(6, 15), 18, 19, 20, 22, 23, 24} # Czy tutaj nie powinny być też funkcje 1,2,3? +EASY_TRAIN_FUNCTIONS = {4, *range(6, 15), 18, 19, 20, 22, 23, 24} def build_problem_ids( @@ -22,7 +22,7 @@ def build_problem_ids( ] -def get_train_test_split(mode: str, dims: list[int], seed: int = 0) -> tuple[list[str], list[str]]: +def get_train_test_split(mode: str, dims: list[int]) -> tuple[list[str], list[str]]: """Return (train_ids, test_ids) for the given split mode and dimensions. Modes: @@ -42,7 +42,7 @@ def get_train_test_split(mode: str, dims: list[int], seed: int = 0) -> tuple[lis ) # random 2/3 – 1/3 split all_ids = build_problem_ids(ALL_FUNCTIONS, dims) - rng = np.random.default_rng(seed) + rng = np.random.default_rng() rng.shuffle(all_ids) split = 2 * len(all_ids) // 3 return all_ids[:split], all_ids[split:] diff --git a/das/env/das_env.py b/das/env/das_env.py index e45d4f3..1fc06d3 100644 --- a/das/env/das_env.py +++ b/das/env/das_env.py @@ -16,21 +16,10 @@ import gymnasium as gym from gymnasium import spaces -from das.env.observation import ( - compute_observation, - observation_dim, - compute_ela_features, - MAX_HISTORY_SAMPLE, - ELA_DIM, -) +from das.env.observation import (compute_observation, observation_dim, MAX_HISTORY_SAMPLE) from das.env.reward import compute_reward from das.optimizers.base import get_checkpoints -# Recompute ELA every ~500 new population samples. pflacco runs regression, -# nearest-neighbour search, and IC calculations on every call — running it -# every step would dominate wall-clock time for long training runs. -_ELA_RECOMPUTE_THRESHOLD = MAX_HISTORY_SAMPLE // 5 - class DASEnv(gym.Env): """DAS environment. @@ -121,11 +110,6 @@ def __init__( self._stagnation_count = 0 self._choices_history: list[int] = [] - # ELA features are expensive; cache the last computed vector and refresh - # lazily once _ELA_RECOMPUTE_THRESHOLD new samples have arrived. - self._ela_cache: np.ndarray = np.zeros(ELA_DIM, dtype=np.float32) - self._ela_cache_len: int = 0 - # ------------------------------------------------------------------ # # Gymnasium interface # # ------------------------------------------------------------------ # @@ -156,8 +140,6 @@ def reset(self, seed=None, options=None): self._initial_range = (float("inf"), -np.inf) self._stagnation_count = 0 self._choices_history = [] - self._ela_cache = np.zeros(ELA_DIM, dtype=np.float32) - self._ela_cache_len = 0 obs = self._build_observation() info = {"problem_id": problem_id, "dimension": dim} @@ -312,13 +294,6 @@ def _update_episode_state(self, result: dict, prev_best_y: float): ) def _build_observation(self) -> np.ndarray: - # Recompute ELA only when enough new samples have arrived. - # _ela_cache starts as zeros (correct before 50 samples) and is reset - # each episode, so stale features from a previous episode never leak in. - current_len = len(self._x_history) if self._x_history is not None else 0 - if current_len >= 50 and current_len - self._ela_cache_len >= _ELA_RECOMPUTE_THRESHOLD: - self._ela_cache = compute_ela_features(self._x_history, self._y_history) - self._ela_cache_len = current_len return compute_observation( x_history=self._x_history, @@ -330,5 +305,4 @@ def _build_observation(self) -> np.ndarray: max_fe=max(self._max_fe, 1), stagnation_count=self._stagnation_count, ndim_problem=self._problem.dimension if self._problem is not None else 1, - ela=self._ela_cache, ) diff --git a/das/env/observation.py b/das/env/observation.py index 95b38fe..6f6d354 100644 --- a/das/env/observation.py +++ b/das/env/observation.py @@ -58,12 +58,10 @@ def compute_ela_features(x: np.ndarray, y: np.ndarray) -> np.ndarray: with warnings.catch_warnings(): warnings.simplefilter("ignore") - # Slice to the most-recent samples first; deduplication is done below - # in normalised space where it is actually meaningful — raw-space - # np.unique missed points that become identical after normalisation and - # was therefore doing redundant work without full correctness guarantees. - x = x[-MAX_HISTORY_SAMPLE:] - y = y[-MAX_HISTORY_SAMPLE:] + _, unique_idx = np.unique(x, axis=0, return_index=True) + unique_idx = np.sort(unique_idx) + x = x[unique_idx][-MAX_HISTORY_SAMPLE:] + y = y[unique_idx][-MAX_HISTORY_SAMPLE:] x_norm_arr = (x - x.mean()) / (x.std() + 1e-8) y_norm_arr = (y - y.mean()) / (y.std() + 1e-8) @@ -98,14 +96,8 @@ def compute_ela_features(x: np.ndarray, y: np.ndarray) -> np.ndarray: ) } - # pflacco may return an incomplete dict for degenerate or edge-case - # inputs that slipped past the variance guard above. Fall back to - # zeros rather than crashing training with a KeyError mid-run. - try: - all_feats = {**meta, **nbc, **disp, **ic, **ela_distr} - return np.array([all_feats[k] for k in ELA_FEATURE_KEYS], dtype=np.float32) - except (KeyError, ValueError): - return np.zeros(ELA_DIM, dtype=np.float32) + all_feats = {**meta, **nbc, **disp, **ic, **ela_distr} + return np.array([all_feats[k] for k in ELA_FEATURE_KEYS], dtype=np.float32) def compute_action_history_features( @@ -172,16 +164,12 @@ def compute_observation( max_fe: int, stagnation_count: int, ndim_problem: int, - ela: np.ndarray | None = None, ) -> np.ndarray: """Assemble the full observation vector from its components.""" - # Accept a pre-computed ELA vector so the caller can cache it across steps - # and avoid running pflacco on every observation build (pflacco is expensive). - if ela is None: - if x_history is not None and y_history is not None and len(x_history) >= 50: - ela = compute_ela_features(x_history, y_history) - else: - ela = np.zeros(ELA_DIM, dtype=np.float32) + if x_history is not None and y_history is not None and len(x_history) >= 50: + ela = compute_ela_features(x_history, y_history) + else: + ela = np.zeros(ELA_DIM, dtype=np.float32) action_hist = compute_action_history_features( choices_history, n_actions, n_checkpoints, ndim_problem diff --git a/das/env/reward.py b/das/env/reward.py index eb31fb0..933b262 100644 --- a/das/env/reward.py +++ b/das/env/reward.py @@ -18,7 +18,7 @@ def _improvement_ratio( def reward_log_scaled(new_best_y, old_best_y, initial_range, is_final=False): """Log-scaled incremental improvement (original r1).""" if old_best_y == float("inf"): - return 0.0 + return float(np.log(initial_range[1] - initial_range[0] + 1e-10)) ratio = _improvement_ratio(new_best_y, old_best_y, initial_range) return float(np.log(np.clip(ratio, 0.0, 1.0) + 1e-5)) @@ -26,9 +26,7 @@ def reward_log_scaled(new_best_y, old_best_y, initial_range, is_final=False): def reward_linear(new_best_y, old_best_y, initial_range, is_final=False): """Linear improvement clipped to [0, 1] (original r2).""" if old_best_y == float("inf"): - # No prior best on the first step — returning log(scale) here would - # produce a value outside [0, 1] and break the linear contract. - return 0.0 + return float(np.log(initial_range[1] - initial_range[0] + 1e-10)) return float( np.clip(_improvement_ratio(new_best_y, old_best_y, initial_range), 0.0, 1.0) ) @@ -37,7 +35,7 @@ def reward_linear(new_best_y, old_best_y, initial_range, is_final=False): def reward_sparse(new_best_y, old_best_y, initial_range, is_final=False): """Sparse: only reward at the final checkpoint (original r3).""" if old_best_y == float("inf") or not is_final: - return 0.0 + return float(np.log(initial_range[1] - initial_range[0] + 1e-10)) total_improvement = initial_range[0] - new_best_y scale = initial_range[1] - initial_range[0] return float(np.log(total_improvement / (scale + 1e-10) + 1e-5)) From c93efb888ff88259c072e10921c736cdae294192 Mon Sep 17 00:00:00 2001 From: wniec Date: Fri, 29 May 2026 20:12:31 +0200 Subject: [PATCH 5/5] ruff fix --- agents/rl_das/trainer.py | 8 ++++++-- das/env/das_env.py | 6 ++++-- das/training/rldas.py | 3 ++- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/agents/rl_das/trainer.py b/agents/rl_das/trainer.py index 0d6d261..4d2b0a6 100644 --- a/agents/rl_das/trainer.py +++ b/agents/rl_das/trainer.py @@ -131,8 +131,12 @@ def train( # Log per-epoch PPO diagnostics so training instability is visible # (e.g. actor_loss explosion, entropy collapse) without manual debugging. if epoch_diagnostics: - entry["actor_loss"] = float(np.mean([d["actor_loss"] for d in epoch_diagnostics])) - entry["critic_loss"] = float(np.mean([d["critic_loss"] for d in epoch_diagnostics])) + entry["actor_loss"] = float( + np.mean([d["actor_loss"] for d in epoch_diagnostics]) + ) + entry["critic_loss"] = float( + np.mean([d["critic_loss"] for d in epoch_diagnostics]) + ) entry["entropy"] = float(np.mean([d["entropy"] for d in epoch_diagnostics])) if epoch % eval_interval == 0: diff --git a/das/env/das_env.py b/das/env/das_env.py index 1fc06d3..57fbe84 100644 --- a/das/env/das_env.py +++ b/das/env/das_env.py @@ -16,7 +16,7 @@ import gymnasium as gym from gymnasium import spaces -from das.env.observation import (compute_observation, observation_dim, MAX_HISTORY_SAMPLE) +from das.env.observation import compute_observation, observation_dim, MAX_HISTORY_SAMPLE from das.env.reward import compute_reward from das.optimizers.base import get_checkpoints @@ -257,7 +257,9 @@ def _update_episode_state(self, result: dict, prev_best_y: float): # derive scale from the magnitude of the initial best fitness. if self._initial_range[0] == float("inf"): safe_worst = ( - worst_y if np.isfinite(worst_y) else new_best_y + max(abs(new_best_y), 1.0) + worst_y + if np.isfinite(worst_y) + else new_best_y + max(abs(new_best_y), 1.0) ) self._initial_range = (new_best_y, max(safe_worst, new_best_y + 1e-5)) diff --git a/das/training/rldas.py b/das/training/rldas.py index 588e2b7..0bfcb5a 100644 --- a/das/training/rldas.py +++ b/das/training/rldas.py @@ -24,7 +24,8 @@ def run_rl_das(args) -> None: # Local variable — avoid mutating args so the caller's namespace stays predictable. k_epoch = ( - args.k_epoch if args.k_epoch is not None + args.k_epoch + if args.k_epoch is not None else max(1, int(0.3 * args.n_checkpoints)) )