Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 10 additions & 11 deletions agents/rl_das/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,17 +194,16 @@ def learn(self, k_epoch: int, bootstrap_value: float = 0.0) -> dict[str, float]:
)
actor_loss = -torch.min(surr1, surr2).mean()

# Value clipping (like PPO v2) from the 2nd epoch onward
if epoch_idx > 0:
values_clipped = old_values_t + torch.clamp(
values - old_values_t, -self.eps_clip, self.eps_clip
)
critic_loss = torch.max(
(values - returns_t.detach()) ** 2,
(values_clipped - returns_t.detach()) ** 2,
).mean()
else:
critic_loss = (values - returns_t.detach()).pow(2).mean()
# Value clipping applied from the first inner epoch. Skipping it
# on epoch 0 allowed an unconstrained large update on the first step,
# breaking the PPO v2 guarantee that value changes stay within eps_clip.
values_clipped = old_values_t + torch.clamp(
values - old_values_t, -self.eps_clip, self.eps_clip
)
critic_loss = torch.max(
(values - returns_t.detach()) ** 2,
(values_clipped - returns_t.detach()) ** 2,
).mean()

loss = actor_loss + 0.5 * critic_loss - 0.01 * entropy

Expand Down
7 changes: 7 additions & 0 deletions agents/rl_das/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,13 @@ def __init__(
self._best_history: list[list[np.ndarray]] = [[] for _ in range(self.n_opt)]
self._worst_history: list[list[np.ndarray]] = [[] for _ in range(self.n_opt)]

@property
def problem_ids(self) -> list[str]:
# Public accessor — callers should not reach into _problem_ids directly
# because it is filtered (dimension-matched) and may differ from the
# original list passed to the constructor.
return self._problem_ids

# ------------------------------------------------------------------
# Gymnasium interface
# ------------------------------------------------------------------
Expand Down
7 changes: 6 additions & 1 deletion agents/rl_das/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ def __init__(self, dim: int) -> None:
nn.Linear(dim, 64),
nn.ReLU(),
nn.Linear(64, 1),
nn.ReLU(),
# No second ReLU: movement vectors are signed displacements.
# Clamping to >= 0 discards direction — the network cannot tell
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense

# whether the optimizer stepped left or right in search space.
)

def forward(self, x: torch.Tensor) -> torch.Tensor:
Expand Down Expand Up @@ -94,4 +96,7 @@ def __init__(self, dim: int, n_opt: int) -> None:
self.head = nn.Linear(16, 1)

def forward(self, obs: torch.Tensor) -> torch.Tensor:
# Mirror Actor's NaN guard: a NaN value estimate flows into advantages
# and silently zeroes all gradients via backward(), corrupting the update.
obs = torch.nan_to_num(obs, nan=0.0, posinf=1.0, neginf=-1.0)
return self.head(self.backbone(obs)).squeeze(-1) # (batch,)
31 changes: 26 additions & 5 deletions agents/rl_das/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,18 +102,24 @@ def train(
"""
Path(save_dir).mkdir(parents=True, exist_ok=True)
log: list[dict] = []
n_train = len(train_env._problem_ids)
n_train = len(train_env.problem_ids)

for epoch in range(1, n_epochs + 1):
epoch_rewards = []
epoch_rewards: list[float] = []
epoch_diagnostics: list[dict] = []
epoch_start = time.time()

for _ in range(n_train):
ep = _run_episode(train_env, agent, deterministic=False)
epoch_rewards.append(ep["total_reward"])

agent.learn(k_epoch)
# bootstrap_value=0.0 is correct: this env only terminates naturally
# (terminated=True, truncated always False), so the last done=True flag
# already zeroes future returns — no critic bootstrap is needed.
diag = agent.learn(k_epoch, bootstrap_value=0.0)
agent.rollout.clear()
if diag:
epoch_diagnostics.append(diag)

mean_train_reward = float(np.mean(epoch_rewards))
entry: dict = {
Expand All @@ -122,9 +128,20 @@ def train(
"elapsed_s": round(time.time() - epoch_start, 2),
}

# Log per-epoch PPO diagnostics so training instability is visible
# (e.g. actor_loss explosion, entropy collapse) without manual debugging.
if epoch_diagnostics:
entry["actor_loss"] = float(
np.mean([d["actor_loss"] for d in epoch_diagnostics])
)
entry["critic_loss"] = float(
np.mean([d["critic_loss"] for d in epoch_diagnostics])
)
entry["entropy"] = float(np.mean([d["entropy"] for d in epoch_diagnostics]))

if epoch % eval_interval == 0:
test_results = evaluate(
test_env, agent, n_episodes=len(test_env._problem_ids)
test_env, agent, n_episodes=len(test_env.problem_ids)
)
entry["mean_test_reward"] = float(
np.mean([r["total_reward"] for r in test_results])
Expand All @@ -137,12 +154,16 @@ def train(
f" train_r={mean_train_reward:.4f}"
f" test_r={entry['mean_test_reward']:.4f}"
f" test_best_y={entry['mean_test_best_y']:.4e}"
f" actor_loss={entry.get('actor_loss', float('nan')):.4f}"
f" entropy={entry.get('entropy', float('nan')):.4f}"
f" ({entry['elapsed_s']:.1f}s)"
)
else:
print(
f"Epoch {epoch:4d}/{n_epochs}"
f" train_r={mean_train_reward:.4f}"
f" actor_loss={entry.get('actor_loss', float('nan')):.4f}"
f" entropy={entry.get('entropy', float('nan')):.4f}"
f" ({entry['elapsed_s']:.1f}s)"
)

Expand Down Expand Up @@ -186,7 +207,7 @@ def evaluate(
List of dicts with keys: problem_id, total_reward, best_y, n_fe.
"""
if n_episodes is None:
n_episodes = len(env._problem_ids)
n_episodes = len(env.problem_ids)

results = []
for _ in range(n_episodes):
Expand Down
36 changes: 26 additions & 10 deletions das/env/das_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import gymnasium as gym
from gymnasium import spaces

from das.env.observation import compute_observation, observation_dim
from das.env.observation import compute_observation, observation_dim, MAX_HISTORY_SAMPLE
from das.env.reward import compute_reward
from das.optimizers.base import get_checkpoints

Expand Down Expand Up @@ -251,14 +251,27 @@ def _update_episode_state(self, result: dict, prev_best_y: float):
if worst_y > self._worst_y:
self._worst_y = worst_y

# Set initial range on first step
# Set initial range on first step.
# When worst_so_far_y is absent the default is -inf, which collapses
# scale to 1e-5 and inflates every subsequent reward by 1e5. Instead,
# derive scale from the magnitude of the initial best fitness.
if self._initial_range[0] == float("inf"):
self._initial_range = (new_best_y, max(worst_y, new_best_y + 1e-5))
safe_worst = (
worst_y
if np.isfinite(worst_y)
else new_best_y + max(abs(new_best_y), 1.0)
)
self._initial_range = (new_best_y, max(safe_worst, new_best_y + 1e-5))

# Stagnation counter
# Stagnation counter — prefer the FE delta from the result dict so that
# stagnation accumulates correctly even when y_history is not returned.
x_hist: np.ndarray | None = result.get("x_history")
y_hist: np.ndarray | None = result.get("y_history")
n_fe_step = len(y_hist) if y_hist is not None else 0
n_fe_reported = result.get("n_function_evaluations")
if n_fe_reported is not None:
n_fe_step = max(0, n_fe_reported - self._n_fe)
else:
n_fe_step = len(y_hist) if y_hist is not None else 0

if new_best_y >= prev_best_y:
self._stagnation_count += n_fe_step
Expand All @@ -267,20 +280,23 @@ def _update_episode_state(self, result: dict, prev_best_y: float):

self._n_fe = result.get("n_function_evaluations", self._n_fe + n_fe_step)

# Accumulate population history for ELA
# Accumulate population history for ELA, capped at MAX_HISTORY_SAMPLE rows.
# Without the cap, large budgets (e.g. 40-dim × 10 000 FE) accumulate
# hundreds of thousands of rows — GBs of RAM for a single episode.
if x_hist is not None and len(x_hist) > 0:
self._x_history = (
x_hist
x_hist[-MAX_HISTORY_SAMPLE:]
if self._x_history is None
else np.concatenate([self._x_history, x_hist])
else np.concatenate([self._x_history, x_hist])[-MAX_HISTORY_SAMPLE:]
)
self._y_history = (
y_hist
y_hist[-MAX_HISTORY_SAMPLE:]
if self._y_history is None
else np.concatenate([self._y_history, y_hist])
else np.concatenate([self._y_history, y_hist])[-MAX_HISTORY_SAMPLE:]
)

def _build_observation(self) -> np.ndarray:

return compute_observation(
x_history=self._x_history,
y_history=self._y_history,
Expand Down
5 changes: 2 additions & 3 deletions das/env/observation.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,8 @@ def compute_action_history_features(
last_idx = choices_history[-1]
last_action[last_idx] = 1.0

counts = np.array(
[choices_history.count(j) for j in range(n_actions)], dtype=np.float32
)
# O(n) instead of O(n_actions * n_steps) from calling list.count in a loop.
counts = np.bincount(choices_history, minlength=n_actions).astype(np.float32)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice

frequencies = counts / len(choices_history)

run = 0
Expand Down
2 changes: 1 addition & 1 deletion das/env/reward.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def reward_sparse(new_best_y, old_best_y, initial_range, is_final=False):
def reward_binary(new_best_y, old_best_y, initial_range, is_final=False):
"""Binary: 1 if improvement >= 0.1%, else 0 (original r4)."""
if old_best_y == float("inf"):
return float(np.log(initial_range[1] - initial_range[0] + 1e-10))
return 0.0
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is important change of behaviour. I think further portfolio study will be needed. for now I'll leave the reward as it was before

ratio = _improvement_ratio(new_best_y, old_best_y, initial_range)
return 1.0 if ratio >= 1e-3 else 0.0

Expand Down
23 changes: 19 additions & 4 deletions das/training/rldas.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,12 @@ def run_rl_das(args) -> None:

suite = IOHSuite()

if args.k_epoch is None:
args.k_epoch = max(1, int(0.3 * args.n_checkpoints))
# Local variable — avoid mutating args so the caller's namespace stays predictable.
k_epoch = (
args.k_epoch
if args.k_epoch is not None
else max(1, int(0.3 * args.n_checkpoints))
)

env_kwargs = dict(
suite=suite,
Expand All @@ -45,15 +49,15 @@ def run_rl_das(args) -> None:
print(
f"RL-DAS | dim={args.dim} | portfolio={args.portfolio}"
f" | obs_dim={train_env.observation_space.shape[0]}"
f" | k_epoch={args.k_epoch}"
f" | k_epoch={k_epoch}"
)

train(
train_env=train_env,
test_env=test_env,
agent=agent,
n_epochs=args.n_epochs,
k_epoch=args.k_epoch,
k_epoch=k_epoch,
eval_interval=args.eval_interval,
save_interval=args.save_interval,
save_dir="models",
Expand All @@ -62,6 +66,17 @@ def run_rl_das(args) -> None:

if args.eval:
print("\nRunning final evaluation on test set …")

# Fresh env so _problem_idx starts at 0. test_env accumulated increments
# from periodic evaluations inside train() and would start from a rotated
# offset rather than problem 0, making results hard to reproduce.
eval_env = RLDASEnv(problem_ids=test_ids, **env_kwargs)
n_problems = len(test_ids)
test_results = evaluate(eval_env, agent, n_episodes=n_problems)

# Create the output directory before writing — write_jsonl does not
# create parent directories and would raise FileNotFoundError otherwise.
os.makedirs("results", exist_ok=True)
n_problems = len(test_env._problem_ids)
test_results = evaluate(test_env, agent, n_episodes=n_problems)
mean_best_y = float(np.mean([r["best_y"] for r in test_results]))
Expand Down
Loading