diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000..326d9e4 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,33 @@ +name: Pre-commit checks + +on: + push: + branches: [ main, develop ] + pull_request: + branches: [ main, develop ] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.13' + + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + version: "latest" + + - name: Install dependencies + run: | + uv sync --group dev + + - name: Run pre-commit + run: | + uv run pre-commit run --all-files diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..e047e25 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,33 @@ +name: Tests + +on: + push: + branches: [ main, develop ] + pull_request: + branches: [ main, develop ] + +jobs: + tests: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.13' + + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + version: "latest" + + - name: Install dependencies + run: | + uv sync --group dev + + - name: Run tests + run: | + uv run pytest diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2ca480b..c48ea5d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,7 +28,7 @@ repos: # Type checking - id: ty name: ty - entry: uvx ty check + entry: uv run ty check language: system types: [python] # args: [] # Add any args if needed diff --git a/README.md b/README.md index f744d9e..bb62b60 100644 --- a/README.md +++ b/README.md @@ -24,11 +24,14 @@ A lightweight job queue for shared GPU servers without SLURM. 1. **Start the Scheduler**: ```bash # Run in foreground (for testing/debugging) - gpu-queue serve --min-free 2 + gpu-queue serve --min-free 2 --max-use 6 # OR Start background daemon - gpu-queue start --min-free 2 + gpu-queue start --min-free 2 --max-use 6 ``` + `--min-free` preserves that many physically idle GPUs. GPUs occupied by + other users do not count toward this reserve. `--max-use` caps how many + GPUs gpu-queue jobs may occupy at once. 2. **Submit Jobs**: ```bash @@ -42,10 +45,12 @@ A lightweight job queue for shared GPU servers without SLURM. gpu-queue watch ``` **Keybindings**: - - `d`: **Duplicate** selected job into **Staging** (enters Edit Mode) + - `v`: Enter/leave select mode for bulk actions + - `d`: **Duplicate** selected job into **Staging** - `n`: Create a **new staged** job - `e`: **Edit** selected staged job / **Save** staged changes - `s` or `Enter` (in Staging): Send staged job to **Pending** (with confirmation) + - `b` (in Pending): Move selected pending job back to the top of **Staging** - `c`: **Discard** staged job, or cancel pending/running job - `J` / `K` (in Pending): Move selected job down/up in queue order - `Space`: View logs (internal viewer) @@ -54,8 +59,10 @@ A lightweight job queue for shared GPU servers without SLURM. - `r`: Retry completed job into **Staging** - `x`: Remove completed job + In select mode, `j`/`k` extend the selected rows as you move. `Esc` clears the selection. Batch-safe commands apply to all selected rows in the active panel: `b`, `c`, `s`, `d`, `p`, `r`, `x`, and pending `J`/`K` reorder. Edit and logs remain cursor-only. + **Interactive Editing**: - - **Enter Edit Mode**: Press `e` on a staged job, or create one via `n` / `d` / `r`. + - **Enter Edit Mode**: Press `e` on a staged job, or create one via `n`. - **Navigation**: Use `h`/`l` to switch between GPUs and Command fields. - **Modify Values**: Use `j`/`k` to decrease/increase GPU count. - **Edit Command**: Select the Command field and press `Enter` to open your system editor. diff --git a/src/gpu_queue/cli.py b/src/gpu_queue/cli.py new file mode 100644 index 0000000..2dd8ed4 --- /dev/null +++ b/src/gpu_queue/cli.py @@ -0,0 +1,139 @@ +"""Command-line interface for gpu-queue.""" + +import argparse + +from gpu_queue.commands import ( + cmd_add, + cmd_cancel, + cmd_clear, + cmd_logs, + cmd_pause, + cmd_retry, + cmd_serve, + cmd_start, + cmd_stop, +) +from gpu_queue.tui.app import GPUQueueTUI + + +def cmd_watch(args: argparse.Namespace) -> None: + """Interactive TUI for queue monitoring.""" + import curses + + tui = GPUQueueTUI(args.interval) + try: + curses.wrapper(tui.main) + except KeyboardInterrupt: + pass + print("Exited TUI.") + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="GPU Job Queue Scheduler") + subparsers = parser.add_subparsers(dest="command", required=True) + + add_parser = subparsers.add_parser("add", help="Add a job to the queue") + add_parser.add_argument("command", help="Command to run") + add_parser.add_argument( + "--gpus", "-g", type=int, default=2, help="Number of GPUs required" + ) + add_parser.add_argument( + "--priority", + "-p", + choices=["low", "medium", "high"], + default="medium", + help="Job priority", + ) + add_parser.add_argument( + "--front", + "-f", + action="store_true", + help="Add to front of queue (Urgent priority)", + ) + add_parser.set_defaults(func=cmd_add) + + start_parser = subparsers.add_parser( + "start", help="Start the queue scheduler (background)" + ) + start_parser.add_argument( + "--min-free", + type=int, + default=2, + help="Number of physically idle GPUs to always keep free", + ) + start_parser.add_argument( + "--max-use", + type=int, + default=None, + help="Maximum GPUs this queue may occupy at once", + ) + start_parser.set_defaults(func=cmd_start) + + stop_parser = subparsers.add_parser("stop", help="Stop the background scheduler") + stop_parser.set_defaults(func=cmd_stop) + + serve_parser = subparsers.add_parser( + "serve", help="Run the queue scheduler (foreground)" + ) + serve_parser.add_argument( + "--min-free", + type=int, + default=2, + help="Number of physically idle GPUs to always keep free", + ) + serve_parser.add_argument( + "--max-use", + type=int, + default=None, + help="Maximum GPUs this queue may occupy at once", + ) + serve_parser.add_argument( + "--exclude-gpus", + type=str, + default="", + help="Comma-separated list of GPU indices to ignore (e.g. '0,1')", + ) + serve_parser.set_defaults(func=cmd_serve) + + cancel_parser = subparsers.add_parser("cancel", help="Cancel a pending job") + cancel_parser.add_argument("job_id", help="Job ID to cancel") + cancel_parser.set_defaults(func=cmd_cancel) + + logs_parser = subparsers.add_parser("logs", help="Show job logs") + logs_parser.add_argument("job_id", help="Job ID") + logs_parser.add_argument( + "--lines", "-n", type=int, default=50, help="Number of lines" + ) + logs_parser.set_defaults(func=cmd_logs) + + clear_parser = subparsers.add_parser("clear", help="Clear completed jobs") + clear_parser.set_defaults(func=cmd_clear) + + retry_parser = subparsers.add_parser("retry", help="Re-queue a completed job") + retry_parser.add_argument("job_id", help="Job ID to retry") + retry_parser.add_argument( + "--front", "-f", action="store_true", help="Add to front of queue" + ) + retry_parser.set_defaults(func=cmd_retry) + + pause_parser = subparsers.add_parser( + "pause", help="Pause a running job (re-queue at front)" + ) + pause_parser.add_argument("job_id", help="Job ID to pause") + pause_parser.set_defaults(func=cmd_pause) + + watch_parser = subparsers.add_parser( + "watch", help="Watch queue status continuously" + ) + watch_parser.add_argument( + "--interval", "-n", type=float, default=2.0, help="Update interval in seconds" + ) + watch_parser.set_defaults(func=cmd_watch) + + return parser + + +def main() -> None: + parser = build_parser() + args = parser.parse_args() + args.func(args) diff --git a/src/gpu_queue/commands.py b/src/gpu_queue/commands.py new file mode 100644 index 0000000..ee4b2ad --- /dev/null +++ b/src/gpu_queue/commands.py @@ -0,0 +1,227 @@ +from __future__ import annotations + +import json +import os +import signal +import sys +from datetime import datetime + +from gpu_queue import paths +from gpu_queue.ids import generate_job_id +from gpu_queue.logs import log_msg +from gpu_queue.scheduler import daemon_loop, is_daemon_running +from gpu_queue.storage import ensure_dirs, locked_queue + + +def cmd_serve(args) -> None: + """Run the scheduler loop in the foreground.""" + ensure_dirs() + max_use = getattr(args, "max_use", None) + print(f"✓ Scheduler started (keeping {args.min_free} GPUs physically idle)") + if max_use is not None: + print(f" Max gpu-queue use: {max_use} GPUs") + print(f" Polling every {paths.POLL_INTERVAL}s") + + excluded = set() + if getattr(args, "exclude_gpus", None): + try: + for p in args.exclude_gpus.split(","): + if p.strip(): + excluded.add(int(p.strip())) + except ValueError: + print("Error: Invalid format for --exclude-gpus.") + sys.exit(1) + + daemon_loop(args.min_free, excluded, max_use=max_use) + + +def cmd_add(args) -> None: + """Add a job to the queue.""" + priorities = {"low": 0, "medium": 1, "high": 2} + prio = priorities.get(args.priority, 1) + if args.front: + prio = 3 + + job = { + "id": generate_job_id(), + "cmd": args.command, + "gpus": args.gpus, + "added": datetime.now().isoformat(), + "priority": prio, + } + + with locked_queue() as queue: + if args.front: + queue["pending"].insert(0, job) + else: + queue["pending"].append(job) + + print(f"✓ Added job {job['id']} (requires {args.gpus} GPUs)") + print(f" Command: {args.command}") + + +def cmd_start(args) -> None: + """Start the daemon.""" + ensure_dirs() + max_use = getattr(args, "max_use", None) + + if is_daemon_running(): + print("Daemon is already running!") + return + + pid = os.fork() + if pid > 0: + print(f"✓ Daemon started (PID: {pid})") + print(f" Polling every {paths.POLL_INTERVAL}s for free GPUs") + print(f" Log: {paths.DAEMON_LOG}") + return + + os.setsid() + os.chdir("/") + + paths.PID_FILE.write_text(str(os.getpid())) + (paths.QUEUE_DIR / "config.json").write_text( + json.dumps({"min_free_gpus": args.min_free, "max_use_gpus": max_use}) + ) + + sys.stdout = open(paths.DAEMON_LOG, "a") + sys.stderr = sys.stdout + + def handle_signal(signum, frame): + log_msg("Daemon stopped") + paths.PID_FILE.unlink(missing_ok=True) + sys.exit(0) + + signal.signal(signal.SIGTERM, handle_signal) + signal.signal(signal.SIGINT, handle_signal) + + daemon_loop(args.min_free, max_use=max_use) + + +def cmd_stop(args) -> None: + """Stop the daemon.""" + if not is_daemon_running(): + print("Daemon is not running.") + return + + pid = int(paths.PID_FILE.read_text().strip()) + os.kill(pid, signal.SIGTERM) + print(f"✓ Stopped daemon (PID: {pid})") + + +def cmd_cancel(args) -> None: + """Cancel a pending job.""" + with locked_queue() as queue: + for i, job in enumerate(queue["pending"]): + if job["id"] == args.job_id: + queue["pending"].pop(i) + print(f"✓ Cancelled pending job {args.job_id}") + return + + for i, job in enumerate(queue["running"]): + if job["id"] == args.job_id: + pid = job.get("pid") + if pid: + try: + os.killpg(pid, signal.SIGKILL) + except Exception: + pass + queue["running"].pop(i) + job["status"] = "cancelled" + job["ended"] = datetime.now().isoformat() + queue["completed"].append(job) + print(f"✓ Cancelled running job {args.job_id}") + return + + print(f"Job {args.job_id} not found") + + +def cmd_logs(args) -> None: + """Show logs for a job.""" + log_file = paths.LOG_DIR / f"{args.job_id}.log" + if not log_file.exists(): + print(f"No log file for job {args.job_id}") + return + + lines = args.lines + with open(log_file) as f: + content = f.readlines() + for line in content[-lines:]: + print(line, end="") + + +def cmd_clear(args) -> None: + """Clear completed jobs from the queue.""" + with locked_queue() as queue: + count = len(queue["completed"]) + queue["completed"] = [] + print(f"✓ Cleared {count} completed jobs") + + +def cmd_delete(args) -> None: + """Delete a job from the completed list.""" + with locked_queue() as queue: + for i, job in enumerate(queue["completed"]): + if job["id"] == args.job_id: + queue["completed"].pop(i) + print(f"✓ Deleted job {args.job_id}") + return + print(f"Job {args.job_id} not found in completed jobs") + + +def cmd_retry(args) -> None: + """Re-queue a completed job.""" + with locked_queue() as queue: + for i, job in enumerate(queue["completed"]): + if job["id"] == args.job_id: + queue["completed"].pop(i) + + new_job = { + "id": job["id"], + "cmd": job["cmd"], + "gpus": job.get("gpus", 1), + "added": datetime.now().isoformat(), + "retried_at": datetime.now().isoformat(), + "priority": 1, + } + + if args.front: + queue["pending"].insert(0, new_job) + print(f"✓ Re-queued job {job['id']} (front)") + else: + queue["pending"].append(new_job) + print(f"✓ Re-queued job {job['id']} (back)") + return + + print(f"Job {args.job_id} not found in completed jobs") + + +def cmd_pause(args) -> None: + """Pause a running job (kill and re-queue at front).""" + with locked_queue() as queue: + for i, job in enumerate(queue["running"]): + if job["id"] == args.job_id: + pid = job.get("pid") + if pid: + try: + os.killpg(pid, signal.SIGKILL) + except Exception: + pass + + queue["running"].pop(i) + + new_job = { + "id": generate_job_id(), + "cmd": job["cmd"], + "gpus": job.get("gpus", 1), + "added": datetime.now().isoformat(), + "priority": 3, + "paused_from": job["id"], + } + + queue["pending"].insert(0, new_job) + print(f"✓ Paused job {job['id']} (Killed process group {pid})") + print(f"✓ Re-queued as {new_job['id']} at front") + return + + print(f"Job {args.job_id} not found in running jobs") diff --git a/src/gpu_queue/gpu.py b/src/gpu_queue/gpu.py new file mode 100644 index 0000000..979f025 --- /dev/null +++ b/src/gpu_queue/gpu.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +import subprocess +from pathlib import Path +from typing import Any + +from gpu_queue.storage import load_queue + + +def get_free_gpus() -> list[dict[str, Any]]: + """Get list of GPUs with their status (free = no processes running).""" + try: + result = subprocess.run( + [ + "nvidia-smi", + "--query-gpu=index,memory.used,memory.total,utilization.gpu", + "--format=csv,noheader,nounits", + ], + capture_output=True, + text=True, + check=True, + ) + gpus = {} + for line in result.stdout.strip().split("\n"): + parts = [p.strip() for p in line.split(",")] + if len(parts) >= 3: + idx = int(parts[0]) + used = int(parts[1]) + total = int(parts[2]) + util = 0 + if len(parts) >= 4 and parts[3].strip().isdigit(): + util = int(parts[3].strip()) + + gpus[idx] = { + "index": idx, + "used_mb": used, + "total_mb": total, + "util": util, + "free": True, + "processes": [], + } + + uuid_result = subprocess.run( + [ + "nvidia-smi", + "--query-gpu=index,uuid", + "--format=csv,noheader", + ], + capture_output=True, + text=True, + check=True, + ) + uuid_to_idx = {} + for line in uuid_result.stdout.strip().split("\n"): + if "," in line: + parts = [p.strip() for p in line.split(",")] + if len(parts) >= 2: + uuid_to_idx[parts[1]] = int(parts[0]) + + proc_result = subprocess.run( + [ + "nvidia-smi", + "--query-compute-apps=gpu_uuid,pid,process_name,used_memory", + "--format=csv,noheader,nounits", + ], + capture_output=True, + text=True, + check=True, + ) + + for line in proc_result.stdout.strip().split("\n"): + if "," in line: + parts = [p.strip() for p in line.split(",")] + if len(parts) >= 4: + gpu_uuid, pid_str, proc_name, mem_str = ( + parts[0], + parts[1], + parts[2], + parts[3], + ) + if gpu_uuid in uuid_to_idx: + idx = uuid_to_idx[gpu_uuid] + if idx in gpus: + is_zombie = proc_name == "[Not Found]" + if not is_zombie: + try: + pid = int(pid_str) + if not Path(f"/proc/{pid}").exists(): + is_zombie = True + except ValueError: + is_zombie = True + + user = "unknown" + if "/home/" in proc_name: + user = proc_name.split("/home/")[1].split("/")[0] + elif is_zombie: + user = "zombie" + + gpus[idx]["processes"].append( + { + "pid": pid_str, + "user": user, + "name": proc_name.split("/")[-1] + if "/" in proc_name + else proc_name, + "mem_mb": int(mem_str) if mem_str.isdigit() else 0, + "zombie": is_zombie, + } + ) + + if not is_zombie: + gpus[idx]["free"] = False + + return list(gpus.values()) + except (subprocess.CalledProcessError, FileNotFoundError): + return [] + + +def get_available_gpu_indices() -> list[int]: + """Get indices of free GPUs (excluding running jobs).""" + gpus = get_free_gpus() + free_indices = [g["index"] for g in gpus if g["free"]] + + queue = load_queue() + reserved_gpus = set() + for job in queue.get("running", []): + for gpu_idx in job.get("assigned_gpus", []): + reserved_gpus.add(gpu_idx) + + return [idx for idx in free_indices if idx not in reserved_gpus] diff --git a/src/gpu_queue/ids.py b/src/gpu_queue/ids.py new file mode 100644 index 0000000..6ba53fe --- /dev/null +++ b/src/gpu_queue/ids.py @@ -0,0 +1,10 @@ +from __future__ import annotations + +import hashlib +from datetime import datetime + + +def generate_job_id() -> str: + """Generate a short unique job ID.""" + ts = datetime.now().isoformat() + return hashlib.md5(ts.encode()).hexdigest()[:8] diff --git a/src/gpu_queue/logs.py b/src/gpu_queue/logs.py new file mode 100644 index 0000000..e583197 --- /dev/null +++ b/src/gpu_queue/logs.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from datetime import datetime + +from gpu_queue import paths + + +def log_msg(msg: str, verbose: bool = False) -> None: + """Log a message to the daemon log.""" + if verbose: + return + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + line = f"[{ts}] {msg}\n" + with open(paths.DAEMON_LOG, "a") as f: + f.write(line) diff --git a/src/gpu_queue/main.py b/src/gpu_queue/main.py index 4f1cab3..2e402d9 100644 --- a/src/gpu_queue/main.py +++ b/src/gpu_queue/main.py @@ -14,2713 +14,79 @@ gpu-queue logs """ -import argparse -import copy -import curses -import fcntl -import json -import os -import shutil -import signal -import subprocess -import sys -import tempfile -import threading -import time -from contextlib import contextmanager -from datetime import datetime -from pathlib import Path -from typing import Any, Dict, Optional, cast - -from gpu_queue.queue_state import ( - cancel_staged_job, - empty_queue, - insert_staged_job, - load_queue_file, - make_staged_job, - move_pending_job, - save_queue_file, - send_staged_job_to_pending, - stage_completed_retry, +from gpu_queue.cli import build_parser, cmd_watch, main +from gpu_queue.commands import ( + cmd_add, + cmd_cancel, + cmd_clear, + cmd_logs, + cmd_pause, + cmd_retry, + cmd_serve, + cmd_start, + cmd_stop, ) - -# Try importing requests. Env should have it. Fallback if needed. -try: - import requests # type: ignore -except ImportError: - requests = None - -# Configuration -QUEUE_DIR = Path.home() / ".gpu_queue" -QUEUE_FILE = QUEUE_DIR / "jobs.json" -PID_FILE = QUEUE_DIR / "daemon.pid" -DAEMON_LOG = QUEUE_DIR / "daemon.log" -LOG_DIR = QUEUE_DIR / "logs" - -POLL_INTERVAL = 2 # seconds between GPU checks -MIN_FREE_GPUS = 2 # Number of GPUs to always keep free for other users -SERVER_PORT = 12345 - -CMD_FIELD_GHOST = "" - -# Sparkline for GPU util (U+2581..U+2588). Buffer ≥ max spark columns so full width scrolls. -_BLOCK_SPARK_CHARS = "▁▂▃▄▅▆▇█" -GPU_UTIL_HISTORY_MAX_SAMPLES = 512 -# GPU STATUS table: fixed widths so header and rows align (content inside borders) -GPU_COL_IDX_W = 4 -GPU_COL_UTIL_W = 4 # "100%" -GPU_COL_MEM_W = 14 -GPU_MIN_PROC_W = 8 - - -def get_server_url(): - return f"http://localhost:{SERVER_PORT}" - - -def ensure_dirs(): - """Create queue directories if they don't exist.""" - QUEUE_DIR.mkdir(exist_ok=True) - LOG_DIR.mkdir(exist_ok=True) - - -LOCK_FILE = QUEUE_DIR / "queue.lock" - - -@contextmanager -def locked_queue(): - """Context manager for thread-safe and process-safe queue access.""" - ensure_dirs() - with open(LOCK_FILE, "w") as f: - try: - fcntl.flock(f, fcntl.LOCK_EX) - queue = load_queue_raw() - yield queue - save_queue_raw(queue) - finally: - fcntl.flock(f, fcntl.LOCK_UN) - - -def load_queue_raw() -> dict[str, list]: - """Load the job queue from disk without locking.""" - try: - return load_queue_file(QUEUE_FILE) - except (json.JSONDecodeError, ValueError) as e: - log_msg(f"Error loading queue JSON: {e}") - return empty_queue() - - -def save_queue_raw(queue: dict[str, list]): - """Save the job queue to disk without locking (atomic replace).""" - save_queue_file(QUEUE_FILE, queue) - - -def load_queue() -> dict[str, list]: - """Load the job queue (backward compatibility, no lock).""" - return load_queue_raw() - - -def save_queue(queue: dict[str, list]): - """Save the job queue (backward compatibility, no lock).""" - save_queue_raw(queue) - - -def generate_job_id() -> str: - """Generate a short unique job ID.""" - import hashlib - - ts = datetime.now().isoformat() - return hashlib.md5(ts.encode()).hexdigest()[:8] - - -def get_free_gpus() -> list[dict[str, Any]]: - """Get list of GPUs with their status (free = no processes running).""" - try: - # First get basic GPU info - result = subprocess.run( - [ - "nvidia-smi", - "--query-gpu=index,memory.used,memory.total,utilization.gpu", - "--format=csv,noheader,nounits", - ], - capture_output=True, - text=True, - check=True, - ) - gpus = {} - for line in result.stdout.strip().split("\n"): - parts = [p.strip() for p in line.split(",")] - if len(parts) >= 3: - idx = int(parts[0]) - used = int(parts[1]) - total = int(parts[2]) - util = 0 - if len(parts) >= 4 and parts[3].strip().isdigit(): - util = int(parts[3].strip()) - - gpus[idx] = { - "index": idx, - "used_mb": used, - "total_mb": total, - "util": util, - "free": True, # Assume free, will mark busy if processes found - "processes": [], # List of process info dicts - } - - # Get GPU index to UUID mapping - uuid_result = subprocess.run( - [ - "nvidia-smi", - "--query-gpu=index,uuid", - "--format=csv,noheader", - ], - capture_output=True, - text=True, - check=True, - ) - uuid_to_idx = {} - for line in uuid_result.stdout.strip().split("\n"): - if "," in line: - parts = [p.strip() for p in line.split(",")] - if len(parts) >= 2: - uuid_to_idx[parts[1]] = int(parts[0]) - - # Get process details - proc_result = subprocess.run( - [ - "nvidia-smi", - "--query-compute-apps=gpu_uuid,pid,process_name,used_memory", - "--format=csv,noheader,nounits", - ], - capture_output=True, - text=True, - check=True, - ) - - # Mark GPUs with processes as busy and collect process info - for line in proc_result.stdout.strip().split("\n"): - if "," in line: - parts = [p.strip() for p in line.split(",")] - if len(parts) >= 4: - gpu_uuid, pid_str, proc_name, mem_str = ( - parts[0], - parts[1], - parts[2], - parts[3], - ) - if gpu_uuid in uuid_to_idx: - idx = uuid_to_idx[gpu_uuid] - if idx in gpus: - # Check if it's a zombie (process doesn't exist) - is_zombie = proc_name == "[Not Found]" - if not is_zombie: - # Verify the PID actually exists using /proc - try: - pid = int(pid_str) - if not Path(f"/proc/{pid}").exists(): - is_zombie = True - except ValueError: - is_zombie = True - - # Extract user from process path - user = "unknown" - if "/home/" in proc_name: - user = proc_name.split("/home/")[1].split("/")[0] - elif is_zombie: - user = "zombie" - - gpus[idx]["processes"].append( - { - "pid": pid_str, - "user": user, - "name": proc_name.split("/")[-1] - if "/" in proc_name - else proc_name, - "mem_mb": int(mem_str) if mem_str.isdigit() else 0, - "zombie": is_zombie, - } - ) - - # Only mark as busy if NOT a zombie - if not is_zombie: - gpus[idx]["free"] = False - - return list(gpus.values()) - except (subprocess.CalledProcessError, FileNotFoundError): - return [] - - -def get_available_gpu_indices() -> list[int]: - """Get indices of free GPUs (excluding running jobs).""" - gpus = get_free_gpus() - free_indices = [g["index"] for g in gpus if g["free"]] - - # Also exclude GPUs already assigned to running jobs (race condition protection) - queue = load_queue() - reserved_gpus = set() - for job in queue.get("running", []): - for gpu_idx in job.get("assigned_gpus", []): - reserved_gpus.add(gpu_idx) - - return [idx for idx in free_indices if idx not in reserved_gpus] - - -def is_daemon_running() -> bool: - """Check if scheduler is running (placeholder).""" - # For now, just return True to avoid warnings, or implement a real check. - return True - - -def cleanup_dead_jobs(): - """Check running jobs and move dead ones to completed with status classification.""" - with locked_queue() as queue: - still_running = [] - changed = False - - for job in queue["running"]: - pid = job.get("pid") - if pid: - # Check if process is still running - proc_path = Path(f"/proc/{pid}") - if proc_path.exists(): - still_running.append(job) - continue - - # Process is dead. Wait for exit file. - job["ended"] = datetime.now().isoformat() - exit_file = QUEUE_DIR / f"{job['id']}.exit" - - # Wait up to 1s for exit file to appear (shell might be finishing up) - status = "unknown" - for _ in range(10): - if exit_file.exists(): - try: - code = int(exit_file.read_text().strip()) - status = "success" if code == 0 else "failed" - break - except Exception: - pass - time.sleep(0.1) - - if status == "unknown": - status = "killed" - - job["status"] = status - queue["completed"].append(job) - if exit_file.exists(): - exit_file.unlink(missing_ok=True) - changed = True - else: - still_running.append(job) - - if changed: - queue["running"] = still_running - else: - # No changes needed, queue remains unchanged. - pass - - -def run_job(job: dict, gpu_indices: list[int]) -> int: - """Run a job with the specified GPUs. Returns the PID.""" - log_file = LOG_DIR / f"{job['id']}.log" - exit_file = QUEUE_DIR / f"{job['id']}.exit" - gpu_str = ",".join(map(str, gpu_indices)) - - # Normalize command: collapse any newlines/whitespace to single line - cmd = " ".join(job["cmd"].split()) - - env = os.environ.copy() - env["CUDA_VISIBLE_DEVICES"] = gpu_str - # Ensure ~/.local/bin is in PATH for uv - local_bin = str(Path.home() / ".local" / "bin") - if local_bin not in env.get("PATH", ""): - env["PATH"] = f"{local_bin}:{env.get('PATH', '')}" - - with open(log_file, "w") as f: - f.write(f"=== Job {job['id']} ===\n") - f.write(f"Command: {cmd}\n") - f.write(f"GPUs: {gpu_str}\n") - f.write(f"Started: {datetime.now().isoformat()}\n") - f.write("=" * 40 + "\n\n") - - # Wrap in shell to capture exit code. Quote paths to be safe. - q_log = f"'{log_file}'" - q_exit = f"'{exit_file}'" - wrapped_cmd = f"({cmd}) >> {q_log} 2>&1; echo $? > {q_exit}" - - proc = subprocess.Popen( - wrapped_cmd, - shell=True, - env=env, - stdin=subprocess.DEVNULL, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - cwd=Path.home() / "pepo", - start_new_session=True, - ) - - return proc.pid - - -def daemon_loop(min_free, excluded_gpus=None): - """Main scheduler loop.""" - if excluded_gpus is None: - excluded_gpus = set() - - while True: - try: - cleanup_dead_jobs() - - with locked_queue() as queue: - if queue["pending"]: - # --- Quota and Availability Logic --- - all_gpus = get_free_gpus() - - # Filter out excluded GPUs - gpus = [g for g in all_gpus if g["index"] not in excluded_gpus] - - total_gpus = len(gpus) - - # Excluded GPUs count towards the reserved quota - effective_min_free = max(0, min_free - len(excluded_gpus)) - - quota = total_gpus - effective_min_free - - # Count GPUs currently assigned to our running jobs - our_usage = sum(job.get("gpus", 1) for job in queue["running"]) - quota_remaining = quota - our_usage - - # List of GPUs that are ACTUALLY free - our_assigned = set() - for j in queue["running"]: - for idx in j.get("assigned_gpus", []): - our_assigned.add(idx) - - free_indices = [ - g["index"] - for g in gpus - if g["free"] and g["index"] not in our_assigned - ] - - if quota_remaining > 0: - # --- Backfilling Scheduler --- - jobs_started = False - remaining_pending = [] - - for job in queue["pending"]: - req = job.get("gpus", 1) - - if req <= quota_remaining and req <= len(free_indices): - assigned = free_indices[:req] - log_msg(f"Starting job {job['id']} on GPUs {assigned}") - - pid = run_job(job, assigned) - - job["pid"] = pid - job["assigned_gpus"] = assigned - job["status"] = "running" - job["started"] = datetime.now().isoformat() - queue["running"].append(job) - - quota_remaining -= req - free_indices = free_indices[req:] - jobs_started = True - else: - remaining_pending.append(job) - - if jobs_started: - queue["pending"] = remaining_pending - - # Save GPU status for TUI - try: - status_data = { - "ts": datetime.now().isoformat(), - "gpus": all_gpus, - "min_free": min_free, - "excluded": list(excluded_gpus), - } - (QUEUE_DIR / "status.json").write_text(json.dumps(status_data)) - except Exception: - pass - else: - try: - all_gpus = get_free_gpus() - status_data = { - "ts": datetime.now().isoformat(), - "gpus": all_gpus, - "min_free": min_free, - "excluded": list(excluded_gpus), - } - (QUEUE_DIR / "status.json").write_text(json.dumps(status_data)) - except Exception: - pass - - time.sleep(POLL_INTERVAL) - - except KeyboardInterrupt: - break - except Exception as e: - log_msg(f"Error in daemon loop: {e}") - time.sleep(POLL_INTERVAL) - - -def cmd_serve(args): - """Run the scheduler loop in the foreground.""" - ensure_dirs() - print(f"✓ Scheduler started (keeping {args.min_free} GPUs reserved)") - print(f" Polling every {POLL_INTERVAL}s") - - # Parse excluded GPUs - excluded = set() - if getattr(args, "exclude_gpus", None): - try: - for p in args.exclude_gpus.split(","): - if p.strip(): - excluded.add(int(p.strip())) - except ValueError: - print("Error: Invalid format for --exclude-gpus.") - sys.exit(1) - - daemon_loop(args.min_free, excluded) - - -def log_msg(msg: str, verbose: bool = False): - """Log a message to the daemon log.""" - if verbose: - return # Skip verbose messages for now - ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - line = f"[{ts}] {msg}\n" - with open(DAEMON_LOG, "a") as f: - f.write(line) - - -# === CLI Commands === - - -def cmd_add(args): - """Add a job to the queue.""" - priorities = {"low": 0, "medium": 1, "high": 2} - prio = priorities.get(args.priority, 1) - if args.front: - prio = 3 - - job = { - "id": generate_job_id(), - "cmd": args.command, - "gpus": args.gpus, - "added": datetime.now().isoformat(), - "priority": prio, - } - - with locked_queue() as queue: - if args.front: - queue["pending"].insert(0, job) - else: - queue["pending"].append(job) - - print(f"✓ Added job {job['id']} (requires {args.gpus} GPUs)") - print(f" Command: {args.command}") - - -def cmd_start(args): - """Start the daemon.""" - ensure_dirs() - - if is_daemon_running(): - print("Daemon is already running!") - return - - # Fork to background - pid = os.fork() - if pid > 0: - # Parent process - print(f"✓ Daemon started (PID: {pid})") - print(f" Polling every {POLL_INTERVAL}s for free GPUs") - print(f" Log: {DAEMON_LOG}") - return - - # Child process - become daemon - os.setsid() - os.chdir("/") - - # Write PID file with configuration - PID_FILE.write_text(str(os.getpid())) - - # Store config in separate file or just run with args - (QUEUE_DIR / "config.json").write_text(json.dumps({"min_free_gpus": args.min_free})) - - # Redirect stdout/stderr - sys.stdout = open(DAEMON_LOG, "a") - sys.stderr = sys.stdout - - # Handle termination - def handle_signal(signum, frame): - log_msg("Daemon stopped") - PID_FILE.unlink(missing_ok=True) - sys.exit(0) - - signal.signal(signal.SIGTERM, handle_signal) - signal.signal(signal.SIGINT, handle_signal) - - daemon_loop(args.min_free) - - -def cmd_stop(args): - """Stop the daemon.""" - if not is_daemon_running(): - print("Daemon is not running.") - return - - pid = int(PID_FILE.read_text().strip()) - os.kill(pid, signal.SIGTERM) - print(f"✓ Stopped daemon (PID: {pid})") - - -def get_terminal_width() -> int: - """Get the current terminal width.""" - return shutil.get_terminal_size((80, 20)).columns - - -def sparkline_trailing(util_series: list[int], width: int) -> str: - """Latest `width` samples as block chars, right-aligned (scrolls as time passes).""" - if width <= 0: - return "" - if not util_series: - return " " * width - tail = util_series[-width:] - parts: list[str] = [] - for u in tail: - u = max(0, min(100, int(u))) - bi = min(7, u * 7 // 100) - parts.append(_BLOCK_SPARK_CHARS[bi]) - s = "".join(parts) - return (" " * (width - len(s))) + s - - -def _gpu_status_column_widths(inner: int) -> tuple[int, int, int]: - """Return (prefix_w, hist_w, proc_w). Row is prefix + ' ' + [hist + ' '] + proc (full inner).""" - prefix_w = GPU_COL_IDX_W + 1 + GPU_COL_UTIL_W + 1 + GPU_COL_MEM_W - slack = inner - prefix_w - if slack <= 0: - return prefix_w, 0, 0 - if slack == 1: - return prefix_w, 0, 1 - pair = slack - 2 - if pair < GPU_MIN_PROC_W: - return prefix_w, 0, slack - 1 - proc_w = max(GPU_MIN_PROC_W, pair // 3) - hist_w = pair - proc_w - return prefix_w, hist_w, proc_w - - -def _fit_text_field(s: str, max_w: int) -> str: - if max_w <= 0: - return "" - if len(s) <= max_w: - return s.ljust(max_w) - if max_w <= 3: - return s[:max_w] - return s[: max_w - 3] + "..." - - -def _format_gpu_history_span_seconds(total_sec: int) -> str: - total_sec = max(0, int(total_sec)) - if total_sec < 60: - return f"{total_sec}s" - if total_sec < 3600: - m, s = divmod(total_sec, 60) - return f"{m}m" if s == 0 else f"{m}m{s}s" - h, rem = divmod(total_sec, 3600) - m, s = divmod(rem, 60) - if m == 0 and s == 0: - return f"{h}h" - if s == 0: - return f"{h}h{m}m" - return f"{h}h{m}m{s}s" - - -def _gpu_history_header_label(hist_w: int, sample_interval_sec: float) -> str: - """Visible window = one sample per column at daemon poll cadence.""" - if hist_w <= 0: - return "" - span_sec = int(round(hist_w * sample_interval_sec)) - return f"HISTORY {_format_gpu_history_span_seconds(span_sec)}" - - -def shorten_command(cmd: str, max_len: int) -> str: - """Shorten a command string to max_len by removing the middle part.""" - if len(cmd) <= max_len: - return cmd - - # Calculate lengths to keep - head_len = (max_len - 3) // 2 - tail_len = max_len - 3 - head_len - - return f"{cmd[:head_len]}...{cmd[-tail_len:]}" - - -# cmd_status removed - functionality merged into watch TUI - - -def cmd_cancel(args): - """Cancel a pending job.""" - with locked_queue() as queue: - for i, job in enumerate(queue["pending"]): - if job["id"] == args.job_id: - queue["pending"].pop(i) - print(f"✓ Cancelled pending job {args.job_id}") - return - - # Also check running - for i, job in enumerate(queue["running"]): - if job["id"] == args.job_id: - pid = job.get("pid") - if pid: - try: - os.killpg(pid, signal.SIGKILL) # Aggressive kill - except Exception: - pass - queue["running"].pop(i) - job["status"] = "cancelled" - job["ended"] = datetime.now().isoformat() - queue["completed"].append(job) - print(f"✓ Cancelled running job {args.job_id}") - return - - print(f"Job {args.job_id} not found") - - -def cmd_logs(args): - """Show logs for a job.""" - log_file = LOG_DIR / f"{args.job_id}.log" - if not log_file.exists(): - print(f"No log file for job {args.job_id}") - return - - # Tail the log - lines = args.lines - with open(log_file) as f: - content = f.readlines() - for line in content[-lines:]: - print(line, end="") - - -def cmd_clear(args): - """Clear completed jobs from the queue.""" - with locked_queue() as queue: - count = len(queue["completed"]) - queue["completed"] = [] - print(f"✓ Cleared {count} completed jobs") - - -def cmd_delete(args): - """Delete a job from the completed list.""" - with locked_queue() as queue: - for i, job in enumerate(queue["completed"]): - if job["id"] == args.job_id: - queue["completed"].pop(i) - print(f"✓ Deleted job {args.job_id}") - return - print(f"Job {args.job_id} not found in completed jobs") - - -def cmd_retry(args): - """Re-queue a completed job.""" - with locked_queue() as queue: - # Find job in completed - for i, job in enumerate(queue["completed"]): - if job["id"] == args.job_id: - # Remove from completed - queue["completed"].pop(i) - - # Reset job metadata but KEEP THE ID if requested or default behavior? - # User asked to not duplicate. Reusing ID is best. - # But we should update 'added' time? Or keep original? - # Let's update added time to reflect it's back in queue. - - new_job = { - "id": job["id"], # Reuse ID - "cmd": job["cmd"], - "gpus": job.get("gpus", 1), - "added": datetime.now().isoformat(), - "retried_at": datetime.now().isoformat(), - "priority": 1, - } - - # Add to front or back of pending - if args.front: - queue["pending"].insert(0, new_job) - print(f"✓ Re-queued job {job['id']} (front)") - else: - queue["pending"].append(new_job) - print(f"✓ Re-queued job {job['id']} (back)") - return - - print(f"Job {args.job_id} not found in completed jobs") - - -def cmd_pause(args): - """Pause a running job (kill and re-queue at front).""" - with locked_queue() as queue: - # Find job in running - for i, job in enumerate(queue["running"]): - if job["id"] == args.job_id: - pid = job.get("pid") - if pid: - try: - # Aggressive kill to free GPU instantly - os.killpg(pid, signal.SIGKILL) - except Exception: - pass - - # Remove from running - queue["running"].pop(i) - - # Reset metadata for re-queue - new_job = { - "id": generate_job_id(), - "cmd": job["cmd"], - "gpus": job.get("gpus", 1), - "added": datetime.now().isoformat(), - "priority": 3, # Urgent - "paused_from": job["id"], - } - - # Add to front of pending queue - queue["pending"].insert(0, new_job) - print(f"✓ Paused job {job['id']} (Killed process group {pid})") - print(f"✓ Re-queued as {new_job['id']} at front") - return - - print(f"Job {args.job_id} not found in running jobs") - - -def get_status_data(): - """Gather all status data for the queue and GPUs.""" - cleanup_dead_jobs() - queue = load_queue() - gpus = get_free_gpus() - - # Get config (min_free and excluded) - min_free = 2 - excluded = set() - config_file = QUEUE_DIR / "config.json" - if config_file.exists(): - try: - cfg = json.loads(config_file.read_text()) - min_free = cfg.get("min_free_gpus", 2) - excluded = set(cfg.get("excluded_gpus", [])) - except Exception: - pass - - # Filter GPUs (hiding excluded ones entirely from the monitor view) - gpus = [g for g in gpus if g["index"] not in excluded] - - return { - "queue": queue, - "gpus": gpus, - "min_free": min_free, - "excluded": list(excluded), - "term_width": get_terminal_width(), - } - - -class Window: - def __init__(self, title, key, height_pct=0.3): - self.title = title - self.key = key - self.items = [] - self.selected_idx = 0 - self.scroll_offset = 0 - self.height_pct = height_pct # Target height percentage - self.collapsed = False - - def update_items(self, items): - self.items = items - # Clamp selection - if self.selected_idx >= len(self.items): - self.selected_idx = max(0, len(self.items) - 1) - self.clamp_scroll() - - def visible_item_count(self, h=None): - """Return how many list rows are visible at this window height.""" - effective_h = h if h is not None else getattr(self, "height", None) - if effective_h is None: - effective_h = 10 - - visible_h = max(1, effective_h - 2) - if self.key in ["running", "staging", "pending", "completed"]: - visible_h = max(1, visible_h - 1) - return visible_h - - def clamp_scroll(self, h=None): - """Keep the scroll offset valid for the current item count and height.""" - if not self.items: - self.selected_idx = 0 - self.scroll_offset = 0 - return - - self.selected_idx = max(0, min(len(self.items) - 1, self.selected_idx)) - visible_h = self.visible_item_count(h) - max_offset = max(0, len(self.items) - visible_h) - self.scroll_offset = max(0, min(max_offset, self.scroll_offset)) - - def ensure_selected_visible(self, h=None): - """Adjust scroll offset so the selected row remains on screen.""" - self.clamp_scroll(h) - if not self.items: - return - - visible_h = self.visible_item_count(h) - if self.selected_idx < self.scroll_offset: - self.scroll_offset = self.selected_idx - elif self.selected_idx >= self.scroll_offset + visible_h: - self.scroll_offset = self.selected_idx - visible_h + 1 - self.clamp_scroll(h) - - def scroll(self, delta, h=None): - """Scroll selection by delta.""" - if not self.items: - return - - new_idx = self.selected_idx + delta - self.selected_idx = max(0, min(len(self.items) - 1, new_idx)) - self.ensure_selected_visible(h) - - def get_selected(self): - if 0 <= self.selected_idx < len(self.items): - return self.items[self.selected_idx] - return None - - -class GPUQueueTUI: - def __init__(self, interval=2.0): - self.interval = interval - self.stdscr = None - self.running = False - self.lock = threading.Lock() - - # State - self.data = {"staging": [], "running": [], "pending": [], "completed": []} - self.data = {"staging": [], "running": [], "pending": [], "completed": []} - self.gpu_status = [] - self.min_free = 2 - self.excluded = [] - self.server_status = "UNKNOWN" - self.last_updated = 0 - self.action_msg = "" - self.msg_clear_time = 0 - self.action_msg = "" - self.msg_clear_time = 0 - self.modal: Optional[Dict[str, Any]] = None # { type, title, text, val... } - self._gpu_util_history: dict[int, list[int]] = {} - self._last_util_status_ts: Optional[str] = None - - # Windows - self.windows = [ - Window("RUNNING", "running", 0.2), - Window("PENDING", "pending", 0.2), - Window("STAGING", "staging", 0.2), - Window("COMPLETED", "completed", 0.2), - Window("GPU STATUS", "gpu_status", 0.2), - Window("SELECTED JOB", "job_details", 0.2), - ] - self.active_win_idx = 0 # Index in self.windows - self.mode = "NAV" # "NAV" (select window) or "ACTION" (interact with window) - self.has_selected_job_context = False - - # Log view state - self.viewing_logs = False - self.log_job_id = None - self.log_content = [] - self.log_scroll = 0 - - # Edit Mode State - self.edit_mode_active = False - self.edit_job = None - self.edit_field_idx = 0 # 0: GPUs, 1: Command - self.edit_is_new = False - - # Dynamic column widths for tables - self.col_widths = { - "running": {"id": 8, "pid": 6, "gpus": 4, "elapsed": 7}, - "staging": {"id": 8, "gpus": 4, "waiting": 7}, - "pending": {"id": 8, "gpus": 4, "waiting": 7}, - "completed": {"id": 8, "runtime": 7, "ago": 7}, - } - - def _calc_col_widths(self): - """Calculate column widths based on current data for all tables.""" - try: - def calc_queue_widths(items: list[dict[str, Any]]) -> dict[str, int]: - widths = {"id": 3, "gpus": 4, "waiting": 7} - for job in items: - jid = job.get("id", "")[:8] - widths["id"] = max(widths["id"], len(jid)) - widths["gpus"] = max(widths["gpus"], len(str(job.get("gpus", 1)))) - add_dt = self._parse_iso(job.get("added")) - if add_dt: - waiting = self._fmt_delta(datetime.now() - add_dt) - widths["waiting"] = max(widths["waiting"], len(waiting)) - for key in widths: - widths[key] += 1 - return widths - - # Running table - running_w = {"id": 3, "pid": 3, "gpus": 4, "elapsed": 7} - for job in self.data.get("running", []): - jid = job.get("id", "")[:8] - running_w["id"] = max(running_w["id"], len(jid)) - running_w["pid"] = max(running_w["pid"], len(str(job.get("pid", "")))) - gpus = ",".join(map(str, job.get("assigned_gpus", []))) - running_w["gpus"] = max(running_w["gpus"], len(gpus) if gpus else 1) - start_dt = self._parse_iso(job.get("started")) - if start_dt: - elapsed = self._fmt_delta(datetime.now() - start_dt) - running_w["elapsed"] = max(running_w["elapsed"], len(elapsed)) - - # Staging + Pending tables - staging_w = calc_queue_widths(self.data.get("staging", [])) - pending_w = calc_queue_widths(self.data.get("pending", [])) - - # Completed table - completed_w = {"id": 3, "runtime": 7, "ago": 7} - for job in self.data.get("completed", []): - jid = job.get("id", "")[:8] - completed_w["id"] = max(completed_w["id"], len(jid)) - start_dt = self._parse_iso(job.get("started")) - end_dt = self._parse_iso(job.get("ended")) - if start_dt and end_dt: - run_s = self._fmt_delta(end_dt - start_dt) - completed_w["runtime"] = max(completed_w["runtime"], len(run_s)) - if end_dt: - ago_s = self._fmt_delta(datetime.now() - end_dt) - completed_w["ago"] = max(completed_w["ago"], len(ago_s)) - - # Add padding - for key in running_w: - running_w[key] += 1 - for key in completed_w: - completed_w[key] += 1 - - self.col_widths = { - "running": running_w, - "staging": staging_w, - "pending": pending_w, - "completed": completed_w, - } - except Exception: - pass # Keep existing widths on error - - def _set_data_snapshot(self, queue: dict[str, list]): - """Replace local queue data from a freshly mutated queue snapshot.""" - self.data = copy.deepcopy(queue) - self.data["completed"].sort(key=lambda x: x.get("ended", ""), reverse=True) - self.last_updated = time.time() - self._sync_windows_from_data() - self._calc_col_widths() - - def _sync_windows_from_data(self): - for w in self.windows: - items = self.data.get(w.key, []) - type_label = w.key - for item in items: - item["_type"] = type_label - w.update_items(items) - - if self.edit_mode_active and self.edit_job is not None: - edit_id = self.edit_job.get("id") - for w in self.windows: - if w.key == "staging": - for idx, item in enumerate(w.items): - if item.get("id") == edit_id: - w.selected_idx = idx - break - break - - def _selected_job(self): - win = self.windows[self.active_win_idx] - if win.key in ["running", "pending", "staging", "completed"]: - return win.get_selected() - return getattr(self, "last_selected_job", None) - - def _enter_action_window(self, win): - self.mode = "ACTION" - win.collapsed = False - if win.key in ["running", "pending", "staging", "completed"]: - self.has_selected_job_context = True - - def start(self): - self.running = True - # Fast loop for queue updates - t1 = threading.Thread(target=self._poll_queue_loop, daemon=True) - t1.start() - # Slow loop for GPU polling - t2 = threading.Thread(target=self._poll_gpu_loop, daemon=True) - t2.start() - - def stop(self): - self.running = False - - def _poll_queue_loop(self): - """Poll queue frequently for snappy UI.""" - while self.running: - try: - with locked_queue() as q: - with self.lock: - self._set_data_snapshot(q) - - time.sleep(0.5) # Fast update - except Exception: - # with self.lock: self.action_msg = f"Q Poll Error: {str(e)}" - time.sleep(1) - - def _append_gpu_util_sample(self, status: dict[str, Any]) -> None: - ts = status.get("ts") - if not ts or ts == self._last_util_status_ts: - return - self._last_util_status_ts = ts - for g in status.get("gpus", []): - idx = g.get("index") - if idx is None: - continue - util = int(g.get("util", 0)) - buf = self._gpu_util_history.setdefault(int(idx), []) - buf.append(max(0, min(100, util))) - while len(buf) > GPU_UTIL_HISTORY_MAX_SAMPLES: - buf.pop(0) - - def _poll_gpu_loop(self): - """Poll GPU status at slower interval.""" - while self.running: - try: - status_file = QUEUE_DIR / "status.json" - if status_file.exists(): - try: - # Use file lock or retry read for atomic? - # Using simple read should be fine mostly - txt = status_file.read_text() - if txt.strip(): - s = json.loads(txt) - with self.lock: - self.gpu_status = s.get("gpus", []) - self.min_free = s.get("min_free", 2) - self.excluded = s.get("excluded", []) - self.server_status = "DAEMON: ON" - self._append_gpu_util_sample(s) - else: - with self.lock: - self.server_status = "DAEMON: S?" - except json.JSONDecodeError: - pass # Partial write? - else: - with self.lock: - self.server_status = "DAEMON: OFF" - except Exception: - with self.lock: - self.server_status = "ERR" - - time.sleep(self.interval) - - def draw_box(self, y, x, h, w, title, active=False, focused=False): - if self.stdscr is None: - return - """Draw a bordered box.""" - try: - color = curses.color_pair(4) # Cyan default - if focused: - color = curses.color_pair(2) # Green for focused interaction - # self.stdscr.attron(curses.A_BOLD) - elif active: - color = curses.color_pair(3) # Yellow for selected window - - # Draw border - self.stdscr.attron(color) - self.stdscr.box() - # rectangle doesn't use relative coords well with subwin? - - # Manual box drawing using line characters if needed, or just addstr - # Top - self.stdscr.hline(y, x, curses.ACS_HLINE, w) - self.stdscr.vline(y, x, curses.ACS_VLINE, h) - self.stdscr.hline(y + h - 1, x, curses.ACS_HLINE, w) - self.stdscr.vline(y, x + w - 1, curses.ACS_VLINE, h) - - # Corners - self.stdscr.addch(y, x, curses.ACS_ULCORNER) - self.stdscr.addch(y, x + w - 1, curses.ACS_URCORNER) - self.stdscr.addch(y + h - 1, x, curses.ACS_LLCORNER) - self.stdscr.addch(y + h - 1, x + w - 1, curses.ACS_LRCORNER) - - # Title - title_str = f" {title} " - if focused: - title_str = f" [ {title} ] " - elif active: - title_str = f" {title} " - - self.stdscr.addstr( - y, x + 2, title_str, color | (curses.A_BOLD if active else 0) - ) - self.stdscr.attroff(color) - except Exception: - pass - - def _parse_iso(self, s): - if not s: - return None - try: - return datetime.fromisoformat(s) - except Exception: - return None - - def _fmt_delta(self, delta): - if not delta: - return "-" - s = int(delta.total_seconds()) - if s < 60: - return f"{s}s" - m = s // 60 - if m < 60: - return f"{m}m" - h = m // 60 - m = m % 60 - return f"{h}h{m}m" - - def format_job_line(self, job, w): - """Format a job dictionary into a single line string.""" - jid = job["id"] - - # Edit Mode Highlighting - edit_idx = job.get("_edit_field_idx", -1) - is_editing = edit_idx >= 0 - - # Determine Color based on state/status - color = curses.color_pair(0) - - prefix = "" - - if job["_type"] == "running": - # Formt: ID(8) | PID(10) | GPUS(8) | ELAPSED(9) | CMD - color = curses.color_pair(5) # Blue - - gpus = ",".join(map(str, job.get("assigned_gpus", []))) - if not gpus: - gpus = "?" - - pid = str(job.get("pid", "?")) - - elapsed = "-" - start_dt = self._parse_iso(job.get("started")) - if start_dt: - elapsed = self._fmt_delta(datetime.now() - start_dt) - - # Use dynamic column widths - cw = self.col_widths["running"] - prefix = ( - f" {jid:<{cw['id']}} {pid:<{cw['pid']}} " - f"{gpus:<{cw['gpus']}} {elapsed:<{cw['elapsed']}} " - ) - - elif job["_type"] in ["staging", "pending"]: - gpus = str(job.get("gpus", 1)) - waiting = "-" - add_dt = self._parse_iso(job.get("added")) - if add_dt: - waiting = self._fmt_delta(datetime.now() - add_dt) - - # Use dynamic column widths - cw = self.col_widths["staging" if job["_type"] == "staging" else "pending"] - prefix = f" {jid:<{cw['id']}} {gpus:<{cw['gpus']}} {waiting:<{cw['waiting']}} " - - else: - # Completed/Finished - # Format: ID(8) | RUNTIME(9) | AGO(9) | CMD - - # Color by status - s_res = job.get("status", "?") - if s_res == "success": - color = curses.color_pair(2) # Green - elif s_res == "failed": - color = curses.color_pair(1) # Red - elif s_res == "cancelled": - color = curses.color_pair(3) # Orange - - start_dt = self._parse_iso(job.get("started")) - end_dt = self._parse_iso(job.get("ended")) - - run_s = "-" - ago_s = "-" - - if start_dt and end_dt: - run_s = self._fmt_delta(end_dt - start_dt) - - if end_dt: - ago_s = self._fmt_delta(datetime.now() - end_dt) - - # Use dynamic column widths - cw = self.col_widths["completed"] - prefix = ( - f" {jid:<{cw['id']}} {run_s:<{cw['runtime']}} {ago_s:<{cw['ago']}} " - ) - - cmd = job.get("cmd", "") or "" - avail_cmd = w - len(prefix) - if len(cmd) > avail_cmd: - cmd = cmd[: (avail_cmd - 1)] + "…" - - full_line = prefix + cmd - - if is_editing and job["_type"] == "staging": - cw = self.col_widths["staging"] - - # Field 0: GPUS - if edit_idx == 0: - s_val = f"[{gpus}]" - s_gpus = f"{s_val:<{cw['gpus'] + 1}}" - else: - s_gpus = f"{gpus:<{cw['gpus']}} " - - head = f" {jid:<{cw['id']}} " + s_gpus + f"{waiting:<{cw['waiting']}} " - cmd_avail = max(1, w - len(head)) - - cmd_stripped = (job.get("cmd", "") or "").strip() - if edit_idx == 1: - disp = cmd_stripped if cmd_stripped else CMD_FIELD_GHOST - if len(disp) > cmd_avail: - disp = disp[: max(0, cmd_avail - 1)] + "…" - disp = disp.ljust(cmd_avail)[:cmd_avail] - cmd_attr = curses.A_REVERSE - else: - disp = cmd_stripped - if len(disp) > cmd_avail: - disp = disp[: max(0, cmd_avail - 1)] + "…" - disp = disp.ljust(cmd_avail)[:cmd_avail] - cmd_attr = curses.A_NORMAL - - return { - "type": "rich", - "segments": [ - (f" {jid:<{cw['id']}} ", curses.A_NORMAL), - (s_gpus, curses.A_REVERSE if edit_idx == 0 else curses.A_NORMAL), - (f"{waiting:<{cw['waiting']}} ", curses.A_NORMAL), - (disp, cmd_attr), - ], - "base_color": color, - }, color - - return full_line, color - - def draw(self): - if self.stdscr is None: - return - self.stdscr.erase() - h, w = self.stdscr.getmaxyx() - - if h < 20 or w < 60: - self.stdscr.addstr(0, 0, "Terminal too small!") - return - - with self.lock: - # 0. Header (1 row) - # 0. Header (1 row) - # Full width white bar - self.stdscr.hline(0, 0, " ", w, curses.color_pair(6) | curses.A_REVERSE) - - status_col = curses.color_pair(6) | curses.A_REVERSE # White BG - - # Title - self.stdscr.addstr(0, 1, " GPU QUEUE WATCH ", status_col | curses.A_BOLD) - - # Daemon Status - server_col = ( - curses.color_pair(2 if "ON" in self.server_status else 1) - | curses.A_REVERSE - ) - self.stdscr.addstr( - 0, 20, f" [{self.server_status}] ", server_col | curses.A_BOLD - ) - - # Daemon Info (Reserved/Excluded) - info_str = f"Res: {self.min_free}" - if self.excluded: - ex_list = ",".join(map(str, sorted(self.excluded))) - info_str += f" | Excl: [{ex_list}]" - - # Right aligned info - info_x = w - len(info_str) - 2 - if info_x > 40: # Prevent overlap - self.stdscr.addstr(0, info_x, info_str, status_col) - - # Action message overlay - if self.action_msg: - if time.time() > self.msg_clear_time: - self.action_msg = "" - else: - msg_x = w // 2 - len(self.action_msg) // 2 - self.stdscr.addstr( - 0, - msg_x, - f" {self.action_msg} ", - curses.color_pair(3) | curses.A_REVERSE, - ) - - # 2. Main Windows - # Calculate heights - avail_h = h - 2 # -1 for header, -1 for footer - - win_by_key = {win.key: win for win in self.windows} - - # --- Dynamic Sizing Logic --- - gpu_h = 1 - if not win_by_key["gpu_status"].collapsed: - gpu_content_len = len(self.gpu_status) if self.gpu_status else 1 - gpu_h = min(gpu_content_len + 3, max(3, avail_h // 3)) - gpu_h = max(3, gpu_h) - - running_h = 1 - if not win_by_key["running"].collapsed: - running_items = len(self.data.get("running", [])) - running_content_len = running_items + 1 # +1 for header - running_h = min(running_content_len + 2, max(3, avail_h // 3)) - running_h = max(3, running_h) - - job_h = 1 if win_by_key["job_details"].collapsed else 8 - - queue_keys = ["staging", "pending", "completed"] - queue_min_h = len(queue_keys) - nonqueue_min_heights = { - "running": 1 if win_by_key["running"].collapsed else 3, - "gpu_status": 1 if win_by_key["gpu_status"].collapsed else 3, - "job_details": 1 if win_by_key["job_details"].collapsed else 3, - } - nonqueue_heights = { - "running": running_h, - "gpu_status": gpu_h, - "job_details": job_h, - } - while sum(nonqueue_heights.values()) + queue_min_h > avail_h: - shrinkable = [ - key - for key, height in nonqueue_heights.items() - if height > nonqueue_min_heights[key] - ] - if not shrinkable: - break - key = max(shrinkable, key=lambda k: nonqueue_heights[k]) - nonqueue_heights[key] -= 1 - - running_h = nonqueue_heights["running"] - gpu_h = nonqueue_heights["gpu_status"] - job_h = nonqueue_heights["job_details"] - remaining_h = max(0, avail_h - sum(nonqueue_heights.values())) - visible_queue_keys = [ - k for k in queue_keys if not win_by_key[k].collapsed - ] - heights_by_key = { - "running": running_h, - "gpu_status": gpu_h, - "job_details": job_h, - } - if visible_queue_keys: - collapsed_queue_h = sum( - 1 for k in queue_keys if win_by_key[k].collapsed - ) - visible_h = max(0, remaining_h - collapsed_queue_h) - base_h = max(1, visible_h // len(visible_queue_keys)) - extra = max(0, visible_h - (base_h * len(visible_queue_keys))) - for k in queue_keys: - if win_by_key[k].collapsed: - heights_by_key[k] = 1 - else: - add = 1 if extra > 0 else 0 - heights_by_key[k] = base_h + add - if extra > 0: - extra -= 1 - else: - for k in queue_keys: - heights_by_key[k] = 1 - - heights = [heights_by_key[win.key] for win in self.windows] - - current_y = 1 - for i, win in enumerate(self.windows): - wh = heights[i] - win.height = wh # Store actual height for scrolling - if wh <= 0: - continue # Skip hidden windows - if not win.collapsed: - win.ensure_selected_visible(wh) - - active = i == self.active_win_idx - focused = active and self.mode == "ACTION" - - self.draw_window(win, current_y, 0, wh, w, active, focused) - current_y += wh - - # 3. Footer - self.draw_footer(h - 1, w) - - # 4. Log Overlay? - if self.viewing_logs: - self.draw_log_overlay(h, w) - - # 5. Modal - if self.modal: - self.draw_modal(h, w) - - self.stdscr.refresh() - - def draw_modal(self, h, w): - if self.stdscr is None: - return - if self.modal is None: - return - """Draw a modal overlay.""" - m_h, m_w = 16, 80 - y = (h - m_h) // 2 - x = (w - m_w) // 2 - - # Draw box - try: - # Clear area - for i in range(m_h): - self.stdscr.addstr(y + i, x, " " * m_w) - - # Border - self.stdscr.attron(curses.color_pair(3)) - - # Manual draw for modal box - h_box = m_h - w_box = m_w - self.stdscr.hline(y, x, curses.ACS_HLINE, w_box) - self.stdscr.hline(y + h_box - 1, x, curses.ACS_HLINE, w_box) - self.stdscr.vline(y, x, curses.ACS_VLINE, h_box) - self.stdscr.vline(y, x + w_box - 1, curses.ACS_VLINE, h_box) - self.stdscr.addch(y, x, curses.ACS_ULCORNER) - self.stdscr.addch(y, x + w_box - 1, curses.ACS_URCORNER) - self.stdscr.addch(y + h_box - 1, x, curses.ACS_LLCORNER) - self.stdscr.addch(y + h_box - 1, x + w_box - 1, curses.ACS_LRCORNER) - - self.stdscr.attroff(curses.color_pair(3)) - - # Title - self.stdscr.addstr( - y, - x + 2, - f" {self.modal['title']} ", - curses.color_pair(3) | curses.A_BOLD, - ) - - # Content - text = self.modal.get("text", "") - if text: - self.stdscr.addstr(y + 2, x + 2, text[: m_w - 4]) - - # Input field - if self.modal["type"] == "INPUT": - val = self.modal.get("value", "") - cursor_pos = self.modal.get("cursor_pos", len(val)) - - field_w = m_w - 6 - field_h = m_h - 6 # Leave space for buttons/title - # Import textwrap or use simple slicing - # Simple character wrapping - lines = [] - for i in range(0, len(val), field_w): - lines.append(val[i : i + field_w]) - if not lines: - lines = [""] - - # If cursor is at exact end, handle it? - # Logic puts it at end of last line. - - # Ensure we have enough lines to cover cursor - # Cursor (row, col) - c_row = cursor_pos // field_w - c_col = cursor_pos % field_w - - # Draw lines - # We might need scrolling if text exceeds box height? - # For now assuming it fits or we enforce limit. - # Let's implement basic vertical scrolling if needed - - scroll_row = self.modal.get("scroll_row", 0) - if c_row < scroll_row: - scroll_row = c_row - elif c_row >= scroll_row + field_h: - scroll_row = c_row - field_h + 1 - self.modal["scroll_row"] = scroll_row - - for i in range(field_h): - line_idx = scroll_row + i - draw_y = y + 4 + i - - line_content = "" - if line_idx * field_w < len(val): - # Construct line from val directly to rely on consistent math - start = line_idx * field_w - end = start + field_w - line_content = val[start:end] - elif line_idx == 0 and not val: - line_content = "" - - # Only draw if valid line or active cursor line - # Use White (pair 6) for input text - self.stdscr.addstr( - draw_y, x + 3, line_content, curses.color_pair(6) - ) - - # Cursor - if line_idx == c_row: - # Ensure c_col is within bounds of visual line - # If cursor is at end of line (col=0 of next), handle it? - # No, math handles it: c_col is 0..width-1 - # Cursor pos logic handles new line wrapping - - char_at = " " - if c_col < len(line_content): - char_at = line_content[c_col] - - self.stdscr.addstr( - draw_y, - x + 3 + c_col, - char_at, - curses.A_REVERSE | curses.color_pair(6), - ) - - # Buttons - btn_y = y + m_h - 2 - if self.modal["type"] == "CONFIRM": - btns = "[y] Yes [n] No" - self.stdscr.addstr(btn_y, x + (m_w - len(btns)) // 2, btns) - elif self.modal["type"] == "INPUT": - btns = "[Enter] Confirm [Esc] Cancel" - self.stdscr.addstr(btn_y, x + (m_w - len(btns)) // 2, btns) - - except Exception: - pass - - def draw_compact_gpu_info(self, y, x, h, w): - if self.stdscr is None: - return - """Draw compact nvidia-smi style info.""" - try: - left = x + 2 - inner = max(0, w - 4) - prefix_w, hist_w, proc_w = _gpu_status_column_widths(inner) - hdr_attr = curses.A_NORMAL | curses.A_UNDERLINE - row_attr = curses.A_NORMAL - - if not self.gpu_status: - self.stdscr.addstr(y + 1, left, "No GPU info available", curses.A_NORMAL) - return - - hdr_idx = "IDX".ljust(GPU_COL_IDX_W)[:GPU_COL_IDX_W] - hdr_util = "UTIL".ljust(GPU_COL_UTIL_W)[:GPU_COL_UTIL_W] - hdr_mem = "MEM".ljust(GPU_COL_MEM_W)[:GPU_COL_MEM_W] - prefix_hdr = f"{hdr_idx} {hdr_util} {hdr_mem}" - if len(prefix_hdr) > inner: - self.stdscr.addstr(y, left, prefix_hdr[:inner], hdr_attr) - else: - self.stdscr.addstr(y, left, prefix_hdr, hdr_attr) - col = left + prefix_w + 1 - if hist_w > 0: - h_hist = _fit_text_field( - _gpu_history_header_label(hist_w, POLL_INTERVAL), - hist_w, - ).ljust(hist_w)[:hist_w] - self.stdscr.addstr(y, col, h_hist, hdr_attr) - col += hist_w + 1 - h_proc = _fit_text_field("PROCESSES (USER:PID)", proc_w).ljust(proc_w)[ - :proc_w - ] - if proc_w > 0: - self.stdscr.addstr(y, col, h_proc, hdr_attr) - - for i, g in enumerate(self.gpu_status[: h - 1]): - idx = g.get("index", "?") - used_mb = g.get("used_mb", 0) - total_mb = g.get("total_mb", 0) - util = g.get("util", 0) - - used_gb = used_mb / 1024.0 - total_gb = total_mb / 1024.0 - mem_s = f"{used_gb:.1f}/{total_gb:.0f}G" - - line_y = y + 1 + i - idx_s = ( - str(int(idx))[:GPU_COL_IDX_W] - if isinstance(idx, int) - else str(idx)[:GPU_COL_IDX_W] - ).ljust(GPU_COL_IDX_W)[:GPU_COL_IDX_W] - u = max(0, min(100, int(util))) - util_s = f"{u:>3}%".ljust(GPU_COL_UTIL_W)[:GPU_COL_UTIL_W] - mem_col = mem_s[:GPU_COL_MEM_W].ljust(GPU_COL_MEM_W)[:GPU_COL_MEM_W] - prefix_row = f"{idx_s} {util_s} {mem_col}" - if len(prefix_row) > inner: - self.stdscr.addstr(line_y, left, prefix_row[:inner], row_attr) - continue - - self.stdscr.addstr(line_y, left, prefix_row, row_attr) - col = left + prefix_w + 1 - if hist_w > 0: - hist = ( - self._gpu_util_history.get(int(idx), []) - if idx != "?" - else [] - ) - spark = sparkline_trailing(hist, hist_w) - self.stdscr.addstr(line_y, col, spark, row_attr) - col += hist_w + 1 - proc_strs = [ - f"{p.get('user', '?')}:{p.get('pid', '?')}" - for p in g.get("processes", []) - if not p.get("zombie") - ] - proc_line = _fit_text_field(", ".join(proc_strs), proc_w).ljust(proc_w)[ - :proc_w - ] - if proc_w > 0: - self.stdscr.addstr(line_y, col, proc_line, curses.A_NORMAL) - except Exception: - pass - - def draw_job_details(self, y, x, h, w): - if self.stdscr is None: - return - """Draw detailed job information for the selected job across all windows.""" - try: - if not self.has_selected_job_context: - self.stdscr.addstr(y + 1, x + 2, "No job selected.", curses.A_NORMAL) - return - - # Find which job is "selected" across the 3 main windows - # Or just use the one from the active window if it's a queue - job = self._selected_job() - - if not job: - self.stdscr.addstr(y + 1, x + 2, "No job selected.", curses.A_NORMAL) - return - - self.last_selected_job = job # Keep it - - jid = job["id"] - queue_s = str(job.get("_type", "unk")).upper() - st = str(job.get("status", "-")).upper() - gpu_s = str(job.get("gpus", "-")) - meta_str = f"ID: {jid} | Queue: {queue_s} | Status: {st} | GPUs: {gpu_s}" - inner_w = max(1, w - 4) - self.stdscr.addstr(y, x + 2, _fit_text_field(meta_str, inner_w), curses.A_BOLD) - - cmd = job.get("cmd", "") - # Normalize command (remove newlines) - cmd = cmd.replace("\n", " ").replace("\r", " ") - - prefix = "Cmd: " - # Width calculation checks - # w - 5 - len(prefix) ? - # w is Full Width. - # Draws at x+2. - # Max index w-2 (border at w-1). - # So length available = w-4. - # Subtract prefix. - # Subtract 2 more for safety. - safe_width = max(1, inner_w - len(prefix)) - - import textwrap - - lines = textwrap.wrap(cmd, width=safe_width) or ["-"] - - for i, line in enumerate(lines[: h - 1]): - self.stdscr.addstr( - y + 1 + i, - x + 2, - prefix if i == 0 else " " * len(prefix), - curses.A_NORMAL, - ) - self.stdscr.addstr( - y + 1 + i, - x + 2 + len(prefix), - _fit_text_field(line, safe_width), - ) - except Exception: - pass - - def draw_window(self, win, y, x, h, w, active, focused): - if self.stdscr is None: - return - # Draw Box - try: - # Border Color logic: Blue when focused, white when selected. - # If unselected, don't draw borders (they'll blend with background). - if focused: - border_color = curses.color_pair(5) # Blue - title_color = curses.color_pair(5) | curses.A_BOLD # Blue bold - draw_border = True - elif active: - border_color = curses.color_pair(6) # White - title_color = curses.color_pair(6) | curses.A_BOLD # White bold - draw_border = True - else: - # Unselected: don't draw border; title should use normal text. - border_color = None - title_color = curses.A_NORMAL - draw_border = False - - if draw_border: - self.stdscr.attron(border_color) - self.stdscr.hline(y, x, curses.ACS_HLINE, w) - if not win.collapsed: - self.stdscr.hline(y + h - 1, x, curses.ACS_HLINE, w) - self.stdscr.vline(y, x, curses.ACS_VLINE, h) - self.stdscr.vline(y, x + w - 1, curses.ACS_VLINE, h) - self.stdscr.addch(y, x, curses.ACS_ULCORNER) - self.stdscr.addch(y, x + w - 1, curses.ACS_URCORNER) - self.stdscr.addch(y + h - 1, x, curses.ACS_LLCORNER) - self.stdscr.addch(y + h - 1, x + w - 1, curses.ACS_LRCORNER) - self.stdscr.attroff(border_color) - - # Title - count_str = "" - if win.key in ["running", "staging", "pending", "completed"]: - count_str = f"[{len(win.items)}]" - - title_s = f" {win.title} {count_str} " - if win.collapsed: - title_s = f" [+] {win.title} {count_str} " - elif focused: - title_s = f" [ {win.title} {count_str} ] " - - # Title uses matching color scheme - self.stdscr.addstr(y, 2, title_s, title_color) - - if win.collapsed: - return - - # Dispatch specialized drawing - if win.key == "gpu_status": - self.draw_compact_gpu_info(y + 1, x, h - 2, w) - return - if win.key == "job_details": - self.draw_job_details(y + 1, x, h - 2, w) - return - - # Header? - header_offset = 1 - hdr = "" - if win.key == "running": - # Dynamic header based on column widths - cw = self.col_widths["running"] - hdr = ( - f" {'ID':<{cw['id']}} {'PID':<{cw['pid']}} " - f"{'GPUS':<{cw['gpus']}} {'ELAPSED':<{cw['elapsed']}} CMD" - ) - elif win.key == "staging": - cw = self.col_widths["staging"] - hdr = ( - f" {'ID':<{cw['id']}} {'GPUS':<{cw['gpus']}} " - f"{'WAITING':<{cw['waiting']}} CMD" - ) - elif win.key == "pending": - # Dynamic header based on column widths - cw = self.col_widths["pending"] - hdr = ( - f" {'ID':<{cw['id']}} {'GPUS':<{cw['gpus']}} " - f"{'WAITING':<{cw['waiting']}} CMD" - ) - elif win.key == "completed": - # Dynamic header based on column widths - cw = self.col_widths["completed"] - hdr = ( - f" {'ID':<{cw['id']}} {'RUNTIME':<{cw['runtime']}} " - f"{'AGO':<{cw['ago']}} CMD" - ) - - if hdr: - self.stdscr.addstr(y + 1, 1, hdr, curses.A_NORMAL | curses.A_UNDERLINE) - - # List items - display_items = list(win.items) - - # Edit-mode row swap in staging only - if ( - active - and self.edit_mode_active - and self.edit_job - and win.key == "staging" - ): - edit_copy = copy.deepcopy(self.edit_job) - edit_copy["_edit_field_idx"] = self.edit_field_idx - for idx, it in enumerate(display_items): - if it["id"] == edit_copy["id"]: - display_items[idx] = edit_copy - break - - # Recalculate list height or just use what we have - list_h = h - 2 - header_offset - if list_h < 1: - return - - start_y = y + 1 + header_offset - - # We need to handle scroll offset carefully if we injected an item - # If we injected, the list is 1 longer. - visible_items = display_items[ - win.scroll_offset : win.scroll_offset + list_h - ] - - for i, item in enumerate(visible_items): - abs_idx = win.scroll_offset + i - - is_sel = False - if active: - if abs_idx == win.selected_idx: - is_sel = True - - line_res, line_col = self.format_job_line(item, w - 2) - - draw_style = curses.A_NORMAL - if is_sel: - if focused: - draw_style = curses.A_REVERSE - else: - # In NAV mode, pass or different style - pass - - # Handle Rich Text (Dictionary) - if isinstance(line_res, dict) and line_res.get("type") == "rich": - current_x = 1 - segments = line_res["segments"] - base_attr = line_res.get("base_color", curses.A_NORMAL) - - # Clear line with base attr first? - # self.stdscr.addstr(start_y + i, 1, " " * (w-2), base_attr) - - for text, attr in segments: - try: - self.stdscr.addstr( - start_y + i, current_x, text, attr | base_attr - ) - current_x += len(text) - except Exception: - pass - else: - self.stdscr.addstr(start_y + i, 1, line_res, line_col | draw_style) - - # Scroll bar indicator? - if len(display_items) > list_h: - sb_h = max(1, int(list_h * (list_h / len(display_items)))) - sb_pos = int((win.scroll_offset / len(display_items)) * list_h) - for k in range(list_h): - char = "│" - if k >= sb_pos and k < sb_pos + sb_h: - char = "█" - try: - self.stdscr.addstr(start_y + k, w - 1, char, curses.A_NORMAL) - except Exception: - pass - except Exception: - # self.stdscr.addstr(y+1, 1, str(e)) - pass - - def draw_footer(self, y, w): - if self.stdscr is None: - return - try: - if self.modal: - return # Don't draw footer over modal or distract - - help_str = " Q:Quit " - if self.edit_mode_active: - help_str += ( - "e:Save Staging Esc:Cancel h/l:Field j/k:GPUs Enter:command editor" - ) - elif self.mode == "NAV": - help_str += "j/k:Select l:Focus n:New Job Tab:Collapse" - else: - # Context-aware help based on active window - win = self.windows[self.active_win_idx] - if win.key == "staging": - help_str += "h:Back c:Discard e:Edit s:Send d:Dup n:New" - elif win.key == "pending": - help_str += "h:Back c:Cancel J/K:Reorder Space:Log" - elif win.key == "running": - help_str += "h:Back Space:Log c:Cancel p:Pause d:Dup" - elif win.key == "completed": - help_str += "h:Back Space:Log r:Stage Retry x:Delete d:Dup" - else: - help_str += "h:Back Space:Log d:Dup n:New" - - # Mode display on the right - mode_label = self.mode - if self.edit_mode_active: - mode_label = "EDIT" - - mode_s = f" MODE: {mode_label} " - padding = " " * (w - len(help_str) - len(mode_s)) - - full_str = help_str + padding + mode_s - # Blue background (Pair 5) - self.stdscr.addstr(y, 0, full_str, curses.color_pair(5) | curses.A_REVERSE) - except Exception: - pass - - def draw_log_overlay(self, h, w): - if self.stdscr is None: - return - # Draw a floating window for logs - margin_x = 4 - margin_y = 2 - win_h = h - 2 * margin_y - win_w = w - 2 * margin_x - - # Draw shadow or clear - for i in range(win_h): - self.stdscr.addstr(margin_y + i, margin_x, " " * win_w, curses.A_NORMAL) - - # Box Border - try: - self.stdscr.attron(curses.color_pair(3)) - self.stdscr.hline(margin_y, margin_x, curses.ACS_HLINE, win_w) - self.stdscr.hline(margin_y + win_h - 1, margin_x, curses.ACS_HLINE, win_w) - self.stdscr.vline(margin_y, margin_x, curses.ACS_VLINE, win_h) - self.stdscr.vline(margin_y, margin_x + win_w - 1, curses.ACS_VLINE, win_h) - self.stdscr.addch(margin_y, margin_x, curses.ACS_ULCORNER) - self.stdscr.addch(margin_y, margin_x + win_w - 1, curses.ACS_URCORNER) - self.stdscr.addch(margin_y + win_h - 1, margin_x, curses.ACS_LLCORNER) - self.stdscr.addch( - margin_y + win_h - 1, margin_x + win_w - 1, curses.ACS_LRCORNER - ) - self.stdscr.attroff(curses.color_pair(3)) - # Title - title = f" LOGS: {self.log_job_id} " - self.stdscr.addstr( - margin_y, margin_x + 2, title, curses.A_BOLD | curses.A_REVERSE - ) - - # Content - content_h = win_h - 2 - visible_lines = self.log_content[ - self.log_scroll : self.log_scroll + content_h - ] - - for i, line in enumerate(visible_lines): - if len(line) > win_w - 2: - line = line[: win_w - 5] + "..." - self.stdscr.addstr( - margin_y + 1 + i, margin_x + 1, line, curses.A_NORMAL - ) - - # Footer - footer_str = " h/Esc:Close j/k:Scroll PGUP/DN:Jump L:Full(less) " - self.stdscr.addstr( - margin_y + win_h - 1, margin_x + 2, footer_str, curses.A_REVERSE - ) - - except Exception: - pass - - def action_view_logs(self): - """View logs for selected job using external tool.""" - win = self.windows[self.active_win_idx] - job = win.get_selected() - if not job: - return - - self.action_open_external_logs() - - def action_open_external_logs(self): - """Open logs in an external tool (less +F).""" - if self.mode != "ACTION": - return - win = self.windows[self.active_win_idx] - job = win.get_selected() - if not job: - return - - log_path = Path.home() / f".gpu_queue/logs/{job['id']}.log" - if not log_path.exists(): - self.action_msg = "Log file not found" - self.msg_clear_time = time.time() + 2.0 - return - - # We need to temporarily exit curses - if self.stdscr: - curses.def_shell_mode() - self.stdscr.clear() - self.stdscr.refresh() - curses.endwin() - - try: - # Use +F for following if it's currently running, otherwise just open it - cmd = ["less", "+G", str(log_path)] - if job["_type"] == "running": - cmd = ["less", "+F", str(log_path)] - - # Ignore SIGINT in parent (Python) so Ctrl+C only kills 'less' - old_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) - try: - subprocess.run(cmd) - finally: - # Restore SIGINT handler - signal.signal(signal.SIGINT, old_handler) - finally: - # Re-enter curses - if self.stdscr: - self.stdscr.refresh() - curses.doupdate() - # Some implementations need reset_shell_mode - curses.reset_shell_mode() - self.stdscr.keypad(True) - self.stdscr.nodelay(True) - - def main(self, stdscr): - import curses - - self.stdscr = stdscr - - curses.start_color() - curses.use_default_colors() - - # curr_y, curr_x = 0, 0 - - # Define colors (1: Red, 2: Green, 3: Yellow, 4: Cyan/Blue) - curses.init_pair(1, curses.COLOR_RED, -1) - curses.init_pair(2, curses.COLOR_GREEN, -1) - curses.init_pair(3, curses.COLOR_YELLOW, -1) - curses.init_pair(4, curses.COLOR_CYAN, -1) - curses.init_pair(5, curses.COLOR_BLUE, -1) - curses.init_pair(6, curses.COLOR_WHITE, -1) - curses.curs_set(0) - self.stdscr.nodelay(True) - self.stdscr.keypad(True) - - self.start() - - try: - while True: - self.draw() - - ch = self.stdscr.getch() - if ch == -1: - time.sleep(0.05) - continue - - if self.modal: - m = self.modal - if m["type"] == "CONFIRM": - if ch == ord("y") or ch == 10: # Yes - if m["on_confirm"]: - m["on_confirm"]() - self.modal = None - elif ch == ord("n") or ch == 27: # No / Esc - if m["on_cancel"]: - m["on_cancel"]() - self.modal = None - - elif m["type"] == "INPUT": - cpos = m.get("cursor_pos", len(m["value"])) - val = m["value"] - - if ch == 27: # Esc - if m["on_cancel"]: - m["on_cancel"]() - self.modal = None - elif ch == 10: # Enter - if m["on_confirm"]: - m["on_confirm"](val) - self.modal = None - elif ch == 127 or ch == curses.KEY_BACKSPACE: # Backspace - if cpos > 0: - m["value"] = val[: cpos - 1] + val[cpos:] - m["cursor_pos"] = cpos - 1 - elif ch == curses.KEY_DC: # Delete - if cpos < len(val): - m["value"] = val[:cpos] + val[cpos + 1 :] - elif ch == curses.KEY_LEFT: - m["cursor_pos"] = max(0, cpos - 1) - elif ch == curses.KEY_RIGHT: - m["cursor_pos"] = min(len(val), cpos + 1) - elif ch == curses.KEY_UP: - # Move up one line (width 74 = 80 - 6) - width = 80 - 6 - cpos_int = cast(int, cpos) - m["cursor_pos"] = max(0, cpos_int - width) - elif ch == curses.KEY_DOWN: - width = 80 - 6 - cpos_int = cast(int, cpos) - m["cursor_pos"] = min(len(str(val)), cpos_int + width) - elif ch == curses.KEY_HOME: - m["cursor_pos"] = 0 - elif ch == curses.KEY_END: - m["cursor_pos"] = len(val) - elif ch >= 32 and ch <= 126: # Printable - cpos_int = cast(int, cpos) - m["value"] = ( - str(val)[:cpos_int] + chr(ch) + str(val)[cpos_int:] - ) - m["cursor_pos"] = cpos_int + 1 - continue - - if self.edit_mode_active: - # Force redraw of footer/status - # self.stdscr.touchwin() - - if ch == 27: # Esc -> Cancel - self.edit_mode_active = False - self.edit_job = None - elif ch == ord("e"): # Confirm - if self.edit_job is None: - continue - self.execute_action( - "update_staging", - self.edit_job["id"], - cmd=self.edit_job["cmd"], - gpus=self.edit_job["gpus"], - ) - self.edit_mode_active = False - self.edit_job = None - - elif ch == ord("h"): # Cycle Left - self.edit_field_idx = max(0, self.edit_field_idx - 1) - elif ch == ord("l"): # Cycle Right - self.edit_field_idx = min(1, self.edit_field_idx + 1) - - elif ch == ord("j"): # Decrease Value - if self.edit_job is None: - continue - if self.edit_field_idx == 0: # GPUS - self.edit_job["gpus"] = max( - 1, int(self.edit_job["gpus"]) - 1 - ) - - elif ch == ord("k"): # Increase Value - if self.edit_job is None: - continue - if self.edit_field_idx == 0: # GPUS - self.edit_job["gpus"] = int(self.edit_job["gpus"]) + 1 - - elif ch == 10 or ch == ord("i"): # Enter/Edit Text - if self.edit_field_idx == 1: # Command - self.prompt_edit_command() - - continue - - # Log Viewing Mode - if self.viewing_logs: - if ch == ord("h") or ch == ord("q") or ch == 27: # h or q or Esc - self.viewing_logs = False - elif ch == ord("k"): - self.log_scroll = max(0, self.log_scroll - 1) - elif ch == ord("j"): - max_scroll = max( - 0, len(self.log_content) - (self.stdscr.getmaxyx()[0] - 6) - ) - self.log_scroll = min(max_scroll, self.log_scroll + 1) - continue - - # Normal Mode - if ch == ord("q"): - break - - if self.mode == "NAV": - if ch == ord("k"): - self.active_win_idx = max(0, self.active_win_idx - 1) - elif ch == ord("j"): - self.active_win_idx = min( - len(self.windows) - 1, self.active_win_idx + 1 - ) - elif ch == ord("l"): # l - # Disable l for non-interactive windows - curr_win = self.windows[self.active_win_idx] - if curr_win.key not in ["gpu_status", "job_details"]: - self._enter_action_window(curr_win) - elif ch == 10: # Enter - curr_win = self.windows[self.active_win_idx] - if curr_win.key not in ["gpu_status", "job_details"]: - self._enter_action_window(curr_win) - elif ch == 9: # Tab - self.windows[self.active_win_idx].collapsed = not self.windows[ - self.active_win_idx - ].collapsed - elif ch == ord("n"): # New Job (Global context) - self.prompt_new_job() - - elif self.mode == "ACTION": - win = self.windows[self.active_win_idx] - h = (self.stdscr.getmaxyx()[0] - 5) // 3 # approx height per window - - if ch == ord("h") or ch == 27: # h or Esc - self.mode = "NAV" - # Reset scroll to top - win.scroll_offset = 0 - win.selected_idx = 0 - elif ch == ord("k"): - win.scroll(-1, h) - elif ch == ord("j"): - win.scroll(1, h) - elif ch == ord(" "): - self.action_view_logs() - elif ch == ord("L"): - self.action_open_external_logs() - - # Actions (context-aware per window type) - elif ch == ord("c"): - if win.key in ["staging", "pending", "running"]: - self.do_action("cancel") - elif ch == ord("x"): # Remove/Delete (completed only) - if win.key == "completed": - self.do_action("remove") - elif ch == ord("d"): # Dup (all windows) - self.do_action("dup") - elif ch == ord("n"): # New - self.prompt_new_job() - elif ch == ord("p"): # Pause (running only) - if win.key == "running": - self.do_action("pause") - elif ch == ord("e"): # Edit (staging only) - if win.key == "staging": - self.do_action("edit") - elif ch == ord("r"): # Retry (completed only) - if win.key == "completed": - self.do_action("retry") - elif ch == ord("s"): # Send staged job to pending - if win.key == "staging": - self.do_action("send_to_pending") - elif ch == ord("J"): # Move pending job down - if win.key == "pending": - self.do_action("move_pending_down") - elif ch == ord("K"): # Move pending job up - if win.key == "pending": - self.do_action("move_pending_up") - elif ch == 10: # Enter sends staged job to pending (with confirm) - if win.key == "staging": - self.do_action("send_to_pending") - - finally: - self.stop() - - def add_job_internal(self, cmd, gpus=2, priority=1): - job = { - "id": generate_job_id(), - "cmd": cmd, - "gpus": gpus, - "added": datetime.now().isoformat(), - "priority": priority, - "cwd": os.getcwd(), - } - with locked_queue() as queue: - queue["pending"].append(job) - self.action_msg = f"Added {job['id']}" - self.msg_clear_time = time.time() + 2.0 - - def prompt_edit_command(self): - if self.edit_job is None: - return - - # Use external editor - # 1. Write current cmd to temp file - with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".sh") as tf: - tf.write(str(self.edit_job["cmd"])) - tf_path = tf.name - - if self.stdscr: - curses.def_shell_mode() - self.stdscr.clear() - self.stdscr.refresh() - curses.endwin() - - try: - editor = os.environ.get("EDITOR", "nano") - subprocess.run([editor, tf_path]) - - # 3. Read back (collapse newlines/whitespace to single line) - with open(tf_path, "r") as f: - new_cmd = " ".join(f.read().split()) - self._update_edit_cmd(new_cmd) - - os.unlink(tf_path) - except Exception: - pass - - finally: - if self.stdscr: - self.stdscr.refresh() - curses.doupdate() - curses.reset_shell_mode() - self.stdscr.keypad(True) - self.stdscr.nodelay(True) - - def _update_edit_cmd(self, val): - if self.edit_job is None: - return - if self.edit_job: - self.edit_job["cmd"] = val - - def prompt_new_job(self): - job = make_staged_job(generate_job_id()) - with locked_queue() as q: - insert_staged_job(q, job) - - self.edit_job = copy.deepcopy(job) - self.edit_job["_type"] = "staging" - self.edit_is_new = True - self.edit_mode_active = True - self.edit_field_idx = 0 - - for i, w in enumerate(self.windows): - if w.key == "staging": - self.active_win_idx = i - w.selected_idx = 0 - w.scroll_offset = 0 - self.has_selected_job_context = True - break - - def prompt_change_gpus(self): - win = self.windows[self.active_win_idx] - job = win.get_selected() - if not job: - return - self.modal = { - "type": "INPUT", - "title": "Change GPU Requirement", - "text": f"New GPU count for {job['id']}:", - "value": str(job.get("gpus", 1)), - "cursor_pos": len(str(job.get("gpus", 1))), - "on_confirm": lambda val: self.execute_action( - "change_gpus", job["id"], new_val=val - ), - "on_cancel": None, - } - - def do_action(self, action): - """Perform action on selected job.""" - win = self.windows[self.active_win_idx] - job = win.get_selected() - if not job: - return - - jid = job["id"] if job else "" - - # Actions that require modals - if action == "cancel": - if win.key not in ["staging", "pending", "running"]: - return - if win.key == "staging": - self.modal = { - "type": "CONFIRM", - "title": "Cancel Staged Job", - "text": f"Cancel staged job {jid}?", - "on_confirm": lambda: self.execute_action("cancel", jid), - "on_cancel": None, - } - return - - self.modal = { - "type": "CONFIRM", - "title": "Cancel Job", - "text": f"Cancel job {jid}?", - "on_confirm": lambda: self.execute_action("cancel", jid), - "on_cancel": None, - } - return - - elif action == "remove": - # Remove only works for completed (already checked in keybinding) - self.modal = { - "type": "CONFIRM", - "title": "Delete Job", - "text": f"Permanently delete {jid}?", - "on_confirm": lambda: self.execute_action("delete", jid), - "on_cancel": None, - } - return - - elif action == "edit": - # Edit only works for staging (already checked in keybinding) - self.edit_job = copy.deepcopy(job) - self.edit_job["_type"] = "staging" - self.edit_is_new = False - self.edit_mode_active = True - self.edit_field_idx = 0 - return - - elif action == "dup": - dup_job = make_staged_job( - generate_job_id(), job.get("cmd", ""), job.get("gpus", 1) - ) - with locked_queue() as q: - insert_staged_job(q, dup_job) - self.edit_job = copy.deepcopy(dup_job) - self.edit_job["_type"] = "staging" - self.edit_is_new = True - self.edit_mode_active = True - self.edit_field_idx = 0 - - # Switch to STAGING window - for i, w in enumerate(self.windows): - if w.key == "staging": - self.active_win_idx = i - w.selected_idx = 0 - w.scroll_offset = 0 - self.has_selected_job_context = True - break - - return - elif action == "send_to_pending": - if win.key != "staging": - return - self.modal = { - "type": "CONFIRM", - "title": "Send To Pending", - "text": f"Send job {jid} to pending queue?", - "on_confirm": lambda: self.execute_action("send_to_pending", jid), - "on_cancel": None, - } - return - - # Immediate actions - self.execute_action(action, jid) - - def execute_action(self, action, jid, **kwargs): - # Re-use existing cmd functions if possible, or call logic directly - msg = "" - try: - if action == "cancel": - # If it's a pending job, move to completed with status 'cancelled' - # If it's running, call the external tool - is_pending = False - is_staging = False - with locked_queue() as q: - for j in q["staging"]: - if j["id"] == jid: - is_staging = True - break - for j in q["pending"]: - if j["id"] == jid: - is_pending = True - break - - if is_staging: - with locked_queue() as q: - if cancel_staged_job(q, jid): - msg = f"Cancelled staged {jid}" - elif is_pending: - with locked_queue() as q: - for i, j in enumerate(q["pending"]): - if j["id"] == jid: - job = q["pending"].pop(i) - job["status"] = "cancelled" - job["ended"] = datetime.now().isoformat() - q["completed"].insert(0, job) - msg = f"Cancelled {jid}" - break - else: - # Running job - subprocess.Popen( - ["gpu-queue", "cancel", jid], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - msg = f"Cancelling {jid}..." - - elif action == "delete": - with locked_queue() as q: - for i, j in enumerate(q["completed"]): - if j["id"] == jid: - q["completed"].pop(i) - msg = f"Deleted {jid}" - break - - elif action == "pause": - subprocess.Popen( - ["gpu-queue", "pause", jid], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - msg = f"Pausing {jid}..." - - elif action == "retry": - with locked_queue() as q: - for job in q["completed"]: - if job["id"] == jid: - new_job = make_staged_job( - generate_job_id(), job["cmd"], job.get("gpus", 1) - ) - if stage_completed_retry(q, jid, new_job): - msg = f"Staged retry for {jid}" - break - - elif action == "update_staging": - cmd = kwargs.get("cmd") - gpus = kwargs.get("gpus") - - with locked_queue() as q: - for job in q["staging"]: - if job["id"] == jid: - if cmd is not None: - job["cmd"] = cmd - if gpus is not None: - job["gpus"] = gpus - msg = f"Updated staged job {jid}" - break - - elif action == "send_to_pending": - with locked_queue() as q: - if send_staged_job_to_pending(q, jid): - msg = f"Sent {jid} to pending" - - elif action == "discard_staging": - with locked_queue() as q: - if cancel_staged_job(q, jid): - msg = f"Discarded staged job {jid}" - if self.edit_mode_active and self.edit_job is not None: - if self.edit_job.get("id") == jid: - self.edit_mode_active = False - self.edit_job = None - - elif action in ["move_pending_up", "move_pending_down"]: - offset = -1 if action == "move_pending_up" else 1 - new_idx = None - queue_snapshot = None - with locked_queue() as q: - if move_pending_job(q, jid, offset): - msg = f"Moved {jid} {'up' if offset < 0 else 'down'}" - for idx, job in enumerate(q["pending"]): - if job.get("id") == jid: - new_idx = idx - break - queue_snapshot = copy.deepcopy(q) - else: - msg = "Cannot move further" - - if queue_snapshot is not None: - with self.lock: - self._set_data_snapshot(queue_snapshot) - for w in self.windows: - if w.key == "pending": - if new_idx is not None: - w.selected_idx = new_idx - w.ensure_selected_visible() - break - - except Exception as e: - msg = f"Err: {str(e)}" - - self.action_msg = msg - self.msg_clear_time = time.time() + 2.0 - - -def cmd_watch(args): - """Interactive TUI for queue monitoring.""" - import curses - - tui = GPUQueueTUI(args.interval) - try: - curses.wrapper(tui.main) - except KeyboardInterrupt: - pass - print("Exited TUI.") - - -def main(): - parser = argparse.ArgumentParser(description="GPU Job Queue Scheduler") - subparsers = parser.add_subparsers(dest="command", required=True) - - # add - add_parser = subparsers.add_parser("add", help="Add a job to the queue") - add_parser.add_argument("command", help="Command to run") - add_parser.add_argument( - "--gpus", "-g", type=int, default=2, help="Number of GPUs required" - ) - add_parser.add_argument( - "--priority", - "-p", - choices=["low", "medium", "high"], - default="medium", - help="Job priority", - ) - add_parser.add_argument( - "--front", - "-f", - action="store_true", - help="Add to front of queue (Urgent priority)", - ) - add_parser.set_defaults(func=cmd_add) - - # start - start_parser = subparsers.add_parser( - "start", help="Start the queue scheduler (background)" - ) - start_parser.add_argument( - "--min-free", type=int, default=2, help="Number of GPUs to always keep free" - ) - start_parser.set_defaults(func=cmd_start) - - # stop - stop_parser = subparsers.add_parser("stop", help="Stop the background scheduler") - stop_parser.set_defaults(func=cmd_stop) - - # serve - serve_parser = subparsers.add_parser( - "serve", help="Run the queue scheduler (foreground)" - ) - serve_parser.add_argument( - "--min-free", type=int, default=2, help="Number of GPUs to always keep free" - ) - serve_parser.add_argument( - "--exclude-gpus", - type=str, - default="", - help="Comma-separated list of GPU indices to ignore (e.g. '0,1')", - ) - serve_parser.set_defaults(func=cmd_serve) - - # status removed - - # cancel - cancel_parser = subparsers.add_parser("cancel", help="Cancel a pending job") - cancel_parser.add_argument("job_id", help="Job ID to cancel") - cancel_parser.set_defaults(func=cmd_cancel) - - # logs - logs_parser = subparsers.add_parser("logs", help="Show job logs") - logs_parser.add_argument("job_id", help="Job ID") - logs_parser.add_argument( - "--lines", "-n", type=int, default=50, help="Number of lines" - ) - logs_parser.set_defaults(func=cmd_logs) - - # clear - clear_parser = subparsers.add_parser("clear", help="Clear completed jobs") - clear_parser.set_defaults(func=cmd_clear) - - # retry - retry_parser = subparsers.add_parser("retry", help="Re-queue a completed job") - retry_parser.add_argument("job_id", help="Job ID to retry") - retry_parser.add_argument( - "--front", "-f", action="store_true", help="Add to front of queue" - ) - retry_parser.set_defaults(func=cmd_retry) - - # pause - pause_parser = subparsers.add_parser( - "pause", help="Pause a running job (re-queue at front)" - ) - pause_parser.add_argument("job_id", help="Job ID to pause") - pause_parser.set_defaults(func=cmd_pause) - - # watch - watch_parser = subparsers.add_parser( - "watch", help="Watch queue status continuously" - ) - watch_parser.add_argument( - "--interval", "-n", type=float, default=2.0, help="Update interval in seconds" - ) - watch_parser.set_defaults(func=cmd_watch) - - args = parser.parse_args() - args.func(args) +from gpu_queue.gpu import get_available_gpu_indices, get_free_gpus +from gpu_queue.ids import generate_job_id +from gpu_queue.paths import ( + DAEMON_LOG, + LOCK_FILE, + LOG_DIR, + MIN_FREE_GPUS, + PID_FILE, + POLL_INTERVAL, + QUEUE_DIR, + QUEUE_FILE, + SERVER_PORT, + get_server_url, +) +from gpu_queue.scheduler import ( + cleanup_dead_jobs, + daemon_loop, + is_daemon_running, + run_job, +) +from gpu_queue.storage import load_queue_raw, save_queue, save_queue_raw +from gpu_queue.tui.app import GPUQueueTUI, Window, get_status_data, get_terminal_width + +__all__ = [ + "DAEMON_LOG", + "GPUQueueTUI", + "LOCK_FILE", + "LOG_DIR", + "MIN_FREE_GPUS", + "PID_FILE", + "POLL_INTERVAL", + "QUEUE_DIR", + "QUEUE_FILE", + "SERVER_PORT", + "Window", + "build_parser", + "cleanup_dead_jobs", + "cmd_add", + "cmd_cancel", + "cmd_clear", + "cmd_logs", + "cmd_pause", + "cmd_retry", + "cmd_serve", + "cmd_start", + "cmd_stop", + "cmd_watch", + "daemon_loop", + "generate_job_id", + "get_available_gpu_indices", + "get_free_gpus", + "get_server_url", + "get_status_data", + "get_terminal_width", + "is_daemon_running", + "load_queue_raw", + "main", + "run_job", + "save_queue", + "save_queue_raw", +] if __name__ == "__main__": diff --git a/src/gpu_queue/paths.py b/src/gpu_queue/paths.py new file mode 100644 index 0000000..c693e2d --- /dev/null +++ b/src/gpu_queue/paths.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +from pathlib import Path + +QUEUE_DIR = Path.home() / ".gpu_queue" +QUEUE_FILE = QUEUE_DIR / "jobs.json" +PID_FILE = QUEUE_DIR / "daemon.pid" +DAEMON_LOG = QUEUE_DIR / "daemon.log" +LOG_DIR = QUEUE_DIR / "logs" +LOCK_FILE = QUEUE_DIR / "queue.lock" + +POLL_INTERVAL = 2 +MIN_FREE_GPUS = 2 +SERVER_PORT = 12345 + + +def get_server_url() -> str: + return f"http://localhost:{SERVER_PORT}" diff --git a/src/gpu_queue/queue_state.py b/src/gpu_queue/queue_state.py index c1e778c..44e869a 100644 --- a/src/gpu_queue/queue_state.py +++ b/src/gpu_queue/queue_state.py @@ -64,6 +64,18 @@ def send_staged_job_to_pending(queue: dict[str, list], job_id: str) -> bool: return False +def move_pending_job_to_staging(queue: dict[str, list], job_id: str) -> bool: + for i, job in enumerate(queue["pending"]): + if job["id"] == job_id: + moved = queue["pending"].pop(i) + now = datetime.now().isoformat() + moved["added"] = now + moved["staged_at"] = now + queue["staging"].insert(0, moved) + return True + return False + + def cancel_staged_job(queue: dict[str, list], job_id: str) -> bool: for i, job in enumerate(queue["staging"]): if job["id"] == job_id: @@ -75,7 +87,9 @@ def cancel_staged_job(queue: dict[str, list], job_id: str) -> bool: return False -def stage_completed_retry(queue: dict[str, list], job_id: str, new_job: dict[str, Any]) -> bool: +def stage_completed_retry( + queue: dict[str, list], job_id: str, new_job: dict[str, Any] +) -> bool: for i, job in enumerate(queue["completed"]): if job["id"] == job_id: queue["completed"].pop(i) @@ -98,3 +112,35 @@ def move_pending_job(queue: dict[str, list], job_id: str, offset: int) -> bool: return False pending[idx], pending[new_idx] = pending[new_idx], pending[idx] return True + + +def move_pending_jobs(queue: dict[str, list], job_ids: list[str], offset: int) -> bool: + """Move multiple pending jobs together by one row, preserving relative order.""" + if offset not in (-1, 1): + return False + pending = queue["pending"] + if not pending or not job_ids: + return False + + wanted = {str(job_id) for job_id in job_ids} + indexed = [ + i + for i, job in enumerate(pending) + if job.get("id") is not None and str(job.get("id")) in wanted + ] + if not indexed: + return False + + if offset < 0 and indexed[0] == 0: + return False + if offset > 0 and indexed[-1] == len(pending) - 1: + return False + + if offset < 0: + for idx in indexed: + pending[idx - 1], pending[idx] = pending[idx], pending[idx - 1] + return True + + for idx in reversed(indexed): + pending[idx + 1], pending[idx] = pending[idx], pending[idx + 1] + return True diff --git a/src/gpu_queue/scheduler.py b/src/gpu_queue/scheduler.py new file mode 100644 index 0000000..d5c094e --- /dev/null +++ b/src/gpu_queue/scheduler.py @@ -0,0 +1,206 @@ +from __future__ import annotations + +import json +import os +import subprocess +import time +from datetime import datetime +from pathlib import Path +from typing import Optional + +from gpu_queue import paths +from gpu_queue.gpu import get_free_gpus +from gpu_queue.logs import log_msg +from gpu_queue.storage import locked_queue + + +def is_daemon_running() -> bool: + """Check if scheduler is running (placeholder).""" + return True + + +def cleanup_dead_jobs() -> None: + """Check running jobs and move dead ones to completed with status classification.""" + with locked_queue() as queue: + still_running = [] + changed = False + + for job in queue["running"]: + pid = job.get("pid") + if pid: + proc_path = Path(f"/proc/{pid}") + if proc_path.exists(): + still_running.append(job) + continue + + job["ended"] = datetime.now().isoformat() + exit_file = paths.QUEUE_DIR / f"{job['id']}.exit" + + status = "unknown" + for _ in range(10): + if exit_file.exists(): + try: + code = int(exit_file.read_text().strip()) + status = "success" if code == 0 else "failed" + break + except Exception: + pass + time.sleep(0.1) + + if status == "unknown": + status = "killed" + + job["status"] = status + queue["completed"].append(job) + if exit_file.exists(): + exit_file.unlink(missing_ok=True) + changed = True + else: + still_running.append(job) + + if changed: + queue["running"] = still_running + + +def run_job(job: dict, gpu_indices: list[int]) -> int: + """Run a job with the specified GPUs. Returns the PID.""" + log_file = paths.LOG_DIR / f"{job['id']}.log" + exit_file = paths.QUEUE_DIR / f"{job['id']}.exit" + gpu_str = ",".join(map(str, gpu_indices)) + + cmd = " ".join(job["cmd"].split()) + + env = os.environ.copy() + env["CUDA_VISIBLE_DEVICES"] = gpu_str + local_bin = str(Path.home() / ".local" / "bin") + if local_bin not in env.get("PATH", ""): + env["PATH"] = f"{local_bin}:{env.get('PATH', '')}" + + with open(log_file, "w") as f: + f.write(f"=== Job {job['id']} ===\n") + f.write(f"Command: {cmd}\n") + f.write(f"GPUs: {gpu_str}\n") + f.write(f"Started: {datetime.now().isoformat()}\n") + f.write("=" * 40 + "\n\n") + + q_log = f"'{log_file}'" + q_exit = f"'{exit_file}'" + wrapped_cmd = f"({cmd}) >> {q_log} 2>&1; echo $? > {q_exit}" + + proc = subprocess.Popen( + wrapped_cmd, + shell=True, + env=env, + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + cwd=Path.home() / "pepo", + start_new_session=True, + ) + + return proc.pid + + +def _running_gpu_usage(job: dict) -> int: + assigned = job.get("assigned_gpus", []) + if assigned: + return len(assigned) + return int(job.get("gpus", 1)) + + +def daemon_loop( + min_free: int, excluded_gpus: set[int] | None = None, max_use: Optional[int] = None +) -> None: + """Main scheduler loop.""" + if excluded_gpus is None: + excluded_gpus = set() + + while True: + try: + cleanup_dead_jobs() + + with locked_queue() as queue: + if queue["pending"]: + all_gpus = get_free_gpus() + gpus = [g for g in all_gpus if g["index"] not in excluded_gpus] + + our_assigned = set() + our_usage = 0 + for j in queue["running"]: + our_usage += _running_gpu_usage(j) + for idx in j.get("assigned_gpus", []): + our_assigned.add(idx) + + actual_free_count = sum( + 1 + for g in all_gpus + if g["free"] and g["index"] not in our_assigned + ) + startable_gpus = max(0, actual_free_count - min_free) + if max_use is not None: + startable_gpus = min( + startable_gpus, max(0, max_use - our_usage) + ) + + free_indices = [ + g["index"] + for g in gpus + if g["free"] and g["index"] not in our_assigned + ] + + if startable_gpus > 0: + jobs_started = False + remaining_pending = [] + + for job in queue["pending"]: + req = job.get("gpus", 1) + + if req <= startable_gpus and req <= len(free_indices): + assigned = free_indices[:req] + log_msg(f"Starting job {job['id']} on GPUs {assigned}") + + pid = run_job(job, assigned) + + job["pid"] = pid + job["assigned_gpus"] = assigned + job["status"] = "running" + job["started"] = datetime.now().isoformat() + queue["running"].append(job) + + startable_gpus -= req + free_indices = free_indices[req:] + jobs_started = True + else: + remaining_pending.append(job) + + if jobs_started: + queue["pending"] = remaining_pending + + _write_status(all_gpus, min_free, max_use, excluded_gpus) + else: + all_gpus = get_free_gpus() + _write_status(all_gpus, min_free, max_use, excluded_gpus) + + time.sleep(paths.POLL_INTERVAL) + + except KeyboardInterrupt: + break + except Exception as e: + log_msg(f"Error in daemon loop: {e}") + time.sleep(paths.POLL_INTERVAL) + + +def _write_status( + all_gpus: list[dict], min_free: int, max_use: Optional[int], excluded_gpus: set[int] +) -> None: + try: + status_data = { + "ts": datetime.now().isoformat(), + "gpus": all_gpus, + "min_free": min_free, + "max_use": max_use, + "excluded": list(excluded_gpus), + } + (paths.QUEUE_DIR / "status.json").write_text(json.dumps(status_data)) + except Exception: + pass diff --git a/src/gpu_queue/storage.py b/src/gpu_queue/storage.py new file mode 100644 index 0000000..c3e0bdc --- /dev/null +++ b/src/gpu_queue/storage.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +import fcntl +import json +from contextlib import contextmanager +from typing import Iterator + +from gpu_queue import paths +from gpu_queue.logs import log_msg +from gpu_queue.queue_state import empty_queue, load_queue_file, save_queue_file + + +def ensure_dirs() -> None: + """Create queue directories if they don't exist.""" + paths.QUEUE_DIR.mkdir(exist_ok=True) + paths.LOG_DIR.mkdir(exist_ok=True) + + +@contextmanager +def locked_queue() -> Iterator[dict[str, list]]: + """Context manager for thread-safe and process-safe queue access.""" + ensure_dirs() + with open(paths.LOCK_FILE, "w") as f: + try: + fcntl.flock(f, fcntl.LOCK_EX) + queue = load_queue_raw() + yield queue + save_queue_raw(queue) + finally: + fcntl.flock(f, fcntl.LOCK_UN) + + +def load_queue_raw() -> dict[str, list]: + """Load the job queue from disk without locking.""" + try: + return load_queue_file(paths.QUEUE_FILE) + except (json.JSONDecodeError, ValueError) as e: + log_msg(f"Error loading queue JSON: {e}") + return empty_queue() + + +def save_queue_raw(queue: dict[str, list]) -> None: + """Save the job queue to disk without locking (atomic replace).""" + save_queue_file(paths.QUEUE_FILE, queue) + + +def load_queue() -> dict[str, list]: + """Load the job queue (backward compatibility, no lock).""" + return load_queue_raw() + + +def save_queue(queue: dict[str, list]) -> None: + """Save the job queue (backward compatibility, no lock).""" + save_queue_raw(queue) diff --git a/src/gpu_queue/tui/__init__.py b/src/gpu_queue/tui/__init__.py new file mode 100644 index 0000000..64d929f --- /dev/null +++ b/src/gpu_queue/tui/__init__.py @@ -0,0 +1,3 @@ +from gpu_queue.tui.app import GPUQueueTUI, Window + +__all__ = ["GPUQueueTUI", "Window"] diff --git a/src/gpu_queue/tui/app.py b/src/gpu_queue/tui/app.py new file mode 100644 index 0000000..f724549 --- /dev/null +++ b/src/gpu_queue/tui/app.py @@ -0,0 +1,2298 @@ +from __future__ import annotations + +import copy +import curses +import json +import os +import shutil +import signal +import subprocess +import tempfile +import threading +import time +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, Optional, Sequence, cast + +from gpu_queue import paths +from gpu_queue.gpu import get_free_gpus +from gpu_queue.ids import generate_job_id +from gpu_queue.queue_state import ( + cancel_staged_job, + insert_staged_job, + make_staged_job, + move_pending_job, + move_pending_job_to_staging, + move_pending_jobs, + send_staged_job_to_pending, + stage_completed_retry, +) +from gpu_queue.scheduler import cleanup_dead_jobs +from gpu_queue.storage import load_queue, locked_queue + +CMD_FIELD_GHOST = "" + +# Sparkline for GPU util (U+2581..U+2588). +# Buffer >= max spark columns so full width scrolls. +_BLOCK_SPARK_CHARS = "▁▂▃▄▅▆▇█" +GPU_UTIL_HISTORY_MAX_SAMPLES = 512 +GPU_COL_IDX_W = 4 +GPU_COL_UTIL_W = 4 +GPU_COL_MEM_W = 14 +GPU_MIN_PROC_W = 8 + + +def get_terminal_width() -> int: + """Get the current terminal width.""" + return shutil.get_terminal_size((80, 20)).columns + + +def sparkline_trailing(util_series: list[int], width: int) -> str: + """Latest `width` samples as block chars, right-aligned (scrolls as time passes).""" + if width <= 0: + return "" + if not util_series: + return " " * width + tail = util_series[-width:] + parts: list[str] = [] + for u in tail: + u = max(0, min(100, int(u))) + bi = min(7, u * 7 // 100) + parts.append(_BLOCK_SPARK_CHARS[bi]) + s = "".join(parts) + return (" " * (width - len(s))) + s + + +def _gpu_status_column_widths(inner: int) -> tuple[int, int, int]: + """Return (prefix_w, hist_w, proc_w) for one full-width status row.""" + prefix_w = GPU_COL_IDX_W + 1 + GPU_COL_UTIL_W + 1 + GPU_COL_MEM_W + slack = inner - prefix_w + if slack <= 0: + return prefix_w, 0, 0 + if slack == 1: + return prefix_w, 0, 1 + pair = slack - 2 + if pair < GPU_MIN_PROC_W: + return prefix_w, 0, slack - 1 + proc_w = max(GPU_MIN_PROC_W, pair // 3) + hist_w = pair - proc_w + return prefix_w, hist_w, proc_w + + +def _fit_text_field(s: str, max_w: int) -> str: + if max_w <= 0: + return "" + if len(s) <= max_w: + return s.ljust(max_w) + if max_w <= 3: + return s[:max_w] + return s[: max_w - 3] + "..." + + +def _format_gpu_history_span_seconds(total_sec: int) -> str: + total_sec = max(0, int(total_sec)) + if total_sec < 60: + return f"{total_sec}s" + if total_sec < 3600: + m, s = divmod(total_sec, 60) + return f"{m}m" if s == 0 else f"{m}m{s}s" + h, rem = divmod(total_sec, 3600) + m, s = divmod(rem, 60) + if m == 0 and s == 0: + return f"{h}h" + if s == 0: + return f"{h}h{m}m" + return f"{h}h{m}m{s}s" + + +def _gpu_history_header_label(hist_w: int, sample_interval_sec: float) -> str: + """Visible window = one sample per column at daemon poll cadence.""" + if hist_w <= 0: + return "" + span_sec = int(round(hist_w * sample_interval_sec)) + return f"HISTORY {_format_gpu_history_span_seconds(span_sec)}" + + +def shorten_command(cmd: str, max_len: int) -> str: + """Shorten a command string to max_len by removing the middle part.""" + if len(cmd) <= max_len: + return cmd + + # Calculate lengths to keep + head_len = (max_len - 3) // 2 + tail_len = max_len - 3 - head_len + + return f"{cmd[:head_len]}...{cmd[-tail_len:]}" + + +# cmd_status removed - functionality merged into watch TUI + + +def get_status_data(): + """Gather all status data for the queue and GPUs.""" + cleanup_dead_jobs() + queue = load_queue() + gpus = get_free_gpus() + + # Get config (min_free, max_use, and excluded) + min_free = 2 + max_use = None + excluded = set() + config_file = paths.QUEUE_DIR / "config.json" + if config_file.exists(): + try: + cfg = json.loads(config_file.read_text()) + min_free = cfg.get("min_free_gpus", 2) + max_use = cfg.get("max_use_gpus") + excluded = set(cfg.get("excluded_gpus", [])) + except Exception: + pass + + # Filter GPUs (hiding excluded ones entirely from the monitor view) + gpus = [g for g in gpus if g["index"] not in excluded] + + return { + "queue": queue, + "gpus": gpus, + "min_free": min_free, + "max_use": max_use, + "excluded": list(excluded), + "term_width": get_terminal_width(), + } + + +class Window: + def __init__(self, title, key, height_pct=0.3): + self.title = title + self.key = key + self.items = [] + self.selected_idx = 0 + self.scroll_offset = 0 + self.height_pct = height_pct # Target height percentage + self.height: Optional[int] = None + self.collapsed = False + + def update_items(self, items): + self.items = items + # Clamp selection + if self.selected_idx >= len(self.items): + self.selected_idx = max(0, len(self.items) - 1) + self.clamp_scroll() + + def visible_item_count(self, h=None): + """Return how many list rows are visible at this window height.""" + effective_h = h if h is not None else getattr(self, "height", None) + if effective_h is None: + effective_h = 10 + + visible_h = max(1, effective_h - 2) + if self.key in ["running", "staging", "pending", "completed"]: + visible_h = max(1, visible_h - 1) + return visible_h + + def clamp_scroll(self, h=None): + """Keep the scroll offset valid for the current item count and height.""" + if not self.items: + self.selected_idx = 0 + self.scroll_offset = 0 + return + + self.selected_idx = max(0, min(len(self.items) - 1, self.selected_idx)) + visible_h = self.visible_item_count(h) + max_offset = max(0, len(self.items) - visible_h) + self.scroll_offset = max(0, min(max_offset, self.scroll_offset)) + + def ensure_selected_visible(self, h=None): + """Adjust scroll offset so the selected row remains on screen.""" + self.clamp_scroll(h) + if not self.items: + return + + visible_h = self.visible_item_count(h) + if self.selected_idx < self.scroll_offset: + self.scroll_offset = self.selected_idx + elif self.selected_idx >= self.scroll_offset + visible_h: + self.scroll_offset = self.selected_idx - visible_h + 1 + self.clamp_scroll(h) + + def scroll(self, delta, h=None): + """Scroll selection by delta.""" + if not self.items: + return + + new_idx = self.selected_idx + delta + self.selected_idx = max(0, min(len(self.items) - 1, new_idx)) + self.ensure_selected_visible(h) + + def get_selected(self): + if 0 <= self.selected_idx < len(self.items): + return self.items[self.selected_idx] + return None + + +class GPUQueueTUI: + def __init__(self, interval=2.0): + self.interval = interval + self.stdscr = None + self.running = False + self.lock = threading.Lock() + + # State + self.data = {"staging": [], "running": [], "pending": [], "completed": []} + self.data = {"staging": [], "running": [], "pending": [], "completed": []} + self.gpu_status = [] + self.min_free = 2 + self.max_use: Optional[int] = None + self.excluded = [] + self.server_status = "UNKNOWN" + self.last_updated = 0 + self.action_msg = "" + self.msg_clear_time = 0 + self.action_msg = "" + self.msg_clear_time = 0 + self.modal: Optional[Dict[str, Any]] = None # { type, title, text, val... } + self._gpu_util_history: dict[int, list[int]] = {} + self._last_util_status_ts: Optional[str] = None + + # Windows + self.windows = [ + Window("RUNNING", "running", 0.2), + Window("PENDING", "pending", 0.2), + Window("STAGING", "staging", 0.2), + Window("COMPLETED", "completed", 0.2), + Window("GPU STATUS", "gpu_status", 0.2), + Window("SELECTED JOB", "job_details", 0.2), + ] + self.active_win_idx = 0 # Index in self.windows + self.mode = "NAV" # "NAV" (select window) or "ACTION" (interact with window) + self.has_selected_job_context = False + self.selected_job_ids: dict[str, set[str]] = { + "running": set(), + "pending": set(), + "staging": set(), + "completed": set(), + } + self.select_mode_active = False + self.select_mode_window_key: Optional[str] = None + self.select_mode_anchor_idx: Optional[int] = None + + # Log view state + self.viewing_logs = False + self.log_job_id = None + self.log_content = [] + self.log_scroll = 0 + + # Edit Mode State + self.edit_mode_active = False + self.edit_job = None + self.edit_field_idx = 0 # 0: GPUs, 1: Command + self.edit_is_new = False + + # Dynamic column widths for tables + self.col_widths = { + "running": {"id": 8, "pid": 6, "gpus": 4, "elapsed": 7}, + "staging": {"id": 8, "gpus": 4, "waiting": 7}, + "pending": {"id": 8, "gpus": 4, "waiting": 7}, + "completed": {"id": 8, "runtime": 7, "ago": 7}, + } + + def _calc_col_widths(self): + """Calculate column widths based on current data for all tables.""" + try: + + def calc_queue_widths(items: list[dict[str, Any]]) -> dict[str, int]: + widths = {"id": 3, "gpus": 4, "waiting": 7} + for job in items: + jid = job.get("id", "")[:8] + widths["id"] = max(widths["id"], len(jid)) + widths["gpus"] = max(widths["gpus"], len(str(job.get("gpus", 1)))) + add_dt = self._parse_iso(job.get("added")) + if add_dt: + waiting = self._fmt_delta(datetime.now() - add_dt) + widths["waiting"] = max(widths["waiting"], len(waiting)) + for key in widths: + widths[key] += 1 + return widths + + # Running table + running_w = {"id": 3, "pid": 3, "gpus": 4, "elapsed": 7} + for job in self.data.get("running", []): + jid = job.get("id", "")[:8] + running_w["id"] = max(running_w["id"], len(jid)) + running_w["pid"] = max(running_w["pid"], len(str(job.get("pid", "")))) + gpus = ",".join(map(str, job.get("assigned_gpus", []))) + running_w["gpus"] = max(running_w["gpus"], len(gpus) if gpus else 1) + start_dt = self._parse_iso(job.get("started")) + if start_dt: + elapsed = self._fmt_delta(datetime.now() - start_dt) + running_w["elapsed"] = max(running_w["elapsed"], len(elapsed)) + + # Staging + Pending tables + staging_w = calc_queue_widths(self.data.get("staging", [])) + pending_w = calc_queue_widths(self.data.get("pending", [])) + + # Completed table + completed_w = {"id": 3, "runtime": 7, "ago": 7} + for job in self.data.get("completed", []): + jid = job.get("id", "")[:8] + completed_w["id"] = max(completed_w["id"], len(jid)) + start_dt = self._parse_iso(job.get("started")) + end_dt = self._parse_iso(job.get("ended")) + if start_dt and end_dt: + run_s = self._fmt_delta(end_dt - start_dt) + completed_w["runtime"] = max(completed_w["runtime"], len(run_s)) + if end_dt: + ago_s = self._fmt_delta(datetime.now() - end_dt) + completed_w["ago"] = max(completed_w["ago"], len(ago_s)) + + # Add padding + for key in running_w: + running_w[key] += 1 + for key in completed_w: + completed_w[key] += 1 + + self.col_widths = { + "running": running_w, + "staging": staging_w, + "pending": pending_w, + "completed": completed_w, + } + except Exception: + pass # Keep existing widths on error + + def _set_data_snapshot(self, queue: dict[str, list]): + """Replace local queue data from a freshly mutated queue snapshot.""" + self.data = copy.deepcopy(queue) + self.data["completed"].sort(key=lambda x: x.get("ended", ""), reverse=True) + self.last_updated = time.time() + self._sync_windows_from_data() + self._calc_col_widths() + + def _sync_windows_from_data(self): + for w in self.windows: + items = self.data.get(w.key, []) + type_label = w.key + for item in items: + item["_type"] = type_label + w.update_items(items) + if w.key in self.selected_job_ids: + live_ids = {str(item.get("id")) for item in items} + self.selected_job_ids[w.key].intersection_update(live_ids) + + if self.edit_mode_active and self.edit_job is not None: + edit_id = self.edit_job.get("id") + for w in self.windows: + if w.key == "staging": + for idx, item in enumerate(w.items): + if item.get("id") == edit_id: + w.selected_idx = idx + break + break + + def _selected_job(self): + win = self.windows[self.active_win_idx] + if win.key in ["running", "pending", "staging", "completed"]: + return win.get_selected() + return getattr(self, "last_selected_job", None) + + def _selected_ids_for_window(self, win) -> list[str]: + """Return checked row IDs in display order for a queue window.""" + selected = self.selected_job_ids.get(win.key, set()) + if not selected: + return [] + return [ + str(item["id"]) + for item in win.items + if item.get("id") is not None and str(item["id"]) in selected + ] + + def _action_ids_for_window(self, win) -> list[str]: + checked = self._selected_ids_for_window(win) + if checked: + return checked + job = win.get_selected() + if not job: + return [] + return [str(job["id"])] + + def _select_current_row(self, win) -> None: + if win.key not in self.selected_job_ids: + return + job = win.get_selected() + if not job: + return + self.selected_job_ids[win.key].add(str(job["id"])) + + def _select_range_to_cursor(self, win) -> None: + if win.key not in self.selected_job_ids or not win.items: + return + if self.select_mode_anchor_idx is None: + self.select_mode_anchor_idx = win.selected_idx + anchor = max(0, min(len(win.items) - 1, self.select_mode_anchor_idx)) + cursor = max(0, min(len(win.items) - 1, win.selected_idx)) + lo = min(anchor, cursor) + hi = max(anchor, cursor) + self.selected_job_ids[win.key] = { + str(win.items[idx]["id"]) for idx in range(lo, hi + 1) + } + + def _enter_select_mode(self, win) -> None: + if win.key not in self.selected_job_ids or not win.get_selected(): + return + self.selected_job_ids[win.key] = set() + self.select_mode_active = True + self.select_mode_window_key = win.key + self.select_mode_anchor_idx = win.selected_idx + self._select_range_to_cursor(win) + self.action_msg = "Select mode" + self.msg_clear_time = time.time() + 2.0 + + def _exit_select_mode(self) -> None: + self.select_mode_active = False + self.select_mode_window_key = None + self.select_mode_anchor_idx = None + + def _has_selected_ids(self) -> bool: + return any(self.selected_job_ids.values()) + + def _clear_all_selected_ids(self) -> None: + for key in self.selected_job_ids: + self.selected_job_ids[key].clear() + + def _cancel_select_mode(self) -> None: + self._clear_all_selected_ids() + self._exit_select_mode() + + def _select_mode_move(self, win, delta: int, h: Optional[int] = None) -> None: + if not self.select_mode_active or self.select_mode_window_key != win.key: + return + previous_idx = win.selected_idx + win.scroll(delta, h) + if win.selected_idx == previous_idx: + return + self._select_range_to_cursor(win) + selected_count = len(self._selected_ids_for_window(win)) + self.action_msg = f"Selected {selected_count} jobs" + self.msg_clear_time = time.time() + 2.0 + + def _clear_selected_ids(self, jids: Sequence[str]) -> None: + ids = {str(jid) for jid in jids} + for selected in self.selected_job_ids.values(): + selected.difference_update(ids) + if self.select_mode_window_key is not None and not self.selected_job_ids.get( + self.select_mode_window_key + ): + self._exit_select_mode() + + def _enter_action_window(self, win): + self.mode = "ACTION" + win.collapsed = False + if win.key in ["running", "pending", "staging", "completed"]: + self.has_selected_job_context = True + + def _exit_action_window(self, win): + self.mode = "NAV" + self.has_selected_job_context = False + self.last_selected_job = None + win.scroll_offset = 0 + win.selected_idx = 0 + + def start(self): + self.running = True + # Fast loop for queue updates + t1 = threading.Thread(target=self._poll_queue_loop, daemon=True) + t1.start() + # Slow loop for GPU polling + t2 = threading.Thread(target=self._poll_gpu_loop, daemon=True) + t2.start() + + def stop(self): + self.running = False + + def _poll_queue_loop(self): + """Poll queue frequently for snappy UI.""" + while self.running: + try: + with locked_queue() as q: + with self.lock: + self._set_data_snapshot(q) + + time.sleep(0.5) # Fast update + except Exception: + # with self.lock: self.action_msg = f"Q Poll Error: {str(e)}" + time.sleep(1) + + def _append_gpu_util_sample(self, status: dict[str, Any]) -> None: + ts = status.get("ts") + if not ts or ts == self._last_util_status_ts: + return + self._last_util_status_ts = ts + for g in status.get("gpus", []): + idx = g.get("index") + if idx is None: + continue + util = int(g.get("util", 0)) + buf = self._gpu_util_history.setdefault(int(idx), []) + buf.append(max(0, min(100, util))) + while len(buf) > GPU_UTIL_HISTORY_MAX_SAMPLES: + buf.pop(0) + + def _poll_gpu_loop(self): + """Poll GPU status at slower interval.""" + while self.running: + try: + status_file = paths.QUEUE_DIR / "status.json" + if status_file.exists(): + try: + # Use file lock or retry read for atomic? + # Using simple read should be fine mostly + txt = status_file.read_text() + if txt.strip(): + s = json.loads(txt) + with self.lock: + self.gpu_status = s.get("gpus", []) + self.min_free = s.get("min_free", 2) + self.max_use = s.get("max_use") + self.excluded = s.get("excluded", []) + self.server_status = "DAEMON: ON" + self._append_gpu_util_sample(s) + else: + with self.lock: + self.server_status = "DAEMON: S?" + except json.JSONDecodeError: + pass # Partial write? + else: + with self.lock: + self.server_status = "DAEMON: OFF" + except Exception: + with self.lock: + self.server_status = "ERR" + + time.sleep(self.interval) + + def draw_box(self, y, x, h, w, title, active=False, focused=False): + if self.stdscr is None: + return + """Draw a bordered box.""" + try: + color = curses.color_pair(4) # Cyan default + if focused: + color = curses.color_pair(2) # Green for focused interaction + # self.stdscr.attron(curses.A_BOLD) + elif active: + color = curses.color_pair(3) # Yellow for selected window + + # Draw border + self.stdscr.attron(color) + self.stdscr.box() + # rectangle doesn't use relative coords well with subwin? + + # Manual box drawing using line characters if needed, or just addstr + # Top + self.stdscr.hline(y, x, curses.ACS_HLINE, w) + self.stdscr.vline(y, x, curses.ACS_VLINE, h) + self.stdscr.hline(y + h - 1, x, curses.ACS_HLINE, w) + self.stdscr.vline(y, x + w - 1, curses.ACS_VLINE, h) + + # Corners + self.stdscr.addch(y, x, curses.ACS_ULCORNER) + self.stdscr.addch(y, x + w - 1, curses.ACS_URCORNER) + self.stdscr.addch(y + h - 1, x, curses.ACS_LLCORNER) + self.stdscr.addch(y + h - 1, x + w - 1, curses.ACS_LRCORNER) + + # Title + title_str = f" {title} " + if focused: + title_str = f" [ {title} ] " + elif active: + title_str = f" {title} " + + self.stdscr.addstr( + y, x + 2, title_str, color | (curses.A_BOLD if active else 0) + ) + self.stdscr.attroff(color) + except Exception: + pass + + def _parse_iso(self, s): + if not s: + return None + try: + return datetime.fromisoformat(s) + except Exception: + return None + + def _fmt_delta(self, delta): + if not delta: + return "-" + s = int(delta.total_seconds()) + if s < 60: + return f"{s}s" + m = s // 60 + if m < 60: + return f"{m}m" + h = m // 60 + m = m % 60 + return f"{h}h{m}m" + + def format_job_line(self, job, w): + """Format a job dictionary into a single line string.""" + jid = job["id"] + + # Edit Mode Highlighting + edit_idx = job.get("_edit_field_idx", -1) + is_editing = edit_idx >= 0 + + # Determine Color based on state/status + color = curses.color_pair(0) + + prefix = "" + + if job["_type"] == "running": + # Formt: ID(8) | PID(10) | GPUS(8) | ELAPSED(9) | CMD + color = curses.color_pair(5) # Blue + + gpus = ",".join(map(str, job.get("assigned_gpus", []))) + if not gpus: + gpus = "?" + + pid = str(job.get("pid", "?")) + + elapsed = "-" + start_dt = self._parse_iso(job.get("started")) + if start_dt: + elapsed = self._fmt_delta(datetime.now() - start_dt) + + # Use dynamic column widths + cw = self.col_widths["running"] + prefix = ( + f" {jid:<{cw['id']}} {pid:<{cw['pid']}} " + f"{gpus:<{cw['gpus']}} {elapsed:<{cw['elapsed']}} " + ) + + elif job["_type"] in ["staging", "pending"]: + gpus = str(job.get("gpus", 1)) + waiting = "-" + add_dt = self._parse_iso(job.get("added")) + if add_dt: + waiting = self._fmt_delta(datetime.now() - add_dt) + + # Use dynamic column widths + cw = self.col_widths["staging" if job["_type"] == "staging" else "pending"] + prefix = ( + f" {jid:<{cw['id']}} {gpus:<{cw['gpus']}} {waiting:<{cw['waiting']}} " + ) + + else: + # Completed/Finished + # Format: ID(8) | RUNTIME(9) | AGO(9) | CMD + + # Color by status + s_res = job.get("status", "?") + if s_res == "success": + color = curses.color_pair(2) # Green + elif s_res == "failed": + color = curses.color_pair(1) # Red + elif s_res == "cancelled": + color = curses.color_pair(3) # Orange + + start_dt = self._parse_iso(job.get("started")) + end_dt = self._parse_iso(job.get("ended")) + + run_s = "-" + ago_s = "-" + + if start_dt and end_dt: + run_s = self._fmt_delta(end_dt - start_dt) + + if end_dt: + ago_s = self._fmt_delta(datetime.now() - end_dt) + + # Use dynamic column widths + cw = self.col_widths["completed"] + prefix = ( + f" {jid:<{cw['id']}} {run_s:<{cw['runtime']}} {ago_s:<{cw['ago']}} " + ) + + cmd = job.get("cmd", "") or "" + avail_cmd = w - len(prefix) + if len(cmd) > avail_cmd: + cmd = cmd[: (avail_cmd - 1)] + "…" + + full_line = prefix + cmd + + if is_editing and job["_type"] == "staging": + cw = self.col_widths["staging"] + + # Field 0: GPUS + if edit_idx == 0: + s_val = f"[{gpus}]" + s_gpus = f"{s_val:<{cw['gpus'] + 1}}" + else: + s_gpus = f"{gpus:<{cw['gpus']}} " + + head = f" {jid:<{cw['id']}} " + s_gpus + f"{waiting:<{cw['waiting']}} " + cmd_avail = max(1, w - len(head)) + + cmd_stripped = (job.get("cmd", "") or "").strip() + if edit_idx == 1: + disp = cmd_stripped if cmd_stripped else CMD_FIELD_GHOST + if len(disp) > cmd_avail: + disp = disp[: max(0, cmd_avail - 1)] + "…" + disp = disp.ljust(cmd_avail)[:cmd_avail] + cmd_attr = curses.A_REVERSE + else: + disp = cmd_stripped + if len(disp) > cmd_avail: + disp = disp[: max(0, cmd_avail - 1)] + "…" + disp = disp.ljust(cmd_avail)[:cmd_avail] + cmd_attr = curses.A_NORMAL + + return { + "type": "rich", + "segments": [ + (f" {jid:<{cw['id']}} ", curses.A_NORMAL), + (s_gpus, curses.A_REVERSE if edit_idx == 0 else curses.A_NORMAL), + (f"{waiting:<{cw['waiting']}} ", curses.A_NORMAL), + (disp, cmd_attr), + ], + "base_color": color, + }, color + + return full_line, color + + def draw(self): + if self.stdscr is None: + return + self.stdscr.erase() + h, w = self.stdscr.getmaxyx() + + if h < 20 or w < 60: + self.stdscr.addstr(0, 0, "Terminal too small!") + return + + with self.lock: + # 0. Header (1 row) + # 0. Header (1 row) + # Full width white bar + self.stdscr.hline(0, 0, " ", w, curses.color_pair(6) | curses.A_REVERSE) + + status_col = curses.color_pair(6) | curses.A_REVERSE # White BG + + # Title + self.stdscr.addstr(0, 1, " GPU QUEUE WATCH ", status_col | curses.A_BOLD) + + # Daemon Status + server_col = ( + curses.color_pair(2 if "ON" in self.server_status else 1) + | curses.A_REVERSE + ) + self.stdscr.addstr( + 0, 20, f" [{self.server_status}] ", server_col | curses.A_BOLD + ) + + # Daemon Info (Reserved/Excluded) + info_str = f"Res: {self.min_free}" + if self.max_use is not None: + info_str += f" | Max: {self.max_use}" + if self.excluded: + ex_list = ",".join(map(str, sorted(self.excluded))) + info_str += f" | Excl: [{ex_list}]" + + # Right aligned info + info_x = w - len(info_str) - 2 + if info_x > 40: # Prevent overlap + self.stdscr.addstr(0, info_x, info_str, status_col) + + # Action message overlay + if self.action_msg: + if time.time() > self.msg_clear_time: + self.action_msg = "" + else: + msg_x = w // 2 - len(self.action_msg) // 2 + self.stdscr.addstr( + 0, + msg_x, + f" {self.action_msg} ", + curses.color_pair(3) | curses.A_REVERSE, + ) + + # 2. Main Windows + # Calculate heights + avail_h = h - 2 # -1 for header, -1 for footer + + win_by_key = {win.key: win for win in self.windows} + + # --- Dynamic Sizing Logic --- + gpu_h = 1 + if not win_by_key["gpu_status"].collapsed: + gpu_content_len = len(self.gpu_status) if self.gpu_status else 1 + gpu_h = min(gpu_content_len + 3, max(3, avail_h // 3)) + gpu_h = max(3, gpu_h) + + running_h = 1 + if not win_by_key["running"].collapsed: + running_items = len(self.data.get("running", [])) + running_content_len = running_items + 1 # +1 for header + running_h = min(running_content_len + 2, max(3, avail_h // 3)) + running_h = max(3, running_h) + + job_h = 1 if win_by_key["job_details"].collapsed else 8 + + queue_keys = ["staging", "pending", "completed"] + queue_min_h = len(queue_keys) + nonqueue_min_heights = { + "running": 1 if win_by_key["running"].collapsed else 3, + "gpu_status": 1 if win_by_key["gpu_status"].collapsed else 3, + "job_details": 1 if win_by_key["job_details"].collapsed else 3, + } + nonqueue_heights = { + "running": running_h, + "gpu_status": gpu_h, + "job_details": job_h, + } + while sum(nonqueue_heights.values()) + queue_min_h > avail_h: + shrinkable = [ + key + for key, height in nonqueue_heights.items() + if height > nonqueue_min_heights[key] + ] + if not shrinkable: + break + key = max(shrinkable, key=lambda k: nonqueue_heights[k]) + nonqueue_heights[key] -= 1 + + running_h = nonqueue_heights["running"] + gpu_h = nonqueue_heights["gpu_status"] + job_h = nonqueue_heights["job_details"] + remaining_h = max(0, avail_h - sum(nonqueue_heights.values())) + visible_queue_keys = [k for k in queue_keys if not win_by_key[k].collapsed] + heights_by_key = { + "running": running_h, + "gpu_status": gpu_h, + "job_details": job_h, + } + if visible_queue_keys: + collapsed_queue_h = sum( + 1 for k in queue_keys if win_by_key[k].collapsed + ) + visible_h = max(0, remaining_h - collapsed_queue_h) + base_h = max(1, visible_h // len(visible_queue_keys)) + extra = max(0, visible_h - (base_h * len(visible_queue_keys))) + for k in queue_keys: + if win_by_key[k].collapsed: + heights_by_key[k] = 1 + else: + add = 1 if extra > 0 else 0 + heights_by_key[k] = base_h + add + if extra > 0: + extra -= 1 + else: + for k in queue_keys: + heights_by_key[k] = 1 + + heights = [heights_by_key[win.key] for win in self.windows] + + current_y = 1 + for i, win in enumerate(self.windows): + wh = heights[i] + win.height = wh # Store actual height for scrolling + if wh <= 0: + continue # Skip hidden windows + if not win.collapsed: + win.ensure_selected_visible(wh) + + active = i == self.active_win_idx + focused = active and self.mode == "ACTION" + + self.draw_window(win, current_y, 0, wh, w, active, focused) + current_y += wh + + # 3. Footer + self.draw_footer(h - 1, w) + + # 4. Log Overlay? + if self.viewing_logs: + self.draw_log_overlay(h, w) + + # 5. Modal + if self.modal: + self.draw_modal(h, w) + + self.stdscr.refresh() + + def draw_modal(self, h, w): + if self.stdscr is None: + return + if self.modal is None: + return + """Draw a modal overlay.""" + m_h, m_w = 16, 80 + y = (h - m_h) // 2 + x = (w - m_w) // 2 + + # Draw box + try: + # Clear area + for i in range(m_h): + self.stdscr.addstr(y + i, x, " " * m_w) + + # Border + self.stdscr.attron(curses.color_pair(3)) + + # Manual draw for modal box + h_box = m_h + w_box = m_w + self.stdscr.hline(y, x, curses.ACS_HLINE, w_box) + self.stdscr.hline(y + h_box - 1, x, curses.ACS_HLINE, w_box) + self.stdscr.vline(y, x, curses.ACS_VLINE, h_box) + self.stdscr.vline(y, x + w_box - 1, curses.ACS_VLINE, h_box) + self.stdscr.addch(y, x, curses.ACS_ULCORNER) + self.stdscr.addch(y, x + w_box - 1, curses.ACS_URCORNER) + self.stdscr.addch(y + h_box - 1, x, curses.ACS_LLCORNER) + self.stdscr.addch(y + h_box - 1, x + w_box - 1, curses.ACS_LRCORNER) + + self.stdscr.attroff(curses.color_pair(3)) + + # Title + self.stdscr.addstr( + y, + x + 2, + f" {self.modal['title']} ", + curses.color_pair(3) | curses.A_BOLD, + ) + + # Content + text = self.modal.get("text", "") + if text: + self.stdscr.addstr(y + 2, x + 2, text[: m_w - 4]) + + # Input field + if self.modal["type"] == "INPUT": + val = self.modal.get("value", "") + cursor_pos = self.modal.get("cursor_pos", len(val)) + + field_w = m_w - 6 + field_h = m_h - 6 # Leave space for buttons/title + # Import textwrap or use simple slicing + # Simple character wrapping + lines = [] + for i in range(0, len(val), field_w): + lines.append(val[i : i + field_w]) + if not lines: + lines = [""] + + # If cursor is at exact end, handle it? + # Logic puts it at end of last line. + + # Ensure we have enough lines to cover cursor + # Cursor (row, col) + c_row = cursor_pos // field_w + c_col = cursor_pos % field_w + + # Draw lines + # We might need scrolling if text exceeds box height? + # For now assuming it fits or we enforce limit. + # Let's implement basic vertical scrolling if needed + + scroll_row = self.modal.get("scroll_row", 0) + if c_row < scroll_row: + scroll_row = c_row + elif c_row >= scroll_row + field_h: + scroll_row = c_row - field_h + 1 + self.modal["scroll_row"] = scroll_row + + for i in range(field_h): + line_idx = scroll_row + i + draw_y = y + 4 + i + + line_content = "" + if line_idx * field_w < len(val): + # Construct line from val directly to rely on consistent math + start = line_idx * field_w + end = start + field_w + line_content = val[start:end] + elif line_idx == 0 and not val: + line_content = "" + + # Only draw if valid line or active cursor line + # Use White (pair 6) for input text + self.stdscr.addstr( + draw_y, x + 3, line_content, curses.color_pair(6) + ) + + # Cursor + if line_idx == c_row: + # Ensure c_col is within bounds of visual line + # If cursor is at end of line (col=0 of next), handle it? + # No, math handles it: c_col is 0..width-1 + # Cursor pos logic handles new line wrapping + + char_at = " " + if c_col < len(line_content): + char_at = line_content[c_col] + + self.stdscr.addstr( + draw_y, + x + 3 + c_col, + char_at, + curses.A_REVERSE | curses.color_pair(6), + ) + + # Buttons + btn_y = y + m_h - 2 + if self.modal["type"] == "CONFIRM": + btns = "[y] Yes [n] No" + self.stdscr.addstr(btn_y, x + (m_w - len(btns)) // 2, btns) + elif self.modal["type"] == "INPUT": + btns = "[Enter] Confirm [Esc] Cancel" + self.stdscr.addstr(btn_y, x + (m_w - len(btns)) // 2, btns) + + except Exception: + pass + + def draw_compact_gpu_info(self, y, x, h, w): + if self.stdscr is None: + return + """Draw compact nvidia-smi style info.""" + try: + left = x + 2 + inner = max(0, w - 4) + prefix_w, hist_w, proc_w = _gpu_status_column_widths(inner) + hdr_attr = curses.A_BOLD + row_attr = curses.A_NORMAL + + if not self.gpu_status: + self.stdscr.addstr( + y + 1, left, "No GPU info available", curses.A_NORMAL + ) + return + + hdr_idx = "IDX".ljust(GPU_COL_IDX_W)[:GPU_COL_IDX_W] + hdr_util = "UTIL".ljust(GPU_COL_UTIL_W)[:GPU_COL_UTIL_W] + hdr_mem = "MEM".ljust(GPU_COL_MEM_W)[:GPU_COL_MEM_W] + prefix_hdr = f"{hdr_idx} {hdr_util} {hdr_mem}" + if len(prefix_hdr) > inner: + self.stdscr.addstr(y, left, prefix_hdr[:inner], hdr_attr) + else: + self.stdscr.addstr(y, left, prefix_hdr, hdr_attr) + col = left + prefix_w + 1 + if hist_w > 0: + h_hist = _fit_text_field( + _gpu_history_header_label(hist_w, paths.POLL_INTERVAL), + hist_w, + ).ljust(hist_w)[:hist_w] + self.stdscr.addstr(y, col, h_hist, hdr_attr) + col += hist_w + 1 + h_proc = _fit_text_field("PROCESSES (USER:PID)", proc_w).ljust(proc_w)[ + :proc_w + ] + if proc_w > 0: + self.stdscr.addstr(y, col, h_proc, hdr_attr) + + for i, g in enumerate(self.gpu_status[: h - 1]): + idx = g.get("index", "?") + used_mb = g.get("used_mb", 0) + total_mb = g.get("total_mb", 0) + util = g.get("util", 0) + + used_gb = used_mb / 1024.0 + total_gb = total_mb / 1024.0 + mem_s = f"{used_gb:.1f}/{total_gb:.0f}G" + + line_y = y + 1 + i + idx_s = ( + str(int(idx))[:GPU_COL_IDX_W] + if isinstance(idx, int) + else str(idx)[:GPU_COL_IDX_W] + ).ljust(GPU_COL_IDX_W)[:GPU_COL_IDX_W] + u = max(0, min(100, int(util))) + util_s = f"{u:>3}%".ljust(GPU_COL_UTIL_W)[:GPU_COL_UTIL_W] + mem_col = mem_s[:GPU_COL_MEM_W].ljust(GPU_COL_MEM_W)[:GPU_COL_MEM_W] + prefix_row = f"{idx_s} {util_s} {mem_col}" + if len(prefix_row) > inner: + self.stdscr.addstr(line_y, left, prefix_row[:inner], row_attr) + continue + + self.stdscr.addstr(line_y, left, prefix_row, row_attr) + col = left + prefix_w + 1 + if hist_w > 0: + hist = ( + self._gpu_util_history.get(int(idx), []) if idx != "?" else [] + ) + spark = sparkline_trailing(hist, hist_w) + self.stdscr.addstr(line_y, col, spark, row_attr) + col += hist_w + 1 + proc_strs = [ + f"{p.get('user', '?')}:{p.get('pid', '?')}" + for p in g.get("processes", []) + if not p.get("zombie") + ] + proc_line = _fit_text_field(", ".join(proc_strs), proc_w).ljust(proc_w)[ + :proc_w + ] + if proc_w > 0: + self.stdscr.addstr(line_y, col, proc_line, curses.A_NORMAL) + except Exception: + pass + + def draw_job_details(self, y, x, h, w): + if self.stdscr is None: + return + """Draw detailed job information for the selected job across all windows.""" + try: + if not self.has_selected_job_context: + self.stdscr.addstr(y + 1, x + 2, "No job selected.", curses.A_NORMAL) + return + + # Find which job is "selected" across the 3 main windows + # Or just use the one from the active window if it's a queue + job = self._selected_job() + + if not job: + self.stdscr.addstr(y + 1, x + 2, "No job selected.", curses.A_NORMAL) + return + + self.last_selected_job = job # Keep it + + jid = job["id"] + queue_s = str(job.get("_type", "unk")).upper() + st = str(job.get("status", "-")).upper() + gpu_s = str(job.get("gpus", "-")) + meta_str = f"ID: {jid} | Queue: {queue_s} | Status: {st} | GPUs: {gpu_s}" + inner_w = max(1, w - 4) + self.stdscr.addstr( + y, x + 2, _fit_text_field(meta_str, inner_w), curses.A_BOLD + ) + + cmd = job.get("cmd", "") + # Normalize command (remove newlines) + cmd = cmd.replace("\n", " ").replace("\r", " ") + + prefix = "Cmd: " + # Width calculation checks + # w - 5 - len(prefix) ? + # w is Full Width. + # Draws at x+2. + # Max index w-2 (border at w-1). + # So length available = w-4. + # Subtract prefix. + # Subtract 2 more for safety. + safe_width = max(1, inner_w - len(prefix)) + + import textwrap + + lines = textwrap.wrap(cmd, width=safe_width) or ["-"] + + for i, line in enumerate(lines[: h - 1]): + self.stdscr.addstr( + y + 1 + i, + x + 2, + prefix if i == 0 else " " * len(prefix), + curses.A_NORMAL, + ) + self.stdscr.addstr( + y + 1 + i, + x + 2 + len(prefix), + _fit_text_field(line, safe_width), + ) + except Exception: + pass + + def draw_window(self, win, y, x, h, w, active, focused): + if self.stdscr is None: + return + # Draw Box + try: + # Border Color logic: Blue when focused, white when selected. + # If unselected, don't draw borders (they'll blend with background). + if focused: + border_color = curses.color_pair(5) # Blue + title_color = curses.color_pair(5) | curses.A_BOLD # Blue bold + draw_border = True + elif active: + border_color = curses.A_NORMAL + title_color = curses.A_BOLD + draw_border = True + else: + # Unselected: don't draw border; title should use normal text. + border_color = None + title_color = curses.A_NORMAL + draw_border = False + + if draw_border: + self.stdscr.attron(border_color) + self.stdscr.hline(y, x, curses.ACS_HLINE, w) + if not win.collapsed: + self.stdscr.hline(y + h - 1, x, curses.ACS_HLINE, w) + self.stdscr.vline(y, x, curses.ACS_VLINE, h) + self.stdscr.vline(y, x + w - 1, curses.ACS_VLINE, h) + self.stdscr.addch(y, x, curses.ACS_ULCORNER) + self.stdscr.addch(y, x + w - 1, curses.ACS_URCORNER) + self.stdscr.addch(y + h - 1, x, curses.ACS_LLCORNER) + self.stdscr.addch(y + h - 1, x + w - 1, curses.ACS_LRCORNER) + self.stdscr.attroff(border_color) + + # Title + count_str = "" + if win.key in ["running", "staging", "pending", "completed"]: + selected_count = len(self._selected_ids_for_window(win)) + count_str = f"[{len(win.items)}" + if selected_count: + count_str += f"/{selected_count}" + count_str += "]" + + title_s = f" {win.title} {count_str} " + if win.collapsed: + title_s = f" [+] {win.title} {count_str} " + elif focused: + title_s = f" [ {win.title} {count_str} ] " + + # Title uses matching color scheme + self.stdscr.addstr(y, 2, title_s, title_color) + + if win.collapsed: + return + + # Dispatch specialized drawing + if win.key == "gpu_status": + self.draw_compact_gpu_info(y + 1, x, h - 2, w) + return + if win.key == "job_details": + self.draw_job_details(y + 1, x, h - 2, w) + return + + # Header? + header_offset = 1 + hdr = "" + if win.key == "running": + # Dynamic header based on column widths + cw = self.col_widths["running"] + hdr = ( + f" {'ID':<{cw['id']}} {'PID':<{cw['pid']}} " + f"{'GPUS':<{cw['gpus']}} {'ELAPSED':<{cw['elapsed']}} CMD" + ) + elif win.key == "staging": + cw = self.col_widths["staging"] + hdr = ( + f" {'ID':<{cw['id']}} {'GPUS':<{cw['gpus']}} " + f"{'WAITING':<{cw['waiting']}} CMD" + ) + elif win.key == "pending": + # Dynamic header based on column widths + cw = self.col_widths["pending"] + hdr = ( + f" {'ID':<{cw['id']}} {'GPUS':<{cw['gpus']}} " + f"{'WAITING':<{cw['waiting']}} CMD" + ) + elif win.key == "completed": + # Dynamic header based on column widths + cw = self.col_widths["completed"] + hdr = ( + f" {'ID':<{cw['id']}} {'RUNTIME':<{cw['runtime']}} " + f"{'AGO':<{cw['ago']}} CMD" + ) + + # List items + display_items = list(win.items) + + # Edit-mode row swap in staging only + if ( + active + and self.edit_mode_active + and self.edit_job + and win.key == "staging" + ): + edit_copy = copy.deepcopy(self.edit_job) + edit_copy["_edit_field_idx"] = self.edit_field_idx + for idx, it in enumerate(display_items): + if it["id"] == edit_copy["id"]: + display_items[idx] = edit_copy + break + + # Recalculate list height or just use what we have + list_h = h - 2 - header_offset + if list_h < 1: + return + + start_y = y + 1 + header_offset + has_scrollbar = len(display_items) > list_h + row_text_w = max(0, w - (4 if has_scrollbar else 3)) + scrollbar_x = max(x + 1, x + w - 2) + + if hdr: + self.stdscr.addstr( + y + 1, + x + 1, + _fit_text_field(hdr, row_text_w), + curses.A_BOLD, + ) + + # We need to handle scroll offset carefully if we injected an item + # If we injected, the list is 1 longer. + visible_items = display_items[ + win.scroll_offset : win.scroll_offset + list_h + ] + + for i, item in enumerate(visible_items): + abs_idx = win.scroll_offset + i + + is_sel = False + if active: + if abs_idx == win.selected_idx: + is_sel = True + + line_res, line_col = self.format_job_line(item, row_text_w) + + draw_style = curses.A_NORMAL + is_bulk_selected = ( + win.key in self.selected_job_ids + and str(item.get("id")) in self.selected_job_ids[win.key] + ) + if is_bulk_selected: + draw_style |= curses.A_REVERSE | curses.A_BOLD + if is_sel: + if focused: + draw_style |= curses.A_REVERSE + else: + # In NAV mode, pass or different style + pass + + try: + self.stdscr.addstr( + start_y + i, x + 1, " " * row_text_w, curses.A_NORMAL + ) + except Exception: + pass + + # Handle Rich Text (Dictionary) + if isinstance(line_res, dict) and line_res.get("type") == "rich": + current_x = 1 + segments = line_res["segments"] + base_attr = line_res.get("base_color", curses.A_NORMAL) + if is_bulk_selected: + base_attr |= curses.A_REVERSE | curses.A_BOLD + + # Clear line with base attr first? + # self.stdscr.addstr(start_y + i, 1, " " * (w-2), base_attr) + + for text, attr in segments: + try: + self.stdscr.addstr( + start_y + i, x + current_x, text, attr | base_attr + ) + current_x += len(text) + except Exception: + pass + else: + self.stdscr.addstr( + start_y + i, x + 1, line_res, line_col | draw_style + ) + + # Scroll bar indicator? + if len(display_items) > list_h: + sb_h = max(1, int(list_h * (list_h / len(display_items)))) + sb_pos = int((win.scroll_offset / len(display_items)) * list_h) + for k in range(list_h): + char = "│" + if k >= sb_pos and k < sb_pos + sb_h: + char = "█" + try: + self.stdscr.addstr( + start_y + k, scrollbar_x, char, curses.A_NORMAL + ) + except Exception: + pass + except Exception: + # self.stdscr.addstr(y+1, 1, str(e)) + pass + + def draw_footer(self, y, w): + if self.stdscr is None: + return + try: + if self.modal: + return # Don't draw footer over modal or distract + + help_str = " Q:Quit " + if self.edit_mode_active: + help_str += ( + "e:Save Staging Esc:Cancel h/l:Field " + "j/k:GPUs Enter:command editor" + ) + elif self.mode == "NAV": + help_str += "j/k:Select l:Focus n:New Job Tab:Collapse" + else: + # Context-aware help based on active window + win = self.windows[self.active_win_idx] + if self.select_mode_active and self.select_mode_window_key == win.key: + help_str += "j/k:Extend Selection v:Done Esc:Cancel" + elif win.key == "staging": + help_str += "h:Back v:Select c:Discard e:Edit s:Send d:Dup" + elif win.key == "pending": + help_str += ( + "h:Back v:Select b:Stage c:Cancel J/K:Reorder Space:Log" + ) + elif win.key == "running": + help_str += "h:Back v:Select Space:Log c:Cancel p:Pause d:Dup" + elif win.key == "completed": + help_str += "h:Back v:Select Space:Log r:Retry x:Delete d:Dup" + else: + help_str += "h:Back Space:Log d:Dup n:New" + + # Mode display on the right + mode_label = self.mode + if self.edit_mode_active: + mode_label = "EDIT" + elif self.select_mode_active: + mode_label = "SELECT" + + mode_s = f" MODE: {mode_label} " + if len(mode_s) >= w: + full_str = _fit_text_field(mode_s, w) + else: + help_str = _fit_text_field(help_str, w - len(mode_s)).rstrip() + padding = " " * max(0, w - len(help_str) - len(mode_s)) + full_str = help_str + padding + mode_s + + # Blue background (Pair 5) + self.stdscr.addstr(y, 0, full_str, curses.color_pair(5) | curses.A_REVERSE) + except Exception: + pass + + def draw_log_overlay(self, h, w): + if self.stdscr is None: + return + # Draw a floating window for logs + margin_x = 4 + margin_y = 2 + win_h = h - 2 * margin_y + win_w = w - 2 * margin_x + + # Draw shadow or clear + for i in range(win_h): + self.stdscr.addstr(margin_y + i, margin_x, " " * win_w, curses.A_NORMAL) + + # Box Border + try: + self.stdscr.attron(curses.color_pair(3)) + self.stdscr.hline(margin_y, margin_x, curses.ACS_HLINE, win_w) + self.stdscr.hline(margin_y + win_h - 1, margin_x, curses.ACS_HLINE, win_w) + self.stdscr.vline(margin_y, margin_x, curses.ACS_VLINE, win_h) + self.stdscr.vline(margin_y, margin_x + win_w - 1, curses.ACS_VLINE, win_h) + self.stdscr.addch(margin_y, margin_x, curses.ACS_ULCORNER) + self.stdscr.addch(margin_y, margin_x + win_w - 1, curses.ACS_URCORNER) + self.stdscr.addch(margin_y + win_h - 1, margin_x, curses.ACS_LLCORNER) + self.stdscr.addch( + margin_y + win_h - 1, margin_x + win_w - 1, curses.ACS_LRCORNER + ) + self.stdscr.attroff(curses.color_pair(3)) + # Title + title = f" LOGS: {self.log_job_id} " + self.stdscr.addstr( + margin_y, margin_x + 2, title, curses.A_BOLD | curses.A_REVERSE + ) + + # Content + content_h = win_h - 2 + visible_lines = self.log_content[ + self.log_scroll : self.log_scroll + content_h + ] + + for i, line in enumerate(visible_lines): + if len(line) > win_w - 2: + line = line[: win_w - 5] + "..." + self.stdscr.addstr( + margin_y + 1 + i, margin_x + 1, line, curses.A_NORMAL + ) + + # Footer + footer_str = " h/Esc:Close j/k:Scroll PGUP/DN:Jump L:Full(less) " + self.stdscr.addstr( + margin_y + win_h - 1, margin_x + 2, footer_str, curses.A_REVERSE + ) + + except Exception: + pass + + def action_view_logs(self): + """View logs for selected job using external tool.""" + win = self.windows[self.active_win_idx] + job = win.get_selected() + if not job: + return + + self.action_open_external_logs() + + def action_open_external_logs(self): + """Open logs in an external tool (less +F).""" + if self.mode != "ACTION": + return + win = self.windows[self.active_win_idx] + job = win.get_selected() + if not job: + return + + log_path = Path.home() / f".gpu_queue/logs/{job['id']}.log" + if not log_path.exists(): + self.action_msg = "Log file not found" + self.msg_clear_time = time.time() + 2.0 + return + + # We need to temporarily exit curses + if self.stdscr: + curses.def_shell_mode() + self.stdscr.clear() + self.stdscr.refresh() + curses.endwin() + + try: + # Use +F for following if it's currently running, otherwise just open it + cmd = ["less", "+G", str(log_path)] + if job["_type"] == "running": + cmd = ["less", "+F", str(log_path)] + + # Ignore SIGINT in parent (Python) so Ctrl+C only kills 'less' + old_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) + try: + subprocess.run(cmd) + finally: + # Restore SIGINT handler + signal.signal(signal.SIGINT, old_handler) + finally: + # Re-enter curses + if self.stdscr: + self.stdscr.refresh() + curses.doupdate() + # Some implementations need reset_shell_mode + curses.reset_shell_mode() + self.stdscr.keypad(True) + self.stdscr.nodelay(True) + + def main(self, stdscr): + import curses + + self.stdscr = stdscr + + curses.start_color() + curses.use_default_colors() + + # curr_y, curr_x = 0, 0 + + # Define colors (1: Red, 2: Green, 3: Yellow, 4: Cyan/Blue) + curses.init_pair(1, curses.COLOR_RED, -1) + curses.init_pair(2, curses.COLOR_GREEN, -1) + curses.init_pair(3, curses.COLOR_YELLOW, -1) + curses.init_pair(4, curses.COLOR_CYAN, -1) + curses.init_pair(5, curses.COLOR_BLUE, -1) + curses.init_pair(6, curses.COLOR_WHITE, -1) + curses.curs_set(0) + self.stdscr.nodelay(True) + self.stdscr.keypad(True) + + self.start() + + try: + while True: + self.draw() + + ch = self.stdscr.getch() + if ch == -1: + time.sleep(0.05) + continue + + if self.modal: + m = self.modal + if m["type"] == "CONFIRM": + if ch == ord("y") or ch == 10: # Yes + if m["on_confirm"]: + m["on_confirm"]() + self.modal = None + elif ch == ord("n") or ch == 27: # No / Esc + if m["on_cancel"]: + m["on_cancel"]() + if ch == 27 and self._has_selected_ids(): + self._cancel_select_mode() + self.modal = None + + elif m["type"] == "INPUT": + cpos = m.get("cursor_pos", len(m["value"])) + val = m["value"] + + if ch == 27: # Esc + if m["on_cancel"]: + m["on_cancel"]() + self.modal = None + elif ch == 10: # Enter + if m["on_confirm"]: + m["on_confirm"](val) + self.modal = None + elif ch == 127 or ch == curses.KEY_BACKSPACE: # Backspace + if cpos > 0: + m["value"] = val[: cpos - 1] + val[cpos:] + m["cursor_pos"] = cpos - 1 + elif ch == curses.KEY_DC: # Delete + if cpos < len(val): + m["value"] = val[:cpos] + val[cpos + 1 :] + elif ch == curses.KEY_LEFT: + m["cursor_pos"] = max(0, cpos - 1) + elif ch == curses.KEY_RIGHT: + m["cursor_pos"] = min(len(val), cpos + 1) + elif ch == curses.KEY_UP: + # Move up one line (width 74 = 80 - 6) + width = 80 - 6 + cpos_int = cast(int, cpos) + m["cursor_pos"] = max(0, cpos_int - width) + elif ch == curses.KEY_DOWN: + width = 80 - 6 + cpos_int = cast(int, cpos) + m["cursor_pos"] = min(len(str(val)), cpos_int + width) + elif ch == curses.KEY_HOME: + m["cursor_pos"] = 0 + elif ch == curses.KEY_END: + m["cursor_pos"] = len(val) + elif ch >= 32 and ch <= 126: # Printable + cpos_int = cast(int, cpos) + m["value"] = ( + str(val)[:cpos_int] + chr(ch) + str(val)[cpos_int:] + ) + m["cursor_pos"] = cpos_int + 1 + continue + + if self.edit_mode_active: + # Force redraw of footer/status + # self.stdscr.touchwin() + + if ch == 27: # Esc -> Cancel + self.edit_mode_active = False + self.edit_job = None + elif ch == ord("e"): # Confirm + if self.edit_job is None: + continue + self.execute_action( + "update_staging", + self.edit_job["id"], + cmd=self.edit_job["cmd"], + gpus=self.edit_job["gpus"], + ) + self.edit_mode_active = False + self.edit_job = None + + elif ch == ord("h"): # Cycle Left + self.edit_field_idx = max(0, self.edit_field_idx - 1) + elif ch == ord("l"): # Cycle Right + self.edit_field_idx = min(1, self.edit_field_idx + 1) + + elif ch == ord("j"): # Decrease Value + if self.edit_job is None: + continue + if self.edit_field_idx == 0: # GPUS + self.edit_job["gpus"] = max( + 1, int(self.edit_job["gpus"]) - 1 + ) + + elif ch == ord("k"): # Increase Value + if self.edit_job is None: + continue + if self.edit_field_idx == 0: # GPUS + self.edit_job["gpus"] = int(self.edit_job["gpus"]) + 1 + + elif ch == 10 or ch == ord("i"): # Enter/Edit Text + if self.edit_field_idx == 1: # Command + self.prompt_edit_command() + + continue + + # Log Viewing Mode + if self.viewing_logs: + if ch == ord("h") or ch == ord("q") or ch == 27: # h or q or Esc + self.viewing_logs = False + elif ch == ord("k"): + self.log_scroll = max(0, self.log_scroll - 1) + elif ch == ord("j"): + max_scroll = max( + 0, len(self.log_content) - (self.stdscr.getmaxyx()[0] - 6) + ) + self.log_scroll = min(max_scroll, self.log_scroll + 1) + continue + + # Normal Mode + if ch == ord("q"): + break + + if self.mode == "NAV": + if ch == ord("k"): + self.active_win_idx = max(0, self.active_win_idx - 1) + elif ch == ord("j"): + self.active_win_idx = min( + len(self.windows) - 1, self.active_win_idx + 1 + ) + elif ch == ord("l"): # l + # Disable l for non-interactive windows + curr_win = self.windows[self.active_win_idx] + if curr_win.key not in ["gpu_status", "job_details"]: + self._enter_action_window(curr_win) + elif ch == 10: # Enter + curr_win = self.windows[self.active_win_idx] + if curr_win.key not in ["gpu_status", "job_details"]: + self._enter_action_window(curr_win) + elif ch == 9: # Tab + self.windows[self.active_win_idx].collapsed = not self.windows[ + self.active_win_idx + ].collapsed + elif ch == ord("n"): # New Job (Global context) + self.prompt_new_job() + + elif self.mode == "ACTION": + win = self.windows[self.active_win_idx] + window_h = getattr(win, "height", None) + + if ch == ord("h") or ch == 27: # h or Esc + if self.select_mode_active: + self._cancel_select_mode() + continue + if ch == 27 and self._has_selected_ids(): + self._cancel_select_mode() + continue + self._exit_action_window(win) + elif ch == ord("k"): + if self.select_mode_active: + self._select_mode_move(win, -1, window_h) + else: + win.scroll(-1, window_h) + elif ch == ord("j"): + if self.select_mode_active: + self._select_mode_move(win, 1, window_h) + else: + win.scroll(1, window_h) + elif ch == ord("v"): + if self.select_mode_active: + self._exit_select_mode() + else: + self._enter_select_mode(win) + elif ch == ord(" "): + self.action_view_logs() + elif ch == ord("L"): + self.action_open_external_logs() + + # Actions (context-aware per window type) + elif ch == ord("c"): + if win.key in ["staging", "pending", "running"]: + self.do_action("cancel") + elif ch == ord("x"): # Remove/Delete (completed only) + if win.key == "completed": + self.do_action("remove") + elif ch == ord("d"): # Dup (all windows) + self.do_action("dup") + elif ch == ord("n"): # New + self.prompt_new_job() + elif ch == ord("p"): # Pause (running only) + if win.key == "running": + self.do_action("pause") + elif ch == ord("e"): # Edit (staging only) + if win.key == "staging": + self.do_action("edit") + elif ch == ord("r"): # Retry (completed only) + if win.key == "completed": + self.do_action("retry") + elif ch == ord("s"): # Send staged job to pending + if win.key == "staging": + self.do_action("send_to_pending") + elif ch == ord("b"): # Move pending job back to staging + if win.key == "pending": + self.do_action("back_to_staging") + elif ch == ord("J"): # Move pending job down + if win.key == "pending": + self.do_action("move_pending_down") + elif ch == ord("K"): # Move pending job up + if win.key == "pending": + self.do_action("move_pending_up") + elif ch == 10: # Enter sends staged job to pending (with confirm) + if win.key == "staging": + self.do_action("send_to_pending") + + finally: + self.stop() + + def add_job_internal(self, cmd, gpus=2, priority=1): + job = { + "id": generate_job_id(), + "cmd": cmd, + "gpus": gpus, + "added": datetime.now().isoformat(), + "priority": priority, + "cwd": os.getcwd(), + } + with locked_queue() as queue: + queue["pending"].append(job) + self.action_msg = f"Added {job['id']}" + self.msg_clear_time = time.time() + 2.0 + + def prompt_edit_command(self): + if self.edit_job is None: + return + + # Use external editor + # 1. Write current cmd to temp file + with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".sh") as tf: + tf.write(str(self.edit_job["cmd"])) + tf_path = tf.name + + if self.stdscr: + curses.def_shell_mode() + self.stdscr.clear() + self.stdscr.refresh() + curses.endwin() + + try: + editor = os.environ.get("EDITOR", "nano") + subprocess.run([editor, tf_path]) + + # 3. Read back (collapse newlines/whitespace to single line) + with open(tf_path, "r") as f: + new_cmd = " ".join(f.read().split()) + self._update_edit_cmd(new_cmd) + + os.unlink(tf_path) + except Exception: + pass + + finally: + if self.stdscr: + self.stdscr.refresh() + curses.doupdate() + curses.reset_shell_mode() + self.stdscr.keypad(True) + self.stdscr.nodelay(True) + + def _update_edit_cmd(self, val): + if self.edit_job is None: + return + if self.edit_job: + self.edit_job["cmd"] = val + + def prompt_new_job(self): + job = make_staged_job(generate_job_id()) + with locked_queue() as q: + insert_staged_job(q, job) + + self.edit_job = copy.deepcopy(job) + self.edit_job["_type"] = "staging" + self.edit_is_new = True + self.edit_mode_active = True + self.edit_field_idx = 0 + + for i, w in enumerate(self.windows): + if w.key == "staging": + self.active_win_idx = i + w.selected_idx = 0 + w.scroll_offset = 0 + self.has_selected_job_context = True + break + + def prompt_change_gpus(self): + win = self.windows[self.active_win_idx] + job = win.get_selected() + if not job: + return + self.modal = { + "type": "INPUT", + "title": "Change GPU Requirement", + "text": f"New GPU count for {job['id']}:", + "value": str(job.get("gpus", 1)), + "cursor_pos": len(str(job.get("gpus", 1))), + "on_confirm": lambda val: self.execute_action( + "change_gpus", job["id"], new_val=val + ), + "on_cancel": None, + } + + def do_action(self, action): + """Perform action on selected job.""" + win = self.windows[self.active_win_idx] + job = win.get_selected() + cursor_only_actions = {"edit"} + focus_jid = str(job["id"]) if job else None + if action in cursor_only_actions: + if not job: + return + jids = [str(job["id"])] + elif ( + action in {"move_pending_up", "move_pending_down"} and win.key == "pending" + ): + selected = self._selected_ids_for_window(win) + if selected: + jids = selected + if focus_jid not in set(jids): + focus_jid = jids[0] + else: + if not job: + return + jids = [str(job["id"])] + else: + jids = self._action_ids_for_window(win) + if not jids: + return + + jid = jids[0] + count = len(jids) + target = f"{count} jobs" if count > 1 else f"job {jid}" + + # Actions that require modals + if action == "cancel": + if win.key not in ["staging", "pending", "running"]: + return + if win.key == "staging": + self.modal = { + "type": "CONFIRM", + "title": "Cancel Staged Job", + "text": f"Cancel staged {target}?", + "on_confirm": lambda jids=jids: self.execute_bulk_action( + "cancel", jids + ), + "on_cancel": None, + } + return + + self.modal = { + "type": "CONFIRM", + "title": "Cancel Job", + "text": f"Cancel {target}?", + "on_confirm": lambda jids=jids: self.execute_bulk_action( + "cancel", jids + ), + "on_cancel": None, + } + return + + elif action == "remove": + # Remove only works for completed (already checked in keybinding) + self.modal = { + "type": "CONFIRM", + "title": "Delete Job", + "text": f"Permanently delete {target}?", + "on_confirm": lambda jids=jids: self.execute_bulk_action( + "delete", jids + ), + "on_cancel": None, + } + return + + elif action == "edit": + # Edit only works for staging (already checked in keybinding) + if not job: + return + self.edit_job = copy.deepcopy(job) + self.edit_job["_type"] = "staging" + self.edit_is_new = False + self.edit_mode_active = True + self.edit_field_idx = 0 + return + + elif action == "dup": + self.execute_bulk_action("dup", jids) + return + elif action == "send_to_pending": + if win.key != "staging": + return + self.modal = { + "type": "CONFIRM", + "title": "Send To Pending", + "text": f"Send {target} to pending queue?", + "on_confirm": lambda jids=jids: self.execute_bulk_action( + "send_to_pending", jids + ), + "on_cancel": None, + } + return + elif action == "back_to_staging": + if win.key != "pending": + return + self.execute_bulk_action("back_to_staging", jids) + return + elif action == "retry": + if win.key != "completed": + return + self.modal = { + "type": "CONFIRM", + "title": "Retry Completed Job", + "text": ( + f"Retry {target}? This may replace or overwrite access to old logs." + ), + "on_confirm": lambda jids=jids: self.execute_bulk_action("retry", jids), + "on_cancel": None, + } + return + + # Immediate actions + self.execute_bulk_action(action, jids, focus_jid=focus_jid) + + def execute_bulk_action(self, action, jids, **kwargs): + jids = list(dict.fromkeys(str(jid) for jid in jids)) + if not jids: + return + if len(jids) == 1: + self.execute_action(action, jids[0], **kwargs) + return + + if action in ["edit", "change_gpus"]: + self.action_msg = "Action only supports the cursor row" + self.msg_clear_time = time.time() + 2.0 + return + + if action in ["move_pending_up", "move_pending_down"]: + offset = -1 if action == "move_pending_up" else 1 + focus_jid = ( + str(kwargs.get("focus_jid")) if kwargs.get("focus_jid") else None + ) + queue_snapshot = None + with locked_queue() as q: + if move_pending_jobs(q, jids, offset): + queue_snapshot = copy.deepcopy(q) + else: + self.action_msg = "Cannot move further" + self.msg_clear_time = time.time() + 2.0 + return + if queue_snapshot is not None: + with self.lock: + self._set_data_snapshot(queue_snapshot) + for w in self.windows: + if w.key != "pending": + continue + target_idx = None + if focus_jid is not None: + for idx, pending_job in enumerate(w.items): + if str(pending_job.get("id")) == focus_jid: + target_idx = idx + break + if target_idx is not None: + w.selected_idx = target_idx + w.ensure_selected_visible() + break + direction = "up" if offset < 0 else "down" + self.action_msg = ( + f"Moved {len(jids)} selected jobs {direction}" + if len(jids) > 1 + else f"Moved {jids[0]} {direction}" + ) + self.msg_clear_time = time.time() + 2.0 + return + + if action == "dup": + created = [] + with locked_queue() as q: + jobs_by_id = { + str(job.get("id")): job + for key in ["running", "pending", "staging", "completed"] + for job in q.get(key, []) + } + for jid in jids: + job = jobs_by_id.get(jid) + if job is None: + continue + dup_job = make_staged_job( + generate_job_id(), job.get("cmd", ""), job.get("gpus", 1) + ) + insert_staged_job(q, dup_job) + created.append(dup_job["id"]) + self.action_msg = f"Duplicated {len(created)} jobs" + self.msg_clear_time = time.time() + 2.0 + return + + if action == "back_to_staging": + moved = [] + queue_snapshot = None + with locked_queue() as q: + for jid in reversed(jids): + if move_pending_job_to_staging(q, jid): + moved.append(jid) + if moved: + queue_snapshot = copy.deepcopy(q) + if queue_snapshot is not None: + with self.lock: + self._set_data_snapshot(queue_snapshot) + self._clear_selected_ids(jids) + self.action_msg = f"Moved {len(moved)} jobs to staging" + self.msg_clear_time = time.time() + 2.0 + return + + messages = [] + for jid in jids: + self.execute_action(action, jid, **kwargs) + if self.action_msg: + messages.append(self.action_msg) + + action_labels = { + "cancel": "Cancelled", + "delete": "Deleted", + "pause": "Paused", + "retry": "Staged retries for", + "dup": "Duplicated", + "send_to_pending": "Sent", + "back_to_staging": "Moved to staging", + "discard_staging": "Discarded", + } + label = action_labels.get(action, "Updated") + self.action_msg = f"{label} {len(messages) or len(jids)} jobs" + self._clear_selected_ids(jids) + self.msg_clear_time = time.time() + 2.0 + + def execute_action(self, action, jid, **kwargs): + # Re-use existing cmd functions if possible, or call logic directly + msg = "" + try: + if action == "cancel": + # If it's a pending job, move to completed with status 'cancelled' + # If it's running, call the external tool + is_pending = False + is_staging = False + with locked_queue() as q: + for j in q["staging"]: + if j["id"] == jid: + is_staging = True + break + for j in q["pending"]: + if j["id"] == jid: + is_pending = True + break + + if is_staging: + with locked_queue() as q: + if cancel_staged_job(q, jid): + msg = f"Cancelled staged {jid}" + elif is_pending: + with locked_queue() as q: + for i, j in enumerate(q["pending"]): + if j["id"] == jid: + job = q["pending"].pop(i) + job["status"] = "cancelled" + job["ended"] = datetime.now().isoformat() + q["completed"].insert(0, job) + msg = f"Cancelled {jid}" + break + else: + # Running job + subprocess.Popen( + ["gpu-queue", "cancel", jid], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + msg = f"Cancelling {jid}..." + + elif action == "delete": + with locked_queue() as q: + for i, j in enumerate(q["completed"]): + if j["id"] == jid: + q["completed"].pop(i) + msg = f"Deleted {jid}" + break + + elif action == "pause": + subprocess.Popen( + ["gpu-queue", "pause", jid], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + msg = f"Pausing {jid}..." + + elif action == "dup": + with locked_queue() as q: + jobs_by_id = { + str(job.get("id")): job + for key in ["running", "pending", "staging", "completed"] + for job in q.get(key, []) + } + job = jobs_by_id.get(str(jid)) + if job is not None: + dup_job = make_staged_job( + generate_job_id(), job.get("cmd", ""), job.get("gpus", 1) + ) + insert_staged_job(q, dup_job) + msg = f"Duplicated {jid} to {dup_job['id']}" + + elif action == "retry": + with locked_queue() as q: + for job in q["completed"]: + if job["id"] == jid: + new_job = make_staged_job( + generate_job_id(), job["cmd"], job.get("gpus", 1) + ) + if stage_completed_retry(q, jid, new_job): + msg = f"Staged retry for {jid}" + break + + elif action == "update_staging": + cmd = kwargs.get("cmd") + gpus = kwargs.get("gpus") + + with locked_queue() as q: + for job in q["staging"]: + if job["id"] == jid: + if cmd is not None: + job["cmd"] = cmd + if gpus is not None: + job["gpus"] = gpus + msg = f"Updated staged job {jid}" + break + + elif action == "send_to_pending": + with locked_queue() as q: + if send_staged_job_to_pending(q, jid): + msg = f"Sent {jid} to pending" + + elif action == "back_to_staging": + queue_snapshot = None + with locked_queue() as q: + if move_pending_job_to_staging(q, jid): + msg = f"Moved {jid} to staging" + queue_snapshot = copy.deepcopy(q) + if queue_snapshot is not None: + with self.lock: + self._set_data_snapshot(queue_snapshot) + + elif action == "discard_staging": + with locked_queue() as q: + if cancel_staged_job(q, jid): + msg = f"Discarded staged job {jid}" + if self.edit_mode_active and self.edit_job is not None: + if self.edit_job.get("id") == jid: + self.edit_mode_active = False + self.edit_job = None + + elif action in ["move_pending_up", "move_pending_down"]: + offset = -1 if action == "move_pending_up" else 1 + new_idx = None + queue_snapshot = None + with locked_queue() as q: + if move_pending_job(q, jid, offset): + msg = f"Moved {jid} {'up' if offset < 0 else 'down'}" + for idx, job in enumerate(q["pending"]): + if job.get("id") == jid: + new_idx = idx + break + queue_snapshot = copy.deepcopy(q) + else: + msg = "Cannot move further" + + if queue_snapshot is not None: + with self.lock: + self._set_data_snapshot(queue_snapshot) + for w in self.windows: + if w.key == "pending": + if new_idx is not None: + w.selected_idx = new_idx + w.ensure_selected_visible() + break + + except Exception as e: + msg = f"Err: {str(e)}" + + if action not in [ + "dup", + "move_pending_up", + "move_pending_down", + "update_staging", + ]: + self._clear_selected_ids([jid]) + self.action_msg = msg + self.msg_clear_time = time.time() + 2.0 diff --git a/tests/test_queue_core.py b/tests/test_queue_core.py index 215a575..2d65033 100644 --- a/tests/test_queue_core.py +++ b/tests/test_queue_core.py @@ -6,9 +6,16 @@ from unittest.mock import patch from gpu_queue import main as mod +from gpu_queue import paths, scheduler -def _job(job_id: str, *, gpus: int = 1, cmd: str = "echo hi", added: str = "2026-01-01T00:00:00") -> dict: +def _job( + job_id: str, + *, + gpus: int = 1, + cmd: str = "echo hi", + added: str = "2026-01-01T00:00:00", +) -> dict: return {"id": job_id, "cmd": cmd, "gpus": gpus, "added": added} @@ -18,15 +25,17 @@ def setUp(self): self.addCleanup(self.tmp.cleanup) self.queue_dir = Path(self.tmp.name) / ".gpu_queue" self.stack = ExitStack() - for name, value in { + path_values = { "QUEUE_DIR": self.queue_dir, "QUEUE_FILE": self.queue_dir / "jobs.json", "PID_FILE": self.queue_dir / "daemon.pid", "DAEMON_LOG": self.queue_dir / "daemon.log", "LOG_DIR": self.queue_dir / "logs", "LOCK_FILE": self.queue_dir / "queue.lock", - }.items(): + } + for name, value in path_values.items(): self.stack.enter_context(patch.object(mod, name, value)) + self.stack.enter_context(patch.object(paths, name, value)) self.addCleanup(self.stack.close) def write_queue(self, data: dict) -> None: @@ -45,7 +54,9 @@ def test_load_queue_backfills_staging(self): data = mod.load_queue_raw() - self.assertEqual(list(data.keys()), ["staging", "pending", "running", "completed"]) + self.assertEqual( + list(data.keys()), ["staging", "pending", "running", "completed"] + ) self.assertEqual(data["staging"], []) self.assertEqual([j["id"] for j in data["pending"]], ["p1"]) @@ -74,10 +85,14 @@ def test_daemon_loop_starts_eligible_job(self): ) with ( - patch.object(mod, "cleanup_dead_jobs"), - patch.object(mod, "get_free_gpus", return_value=[{"index": 0, "free": True}, {"index": 1, "free": True}]), - patch.object(mod, "run_job", return_value=4321), - patch.object(mod.time, "sleep", side_effect=KeyboardInterrupt), + patch.object(scheduler, "cleanup_dead_jobs"), + patch.object( + scheduler, + "get_free_gpus", + return_value=[{"index": 0, "free": True}, {"index": 1, "free": True}], + ), + patch.object(scheduler, "run_job", return_value=4321), + patch.object(scheduler.time, "sleep", side_effect=KeyboardInterrupt), ): mod.daemon_loop(min_free=1) @@ -87,12 +102,179 @@ def test_daemon_loop_starts_eligible_job(self): self.assertEqual(data["pending"][0]["id"], "p2") self.assertEqual(data["running"][0]["assigned_gpus"], [0]) + def test_daemon_loop_keeps_min_free_gpus_physically_idle(self): + self.write_queue( + { + "staging": [], + "pending": [_job("p1", gpus=1)], + "running": [], + "completed": [], + } + ) + + gpu_status = [ + {"index": 0, "free": True}, + {"index": 1, "free": True}, + {"index": 2, "free": False}, + {"index": 3, "free": False}, + ] + with ( + patch.object(scheduler, "cleanup_dead_jobs"), + patch.object(scheduler, "get_free_gpus", return_value=gpu_status), + patch.object(scheduler, "run_job", return_value=4321) as run_job, + patch.object(scheduler.time, "sleep", side_effect=KeyboardInterrupt), + ): + mod.daemon_loop(min_free=2) + + data = mod.load_queue_raw() + self.assertEqual(data["running"], []) + self.assertEqual([j["id"] for j in data["pending"]], ["p1"]) + run_job.assert_not_called() + + def test_daemon_loop_respects_max_use_cap(self): + self.write_queue( + { + "staging": [], + "pending": [ + _job("p1", gpus=1), + _job("p2", gpus=1), + _job("p3", gpus=1), + ], + "running": [], + "completed": [], + } + ) + + gpu_status = [ + {"index": 0, "free": True}, + {"index": 1, "free": True}, + {"index": 2, "free": True}, + {"index": 3, "free": True}, + ] + with ( + patch.object(scheduler, "cleanup_dead_jobs"), + patch.object(scheduler, "get_free_gpus", return_value=gpu_status), + patch.object(scheduler, "run_job", return_value=4321), + patch.object(scheduler.time, "sleep", side_effect=KeyboardInterrupt), + ): + mod.daemon_loop(min_free=0, max_use=2) + + data = mod.load_queue_raw() + self.assertEqual([j["id"] for j in data["running"]], ["p1", "p2"]) + self.assertEqual([j["id"] for j in data["pending"]], ["p3"]) + self.assertEqual(data["running"][0]["assigned_gpus"], [0]) + self.assertEqual(data["running"][1]["assigned_gpus"], [1]) + + def test_daemon_loop_counts_existing_running_jobs_against_max_use(self): + self.write_queue( + { + "staging": [], + "pending": [_job("p1", gpus=1), _job("p2", gpus=1)], + "running": [ + { + **_job("r1", gpus=1), + "pid": 1234, + "assigned_gpus": [0], + "status": "running", + } + ], + "completed": [], + } + ) + + gpu_status = [ + {"index": 0, "free": False}, + {"index": 1, "free": True}, + {"index": 2, "free": True}, + {"index": 3, "free": True}, + ] + with ( + patch.object(scheduler, "cleanup_dead_jobs"), + patch.object(scheduler, "get_free_gpus", return_value=gpu_status), + patch.object(scheduler, "run_job", return_value=4321), + patch.object(scheduler.time, "sleep", side_effect=KeyboardInterrupt), + ): + mod.daemon_loop(min_free=0, max_use=2) + + data = mod.load_queue_raw() + self.assertEqual([j["id"] for j in data["running"]], ["r1", "p1"]) + self.assertEqual([j["id"] for j in data["pending"]], ["p2"]) + self.assertEqual(data["running"][1]["assigned_gpus"], [1]) + + def test_daemon_loop_does_not_count_busy_excluded_gpus_as_free_reserve(self): + self.write_queue( + { + "staging": [], + "pending": [_job("p1", gpus=1), _job("p2", gpus=1), _job("p3", gpus=1)], + "running": [], + "completed": [], + } + ) + + gpu_status = [ + {"index": 0, "free": False}, + {"index": 1, "free": True}, + {"index": 2, "free": True}, + {"index": 3, "free": True}, + ] + with ( + patch.object(scheduler, "cleanup_dead_jobs"), + patch.object(scheduler, "get_free_gpus", return_value=gpu_status), + patch.object(scheduler, "run_job", return_value=4321), + patch.object(scheduler.time, "sleep", side_effect=KeyboardInterrupt), + ): + mod.daemon_loop(min_free=1, excluded_gpus={0}) + + data = mod.load_queue_raw() + self.assertEqual([j["id"] for j in data["running"]], ["p1", "p2"]) + self.assertEqual([j["id"] for j in data["pending"]], ["p3"]) + self.assertEqual(data["running"][0]["assigned_gpus"], [1]) + self.assertEqual(data["running"][1]["assigned_gpus"], [2]) + + def test_daemon_loop_counts_idle_excluded_gpus_as_free_reserve(self): + self.write_queue( + { + "staging": [], + "pending": [_job("p1", gpus=1), _job("p2", gpus=1), _job("p3", gpus=1)], + "running": [], + "completed": [], + } + ) + + gpu_status = [ + {"index": 0, "free": True}, + {"index": 1, "free": True}, + {"index": 2, "free": True}, + {"index": 3, "free": True}, + ] + with ( + patch.object(scheduler, "cleanup_dead_jobs"), + patch.object(scheduler, "get_free_gpus", return_value=gpu_status), + patch.object(scheduler, "run_job", return_value=4321), + patch.object(scheduler.time, "sleep", side_effect=KeyboardInterrupt), + ): + mod.daemon_loop(min_free=1, excluded_gpus={0}) + + data = mod.load_queue_raw() + self.assertEqual([j["id"] for j in data["running"]], ["p1", "p2", "p3"]) + self.assertEqual(data["pending"], []) + self.assertEqual(data["running"][0]["assigned_gpus"], [1]) + self.assertEqual(data["running"][1]["assigned_gpus"], [2]) + self.assertEqual(data["running"][2]["assigned_gpus"], [3]) + def test_cmd_add_front_inserts_at_head(self): - args = type("Args", (), {"command": "cmd-a", "gpus": 2, "priority": "medium", "front": False})() + args = type( + "Args", + (), + {"command": "cmd-a", "gpus": 2, "priority": "medium", "front": False}, + )() mod.cmd_add(args) - front_args = type("Args", (), {"command": "cmd-b", "gpus": 1, "priority": "medium", "front": True})() + front_args = type( + "Args", + (), + {"command": "cmd-b", "gpus": 1, "priority": "medium", "front": True}, + )() mod.cmd_add(front_args) data = mod.load_queue_raw() self.assertEqual([j["cmd"] for j in data["pending"]], ["cmd-b", "cmd-a"]) - diff --git a/tests/test_tui_staging.py b/tests/test_tui_staging.py index a9308af..6e90943 100644 --- a/tests/test_tui_staging.py +++ b/tests/test_tui_staging.py @@ -3,13 +3,22 @@ import unittest from contextlib import ExitStack from pathlib import Path -from typing import Optional +from typing import Any, Optional, cast from unittest.mock import patch from gpu_queue import main as mod - - -def _job(job_id: str, *, cmd: str = "echo hi", gpus: int = 1, added: str = "2026-01-01T00:00:00", status: Optional[str] = None) -> dict: +from gpu_queue import paths +from gpu_queue.tui import app as tui_app + + +def _job( + job_id: str, + *, + cmd: str = "echo hi", + gpus: int = 1, + added: str = "2026-01-01T00:00:00", + status: Optional[str] = None, +) -> dict: job = {"id": job_id, "cmd": cmd, "gpus": gpus, "added": added} if status is not None: job["status"] = status @@ -22,15 +31,17 @@ def setUp(self): self.addCleanup(self.tmp.cleanup) self.queue_dir = Path(self.tmp.name) / ".gpu_queue" self.stack = ExitStack() - for name, value in { + path_values = { "QUEUE_DIR": self.queue_dir, "QUEUE_FILE": self.queue_dir / "jobs.json", "PID_FILE": self.queue_dir / "daemon.pid", "DAEMON_LOG": self.queue_dir / "daemon.log", "LOG_DIR": self.queue_dir / "logs", "LOCK_FILE": self.queue_dir / "queue.lock", - }.items(): + } + for name, value in path_values.items(): self.stack.enter_context(patch.object(mod, name, value)) + self.stack.enter_context(patch.object(paths, name, value)) self.addCleanup(self.stack.close) def write_queue(self, data: dict) -> None: @@ -61,6 +72,16 @@ def pending_idx(self, tui: mod.GPUQueueTUI) -> int: return i raise AssertionError("pending window not found") + def completed_idx(self, tui: mod.GPUQueueTUI) -> int: + for i, win in enumerate(tui.windows): + if win.key == "completed": + return i + raise AssertionError("completed window not found") + + def require_modal(self, tui: mod.GPUQueueTUI) -> dict[str, Any]: + self.assertIsNotNone(tui.modal) + return cast(dict[str, Any], tui.modal) + def test_prompt_new_job_inserts_on_top(self): tui = self.load_tui( { @@ -95,8 +116,10 @@ def test_edit_staged_job_enters_edit_mode(self): tui.do_action("edit") self.assertTrue(tui.edit_mode_active) - self.assertEqual(tui.edit_job["id"], "s1") - self.assertEqual(tui.edit_job["cmd"], "python train.py") + self.assertIsNotNone(tui.edit_job) + edit_job = cast(dict[str, Any], tui.edit_job) + self.assertEqual(edit_job["id"], "s1") + self.assertEqual(edit_job["cmd"], "python train.py") def test_cancel_staged_job_moves_to_completed(self): tui = self.load_tui( @@ -112,7 +135,7 @@ def test_cancel_staged_job_moves_to_completed(self): tui.windows[idx].selected_idx = 0 tui.do_action("cancel") - tui.modal["on_confirm"]() + self.require_modal(tui)["on_confirm"]() data = mod.load_queue_raw() self.assertEqual(data["staging"], []) @@ -128,17 +151,66 @@ def test_retry_stages_completed_job_on_top(self): "completed": [_job("c1", cmd="python eval.py", status="failed")], } ) - completed_idx = next(i for i, win in enumerate(tui.windows) if win.key == "completed") + completed_idx = next( + i for i, win in enumerate(tui.windows) if win.key == "completed" + ) tui.active_win_idx = completed_idx tui.windows[completed_idx].selected_idx = 0 tui.do_action("retry") + modal = self.require_modal(tui) + self.assertEqual(modal["type"], "CONFIRM") + self.assertIn("old logs", modal["text"]) + modal["on_confirm"]() data = mod.load_queue_raw() self.assertEqual(data["completed"], []) self.assertEqual(data["staging"][0]["cmd"], "python eval.py") self.assertNotEqual(data["staging"][0]["id"], "c1") + def test_duplicate_does_not_enter_edit_mode(self): + tui = self.load_tui( + { + "staging": [], + "pending": [_job("p1", cmd="python train.py", gpus=2)], + "running": [], + "completed": [], + } + ) + pending_idx = self.pending_idx(tui) + tui.active_win_idx = pending_idx + tui.windows[pending_idx].selected_idx = 0 + + tui.do_action("dup") + + data = mod.load_queue_raw() + self.assertEqual(len(data["staging"]), 1) + self.assertEqual(data["staging"][0]["cmd"], "python train.py") + self.assertEqual(data["staging"][0]["gpus"], 2) + self.assertFalse(tui.edit_mode_active) + self.assertIsNone(tui.edit_job) + + def test_duplicate_keeps_current_window_and_cursor(self): + tui = self.load_tui( + { + "staging": [], + "pending": [_job("p1"), _job("p2"), _job("p3")], + "running": [], + "completed": [], + } + ) + pending_idx = self.pending_idx(tui) + win = tui.windows[pending_idx] + tui.active_win_idx = pending_idx + win.selected_idx = 1 + win.scroll_offset = 1 + + tui.do_action("dup") + + self.assertEqual(tui.active_win_idx, pending_idx) + self.assertEqual(win.selected_idx, 1) + self.assertEqual(win.scroll_offset, 1) + def test_send_staged_job_to_pending_requires_confirmation(self): tui = self.load_tui( { @@ -153,8 +225,9 @@ def test_send_staged_job_to_pending_requires_confirmation(self): tui.windows[idx].selected_idx = 0 tui.do_action("send_to_pending") - self.assertEqual(tui.modal["type"], "CONFIRM") - tui.modal["on_confirm"]() + modal = self.require_modal(tui) + self.assertEqual(modal["type"], "CONFIRM") + modal["on_confirm"]() data = mod.load_queue_raw() self.assertEqual(data["staging"], []) @@ -169,7 +242,9 @@ def test_pending_reorder_moves_selected_job(self): "completed": [], } ) - pending_idx = next(i for i, win in enumerate(tui.windows) if win.key == "pending") + pending_idx = next( + i for i, win in enumerate(tui.windows) if win.key == "pending" + ) tui.active_win_idx = pending_idx tui.windows[pending_idx].selected_idx = 1 @@ -198,7 +273,9 @@ def test_pending_reorder_keeps_moved_job_visible_when_moving_up(self): tui.execute_action("move_pending_up", "p3") data = mod.load_queue_raw() - self.assertEqual([j["id"] for j in data["pending"][:4]], ["p0", "p1", "p3", "p2"]) + self.assertEqual( + [j["id"] for j in data["pending"][:4]], ["p0", "p1", "p3", "p2"] + ) self.assertEqual(win.selected_idx, 2) self.assertEqual(win.scroll_offset, 2) @@ -222,6 +299,231 @@ def test_pending_reorder_updates_window_items_for_rapid_repeated_moves(self): self.assertEqual([j["id"] for j in data["pending"]], ["p1", "p3", "p4", "p2"]) self.assertEqual(tui.windows[pending_idx].selected_idx, 3) + def test_select_mode_selects_rows_while_moving(self): + tui = self.load_tui( + { + "staging": [], + "pending": [_job("p1"), _job("p2"), _job("p3")], + "running": [], + "completed": [], + } + ) + pending_idx = self.pending_idx(tui) + win = tui.windows[pending_idx] + tui.active_win_idx = pending_idx + win.selected_idx = 0 + + tui._enter_select_mode(win) + tui._select_mode_move(win, 1) + tui._select_mode_move(win, 1) + + self.assertTrue(tui.select_mode_active) + self.assertEqual(win.selected_idx, 2) + self.assertEqual(tui._selected_ids_for_window(win), ["p1", "p2", "p3"]) + + tui._exit_select_mode() + self.assertFalse(tui.select_mode_active) + self.assertEqual(tui._selected_ids_for_window(win), ["p1", "p2", "p3"]) + + def test_select_mode_move_up_deselects_rows(self): + tui = self.load_tui( + { + "staging": [], + "pending": [_job("p1"), _job("p2"), _job("p3"), _job("p4")], + "running": [], + "completed": [], + } + ) + pending_idx = self.pending_idx(tui) + win = tui.windows[pending_idx] + tui.active_win_idx = pending_idx + win.selected_idx = 0 + + tui._enter_select_mode(win) + tui._select_mode_move(win, 1) + tui._select_mode_move(win, 1) + self.assertEqual(tui._selected_ids_for_window(win), ["p1", "p2", "p3"]) + + tui._select_mode_move(win, -1) + self.assertEqual(win.selected_idx, 1) + self.assertEqual(tui._selected_ids_for_window(win), ["p1", "p2"]) + + def test_escape_clears_select_mode_selection(self): + tui = self.load_tui( + { + "staging": [], + "pending": [_job("p1"), _job("p2")], + "running": [], + "completed": [], + } + ) + pending_idx = self.pending_idx(tui) + win = tui.windows[pending_idx] + tui.active_win_idx = pending_idx + win.selected_idx = 0 + + tui._enter_select_mode(win) + tui._select_mode_move(win, 1) + tui._cancel_select_mode() + + self.assertFalse(tui.select_mode_active) + self.assertEqual(tui._selected_ids_for_window(win), []) + + def test_escape_clears_selection_even_after_leaving_select_mode(self): + tui = self.load_tui( + { + "staging": [_job("s1")], + "pending": [_job("p1")], + "running": [], + "completed": [], + } + ) + pending_idx = self.pending_idx(tui) + win = tui.windows[pending_idx] + tui.active_win_idx = pending_idx + tui._enter_select_mode(win) + tui._exit_select_mode() + tui.selected_job_ids["staging"] = {"s1"} + + tui._cancel_select_mode() + + self.assertFalse(tui.select_mode_active) + self.assertEqual(tui.selected_job_ids["pending"], set()) + self.assertEqual(tui.selected_job_ids["staging"], set()) + + def test_bulk_cancel_pending_uses_selected_rows(self): + tui = self.load_tui( + { + "staging": [], + "pending": [_job("p1"), _job("p2"), _job("p3")], + "running": [], + "completed": [], + } + ) + pending_idx = self.pending_idx(tui) + tui.active_win_idx = pending_idx + tui.selected_job_ids["pending"] = {"p1", "p3"} + + tui.do_action("cancel") + modal = self.require_modal(tui) + self.assertEqual(modal["type"], "CONFIRM") + modal["on_confirm"]() + + data = mod.load_queue_raw() + self.assertEqual([j["id"] for j in data["pending"]], ["p2"]) + self.assertEqual([j["id"] for j in data["completed"]], ["p3", "p1"]) + self.assertEqual(tui.selected_job_ids["pending"], set()) + + def test_move_pending_job_back_to_staging_top(self): + tui = self.load_tui( + { + "staging": [_job("s1")], + "pending": [_job("p1"), _job("p2")], + "running": [], + "completed": [], + } + ) + pending_idx = self.pending_idx(tui) + win = tui.windows[pending_idx] + tui.active_win_idx = pending_idx + win.selected_idx = 1 + + tui.do_action("back_to_staging") + + data = mod.load_queue_raw() + self.assertEqual([j["id"] for j in data["staging"]], ["p2", "s1"]) + self.assertEqual([j["id"] for j in data["pending"]], ["p1"]) + self.assertEqual(win.selected_idx, 0) + + def test_bulk_move_pending_jobs_back_to_staging_preserves_order(self): + tui = self.load_tui( + { + "staging": [_job("s1")], + "pending": [_job("p1"), _job("p2"), _job("p3"), _job("p4")], + "running": [], + "completed": [], + } + ) + pending_idx = self.pending_idx(tui) + tui.active_win_idx = pending_idx + tui.selected_job_ids["pending"] = {"p2", "p4"} + + tui.do_action("back_to_staging") + + data = mod.load_queue_raw() + self.assertEqual([j["id"] for j in data["staging"]], ["p2", "p4", "s1"]) + self.assertEqual([j["id"] for j in data["pending"]], ["p1", "p3"]) + self.assertEqual(tui.selected_job_ids["pending"], set()) + + def test_bulk_send_staged_jobs_to_pending(self): + tui = self.load_tui( + { + "staging": [_job("s1"), _job("s2"), _job("s3")], + "pending": [_job("p1")], + "running": [], + "completed": [], + } + ) + staging_idx = self.staging_idx(tui) + tui.active_win_idx = staging_idx + tui.selected_job_ids["staging"] = {"s1", "s3"} + + tui.do_action("send_to_pending") + self.require_modal(tui)["on_confirm"]() + + data = mod.load_queue_raw() + self.assertEqual([j["id"] for j in data["staging"]], ["s2"]) + self.assertEqual([j["id"] for j in data["pending"]], ["p1", "s1", "s3"]) + + def test_bulk_retry_completed_jobs_to_staging(self): + tui = self.load_tui( + { + "staging": [], + "pending": [], + "running": [], + "completed": [ + _job("c1", cmd="python a.py", status="failed"), + _job("c2", cmd="python b.py", status="failed"), + ], + } + ) + completed_idx = self.completed_idx(tui) + tui.active_win_idx = completed_idx + tui.selected_job_ids["completed"] = {"c1", "c2"} + + tui.do_action("retry") + modal = self.require_modal(tui) + self.assertEqual(modal["type"], "CONFIRM") + modal["on_confirm"]() + + data = mod.load_queue_raw() + self.assertEqual(data["completed"], []) + self.assertEqual( + [j["cmd"] for j in data["staging"]], + ["python b.py", "python a.py"], + ) + + def test_reorder_moves_bulk_selection_when_present(self): + tui = self.load_tui( + { + "staging": [], + "pending": [_job("p1"), _job("p2"), _job("p3")], + "running": [], + "completed": [], + } + ) + pending_idx = self.pending_idx(tui) + win = tui.windows[pending_idx] + tui.active_win_idx = pending_idx + win.selected_idx = 1 + tui.selected_job_ids["pending"] = {"p2", "p3"} + + tui.do_action("move_pending_up") + + data = mod.load_queue_raw() + self.assertEqual([j["id"] for j in data["pending"]], ["p2", "p3", "p1"]) + self.assertEqual(win.selected_idx, 0) + def test_scroll_offset_clamps_to_viewport_after_height_change(self): win = mod.Window("PENDING", "pending") win.height = 5 @@ -234,7 +536,7 @@ def test_scroll_offset_clamps_to_viewport_after_height_change(self): self.assertEqual(win.scroll_offset, 0) self.assertEqual(win.selected_idx, 4) - def test_selected_job_panel_renders_current_job_in_nav_mode(self): + def test_selected_job_panel_renders_current_job_in_action_mode(self): tui = self.load_tui( { "staging": [], @@ -245,8 +547,7 @@ def test_selected_job_panel_renders_current_job_in_nav_mode(self): ) pending_idx = self.pending_idx(tui) tui.active_win_idx = pending_idx - tui.mode = "NAV" - tui.has_selected_job_context = True + tui._enter_action_window(tui.windows[pending_idx]) screen = FakeScreen(10, 80) tui.stdscr = screen @@ -256,6 +557,32 @@ def test_selected_job_panel_renders_current_job_in_nav_mode(self): self.assertIn("ID: p1", text) self.assertIn("python train.py", text) + def test_selected_job_panel_clears_after_exiting_action_window(self): + tui = self.load_tui( + { + "staging": [], + "pending": [_job("p1", cmd="python train.py")], + "running": [], + "completed": [], + } + ) + pending_idx = self.pending_idx(tui) + tui.active_win_idx = pending_idx + tui._enter_action_window(tui.windows[pending_idx]) + screen = FakeScreen(10, 80) + tui.stdscr = screen + + tui.draw_job_details(1, 0, 6, 80) + self.assertIn("ID: p1", screen.text()) + + screen.calls = [] + tui._exit_action_window(tui.windows[pending_idx]) + tui.draw_job_details(1, 0, 6, 80) + + text = screen.text() + self.assertIn("No job selected.", text) + self.assertNotIn("ID: p1", text) + def test_selected_job_panel_is_empty_until_a_panel_is_entered(self): tui = self.load_tui( { @@ -296,13 +623,19 @@ def test_draw_layout_stays_onscreen_with_collapsed_queue_window(self): tui.mode = "ACTION" tui.active_win_idx = self.pending_idx(tui) tui.gpu_status = [ - {"index": i, "used_mb": 1000, "total_mb": 24000, "util": 50, "processes": []} + { + "index": i, + "used_mb": 1000, + "total_mb": 24000, + "util": 50, + "processes": [], + } for i in range(8) ] next(win for win in tui.windows if win.key == "staging").collapsed = True tui.stdscr = FakeScreen(20, 80) - with patch.object(mod.curses, "color_pair", side_effect=lambda n: n): + with patch.object(tui_app.curses, "color_pair", side_effect=lambda n: n): tui.draw() self.assertEqual(tui.stdscr.errors, [])