From bde82965ac9ddd55d185b11e89fd1d4de538dbaf Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Wed, 1 Jul 2026 17:09:39 +0800 Subject: [PATCH 01/28] fix: add backend service watchdog recovery --- flocks/cli/main.py | 26 +++ flocks/cli/service_manager.py | 287 ++++++++++++++++++++++++++++- tests/cli/test_service_commands.py | 2 +- tests/cli/test_service_manager.py | 221 ++++++++++++++++++++++ 4 files changed, 533 insertions(+), 3 deletions(-) diff --git a/flocks/cli/main.py b/flocks/cli/main.py index bdfa8d8d5..7a8973a9e 100644 --- a/flocks/cli/main.py +++ b/flocks/cli/main.py @@ -33,9 +33,11 @@ from flocks.cli.service_manager import ( ServiceConfig, ServiceError, + WATCHDOG_CHECK_INTERVAL_SECONDS, read_runtime_record, resolve_flocks_cli_command, restart_all, + run_service_watchdog, runtime_paths, show_logs, show_status, @@ -401,6 +403,30 @@ def serve( ) +@app.command(name="service-watchdog", hidden=True) +def service_watchdog( + server_host: str = typer.Option("127.0.0.1", "--server-host", help="Backend server host"), + server_port: int = typer.Option(8000, "--server-port", help="Backend server port"), + webui_host: str = typer.Option("127.0.0.1", "--webui-host", help="WebUI host"), + webui_port: int = typer.Option(5173, "--webui-port", help="WebUI port"), + interval: float = typer.Option(WATCHDOG_CHECK_INTERVAL_SECONDS, "--interval", help="Health check interval"), +): + """ + Monitor daemon services and recover unhealthy backend listeners. + """ + run_service_watchdog( + ServiceConfig( + backend_host=server_host, + backend_port=server_port, + frontend_host=webui_host, + frontend_port=webui_port, + no_browser=True, + skip_frontend_build=True, + ), + interval=interval, + ) + + @app.command() def tui( directory: Optional[Path] = typer.Option(None, "--directory", "-d", help="Project directory"), diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index df534ffe7..0ef4a25b6 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -48,6 +48,10 @@ "src\\win\\async.c", "src/win/async.c", ) +WATCHDOG_CHECK_INTERVAL_SECONDS = 5.0 +WATCHDOG_HEALTH_FAILURE_THRESHOLD = 2 +WATCHDOG_PID_FILENAME = "watchdog.pid" +WATCHDOG_LOG_FILENAME = "watchdog.log" class ServiceError(RuntimeError): @@ -104,6 +108,15 @@ def has_artifacts(self) -> bool: return self.payload_present or self.pid_file_present +@dataclass(frozen=True) +class WatchdogProbeResult: + restart_needed: bool + health_failure: bool + reason: str + host: str + port: int + + def repo_root() -> Path: """Return the installed repository root.""" override = os.getenv("FLOCKS_REPO_ROOT") @@ -144,6 +157,16 @@ def ensure_runtime_dirs(paths: RuntimePaths | None = None) -> RuntimePaths: return current +def watchdog_pid_path(paths: RuntimePaths) -> Path: + """Return the watchdog runtime record path.""" + return paths.run_dir / WATCHDOG_PID_FILENAME + + +def watchdog_log_path(paths: RuntimePaths) -> Path: + """Return the watchdog log path.""" + return paths.log_dir / WATCHDOG_LOG_FILENAME + + def ensure_install_layout(root: Path | None = None) -> Path: """Validate that the installed repo still contains backend and WebUI code.""" current = root or repo_root() @@ -412,8 +435,8 @@ def write_runtime_record(pid_file: Path, record: RuntimeRecord) -> None: def process_runtime_record( process: subprocess.Popen, *, - host: str, - port: int, + host: str | None, + port: int | None, command: Sequence[str], ) -> RuntimeRecord: """Build runtime metadata for a freshly started service process.""" @@ -877,6 +900,17 @@ def _is_running_status_response(response: httpx.Response) -> bool: return isinstance(payload, dict) and payload.get("status") == "running" +def _is_healthy_status_response(response: httpx.Response) -> bool: + """Return True when the backend health endpoint reports healthy.""" + if response.status_code != 200: + return False + try: + payload = response.json() + except ValueError: + return False + return isinstance(payload, dict) and payload.get("status") == "healthy" + + def wait_for_http( urls: Sequence[str], name: str, @@ -902,6 +936,156 @@ def wait_for_http( raise ServiceError(f"{name} 启动超时,请检查日志。") +class _StdoutConsole: + """Console adapter for daemon logs redirected to a file.""" + + def print(self, *args, **_kwargs) -> None: + sys.stdout.write(" ".join(str(arg) for arg in args) + "\n") + sys.stdout.flush() + + +def _watchdog_log(event: str, details: dict[str, object] | None = None) -> None: + timestamp = datetime.datetime.now().isoformat(timespec="seconds") + suffix = "" + if details: + suffix = " " + json.dumps(details, ensure_ascii=True, sort_keys=True) + sys.stdout.write(f"[{timestamp}] watchdog.{event}{suffix}\n") + sys.stdout.flush() + + +def _backend_health_url(host: str, port: int) -> str: + return f"http://{_format_host_for_url(access_host(host))}:{port}/api/health" + + +def _watchdog_backend_endpoint(config: ServiceConfig, paths: RuntimePaths) -> tuple[RuntimeRecord | None, str, int]: + record = read_runtime_record(paths.backend_pid) + host = record.host if record is not None and record.host else config.backend_host + port = record.port if record is not None and record.port is not None else config.backend_port + return record, host, port + + +def _watchdog_backend_config(config: ServiceConfig, paths: RuntimePaths) -> ServiceConfig: + record, host, port = _watchdog_backend_endpoint(config, paths) + return ServiceConfig( + backend_host=host, + backend_port=port, + frontend_host=config.frontend_host, + frontend_port=config.frontend_port, + no_browser=True, + skip_frontend_build=True, + ) + + +def _watchdog_probe_backend( + config: ServiceConfig, + paths: RuntimePaths, + client: httpx.Client, +) -> WatchdogProbeResult: + record, host, port = _watchdog_backend_endpoint(config, paths) + if record is None: + return WatchdogProbeResult(False, False, "backend runtime record missing", host, port) + + backend_running = runtime_record_is_running(record) + listeners = port_owner_pids(port) + port_in_use = port_is_in_use(port, listeners) + if not backend_running: + if port_in_use: + return WatchdogProbeResult( + False, + False, + f"backend runtime record is not running but port {port} is occupied", + host, + port, + ) + return WatchdogProbeResult(True, False, "backend runtime record is not running", host, port) + if not port_in_use: + return WatchdogProbeResult(True, False, f"backend process alive but port {port} is not listening", host, port) + + runtime_pids = set(_runtime_record_pids(record)) + listener_pids = set(listeners) + if listener_pids and runtime_pids and listener_pids.isdisjoint(runtime_pids): + reason = ( + f"backend process alive but port {port} is owned by unexpected pid(s): " + f"{_join_pids(sorted(listener_pids))}" + ) + return WatchdogProbeResult(False, False, reason, host, port) + + url = _backend_health_url(host, port) + try: + response = client.get(url) + except Exception as exc: + return WatchdogProbeResult(True, True, f"backend health check failed: {exc}", host, port) + if not _is_healthy_status_response(response): + return WatchdogProbeResult( + True, + True, + f"backend health check unhealthy: status={response.status_code}", + host, + port, + ) + return WatchdogProbeResult(False, False, "backend healthy", host, port) + + +def _recover_unhealthy_backend(config: ServiceConfig, paths: RuntimePaths, reason: str) -> None: + try: + with service_lock(paths): + effective_config = _watchdog_backend_config(config, paths) + with httpx.Client(timeout=2.0, trust_env=False) as client: + probe = _watchdog_probe_backend(effective_config, paths, client) + if not probe.restart_needed: + _watchdog_log("backend_recovery_skipped", {"reason": probe.reason}) + return + + console = _StdoutConsole() + _watchdog_log( + "backend_recovery_start", + { + "reason": reason, + "host": probe.host, + "port": probe.port, + }, + ) + stop_one(probe.port, paths.backend_pid, "后端", console) + start_backend(effective_config, console) + _watchdog_log("backend_recovery_done", {"host": probe.host, "port": probe.port}) + except ServiceError as exc: + _watchdog_log("backend_recovery_failed", {"reason": reason, "error": str(exc)}) + except Exception as exc: + _watchdog_log("backend_recovery_crashed", {"reason": reason, "error": repr(exc)}) + + +def _watchdog_tick( + config: ServiceConfig, + paths: RuntimePaths, + health_failure_count: int, + *, + failure_threshold: int = WATCHDOG_HEALTH_FAILURE_THRESHOLD, +) -> int: + with httpx.Client(timeout=2.0, trust_env=False) as client: + probe = _watchdog_probe_backend(config, paths, client) + + if not probe.restart_needed: + return 0 + + if probe.health_failure: + health_failure_count += 1 + _watchdog_log( + "backend_health_failed", + { + "count": health_failure_count, + "threshold": failure_threshold, + "reason": probe.reason, + "host": probe.host, + "port": probe.port, + }, + ) + if health_failure_count < failure_threshold: + return health_failure_count + + _recover_unhealthy_backend(config, paths, probe.reason) + return 0 + + def start_backend(config: ServiceConfig, console) -> None: """Start the backend API service if needed.""" root = ensure_install_layout() @@ -1099,6 +1283,95 @@ def start_frontend(config: ServiceConfig, console) -> None: console.print(f"[flocks] WebUI 已启动,日志: {paths.frontend_log}") +def start_watchdog(config: ServiceConfig, console) -> None: + """Start the service watchdog daemon if needed.""" + root = ensure_install_layout() + paths = ensure_runtime_dirs() + pid_file = watchdog_pid_path(paths) + log_path = watchdog_log_path(paths) + cleanup_stale_pid_file(pid_file) + + runtime_record = read_runtime_record(pid_file) + if runtime_record is not None and runtime_record_is_running(runtime_record): + console.print(f"[flocks] Watchdog 已在运行,PID={runtime_record.pid}") + return + if runtime_record is not None: + pid_file.unlink(missing_ok=True) + + command = resolve_flocks_cli_command(root) + [ + "service-watchdog", + "--server-host", + config.backend_host, + "--server-port", + str(config.backend_port), + "--webui-host", + config.frontend_host, + "--webui-port", + str(config.frontend_port), + "--interval", + str(WATCHDOG_CHECK_INTERVAL_SECONDS), + ] + env = os.environ.copy() + env["PYTHONUNBUFFERED"] = "1" + + console.print("[flocks] 启动服务 Watchdog...") + process = _spawn_process(command, cwd=root, log_path=log_path, env=env) + write_runtime_record( + pid_file, + process_runtime_record( + process, + host=None, + port=None, + command=command, + ), + ) + _log_startup_config(log_path, "watchdog", config.backend_host, config.backend_port, read_runtime_record(pid_file)) + console.print(f"[flocks] Watchdog 已启动,日志: {log_path}") + + +def stop_watchdog(paths: RuntimePaths, console) -> None: + """Stop the service watchdog without touching backend/frontend ports.""" + pid_file = watchdog_pid_path(paths) + cleanup_stale_pid_file(pid_file) + if read_runtime_record(pid_file) is None: + return + stop_one(0, pid_file, "Watchdog", console) + + +def run_service_watchdog( + config: ServiceConfig, + *, + interval: float = WATCHDOG_CHECK_INTERVAL_SECONDS, + failure_threshold: int = WATCHDOG_HEALTH_FAILURE_THRESHOLD, +) -> None: + """Run the backend health watchdog loop.""" + paths = ensure_runtime_dirs() + _watchdog_log( + "started", + { + "backend_host": config.backend_host, + "backend_port": config.backend_port, + "interval": interval, + "failure_threshold": failure_threshold, + }, + ) + health_failure_count = 0 + while True: + try: + health_failure_count = _watchdog_tick( + config, + paths, + health_failure_count, + failure_threshold=failure_threshold, + ) + except KeyboardInterrupt: + _watchdog_log("stopped") + return + except Exception as exc: + _watchdog_log("tick_failed", {"error": repr(exc)}) + time.sleep(interval) + + def _tracked_processes_stopped( port: int, record: RuntimeRecord | None, @@ -1323,6 +1596,7 @@ def _stop_all_locked( fe_port, be_port = _resolve_stop_ports(paths, config) try: _resolve_upgrade_runtime(console, frontend_port=fe_port, attempt_recover=False) + stop_watchdog(paths, console) stop_one(fe_port, paths.frontend_pid, "WebUI", console) stop_one(be_port, paths.backend_pid, "后端", console) finally: @@ -1341,6 +1615,7 @@ def _start_all_without_stop(config: ServiceConfig, console) -> None: ensure_runtime_dirs() start_backend(config, console) start_frontend(config, console) + start_watchdog(config, console) show_start_summary(config, console) if not config.no_browser: open_default_browser(config.frontend_url, console) @@ -1367,9 +1642,11 @@ def build_status_lines(paths: RuntimePaths | None = None) -> list[str]: current = paths or runtime_paths() cleanup_stale_pid_file(current.backend_pid) cleanup_stale_pid_file(current.frontend_pid) + cleanup_stale_pid_file(watchdog_pid_path(current)) backend_record = read_runtime_record(current.backend_pid) frontend_record = read_runtime_record(current.frontend_pid) + watchdog_record = read_runtime_record(watchdog_pid_path(current)) backend_port = _recorded_port(current.backend_pid, ServiceConfig.backend_port) frontend_port = _recorded_port(current.frontend_pid, ServiceConfig.frontend_port) backend_host = _loopback_host(_recorded_host(current.backend_pid, ServiceConfig.backend_host)) @@ -1417,11 +1694,17 @@ def build_status_lines(paths: RuntimePaths | None = None) -> list[str]: else: lines.append("[flocks] WebUI 未运行") + if runtime_record_is_running(watchdog_record): + lines.append(f"[flocks] Watchdog 运行中: PID={watchdog_record.pid}") + else: + lines.append("[flocks] Watchdog 未运行") + if upgrade_info.payload_present: lines.append("[flocks] 检测到未完成的升级恢复状态") lines.append(f"[flocks] 后端日志: {current.backend_log}") lines.append(f"[flocks] WebUI 日志: {current.frontend_log}") + lines.append(f"[flocks] Watchdog 日志: {watchdog_log_path(current)}") return lines diff --git a/tests/cli/test_service_commands.py b/tests/cli/test_service_commands.py index 99a267c17..49d92a200 100644 --- a/tests/cli/test_service_commands.py +++ b/tests/cli/test_service_commands.py @@ -31,7 +31,7 @@ def test_cli_help_lists_service_commands(monkeypatch, tmp_path) -> None: assert result.exit_code == 0 for command in ("start", "stop", "restart", "status", "logs", "session", "mcp", "task", "skills"): assert _help_contains_command(result.stdout, command) - for command in ("agent", "acp", "debug", "run", "serve", "auth", "models"): + for command in ("agent", "acp", "debug", "run", "serve", "service-watchdog", "auth", "models"): assert not _help_contains_command(result.stdout, command) diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index ba42f890d..286f9541a 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -19,6 +19,18 @@ def print(self, *args, **kwargs) -> None: self.messages.append(" ".join(str(arg) for arg in args)) +def _make_runtime_paths(tmp_path: Path) -> service_manager.RuntimePaths: + return service_manager.RuntimePaths( + root=tmp_path, + run_dir=tmp_path / "run", + log_dir=tmp_path / "logs", + backend_pid=tmp_path / "run" / "backend.pid", + frontend_pid=tmp_path / "run" / "webui.pid", + backend_log=tmp_path / "logs" / "backend.log", + frontend_log=tmp_path / "logs" / "webui.log", + ) + + def test_runtime_paths_follow_flocks_root_env(monkeypatch, tmp_path: Path) -> None: monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path)) @@ -797,6 +809,26 @@ def test_restart_all_stops_then_starts_under_lock(monkeypatch) -> None: ] +def test_start_all_without_stop_starts_watchdog_after_frontend(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + calls: list[str] = [] + + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "start_backend", lambda _config, _console: calls.append("backend")) + monkeypatch.setattr(service_manager, "start_frontend", lambda _config, _console: calls.append("webui")) + monkeypatch.setattr(service_manager, "start_watchdog", lambda _config, _console: calls.append("watchdog")) + monkeypatch.setattr(service_manager, "show_start_summary", lambda _config, _console: calls.append("summary")) + monkeypatch.setattr( + service_manager, + "open_default_browser", + lambda _url, _console: calls.append("browser"), + ) + + service_manager._start_all_without_stop(service_manager.ServiceConfig(no_browser=True), DummyConsole()) + + assert calls == ["backend", "webui", "watchdog", "summary"] + + def test_start_all_stops_on_failure_before_restart(monkeypatch) -> None: paths = service_manager.RuntimePaths( root=Path("/tmp"), @@ -1209,6 +1241,195 @@ def fake_spawn(command, **kwargs): assert record.port == 5174 +def test_start_watchdog_writes_runtime_metadata(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + paths.run_dir.mkdir(parents=True) + paths.log_dir.mkdir(parents=True) + console = DummyConsole() + spawn_calls: list[dict[str, object]] = [] + + monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) + monkeypatch.setattr(service_manager, "runtime_record_is_running", lambda _record: False) + monkeypatch.setattr( + service_manager, + "resolve_flocks_cli_command", + lambda root=None: ["python", "-m", "flocks.cli.main"], + ) + monkeypatch.setattr(service_manager.os, "getpgid", lambda pid: pid) + monkeypatch.setattr( + service_manager, + "_spawn_process", + lambda *args, **kwargs: spawn_calls.append({"args": args, "kwargs": kwargs}) or SimpleNamespace(pid=2468), + ) + + service_manager.start_watchdog( + service_manager.ServiceConfig(backend_host="0.0.0.0", backend_port=9000), + console, + ) + + record = service_manager.read_runtime_record(service_manager.watchdog_pid_path(paths)) + assert record is not None + assert record.pid == 2468 + assert record.port is None + assert record.command == ( + "python", + "-m", + "flocks.cli.main", + "service-watchdog", + "--server-host", + "0.0.0.0", + "--server-port", + "9000", + "--webui-host", + "127.0.0.1", + "--webui-port", + "5173", + "--interval", + str(service_manager.WATCHDOG_CHECK_INTERVAL_SECONDS), + ) + assert spawn_calls[0]["kwargs"]["log_path"] == service_manager.watchdog_log_path(paths) + + +def test_watchdog_recovers_backend_when_process_alive_but_port_not_listening(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + paths.run_dir.mkdir(parents=True) + service_manager.write_runtime_record( + paths.backend_pid, + service_manager.RuntimeRecord(pid=111, pgid=222, host="0.0.0.0", port=9995), + ) + calls: list[tuple[str, int]] = [] + + monkeypatch.setattr(service_manager, "runtime_record_is_running", lambda _record: True) + monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) + monkeypatch.setattr(service_manager, "port_is_in_use", lambda _port, listeners=None: False) + monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call([], "service_lock")) + monkeypatch.setattr( + service_manager, + "stop_one", + lambda port, _pid_file, _name, _console: calls.append(("stop", port)), + ) + monkeypatch.setattr( + service_manager, + "start_backend", + lambda config, _console: calls.append(("start", config.backend_port)), + ) + + next_count = service_manager._watchdog_tick( + service_manager.ServiceConfig(backend_port=8000), + paths, + 0, + ) + + assert next_count == 0 + assert calls == [("stop", 9995), ("start", 9995)] + + +def test_watchdog_does_not_recover_when_port_owned_by_unexpected_pid(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + paths.run_dir.mkdir(parents=True) + service_manager.write_runtime_record( + paths.backend_pid, + service_manager.RuntimeRecord(pid=111, pgid=222, host="0.0.0.0", port=9995), + ) + calls: list[str] = [] + + monkeypatch.setattr(service_manager, "runtime_record_is_running", lambda _record: True) + monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: [999]) + monkeypatch.setattr(service_manager, "port_is_in_use", lambda _port, listeners=None: True) + monkeypatch.setattr(service_manager, "_runtime_record_pids", lambda _record: [111]) + monkeypatch.setattr(service_manager, "_recover_unhealthy_backend", lambda *_args: calls.append("recover")) + + next_count = service_manager._watchdog_tick( + service_manager.ServiceConfig(backend_port=9995), + paths, + 0, + ) + + assert next_count == 0 + assert calls == [] + + +def test_watchdog_recovers_backend_when_runtime_record_is_dead(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + paths.run_dir.mkdir(parents=True) + service_manager.write_runtime_record( + paths.backend_pid, + service_manager.RuntimeRecord(pid=111, pgid=222, host="127.0.0.1", port=9995), + ) + calls: list[tuple[str, int]] = [] + + monkeypatch.setattr(service_manager, "runtime_record_is_running", lambda _record: False) + monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) + monkeypatch.setattr(service_manager, "port_is_in_use", lambda _port, listeners=None: False) + monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call([], "service_lock")) + monkeypatch.setattr( + service_manager, + "stop_one", + lambda port, _pid_file, _name, _console: calls.append(("stop", port)), + ) + monkeypatch.setattr( + service_manager, + "start_backend", + lambda config, _console: calls.append(("start", config.backend_port)), + ) + + next_count = service_manager._watchdog_tick( + service_manager.ServiceConfig(backend_port=9995), + paths, + 0, + ) + + assert next_count == 0 + assert calls == [("stop", 9995), ("start", 9995)] + + +def test_watchdog_waits_for_second_health_failure_before_restart(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + paths.run_dir.mkdir(parents=True) + service_manager.write_runtime_record( + paths.backend_pid, + service_manager.RuntimeRecord(pid=111, pgid=222, host="127.0.0.1", port=9995), + ) + calls: list[str] = [] + + class FakeClient: + def __init__(self, *_args, **_kwargs) -> None: + pass + + def __enter__(self): + return self + + def __exit__(self, *_args) -> None: + return None + + def get(self, _url): + return httpx.Response(503, json={"status": "unhealthy"}) + + monkeypatch.setattr(service_manager.httpx, "Client", FakeClient) + monkeypatch.setattr(service_manager, "runtime_record_is_running", lambda _record: True) + monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: [111]) + monkeypatch.setattr(service_manager, "port_is_in_use", lambda _port, listeners=None: True) + monkeypatch.setattr(service_manager, "_runtime_record_pids", lambda _record: [111]) + monkeypatch.setattr(service_manager, "_recover_unhealthy_backend", lambda *_args: calls.append("recover")) + + first_count = service_manager._watchdog_tick( + service_manager.ServiceConfig(backend_port=9995), + paths, + 0, + ) + second_count = service_manager._watchdog_tick( + service_manager.ServiceConfig(backend_port=9995), + paths, + first_count, + ) + + assert first_count == 1 + assert second_count == 0 + assert calls == ["recover"] + + def test_start_frontend_tolerates_windows_node_assertion_after_build(monkeypatch, tmp_path: Path) -> None: paths = service_manager.RuntimePaths( root=tmp_path, From b16d71442ea6ec3bee6a1628195bda5552a26b54 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Wed, 1 Jul 2026 18:30:21 +0800 Subject: [PATCH 02/28] Refactor service supervisor control plane --- flocks/cli/main.py | 46 +- flocks/cli/service_control.py | 116 +++ flocks/cli/service_manager.py | 968 +++++++---------------- flocks/cli/service_supervisor.py | 608 +++++++++++++++ flocks/server/app.py | 2 +- flocks/updater/restart_handoff.py | 58 +- flocks/updater/updater.py | 50 +- tests/cli/test_service_commands.py | 2 +- tests/cli/test_service_manager.py | 972 +++++------------------- tests/server/test_server_port_config.py | 72 +- tests/updater/test_restart_handoff.py | 29 +- tests/updater/test_updater.py | 135 ++-- 12 files changed, 1387 insertions(+), 1671 deletions(-) create mode 100644 flocks/cli/service_control.py create mode 100644 flocks/cli/service_supervisor.py diff --git a/flocks/cli/main.py b/flocks/cli/main.py index 7a8973a9e..58e5c2888 100644 --- a/flocks/cli/main.py +++ b/flocks/cli/main.py @@ -33,17 +33,16 @@ from flocks.cli.service_manager import ( ServiceConfig, ServiceError, - WATCHDOG_CHECK_INTERVAL_SECONDS, - read_runtime_record, resolve_flocks_cli_command, restart_all, - run_service_watchdog, runtime_paths, show_logs, show_status, start_all, stop_all, ) +from flocks.cli.service_control import read_supervisor_status +from flocks.cli.service_supervisor import run_service_daemon from flocks.config.config import Config from flocks.utils.log import Log, LogLevel @@ -213,21 +212,21 @@ def _resolve_port( def _restart_runtime_defaults() -> dict[str, Any]: - """Load host/port defaults from the last recorded service runtime.""" - paths = runtime_paths() - backend = read_runtime_record(paths.backend_pid) - frontend = read_runtime_record(paths.frontend_pid) + """Load host/port defaults from the running supervisor when available.""" defaults: dict[str, Any] = {} - if backend is not None: - if backend.host: - defaults["default_server_host"] = backend.host - if backend.port is not None: - defaults["default_server_port"] = backend.port - if frontend is not None: - if frontend.host: - defaults["default_webui_host"] = frontend.host - if frontend.port is not None: - defaults["default_webui_port"] = frontend.port + try: + payload = read_supervisor_status(paths=runtime_paths(), timeout=1.0) + except Exception: + return defaults + config = payload.get("config") if isinstance(payload.get("config"), dict) else {} + if isinstance(config.get("backend_host"), str): + defaults["default_server_host"] = config["backend_host"] + if isinstance(config.get("backend_port"), int): + defaults["default_server_port"] = config["backend_port"] + if isinstance(config.get("frontend_host"), str): + defaults["default_webui_host"] = config["frontend_host"] + if isinstance(config.get("frontend_port"), int): + defaults["default_webui_port"] = config["frontend_port"] return defaults @@ -403,27 +402,26 @@ def serve( ) -@app.command(name="service-watchdog", hidden=True) -def service_watchdog( +@app.command(name="service-daemon", hidden=True) +def service_daemon( server_host: str = typer.Option("127.0.0.1", "--server-host", help="Backend server host"), server_port: int = typer.Option(8000, "--server-port", help="Backend server port"), webui_host: str = typer.Option("127.0.0.1", "--webui-host", help="WebUI host"), webui_port: int = typer.Option(5173, "--webui-port", help="WebUI port"), - interval: float = typer.Option(WATCHDOG_CHECK_INTERVAL_SECONDS, "--interval", help="Health check interval"), + skip_webui_build: bool = typer.Option(False, "--skip-webui-build", help="Skip WebUI build before preview start"), ): """ - Monitor daemon services and recover unhealthy backend listeners. + Run the Flocks service supervisor daemon. """ - run_service_watchdog( + run_service_daemon( ServiceConfig( backend_host=server_host, backend_port=server_port, frontend_host=webui_host, frontend_port=webui_port, no_browser=True, - skip_frontend_build=True, + skip_frontend_build=skip_webui_build, ), - interval=interval, ) diff --git a/flocks/cli/service_control.py b/flocks/cli/service_control.py new file mode 100644 index 000000000..866afb796 --- /dev/null +++ b/flocks/cli/service_control.py @@ -0,0 +1,116 @@ +"""Local supervisor control API client helpers.""" + +from __future__ import annotations + +import os +import sys +from pathlib import Path +from typing import Any + +import httpx + +SUPERVISOR_CONTROL_PORT = 48765 +SUPERVISOR_LOG_FILENAME = "supervisor.log" +SUPERVISOR_SOCKET_FILENAME = "service-daemon.sock" + + +def _default_runtime_paths(): + from flocks.cli.service_manager import runtime_paths + + return runtime_paths() + + +def supervisor_log_path(paths) -> Path: + """Return the supervisor daemon log path.""" + return paths.log_dir / SUPERVISOR_LOG_FILENAME + + +def supervisor_socket_path(paths) -> Path: + """Return the Unix control socket path for the supervisor daemon.""" + return paths.run_dir / SUPERVISOR_SOCKET_FILENAME + + +def supervisor_control_port() -> int: + """Return the local TCP control port used on Windows.""" + raw = os.getenv("FLOCKS_CONTROL_PORT") + if raw and raw.isdigit(): + value = int(raw) + if 0 < value < 65536: + return value + return SUPERVISOR_CONTROL_PORT + + +def supervisor_control_client(paths=None, timeout: float | None = 2.0) -> httpx.Client: + """Create a client for the local daemon control API.""" + if sys.platform == "win32": + return httpx.Client( + base_url=f"http://127.0.0.1:{supervisor_control_port()}", + timeout=timeout, + trust_env=False, + ) + current = paths or _default_runtime_paths() + transport = httpx.HTTPTransport(uds=str(supervisor_socket_path(current))) + return httpx.Client(base_url="http://flocks.local", timeout=timeout, trust_env=False, transport=transport) + + +def control_api_request( + method: str, + path: str, + *, + paths=None, + timeout: float | None = 2.0, + **kwargs, +) -> httpx.Response: + """Send one local control API request.""" + with supervisor_control_client(paths, timeout=timeout) as client: + response = client.request(method, path, **kwargs) + response.raise_for_status() + return response + + +def supervisor_is_running(paths=None) -> bool: + """Return True when the local supervisor control API responds.""" + try: + control_api_request("GET", "/status", paths=paths, timeout=0.75) + return True + except Exception: + return False + + +def read_control_json(path: str, *, paths=None, timeout: float | None = 2.0) -> dict[str, Any]: + response = control_api_request("GET", path, paths=paths, timeout=timeout) + payload = response.json() + if not isinstance(payload, dict): + raise RuntimeError("daemon control API returned an invalid response.") + return payload + + +def read_supervisor_status(paths=None, timeout: float | None = 2.0) -> dict[str, Any]: + """Read the current supervisor status from the local control API.""" + return read_control_json("/status", paths=paths, timeout=timeout) + + +def post_control_json( + path: str, + *, + payload: dict[str, Any] | None = None, + paths=None, + timeout: float | None = 5.0, +) -> dict[str, Any]: + response = control_api_request("POST", path, paths=paths, timeout=timeout, json=payload or {}) + data = response.json() + if not isinstance(data, dict): + raise RuntimeError("daemon control API returned an invalid response.") + return data + + +def service_config_payload(config) -> dict[str, object]: + """Serialize a ServiceConfig-like object for the supervisor control API.""" + return { + "backend_host": config.backend_host, + "backend_port": config.backend_port, + "frontend_host": config.frontend_host, + "frontend_port": config.frontend_port, + "no_browser": config.no_browser, + "skip_frontend_build": config.skip_frontend_build, + } diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index 0ef4a25b6..917754b72 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -21,16 +21,19 @@ from dataclasses import dataclass from pathlib import Path from shutil import which -from typing import Iterable, Sequence +from typing import Any, Iterable, Sequence import httpx -from flocks.browser.admin import stop_all_daemons as stop_all_browser_daemons - -try: - import fcntl -except ImportError: # pragma: no cover - unavailable on Windows - fcntl = None +from flocks.cli.service_control import ( + post_control_json, + read_control_json, + service_config_payload, + supervisor_control_client, + supervisor_is_running, + supervisor_log_path, + supervisor_socket_path, +) MIN_NODE_MAJOR = 22 FOLLOW_POLL_INTERVAL = 0.5 @@ -48,10 +51,8 @@ "src\\win\\async.c", "src/win/async.c", ) -WATCHDOG_CHECK_INTERVAL_SECONDS = 5.0 -WATCHDOG_HEALTH_FAILURE_THRESHOLD = 2 WATCHDOG_PID_FILENAME = "watchdog.pid" -WATCHDOG_LOG_FILENAME = "watchdog.log" +SUPERVISOR_START_TIMEOUT_SECONDS = 180.0 class ServiceError(RuntimeError): @@ -108,15 +109,6 @@ def has_artifacts(self) -> bool: return self.payload_present or self.pid_file_present -@dataclass(frozen=True) -class WatchdogProbeResult: - restart_needed: bool - health_failure: bool - reason: str - host: str - port: int - - def repo_root() -> Path: """Return the installed repository root.""" override = os.getenv("FLOCKS_REPO_ROOT") @@ -162,11 +154,6 @@ def watchdog_pid_path(paths: RuntimePaths) -> Path: return paths.run_dir / WATCHDOG_PID_FILENAME -def watchdog_log_path(paths: RuntimePaths) -> Path: - """Return the watchdog log path.""" - return paths.log_dir / WATCHDOG_LOG_FILENAME - - def ensure_install_layout(root: Path | None = None) -> Path: """Validate that the installed repo still contains backend and WebUI code.""" current = root or repo_root() @@ -416,22 +403,6 @@ def read_runtime_record(pid_file: Path) -> RuntimeRecord | None: return _parse_runtime_record(raw) -def write_runtime_record(pid_file: Path, record: RuntimeRecord) -> None: - """Persist runtime metadata in a backward-compatible JSON format.""" - payload: dict[str, object] = {"pid": record.pid} - if record.pgid is not None: - payload["pgid"] = record.pgid - if record.host is not None: - payload["host"] = record.host - if record.port is not None: - payload["port"] = record.port - if record.command: - payload["command"] = list(record.command) - if record.started_at is not None: - payload["started_at"] = record.started_at - pid_file.write_text(json.dumps(payload, ensure_ascii=True, sort_keys=True), encoding="utf-8") - - def process_runtime_record( process: subprocess.Popen, *, @@ -462,11 +433,6 @@ def read_pid(pid_file: Path) -> int | None: return record.pid if record else None -def write_pid(pid_file: Path, pid: int) -> None: - """Persist a process id.""" - write_runtime_record(pid_file, RuntimeRecord(pid=pid)) - - def _unix_process_stat(pid: int) -> str | None: """Return the Unix process status code for a pid, if available.""" if sys.platform == "win32" or pid <= 0: @@ -789,12 +755,6 @@ def _resolve_upgrade_runtime(console, *, frontend_port: int, attempt_recover: bo return result -def _effective_frontend_port(paths: RuntimePaths, default: int) -> int: - recorded_port = _recorded_port(paths.frontend_pid, default) - upgrade_info = _read_upgrade_runtime_info(recorded_port) - return upgrade_info.frontend_port or recorded_port - - def cleanup_stale_pid_file(pid_file: Path) -> None: """Remove pid files that no longer point to running processes.""" if not pid_file.exists(): @@ -810,20 +770,6 @@ def cleanup_stale_pid_file(pid_file: Path) -> None: pid_file.unlink(missing_ok=True) -def backend_is_running(config: ServiceConfig, paths: RuntimePaths | None = None) -> bool: - """Return True if the tracked backend process is running.""" - current = paths or runtime_paths() - cleanup_stale_pid_file(current.backend_pid) - return runtime_record_is_running(read_runtime_record(current.backend_pid)) or port_is_in_use(config.backend_port) - - -def frontend_is_running(config: ServiceConfig, paths: RuntimePaths | None = None) -> bool: - """Return True if the tracked frontend process is running.""" - current = paths or runtime_paths() - cleanup_stale_pid_file(current.frontend_pid) - return runtime_record_is_running(read_runtime_record(current.frontend_pid)) or port_is_in_use(config.frontend_port) - - def _port_owner_lookup_available() -> bool: """Return True when the current platform can resolve listener pids.""" return sys.platform == "win32" or bool(which("lsof") or which("fuser")) @@ -944,164 +890,80 @@ def print(self, *args, **_kwargs) -> None: sys.stdout.flush() -def _watchdog_log(event: str, details: dict[str, object] | None = None) -> None: - timestamp = datetime.datetime.now().isoformat(timespec="seconds") - suffix = "" - if details: - suffix = " " + json.dumps(details, ensure_ascii=True, sort_keys=True) - sys.stdout.write(f"[{timestamp}] watchdog.{event}{suffix}\n") - sys.stdout.flush() - - def _backend_health_url(host: str, port: int) -> str: return f"http://{_format_host_for_url(access_host(host))}:{port}/api/health" -def _watchdog_backend_endpoint(config: ServiceConfig, paths: RuntimePaths) -> tuple[RuntimeRecord | None, str, int]: - record = read_runtime_record(paths.backend_pid) - host = record.host if record is not None and record.host else config.backend_host - port = record.port if record is not None and record.port is not None else config.backend_port - return record, host, port - - -def _watchdog_backend_config(config: ServiceConfig, paths: RuntimePaths) -> ServiceConfig: - record, host, port = _watchdog_backend_endpoint(config, paths) - return ServiceConfig( - backend_host=host, - backend_port=port, - frontend_host=config.frontend_host, - frontend_port=config.frontend_port, - no_browser=True, - skip_frontend_build=True, - ) - +def _terminate_process( + process: subprocess.Popen | None, + name: str, + console, + *, + timeout: float = 10.0, +) -> None: + """Terminate a process and its process group without scanning service ports.""" + if process is None: + return + if process.poll() is not None: + return -def _watchdog_probe_backend( - config: ServiceConfig, - paths: RuntimePaths, - client: httpx.Client, -) -> WatchdogProbeResult: - record, host, port = _watchdog_backend_endpoint(config, paths) - if record is None: - return WatchdogProbeResult(False, False, "backend runtime record missing", host, port) - - backend_running = runtime_record_is_running(record) - listeners = port_owner_pids(port) - port_in_use = port_is_in_use(port, listeners) - if not backend_running: - if port_in_use: - return WatchdogProbeResult( - False, - False, - f"backend runtime record is not running but port {port} is occupied", - host, - port, - ) - return WatchdogProbeResult(True, False, "backend runtime record is not running", host, port) - if not port_in_use: - return WatchdogProbeResult(True, False, f"backend process alive but port {port} is not listening", host, port) - - runtime_pids = set(_runtime_record_pids(record)) - listener_pids = set(listeners) - if listener_pids and runtime_pids and listener_pids.isdisjoint(runtime_pids): - reason = ( - f"backend process alive but port {port} is owned by unexpected pid(s): " - f"{_join_pids(sorted(listener_pids))}" - ) - return WatchdogProbeResult(False, False, reason, host, port) + record = process_runtime_record(process, host=None, port=None, command=()) + console.print(f"[flocks] 停止 {name}(PID={process.pid})...") + if sys.platform == "win32": + subprocess.run(["taskkill", "/PID", str(process.pid), "/T", "/F"], check=False, capture_output=True) + else: + if record.pgid is not None: + signal_process_group(signal.SIGTERM, record.pgid) + else: + signal_pid_list(signal.SIGTERM, collect_process_tree_pids(process.pid)) - url = _backend_health_url(host, port) - try: - response = client.get(url) - except Exception as exc: - return WatchdogProbeResult(True, True, f"backend health check failed: {exc}", host, port) - if not _is_healthy_status_response(response): - return WatchdogProbeResult( - True, - True, - f"backend health check unhealthy: status={response.status_code}", - host, - port, - ) - return WatchdogProbeResult(False, False, "backend healthy", host, port) + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if process.poll() is not None and not process_group_is_running(record.pgid): + return + time.sleep(0.25) + console.print(f"[flocks] {name} 未在预期时间内退出,强制终止...") + if sys.platform == "win32": + subprocess.run(["taskkill", "/PID", str(process.pid), "/T", "/F"], check=False, capture_output=True) + else: + if record.pgid is not None: + signal_process_group(signal.SIGKILL, record.pgid) + signal_pid_list(signal.SIGKILL, collect_process_tree_pids(process.pid)) -def _recover_unhealthy_backend(config: ServiceConfig, paths: RuntimePaths, reason: str) -> None: - try: - with service_lock(paths): - effective_config = _watchdog_backend_config(config, paths) - with httpx.Client(timeout=2.0, trust_env=False) as client: - probe = _watchdog_probe_backend(effective_config, paths, client) - if not probe.restart_needed: - _watchdog_log("backend_recovery_skipped", {"reason": probe.reason}) - return - console = _StdoutConsole() - _watchdog_log( - "backend_recovery_start", - { - "reason": reason, - "host": probe.host, - "port": probe.port, - }, - ) - stop_one(probe.port, paths.backend_pid, "后端", console) - start_backend(effective_config, console) - _watchdog_log("backend_recovery_done", {"host": probe.host, "port": probe.port}) - except ServiceError as exc: - _watchdog_log("backend_recovery_failed", {"reason": reason, "error": str(exc)}) - except Exception as exc: - _watchdog_log("backend_recovery_crashed", {"reason": reason, "error": repr(exc)}) +def _backend_command_and_env(root: Path, config: ServiceConfig) -> tuple[list[str], dict[str, str]]: + """Build the backend service command and environment.""" + command = resolve_flocks_cli_command(root) + [ + "serve", + "--host", + config.backend_host, + "--port", + str(config.backend_port), + ] + env = os.environ.copy() + env["_FLOCKS_WEBUI_HOST"] = config.frontend_host + env["_FLOCKS_WEBUI_PORT"] = str(config.frontend_port) + env["PYTHONUNBUFFERED"] = "1" + env.setdefault("FLOCKS_CONSOLE_BASE_URL", DEFAULT_FLOCKS_CONSOLE_BASE_URL) + return command, env -def _watchdog_tick( +def _start_backend_process( config: ServiceConfig, - paths: RuntimePaths, - health_failure_count: int, + console, *, - failure_threshold: int = WATCHDOG_HEALTH_FAILURE_THRESHOLD, -) -> int: - with httpx.Client(timeout=2.0, trust_env=False) as client: - probe = _watchdog_probe_backend(config, paths, client) - - if not probe.restart_needed: - return 0 - - if probe.health_failure: - health_failure_count += 1 - _watchdog_log( - "backend_health_failed", - { - "count": health_failure_count, - "threshold": failure_threshold, - "reason": probe.reason, - "host": probe.host, - "port": probe.port, - }, - ) - if health_failure_count < failure_threshold: - return health_failure_count - - _recover_unhealthy_backend(config, paths, probe.reason) - return 0 - - -def start_backend(config: ServiceConfig, console) -> None: - """Start the backend API service if needed.""" + paths: RuntimePaths | None = None, +) -> subprocess.Popen: + """Start the backend child process for the supervisor.""" root = ensure_install_layout() - paths = ensure_runtime_dirs() - cleanup_stale_pid_file(paths.backend_pid) + current = paths if paths is not None else ensure_runtime_dirs() - runtime_record = read_runtime_record(paths.backend_pid) - tracked_pid = runtime_record.pid if runtime_record else None listeners = port_owner_pids(config.backend_port) if listeners: - if tracked_pid and tracked_pid in listeners: - console.print(f"[flocks] 后端已在运行,PID={tracked_pid}") - return raise ServiceError( f"后端端口 {config.backend_port} 已被占用 (PID: {_join_pids(listeners)})," - "与当前运行时记录不一致,请先执行 `flocks stop` 或手动清理残留进程。" + "请先执行 `flocks stop` 或手动清理残留进程。" ) if port_is_in_use(config.backend_port, listeners): raise ServiceError( @@ -1109,45 +971,16 @@ def start_backend(config: ServiceConfig, console) -> None: "请先安装 lsof 或手动清理残留进程。" ) - if runtime_record is not None and runtime_record_is_running(runtime_record): - raise ServiceError( - "后端运行记录仍存活,但端口未监听;请先执行 `flocks stop` 清理异常状态后重试。" - ) - - if runtime_record is not None: - paths.backend_pid.unlink(missing_ok=True) - - command = resolve_flocks_cli_command(root) + [ - "serve", - "--host", - config.backend_host, - "--port", - str(config.backend_port), - ] - - backend_env = os.environ.copy() - backend_env["_FLOCKS_WEBUI_HOST"] = config.frontend_host - backend_env["_FLOCKS_WEBUI_PORT"] = str(config.frontend_port) - backend_env["PYTHONUNBUFFERED"] = "1" - backend_env.setdefault("FLOCKS_CONSOLE_BASE_URL", DEFAULT_FLOCKS_CONSOLE_BASE_URL) - + command, env = _backend_command_and_env(root, config) console.print("[flocks] 启动后端服务...") - process = _spawn_process( - command, - cwd=root, - log_path=paths.backend_log, - env=backend_env, - ) - write_runtime_record( - paths.backend_pid, - process_runtime_record( - process, - host=config.backend_host, - port=config.backend_port, - command=command, - ), + process = _spawn_process(command, cwd=root, log_path=current.backend_log, env=env) + record = process_runtime_record( + process, + host=config.backend_host, + port=config.backend_port, + command=command, ) - _log_startup_config(paths.backend_log, "backend", config.backend_host, config.backend_port, read_runtime_record(paths.backend_pid)) + _log_startup_config(current.backend_log, "backend", config.backend_host, config.backend_port, record) try: wait_for_http( @@ -1157,27 +990,24 @@ def start_backend(config: ServiceConfig, console) -> None: validator=_is_running_status_response, ) except ServiceError: - _emit_service_log_tail(console, paths.backend_log, "后端") - stop_one(config.backend_port, paths.backend_pid, "后端", console) + _emit_service_log_tail(console, current.backend_log, "后端") + _terminate_process(process, "后端", console) raise + return process - console.print(f"[flocks] 后端已启动,日志: {paths.backend_log}") - -def start_frontend(config: ServiceConfig, console) -> None: - """Build and start the WebUI preview service if needed.""" +def _start_frontend_process( + config: ServiceConfig, + console, + *, + paths: RuntimePaths | None = None, +) -> subprocess.Popen: + """Build and start the WebUI child process.""" root = ensure_install_layout() - paths = ensure_runtime_dirs() - cleanup_stale_pid_file(paths.frontend_pid) + current = paths if paths is not None else ensure_runtime_dirs() - runtime_record = read_runtime_record(paths.frontend_pid) - tracked_pid = runtime_record.pid if runtime_record else None listeners = port_owner_pids(config.frontend_port) if listeners: - if tracked_pid and tracked_pid in listeners: - console.print(f"[flocks] WebUI 已在运行,PID={tracked_pid}") - return - upgrade_info = _read_upgrade_runtime_info(config.frontend_port) if upgrade_info.page_active: _resolve_upgrade_runtime( @@ -1185,25 +1015,17 @@ def start_frontend(config: ServiceConfig, console) -> None: frontend_port=upgrade_info.frontend_port or config.frontend_port, attempt_recover=False, ) - cleanup_stale_pid_file(paths.frontend_pid) - runtime_record = read_runtime_record(paths.frontend_pid) - tracked_pid = runtime_record.pid if runtime_record else None listeners = port_owner_pids(config.frontend_port) - if tracked_pid and tracked_pid in listeners: - console.print(f"[flocks] WebUI 已在运行,PID={tracked_pid}") - return - if not listeners: - tracked_pid = runtime_record.pid if runtime_record else None - else: + if listeners: raise ServiceError( f"WebUI 端口 {config.frontend_port} 已被占用 (PID: {_join_pids(listeners)})," - "与当前运行时记录不一致,请先执行 `flocks stop` 或手动清理残留进程。" + "请先执行 `flocks stop` 或手动清理残留进程。" ) else: raise ServiceError( f"WebUI 端口 {config.frontend_port} 已被占用 (PID: {_join_pids(listeners)})," - "与当前运行时记录不一致,请先执行 `flocks stop` 或手动清理残留进程。" + "请先执行 `flocks stop` 或手动清理残留进程。" ) elif port_is_in_use(config.frontend_port, listeners): raise ServiceError( @@ -1211,14 +1033,6 @@ def start_frontend(config: ServiceConfig, console) -> None: "请先安装 lsof 或手动清理残留进程。" ) - if runtime_record is not None and runtime_record_is_running(runtime_record): - raise ServiceError( - "WebUI 运行记录仍存活,但端口未监听;请先执行 `flocks stop` 清理异常状态后重试。" - ) - - if runtime_record is not None: - paths.frontend_pid.unlink(missing_ok=True) - npm = resolve_npm_executable() if not npm: raise ServiceError("未检测到 npm,请先安装 Node.js 22+(包含 npm)后重试。") @@ -1256,158 +1070,53 @@ def start_frontend(config: ServiceConfig, console) -> None: ] console.print("[flocks] 启动 WebUI...") - process = _spawn_process( - command, - cwd=webui_dir, - log_path=paths.frontend_log, - env=frontend_env, - ) - write_runtime_record( - paths.frontend_pid, - process_runtime_record( - process, - host=config.frontend_host, - port=config.frontend_port, - command=command, - ), + process = _spawn_process(command, cwd=webui_dir, log_path=current.frontend_log, env=frontend_env) + record = process_runtime_record( + process, + host=config.frontend_host, + port=config.frontend_port, + command=command, ) - _log_startup_config(paths.frontend_log, "webui", config.frontend_host, config.frontend_port, read_runtime_record(paths.frontend_pid)) + _log_startup_config(current.frontend_log, "webui", config.frontend_host, config.frontend_port, record) try: wait_for_http([config.frontend_url], "WebUI") except ServiceError: - _emit_service_log_tail(console, paths.frontend_log, "WebUI") - stop_one(config.frontend_port, paths.frontend_pid, "WebUI", console) + _emit_service_log_tail(console, current.frontend_log, "WebUI") + _terminate_process(process, "WebUI", console) raise - console.print(f"[flocks] WebUI 已启动,日志: {paths.frontend_log}") + return process -def start_watchdog(config: ServiceConfig, console) -> None: - """Start the service watchdog daemon if needed.""" - root = ensure_install_layout() - paths = ensure_runtime_dirs() - pid_file = watchdog_pid_path(paths) - log_path = watchdog_log_path(paths) +def stop_runtime_record_process(pid_file: Path, name: str, console) -> None: + """Stop a legacy pid/runtime record without scanning ports.""" cleanup_stale_pid_file(pid_file) - - runtime_record = read_runtime_record(pid_file) - if runtime_record is not None and runtime_record_is_running(runtime_record): - console.print(f"[flocks] Watchdog 已在运行,PID={runtime_record.pid}") - return - if runtime_record is not None: + record = read_runtime_record(pid_file) + if record is None: pid_file.unlink(missing_ok=True) - - command = resolve_flocks_cli_command(root) + [ - "service-watchdog", - "--server-host", - config.backend_host, - "--server-port", - str(config.backend_port), - "--webui-host", - config.frontend_host, - "--webui-port", - str(config.frontend_port), - "--interval", - str(WATCHDOG_CHECK_INTERVAL_SECONDS), - ] - env = os.environ.copy() - env["PYTHONUNBUFFERED"] = "1" - - console.print("[flocks] 启动服务 Watchdog...") - process = _spawn_process(command, cwd=root, log_path=log_path, env=env) - write_runtime_record( - pid_file, - process_runtime_record( - process, - host=None, - port=None, - command=command, - ), - ) - _log_startup_config(log_path, "watchdog", config.backend_host, config.backend_port, read_runtime_record(pid_file)) - console.print(f"[flocks] Watchdog 已启动,日志: {log_path}") - - -def stop_watchdog(paths: RuntimePaths, console) -> None: - """Stop the service watchdog without touching backend/frontend ports.""" - pid_file = watchdog_pid_path(paths) - cleanup_stale_pid_file(pid_file) - if read_runtime_record(pid_file) is None: return - stop_one(0, pid_file, "Watchdog", console) - - -def run_service_watchdog( - config: ServiceConfig, - *, - interval: float = WATCHDOG_CHECK_INTERVAL_SECONDS, - failure_threshold: int = WATCHDOG_HEALTH_FAILURE_THRESHOLD, -) -> None: - """Run the backend health watchdog loop.""" - paths = ensure_runtime_dirs() - _watchdog_log( - "started", - { - "backend_host": config.backend_host, - "backend_port": config.backend_port, - "interval": interval, - "failure_threshold": failure_threshold, - }, - ) - health_failure_count = 0 - while True: - try: - health_failure_count = _watchdog_tick( - config, - paths, - health_failure_count, - failure_threshold=failure_threshold, - ) - except KeyboardInterrupt: - _watchdog_log("stopped") - return - except Exception as exc: - _watchdog_log("tick_failed", {"error": repr(exc)}) - time.sleep(interval) - - -def _tracked_processes_stopped( - port: int, - record: RuntimeRecord | None, - tracked_pids: Iterable[int], -) -> bool: - """Return True when the tracked service no longer has running processes.""" - listeners = port_owner_pids(port) - if port_is_in_use(port, listeners): - return False - if runtime_record_is_running(record): - return False - return not any(pid_is_running(pid) for pid in tracked_pids) - - -def _runtime_record_pids(record: RuntimeRecord | None) -> list[int]: - """Collect the latest pids implied by a runtime record.""" - if record is None: - return [] - - result: list[int] = [] - if record.pid > 0: - result = append_unique_pids(result, collect_process_tree_pids(record.pid)) - if record.pgid is not None and sys.platform != "win32": - result = append_unique_pids(result, _process_group_member_pids(record.pgid)) - return result + targets = collect_process_tree_pids(record.pid) + console.print(f"[flocks] 清理旧 {name} 进程(PID={record.pid})...") + if sys.platform == "win32": + subprocess.run(["taskkill", "/PID", str(record.pid), "/T", "/F"], check=False, capture_output=True) + else: + if record.pgid is not None: + signal_process_group(signal.SIGTERM, record.pgid) + else: + signal_pid_list(signal.SIGTERM, targets) + deadline = time.monotonic() + 5.0 + while time.monotonic() < deadline: + if not runtime_record_is_running(record): + pid_file.unlink(missing_ok=True) + return + time.sleep(0.25) + if record.pgid is not None: + signal_process_group(signal.SIGKILL, record.pgid) + signal_pid_list(signal.SIGKILL, targets) -def _current_stop_targets( - port: int, - record: RuntimeRecord | None, - tracked_pids: Iterable[int], -) -> list[int]: - """Refresh the pid list that stop_one() should verify or force kill.""" - result = append_unique_pids([], tracked_pids) - result = append_unique_pids(result, _runtime_record_pids(record)) - return append_unique_pids(result, port_owner_pids(port)) + pid_file.unlink(missing_ok=True) def signal_process_group(sig: signal.Signals, pgid: int | None) -> None: @@ -1420,137 +1129,6 @@ def signal_process_group(sig: signal.Signals, pgid: int | None) -> None: pass -def stop_one(port: int, pid_file: Path, name: str, console) -> None: - """Stop a single service by tracked pid and/or listening port.""" - cleanup_stale_pid_file(pid_file) - runtime_record = read_runtime_record(pid_file) - tracked_pid = runtime_record.pid if runtime_record else None - listeners = port_owner_pids(port) - - target_pids: list[int] = [] - if tracked_pid is not None: - target_pids = append_unique_pids(target_pids, collect_process_tree_pids(tracked_pid)) - target_pids = append_unique_pids(target_pids, listeners) - if sys.platform == "win32" and runtime_record is not None: - filtered_targets: list[int] = [] - for pid in target_pids: - if pid in listeners: - filtered_targets = append_unique_pids(filtered_targets, [pid]) - continue - if pid == runtime_record.pid and not _windows_runtime_record_matches_pid(runtime_record, pid, listeners): - continue - filtered_targets = append_unique_pids(filtered_targets, [pid]) - target_pids = filtered_targets - - group_running = process_group_is_running(runtime_record.pgid if runtime_record else None) - if not target_pids and not group_running: - if port_is_in_use(port, listeners): - raise ServiceError( - f"{name} 端口 {port} 已被占用,但当前环境无法识别占用 PID;" - "请先安装 lsof 或手动处理该进程。" - ) - pid_file.unlink(missing_ok=True) - console.print(f"[flocks] {name} 未运行。") - return - - details = _join_pids(target_pids) if target_pids else "none" - if runtime_record and runtime_record.pgid is not None and sys.platform != "win32": - details = f"{details}; PGID={runtime_record.pgid}" - console.print(f"[flocks] 停止 {name}(端口 {port},PID: {details})...") - - if sys.platform == "win32": - for pid in target_pids: - subprocess.run(["taskkill", "/PID", str(pid), "/T", "/F"], check=False, capture_output=True) - else: - if runtime_record and runtime_record.pgid is not None: - signal_process_group(signal.SIGTERM, runtime_record.pgid) - else: - signal_pid_list(signal.SIGTERM, target_pids) - for _ in range(10): - current_targets = _current_stop_targets(port, runtime_record, target_pids) - if _tracked_processes_stopped(port, runtime_record, current_targets): - pid_file.unlink(missing_ok=True) - console.print(f"[flocks] {name} 已停止。") - return - time.sleep(1) - - console.print(f"[flocks] {name} 未在预期时间内退出,强制终止...") - force_targets = _current_stop_targets(port, runtime_record, target_pids) - if runtime_record and runtime_record.pgid is not None: - signal_process_group(signal.SIGKILL, runtime_record.pgid) - signal_pid_list(signal.SIGKILL, force_targets) - - for _ in range(10): - force_targets = _current_stop_targets(port, runtime_record, target_pids) - if _tracked_processes_stopped(port, runtime_record, force_targets): - pid_file.unlink(missing_ok=True) - console.print(f"[flocks] {name} 已停止。") - return - if sys.platform == "win32": - for pid in force_targets: - subprocess.run(["taskkill", "/PID", str(pid), "/T", "/F"], check=False, capture_output=True) - else: - if runtime_record and runtime_record.pgid is not None: - signal_process_group(signal.SIGKILL, runtime_record.pgid) - signal_pid_list(signal.SIGKILL, force_targets) - time.sleep(1) - - raise ServiceError(f"{name} 未在预期时间内退出,请手动检查端口 {port}。") - - -def _recorded_port(pid_file: Path, default: int) -> int: - """Return the port from a runtime record, falling back to *default*.""" - record = read_runtime_record(pid_file) - if record is not None and record.port is not None: - return record.port - return default - - -def _recorded_host(pid_file: Path, default: str) -> str: - """Return the host from a runtime record, falling back to *default*.""" - record = read_runtime_record(pid_file) - if record is not None and record.host: - return record.host - return default - - -@contextlib.contextmanager -def service_lock(paths: RuntimePaths): - """Serialize lifecycle commands with a cross-process lock file.""" - lock_path = paths.run_dir / "service.lock" - lock_path.parent.mkdir(parents=True, exist_ok=True) - handle = lock_path.open("a+", encoding="utf-8") - unlock_windows = None - try: - try: - if sys.platform == "win32": - import msvcrt - - handle.seek(0) - handle.write("0") - handle.flush() - handle.seek(0) - msvcrt.locking(handle.fileno(), msvcrt.LK_NBLCK, 1) - unlock_windows = msvcrt - else: - if fcntl is None: # pragma: no cover - defensive - raise OSError("fcntl unavailable") - fcntl.flock(handle, fcntl.LOCK_EX | fcntl.LOCK_NB) - except OSError as error: - raise ServiceError("另一个 flocks 命令正在执行,请稍后重试。") from error - yield - finally: - try: - if unlock_windows is not None: - handle.seek(0) - unlock_windows.locking(handle.fileno(), unlock_windows.LK_UNLCK, 1) - elif fcntl is not None and sys.platform != "win32": - fcntl.flock(handle, fcntl.LOCK_UN) - except OSError: - pass - handle.close() - - def _log_startup_config( log_path: Path, name: str, @@ -1568,144 +1146,173 @@ def _log_startup_config( handle.write(line) -def _resolve_stop_ports( +def _wait_for_supervisor_ready( paths: RuntimePaths, - config: ServiceConfig | None = None, -) -> tuple[int, int]: - """Resolve frontend/backend ports for stop flows. - - When a runtime record is missing or uses the legacy pid-only format, - ``start`` and ``restart`` should fall back to the current CLI config - rather than the static default ports. - """ - frontend_default = config.frontend_port if config is not None else ServiceConfig.frontend_port - backend_default = config.backend_port if config is not None else ServiceConfig.backend_port - return ( - _effective_frontend_port(paths, frontend_default), - _recorded_port(paths.backend_pid, backend_default), - ) + *, + process: subprocess.Popen | None = None, + timeout: float = SUPERVISOR_START_TIMEOUT_SECONDS, +) -> dict[str, Any]: + """Wait for the supervisor control API and managed services to become ready.""" + deadline = time.monotonic() + timeout + last_payload: dict[str, Any] | None = None + while time.monotonic() < deadline: + if process is not None and process.poll() is not None: + raise ServiceError(f"Supervisor 启动失败,退出码: {process.returncode}") + try: + payload = read_control_json("/status", paths=paths, timeout=1.0) + last_payload = payload + backend_state = ((payload.get("backend") or {}).get("state") if isinstance(payload.get("backend"), dict) else None) + webui_state = ((payload.get("webui") or {}).get("state") if isinstance(payload.get("webui"), dict) else None) + if backend_state == "healthy" and webui_state == "healthy": + return payload + if backend_state == "degraded" or webui_state == "degraded": + return payload + except Exception: + pass + time.sleep(0.5) + if last_payload is not None: + return last_payload + raise ServiceError("Supervisor 启动超时,请检查日志。") -def _stop_all_locked( - paths: RuntimePaths, - console, - *, - config: ServiceConfig | None = None, -) -> None: - """Stop frontend then backend while reusing the caller's lock.""" - fe_port, be_port = _resolve_stop_ports(paths, config) - try: - _resolve_upgrade_runtime(console, frontend_port=fe_port, attempt_recover=False) - stop_watchdog(paths, console) - stop_one(fe_port, paths.frontend_pid, "WebUI", console) - stop_one(be_port, paths.backend_pid, "后端", console) - finally: - stop_all_browser_daemons() +def _start_supervisor_process(config: ServiceConfig, paths: RuntimePaths, console) -> subprocess.Popen: + """Spawn the detached service supervisor daemon.""" + root = ensure_install_layout() + log_path = supervisor_log_path(paths) + if sys.platform != "win32": + supervisor_socket_path(paths).unlink(missing_ok=True) + command = resolve_flocks_cli_command(root) + [ + "service-daemon", + "--server-host", + config.backend_host, + "--server-port", + str(config.backend_port), + "--webui-host", + config.frontend_host, + "--webui-port", + str(config.frontend_port), + ] + if config.skip_frontend_build: + command.append("--skip-webui-build") + env = os.environ.copy() + env["PYTHONUNBUFFERED"] = "1" + console.print("[flocks] 启动 Supervisor daemon...") + return _spawn_process(command, cwd=root, log_path=log_path, env=env) def stop_all(console) -> None: - """Stop frontend then backend using ports persisted in runtime records.""" + """Stop managed services through the supervisor control API.""" paths = ensure_runtime_dirs() - with service_lock(paths): - _stop_all_locked(paths, console) + if not supervisor_is_running(paths): + console.print("[flocks] Supervisor 未运行。") + return + try: + post_control_json("/stop", paths=paths, timeout=2.0) + except Exception as exc: + raise ServiceError(f"无法请求 Supervisor 停止: {exc}") from exc + + deadline = time.monotonic() + 20.0 + while time.monotonic() < deadline: + if not supervisor_is_running(paths): + console.print("[flocks] Supervisor 已停止。") + return + time.sleep(0.5) + raise ServiceError("Supervisor 未在预期时间内退出。") def _start_all_without_stop(config: ServiceConfig, console) -> None: - """Start backend and frontend, then print access summary.""" - ensure_runtime_dirs() - start_backend(config, console) - start_frontend(config, console) - start_watchdog(config, console) + """Start the supervisor daemon, then print access summary.""" + paths = ensure_runtime_dirs() + process = _start_supervisor_process(config, paths, console) + payload = _wait_for_supervisor_ready(paths, process=process) show_start_summary(config, console) + _print_status_payload(payload, console) if not config.no_browser: open_default_browser(config.frontend_url, console) def start_all(config: ServiceConfig, console) -> None: - """Ensure backend and frontend are restarted with a clean state.""" + """Ensure the supervisor daemon is running.""" paths = ensure_runtime_dirs() - with service_lock(paths): - _stop_all_locked(paths, console, config=config) - _start_all_without_stop(config, console) + if supervisor_is_running(paths): + console.print("[flocks] Supervisor 已在运行。") + show_status(console) + if not config.no_browser: + try: + payload = read_control_json("/status", paths=paths, timeout=1.0) + url = _frontend_url_from_status_payload(payload, config.frontend_url) + except Exception: + url = config.frontend_url + open_default_browser(url, console) + return + _start_all_without_stop(config, console) def restart_all(config: ServiceConfig, console) -> None: - """Restart backend and frontend.""" + """Restart backend and frontend through the supervisor control API.""" paths = ensure_runtime_dirs() - with service_lock(paths): - _stop_all_locked(paths, console, config=config) - _start_all_without_stop(config, console) + if not supervisor_is_running(paths): + start_all(config, console) + return + try: + payload = post_control_json("/restart", payload=service_config_payload(config), paths=paths, timeout=180.0) + except Exception as exc: + raise ServiceError(f"无法请求 Supervisor 重启: {exc}") from exc + _print_status_payload(payload, console) def build_status_lines(paths: RuntimePaths | None = None) -> list[str]: - """Return a human-readable status summary.""" + """Return a human-readable status summary from the supervisor control API.""" current = paths or runtime_paths() - cleanup_stale_pid_file(current.backend_pid) - cleanup_stale_pid_file(current.frontend_pid) - cleanup_stale_pid_file(watchdog_pid_path(current)) - - backend_record = read_runtime_record(current.backend_pid) - frontend_record = read_runtime_record(current.frontend_pid) - watchdog_record = read_runtime_record(watchdog_pid_path(current)) - backend_port = _recorded_port(current.backend_pid, ServiceConfig.backend_port) - frontend_port = _recorded_port(current.frontend_pid, ServiceConfig.frontend_port) - backend_host = _loopback_host(_recorded_host(current.backend_pid, ServiceConfig.backend_host)) - frontend_host = _loopback_host(_recorded_host(current.frontend_pid, ServiceConfig.frontend_host)) - upgrade_info = _read_upgrade_runtime_info(frontend_port) - if frontend_record is None and upgrade_info.frontend_port is not None: - frontend_port = upgrade_info.frontend_port - if frontend_record is None and upgrade_info.frontend_host: - frontend_host = _loopback_host(upgrade_info.frontend_host) - backend_pid = backend_record.pid if backend_record else None - frontend_pid = frontend_record.pid if frontend_record else None - backend_listeners = port_owner_pids(backend_port) - frontend_listeners = port_owner_pids(frontend_port) - backend_in_use = port_is_in_use(backend_port, backend_listeners) - frontend_in_use = port_is_in_use(frontend_port, frontend_listeners) - - lines: list[str] = [] - if backend_listeners: - lines.append( - f"[flocks] 后端运行中: PID={_join_pids(backend_listeners)} URL=http://{backend_host}:{backend_port}" - ) - elif backend_in_use: - lines.append(f"[flocks] 后端运行中: PID=unknown URL=http://{backend_host}:{backend_port}") - elif pid_is_running(backend_pid): - lines.append(f"[flocks] 后端主进程仍在运行,但端口 {backend_port} 未监听: PID={backend_pid}") - elif process_group_is_running(backend_record.pgid if backend_record else None): - lines.append(f"[flocks] 后端进程组仍在运行,但端口 {backend_port} 未监听: PGID={backend_record.pgid}") - else: - lines.append("[flocks] 后端未运行") + try: + payload = read_control_json("/status", paths=current) + except Exception: + return [ + "[flocks] Supervisor 未运行", + f"[flocks] Supervisor 日志: {supervisor_log_path(current)}", + ] + return _status_lines_from_payload(payload) + + +def _status_lines_from_payload(payload: dict[str, Any]) -> list[str]: + daemon = payload.get("daemon") if isinstance(payload.get("daemon"), dict) else {} + backend = payload.get("backend") if isinstance(payload.get("backend"), dict) else {} + webui = payload.get("webui") if isinstance(payload.get("webui"), dict) else {} + lines = [ + f"[flocks] Supervisor 运行中: PID={daemon.get('pid')} state={daemon.get('state')}", + _service_status_line("后端", backend), + _service_status_line("WebUI", webui), + f"[flocks] Supervisor 日志: {daemon.get('log_path')}", + ] + for service in (backend, webui): + log_path = service.get("log_path") + if log_path: + lines.append(f"[flocks] {service.get('state')} 日志: {log_path}") + return lines - if upgrade_info.page_active: - lines.append( - f"[flocks] WebUI 临时升级页运行中: PID={_join_pids(upgrade_info.listener_pids)} URL=http://{frontend_host}:{frontend_port}" - ) - elif frontend_listeners: - lines.append( - f"[flocks] WebUI 运行中: PID={_join_pids(frontend_listeners)} URL=http://{frontend_host}:{frontend_port}" - ) - elif frontend_in_use: - lines.append(f"[flocks] WebUI 运行中: PID=unknown URL=http://{frontend_host}:{frontend_port}") - elif pid_is_running(frontend_pid): - lines.append(f"[flocks] WebUI 主进程仍在运行,但端口 {frontend_port} 未监听: PID={frontend_pid}") - elif process_group_is_running(frontend_record.pgid if frontend_record else None): - lines.append(f"[flocks] WebUI 进程组仍在运行,但端口 {frontend_port} 未监听: PGID={frontend_record.pgid}") - else: - lines.append("[flocks] WebUI 未运行") - if runtime_record_is_running(watchdog_record): - lines.append(f"[flocks] Watchdog 运行中: PID={watchdog_record.pid}") - else: - lines.append("[flocks] Watchdog 未运行") +def _service_status_line(label: str, payload: dict[str, Any]) -> str: + host = _loopback_host(str(payload.get("host") or "127.0.0.1")) + port = payload.get("port") + pid = payload.get("pid") + state = payload.get("state") or "unknown" + error = payload.get("last_error") + suffix = f" last_error={error}" if error else "" + return f"[flocks] {label}: state={state} PID={pid} URL=http://{host}:{port}{suffix}" - if upgrade_info.payload_present: - lines.append("[flocks] 检测到未完成的升级恢复状态") - lines.append(f"[flocks] 后端日志: {current.backend_log}") - lines.append(f"[flocks] WebUI 日志: {current.frontend_log}") - lines.append(f"[flocks] Watchdog 日志: {watchdog_log_path(current)}") - return lines +def _frontend_url_from_status_payload(payload: dict[str, Any], fallback: str) -> str: + webui = payload.get("webui") if isinstance(payload.get("webui"), dict) else {} + host = webui.get("host") + port = webui.get("port") + if isinstance(host, str) and isinstance(port, int): + return f"http://{_format_host_for_url(_loopback_host(host))}:{port}" + return fallback + + +def _print_status_payload(payload: dict[str, Any], console) -> None: + for line in _status_lines_from_payload(payload): + console.print(line) def show_status(console) -> None: @@ -1737,42 +1344,39 @@ def show_logs( follow: bool = True, lines: int = 50, ) -> None: - """Print recent service logs and optionally follow them.""" + """Print recent service logs through the supervisor control API.""" paths = ensure_runtime_dirs() - selections = selected_log_paths(paths, backend=backend, webui=webui) - prefixes = {paths.backend_log: "backend", paths.frontend_log: "webui"} - - for path in selections: - path.touch(exist_ok=True) - console.print(f"[{prefixes[path]}] --- {path} ---") - for line in tail_lines(path, lines): - console.print(f"[{prefixes[path]}] {line}") - + service = "all" + if backend and not webui: + service = "backend" + elif webui and not backend: + service = "webui" + params = {"service": service, "lines": str(lines), "follow": "true" if follow else "false"} if not follow: + try: + payload = read_control_json(f"/logs?service={service}&lines={lines}&follow=false", paths=paths, timeout=5.0) + except Exception as exc: + raise ServiceError(f"无法通过 Supervisor 读取日志: {exc}") from exc + logs = payload.get("logs") if isinstance(payload.get("logs"), dict) else {} + for prefix, entry in logs.items(): + if not isinstance(entry, dict): + continue + console.print(f"[{prefix}] --- {entry.get('path')} ---") + for line in entry.get("lines") or []: + console.print(f"[{prefix}] {line}") return console.print("[flocks] 按 Ctrl+C 退出日志跟随。") - handles = {} try: - for path in selections: - handle = path.open("r", encoding="utf-8", errors="replace") - handle.seek(0, os.SEEK_END) - handles[path] = handle - - while True: - emitted = False - for path, handle in handles.items(): - while True: - line = handle.readline() - if not line: - break - emitted = True - console.print(f"[{prefixes[path]}] {line.rstrip()}") - if not emitted: - time.sleep(FOLLOW_POLL_INTERVAL) - finally: - for handle in handles.values(): - handle.close() + with supervisor_control_client(paths, timeout=None) as client: + with client.stream("GET", "/logs", params=params) as response: + response.raise_for_status() + for line in response.iter_lines(): + console.print(line) + except KeyboardInterrupt: + return + except Exception as exc: + raise ServiceError(f"无法通过 Supervisor 跟随日志: {exc}") from exc def selected_log_paths( diff --git a/flocks/cli/service_supervisor.py b/flocks/cli/service_supervisor.py new file mode 100644 index 000000000..67793186c --- /dev/null +++ b/flocks/cli/service_supervisor.py @@ -0,0 +1,608 @@ +"""Supervisor daemon for the local Flocks service.""" + +from __future__ import annotations + +import datetime +import json +import os +import signal +import socket +import subprocess +import sys +import threading +import time +from dataclasses import dataclass +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from pathlib import Path +from typing import Any +from urllib.parse import parse_qs, urlparse + +import httpx + +from flocks.browser.admin import stop_all_daemons as stop_all_browser_daemons +from flocks.cli.service_control import ( + service_config_payload, + supervisor_control_port, + supervisor_log_path, + supervisor_socket_path, +) + +SUPERVISOR_CHECK_INTERVAL_SECONDS = 5.0 +SUPERVISOR_HEALTH_FAILURE_THRESHOLD = 2 +SUPERVISOR_BACKOFF_SECONDS = (1.0, 2.0, 5.0, 10.0, 30.0) + + +@dataclass +class ManagedService: + name: str + label: str + host: str + port: int + log_path: Path + process: subprocess.Popen | None = None + command: tuple[str, ...] = () + state: str = "stopped" + last_error: str | None = None + restart_count: int = 0 + last_restart_at: float | None = None + health_failure_count: int = 0 + next_restart_at: float = 0.0 + built_once: bool = False + + @property + def pid(self) -> int | None: + return self.process.pid if self.process is not None else None + + +def _daemon_log(event: str, details: dict[str, object] | None = None) -> None: + """Write a structured supervisor log line to stdout.""" + timestamp = datetime.datetime.now().isoformat(timespec="seconds") + suffix = "" + if details: + suffix = " " + json.dumps(details, ensure_ascii=True, sort_keys=True) + sys.stdout.write(f"[{timestamp}] supervisor.{event}{suffix}\n") + sys.stdout.flush() + + +def _config_from_payload(payload: dict[str, Any], default): + from flocks.cli.service_manager import ServiceConfig + + def _string(name: str, fallback: str) -> str: + value = payload.get(name) + return value if isinstance(value, str) and value else fallback + + def _int(name: str, fallback: int) -> int: + value = payload.get(name) + return value if isinstance(value, int) and not isinstance(value, bool) and value > 0 else fallback + + return ServiceConfig( + backend_host=_string("backend_host", default.backend_host), + backend_port=_int("backend_port", default.backend_port), + frontend_host=_string("frontend_host", default.frontend_host), + frontend_port=_int("frontend_port", default.frontend_port), + no_browser=bool(payload.get("no_browser", default.no_browser)), + skip_frontend_build=bool(payload.get("skip_frontend_build", default.skip_frontend_build)), + ) + + +def _tcp_port_accepts_connections(host: str, port: int) -> bool: + """Return True when a local service accepts TCP connections.""" + from flocks.cli.service_manager import access_host + + try: + with socket.create_connection((access_host(host), port), timeout=1.0): + return True + except OSError: + return False + + +def _health_status_from_service_state(state: str) -> str: + if state in {"healthy", "starting", "restarting", "stopped", "paused"}: + return state + return "degraded" + + +def _service_payload(service: ManagedService, *, paused: bool = False) -> dict[str, object]: + return { + "pid": service.pid, + "host": service.host, + "port": service.port, + "state": "paused" if paused else service.state, + "health": _health_status_from_service_state("paused" if paused else service.state), + "last_error": service.last_error, + "restart_count": service.restart_count, + "last_restart_at": service.last_restart_at, + "log_path": str(service.log_path), + "command": list(service.command), + "paused": paused, + } + + +class _UnixControlServer(ThreadingHTTPServer): + address_family = socket.AF_UNIX + + +class SupervisorDaemon: + """Owns backend/WebUI child processes and exposes a local control API.""" + + def __init__( + self, + config, + *, + interval: float = SUPERVISOR_CHECK_INTERVAL_SECONDS, + failure_threshold: int = SUPERVISOR_HEALTH_FAILURE_THRESHOLD, + ) -> None: + from flocks.cli.service_manager import ensure_runtime_dirs + + self.config = config + self.paths = ensure_runtime_dirs() + self.interval = interval + self.failure_threshold = failure_threshold + self.started_at = time.time() + self._lock = threading.RLock() + self._shutdown_requested = threading.Event() + self._server: ThreadingHTTPServer | None = None + self._server_thread: threading.Thread | None = None + self._webui_paused = False + self.backend = ManagedService( + name="backend", + label="后端", + host=config.backend_host, + port=config.backend_port, + log_path=self.paths.backend_log, + ) + self.webui = ManagedService( + name="webui", + label="WebUI", + host=config.frontend_host, + port=config.frontend_port, + log_path=self.paths.frontend_log, + ) + + def run(self) -> None: + """Run the supervisor until the control API asks it to stop.""" + self._install_signal_handlers() + self._cleanup_legacy_runtime() + self._start_control_server() + try: + self.restart_all(reason="startup") + while not self._shutdown_requested.wait(self.interval): + self.tick() + finally: + self.shutdown_children() + self._stop_control_server() + stop_all_browser_daemons() + _daemon_log("stopped") + + def _install_signal_handlers(self) -> None: + if threading.current_thread() is not threading.main_thread(): + return + + def _handle(_signum, _frame) -> None: + self.request_stop() + + for sig in (signal.SIGINT, signal.SIGTERM): + try: + signal.signal(sig, _handle) + except (OSError, ValueError): # pragma: no cover - platform defensive + pass + + def _cleanup_legacy_runtime(self) -> None: + from flocks.cli import service_manager + + console = service_manager._StdoutConsole() + for pid_file, name in ( + (service_manager.watchdog_pid_path(self.paths), "watchdog"), + (self.paths.frontend_pid, "WebUI"), + (self.paths.backend_pid, "backend"), + ): + record = service_manager.read_runtime_record(pid_file) + if record is not None and service_manager.runtime_record_is_running(record): + service_manager.stop_runtime_record_process(pid_file, name, console) + else: + pid_file.unlink(missing_ok=True) + + def _start_control_server(self) -> None: + handler = self._handler_class() + if sys.platform == "win32": + server: ThreadingHTTPServer = ThreadingHTTPServer(("127.0.0.1", supervisor_control_port()), handler) + else: + socket_path = supervisor_socket_path(self.paths) + socket_path.parent.mkdir(parents=True, exist_ok=True) + socket_path.unlink(missing_ok=True) + server = _UnixControlServer(str(socket_path), handler) + self._server = server + self._server_thread = threading.Thread(target=server.serve_forever, name="flocks-supervisor-control", daemon=True) + self._server_thread.start() + _daemon_log("control_started", {"platform": sys.platform}) + + def _stop_control_server(self) -> None: + if self._server is not None: + self._server.shutdown() + self._server.server_close() + if self._server_thread is not None: + self._server_thread.join(timeout=5.0) + if sys.platform != "win32": + supervisor_socket_path(self.paths).unlink(missing_ok=True) + + def _handler_class(self): + daemon = self + + class ControlHandler(BaseHTTPRequestHandler): + protocol_version = "HTTP/1.0" + + def log_message(self, _format, *_args) -> None: + return + + def _send_json(self, payload: dict[str, object], status: int = 200) -> None: + body = json.dumps(payload, ensure_ascii=False, sort_keys=True).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def _read_json(self) -> dict[str, Any]: + length = int(self.headers.get("Content-Length") or "0") + if length <= 0: + return {} + try: + payload = json.loads(self.rfile.read(length).decode("utf-8")) + except (UnicodeDecodeError, json.JSONDecodeError): + return {} + return payload if isinstance(payload, dict) else {} + + def do_GET(self) -> None: + parsed = urlparse(self.path) + try: + if parsed.path == "/status": + self._send_json(daemon.status_payload()) + return + if parsed.path == "/logs": + daemon.handle_logs_request(self, parse_qs(parsed.query)) + return + self._send_json({"error": "not found"}, status=404) + except Exception as exc: # pragma: no cover - defensive control path + self._send_json({"error": str(exc)}, status=500) + + def do_POST(self) -> None: + parsed = urlparse(self.path) + payload = self._read_json() + try: + if parsed.path == "/stop": + daemon.request_stop() + self._send_json({"status": "stopping"}) + return + if parsed.path == "/restart": + daemon.update_config(payload) + daemon.restart_all(reason="control restart") + self._send_json(daemon.status_payload()) + return + if parsed.path == "/restart/backend": + daemon.restart_backend(reason="control restart") + self._send_json(daemon.status_payload()) + return + if parsed.path == "/restart/webui": + daemon.update_config(payload) + daemon.restart_webui( + reason="control restart", + force_frontend_build=bool(payload.get("force_frontend_build")), + ) + self._send_json(daemon.status_payload()) + return + if parsed.path == "/stop/webui": + daemon.stop_webui(reason="control stop") + self._send_json(daemon.status_payload()) + return + self._send_json({"error": "not found"}, status=404) + except Exception as exc: # pragma: no cover - defensive control path + self._send_json({"error": str(exc)}, status=500) + + return ControlHandler + + def update_config(self, payload: dict[str, Any]) -> None: + with self._lock: + self.config = _config_from_payload(payload, self.config) + self.backend.host = self.config.backend_host + self.backend.port = self.config.backend_port + self.webui.host = self.config.frontend_host + self.webui.port = self.config.frontend_port + + def request_stop(self) -> None: + self._shutdown_requested.set() + + def status_payload(self) -> dict[str, object]: + try: + from flocks import __version__ + except Exception: # pragma: no cover - defensive + __version__ = "unknown" + with self._lock: + return { + "daemon": { + "pid": os.getpid(), + "uptime": time.time() - self.started_at, + "version": __version__, + "state": "stopping" if self._shutdown_requested.is_set() else "running", + "log_path": str(supervisor_log_path(self.paths)), + }, + "backend": _service_payload(self.backend), + "webui": _service_payload(self.webui, paused=self._webui_paused), + "config": service_config_payload(self.config), + } + + def handle_logs_request(self, handler: BaseHTTPRequestHandler, query: dict[str, list[str]]) -> None: + from flocks.cli.service_manager import FOLLOW_POLL_INTERVAL, _coerce_positive_int, tail_lines + + service_name = (query.get("service") or ["backend"])[0] + lines = _coerce_positive_int((query.get("lines") or ["50"])[0]) or 50 + follow = (query.get("follow") or ["false"])[0].lower() == "true" + selections = self._log_paths_for_service(service_name) + if not selections: + body = json.dumps({"error": "unknown service"}, ensure_ascii=False).encode("utf-8") + handler.send_response(400) + handler.send_header("Content-Type", "application/json; charset=utf-8") + handler.send_header("Content-Length", str(len(body))) + handler.end_headers() + handler.wfile.write(body) + return + + for _prefix, log_path in selections: + log_path.touch(exist_ok=True) + if not follow: + body = json.dumps( + { + "service": service_name, + "logs": { + prefix: { + "path": str(log_path), + "lines": tail_lines(log_path, lines), + } + for prefix, log_path in selections + }, + }, + ensure_ascii=False, + ).encode("utf-8") + handler.send_response(200) + handler.send_header("Content-Type", "application/json; charset=utf-8") + handler.send_header("Content-Length", str(len(body))) + handler.end_headers() + handler.wfile.write(body) + return + + handler.send_response(200) + handler.send_header("Content-Type", "text/plain; charset=utf-8") + handler.end_headers() + for prefix, log_path in selections: + handler.wfile.write((f"[{prefix}] --- {log_path} ---\n").encode("utf-8", errors="replace")) + for line in tail_lines(log_path, lines): + handler.wfile.write((f"[{prefix}] {line}\n").encode("utf-8", errors="replace")) + handler.wfile.flush() + handles = {} + try: + for prefix, log_path in selections: + handle = log_path.open("r", encoding="utf-8", errors="replace") + handle.seek(0, os.SEEK_END) + handles[prefix] = handle + while not self._shutdown_requested.is_set(): + emitted = False + for prefix, handle in handles.items(): + while True: + line = handle.readline() + if not line: + break + emitted = True + handler.wfile.write((f"[{prefix}] {line}").encode("utf-8", errors="replace")) + if emitted: + handler.wfile.flush() + else: + time.sleep(FOLLOW_POLL_INTERVAL) + finally: + for handle in handles.values(): + handle.close() + + def _log_paths_for_service(self, service_name: str) -> list[tuple[str, Path]]: + if service_name == "backend": + return [("backend", self.paths.backend_log)] + if service_name == "webui": + return [("webui", self.paths.frontend_log)] + if service_name == "supervisor": + return [("supervisor", supervisor_log_path(self.paths))] + if service_name == "all": + return [ + ("backend", self.paths.backend_log), + ("webui", self.paths.frontend_log), + ("supervisor", supervisor_log_path(self.paths)), + ] + return [] + + def restart_all(self, *, reason: str) -> None: + with self._lock: + self._webui_paused = False + self._restart_service(self.webui, reason=reason, immediate=True) + self._restart_service(self.backend, reason=reason, immediate=True) + self._start_backend_locked(immediate=True) + self._start_webui_locked(immediate=True) + + def restart_backend(self, *, reason: str) -> None: + with self._lock: + self._restart_service(self.backend, reason=reason, immediate=True) + self._start_backend_locked(immediate=True) + + def restart_webui(self, *, reason: str, force_frontend_build: bool = False) -> None: + with self._lock: + self._webui_paused = False + if force_frontend_build: + self.webui.built_once = False + self._restart_service(self.webui, reason=reason, immediate=True) + self._start_webui_locked(immediate=True) + + def stop_webui(self, *, reason: str) -> None: + with self._lock: + self._webui_paused = True + _daemon_log("service_pause", {"service": "webui", "reason": reason}) + self._stop_service(self.webui) + self.webui.last_error = reason + + def shutdown_children(self) -> None: + with self._lock: + self._stop_service(self.webui) + self._stop_service(self.backend) + + def tick(self) -> None: + with self._lock: + self._probe_backend_locked() + if not self._webui_paused: + self._probe_webui_locked() + self._start_backend_locked(immediate=False) + if not self._webui_paused: + self._start_webui_locked(immediate=False) + + def _restart_service(self, service: ManagedService, *, reason: str, immediate: bool) -> None: + _daemon_log("service_restart", {"service": service.name, "reason": reason}) + self._stop_service(service) + service.state = "restarting" + service.last_error = reason + service.health_failure_count = 0 + service.restart_count += 1 + service.last_restart_at = time.time() + service.next_restart_at = time.monotonic() if immediate else self._next_restart_time(service.restart_count) + + def _stop_service(self, service: ManagedService) -> None: + from flocks.cli.service_manager import _StdoutConsole, _terminate_process + + _terminate_process(service.process, service.label, _StdoutConsole()) + service.process = None + service.command = () + service.state = "stopped" + + def _start_backend_locked(self, *, immediate: bool) -> None: + from flocks.cli.service_manager import _StdoutConsole, _start_backend_process + + if self.backend.process is not None and self.backend.process.poll() is None: + return + if not immediate and time.monotonic() < self.backend.next_restart_at: + return + self.backend.state = "starting" + try: + process = _start_backend_process(self.config, _StdoutConsole(), paths=self.paths) + except Exception as exc: + self._mark_start_failed(self.backend, exc) + return + self.backend.process = process + self.backend.command = tuple(str(item) for item in process.args) + self.backend.state = "healthy" + self.backend.last_error = None + self.backend.health_failure_count = 0 + + def _start_webui_locked(self, *, immediate: bool) -> None: + from flocks.cli.service_manager import ServiceConfig, _StdoutConsole, _start_frontend_process + + if self.webui.process is not None and self.webui.process.poll() is None: + return + if not immediate and time.monotonic() < self.webui.next_restart_at: + return + self.webui.state = "starting" + config = self.config + if self.webui.built_once: + config = ServiceConfig( + backend_host=config.backend_host, + backend_port=config.backend_port, + frontend_host=config.frontend_host, + frontend_port=config.frontend_port, + no_browser=config.no_browser, + skip_frontend_build=True, + ) + try: + process = _start_frontend_process(config, _StdoutConsole(), paths=self.paths) + except Exception as exc: + self._mark_start_failed(self.webui, exc) + return + self.webui.process = process + self.webui.command = tuple(str(item) for item in process.args) + self.webui.state = "healthy" + self.webui.last_error = None + self.webui.health_failure_count = 0 + self.webui.built_once = True + + def _mark_start_failed(self, service: ManagedService, error: Exception) -> None: + service.process = None + service.state = "degraded" + service.last_error = str(error) + service.next_restart_at = self._next_restart_time(service.restart_count) + _daemon_log( + "service_start_failed", + {"service": service.name, "error": str(error), "retry_at": service.next_restart_at}, + ) + + def _next_restart_time(self, restart_count: int) -> float: + index = min(max(restart_count, 1) - 1, len(SUPERVISOR_BACKOFF_SECONDS) - 1) + return time.monotonic() + SUPERVISOR_BACKOFF_SECONDS[index] + + def _probe_backend_locked(self) -> None: + from flocks.cli.service_manager import _backend_health_url, _is_healthy_status_response + + process = self.backend.process + if process is None: + self.backend.state = "stopped" + return + if process.poll() is not None: + self._restart_service(self.backend, reason=f"process exited with code {process.returncode}", immediate=True) + return + if not _tcp_port_accepts_connections(self.backend.host, self.backend.port): + self._restart_service(self.backend, reason=f"port {self.backend.port} is not listening", immediate=True) + return + + url = _backend_health_url(self.backend.host, self.backend.port) + try: + with httpx.Client(timeout=2.0, trust_env=False) as client: + response = client.get(url) + healthy = _is_healthy_status_response(response) + reason = f"health status={response.status_code}" + except Exception as exc: + healthy = False + reason = f"health failed: {exc}" + if healthy: + self.backend.state = "healthy" + self.backend.health_failure_count = 0 + self.backend.last_error = None + return + + self.backend.health_failure_count += 1 + self.backend.state = "degraded" + self.backend.last_error = reason + if self.backend.health_failure_count >= self.failure_threshold: + self._restart_service(self.backend, reason=reason, immediate=True) + + def _probe_webui_locked(self) -> None: + process = self.webui.process + if process is None: + self.webui.state = "stopped" + return + if process.poll() is not None: + self._restart_service(self.webui, reason=f"process exited with code {process.returncode}", immediate=True) + return + if not _tcp_port_accepts_connections(self.webui.host, self.webui.port): + self._restart_service(self.webui, reason=f"port {self.webui.port} is not listening", immediate=True) + return + self.webui.state = "healthy" + self.webui.health_failure_count = 0 + self.webui.last_error = None + + +def run_service_daemon( + config, + *, + interval: float = SUPERVISOR_CHECK_INTERVAL_SECONDS, + failure_threshold: int = SUPERVISOR_HEALTH_FAILURE_THRESHOLD, +) -> None: + """Run the local supervisor daemon.""" + _daemon_log( + "started", + { + "backend_host": config.backend_host, + "backend_port": config.backend_port, + "frontend_host": config.frontend_host, + "frontend_port": config.frontend_port, + }, + ) + SupervisorDaemon(config, interval=interval, failure_threshold=failure_threshold).run() diff --git a/flocks/server/app.py b/flocks/server/app.py index 8c7cb1b24..aa03ebc60 100644 --- a/flocks/server/app.py +++ b/flocks/server/app.py @@ -681,7 +681,7 @@ def _should_log_request(path: str, status_code: int) -> bool: # CORS Configuration # # Priority order: -# 1. Runtime env vars exported by ``start_backend()`` → add the concrete +# 1. Runtime env vars exported by the supervised backend launcher → add the concrete # ``_FLOCKS_WEBUI_*`` origin inferred from the current CLI launch. # 2. Explicit ``server.cors`` in flocks.json → append user-configured # origins without discarding the runtime ones. diff --git a/flocks/updater/restart_handoff.py b/flocks/updater/restart_handoff.py index f6350b400..dc639b02e 100644 --- a/flocks/updater/restart_handoff.py +++ b/flocks/updater/restart_handoff.py @@ -26,11 +26,6 @@ DEFAULT_POLL_INTERVAL_SECONDS = 0.25 -class _NullConsole: - def print(self, *args, **kwargs) -> None: - return None - - def _record_handoff_log(message: str) -> None: append_upgrade_text_log(f"restart_handoff {message}") @@ -68,52 +63,14 @@ def _wait_for_backend_port_free( return not _backend_port_in_use(port) -def _ensure_backend_port_free(backend_port: int, backend_pid_file: Path) -> bool: +def _ensure_backend_port_free(backend_port: int) -> bool: if _wait_for_backend_port_free(backend_port): return True - _record_handoff_log(f"backend_port_still_in_use port={backend_port}; stopping backend") - try: - service_manager.stop_one(backend_port, backend_pid_file, "backend", _NullConsole()) - except Exception as exc: - _record_handoff_log(f"backend_stop_failed port={backend_port} error={exc}") - return False - + _record_handoff_log(f"backend_port_still_in_use port={backend_port}") return _wait_for_backend_port_free(backend_port, timeout_seconds=POST_STOP_PORT_TIMEOUT_SECONDS) -def _cli_subcommand(argv: Sequence[str]) -> str | None: - for index, value in enumerate(argv[:-2]): - if value == "-m" and argv[index + 1] == "flocks.cli.main": - return argv[index + 2] - return None - - -def _record_backend_runtime_if_direct_serve( - process: subprocess.Popen, - restart_argv: Sequence[str], - *, - backend_host: str, - backend_port: int, - backend_pid_file: Path, -) -> None: - if _cli_subcommand(restart_argv) != "serve": - return - - try: - service_manager.write_runtime_record( - backend_pid_file, - service_manager.process_runtime_record( - process, - host=backend_host, - port=backend_port, - command=restart_argv, - ), - ) - except Exception as exc: - _record_handoff_log(f"backend_runtime_record_failed error={exc}") - - def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: parser = argparse.ArgumentParser(description="Flocks restart handoff helper") parser.add_argument("--parent-pid", type=int, required=True) @@ -121,7 +78,6 @@ def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: parser.add_argument("--backend-port", type=int, required=True) parser.add_argument("--frontend-host", required=True) parser.add_argument("--frontend-port", type=int, required=True) - parser.add_argument("--backend-pid-file", required=True) parser.add_argument("--install-root", required=True) parser.add_argument("--uv-path", required=True) parser.add_argument("--sync-timeout", type=int, required=True) @@ -200,8 +156,7 @@ def run(argv: Sequence[str] | None = None) -> int: _cleanup_dir(args.cleanup_dir) return 1 - backend_pid_file = Path(args.backend_pid_file) - if not _ensure_backend_port_free(args.backend_port, backend_pid_file): + if not _ensure_backend_port_free(args.backend_port): _record_handoff_log(f"backend_port_unavailable port={args.backend_port}") _cleanup_dir(args.cleanup_dir) return 1 @@ -226,13 +181,6 @@ def run(argv: Sequence[str] | None = None) -> int: _cleanup_dir(args.cleanup_dir) return 1 - _record_backend_runtime_if_direct_serve( - process, - restart_argv, - backend_host=args.backend_host, - backend_port=args.backend_port, - backend_pid_file=backend_pid_file, - ) _record_handoff_log(f"restart_spawned pid={process.pid}") _cleanup_dir(args.cleanup_dir) return 0 diff --git a/flocks/updater/updater.py b/flocks/updater/updater.py index d076af229..9f5a75552 100644 --- a/flocks/updater/updater.py +++ b/flocks/updater/updater.py @@ -1888,13 +1888,19 @@ def print(self, *args, **kwargs) -> None: def _current_service_config(): from flocks.cli import service_manager + from flocks.cli.service_control import read_supervisor_status - paths = service_manager.ensure_runtime_dirs() + try: + payload = read_supervisor_status(paths=service_manager.runtime_paths(), timeout=1.0) + except Exception: + payload = {} + + config = payload.get("config") if isinstance(payload.get("config"), dict) else {} return service_manager.ServiceConfig( - backend_host=service_manager._recorded_host(paths.backend_pid, service_manager.ServiceConfig.backend_host), - backend_port=service_manager._recorded_port(paths.backend_pid, service_manager.ServiceConfig.backend_port), - frontend_host=service_manager._recorded_host(paths.frontend_pid, service_manager.ServiceConfig.frontend_host), - frontend_port=service_manager._recorded_port(paths.frontend_pid, service_manager.ServiceConfig.frontend_port), + backend_host=str(config.get("backend_host") or service_manager.ServiceConfig.backend_host), + backend_port=int(config.get("backend_port") or service_manager.ServiceConfig.backend_port), + frontend_host=str(config.get("frontend_host") or service_manager.ServiceConfig.frontend_host), + frontend_port=int(config.get("frontend_port") or service_manager.ServiceConfig.frontend_port), no_browser=True, skip_frontend_build=True, ) @@ -2068,6 +2074,7 @@ def _stop_upgrade_page_server(*, frontend_port: int | None = None) -> None: def _prepare_upgrade_handover(version: str) -> dict[str, Any]: from flocks.cli import service_manager + from flocks.cli.service_control import post_control_json config = _current_service_config() payload: dict[str, Any] = { @@ -2082,9 +2089,8 @@ def _prepare_upgrade_handover(version: str) -> dict[str, Any]: _persist_upgrade_state(payload, last_error=None) console = _NullConsole() - paths = service_manager.ensure_runtime_dirs() - frontend_port = service_manager._recorded_port(paths.frontend_pid, config.frontend_port) - service_manager.stop_one(frontend_port, paths.frontend_pid, "WebUI", console) + paths = service_manager.runtime_paths() + post_control_json("/stop/webui", paths=paths, timeout=30.0) try: payload.update(_start_upgrade_page_server(config, version)) @@ -2097,7 +2103,7 @@ def _prepare_upgrade_handover(version: str) -> dict[str, Any]: _stop_upgrade_page_server(frontend_port=config.frontend_port) _clear_upgrade_state() try: - service_manager.start_frontend(config, console) + _start_frontend_with_fallback(config, console, allow_build_fallback=False) except Exception as restart_error: log.error("updater.frontend.restore_failed", {"error": str(restart_error)}) raise @@ -2181,15 +2187,25 @@ def read_upgrade_runtime_state(frontend_port: int | None = None) -> dict[str, An def _start_frontend_with_fallback(config, console, *, allow_build_fallback: bool) -> None: - from flocks.cli import service_manager + from flocks.cli.service_control import post_control_json, service_config_payload try: - service_manager.start_frontend(config, console) + payload = post_control_json( + "/restart/webui", + payload=service_config_payload(config), + paths=None, + timeout=180.0, + ) + webui = payload.get("webui") if isinstance(payload.get("webui"), dict) else {} + if webui.get("state") != "healthy": + raise RuntimeError(str(webui.get("last_error") or "WebUI restart did not become healthy")) return except Exception: if not allow_build_fallback or not config.skip_frontend_build: raise + from flocks.cli import service_manager + rebuilt_config = service_manager.ServiceConfig( backend_host=config.backend_host, backend_port=config.backend_port, @@ -2198,7 +2214,12 @@ def _start_frontend_with_fallback(config, console, *, allow_build_fallback: bool no_browser=config.no_browser, skip_frontend_build=False, ) - service_manager.start_frontend(rebuilt_config, console) + payload = service_config_payload(rebuilt_config) + payload["force_frontend_build"] = True + result = post_control_json("/restart/webui", payload=payload, paths=None, timeout=180.0) + webui = result.get("webui") if isinstance(result.get("webui"), dict) else {} + if webui.get("state") != "healthy": + raise RuntimeError(str(webui.get("last_error") or "WebUI restart did not become healthy")) def cleanup_orphan_upgrade_state(*, frontend_port: int | None = None) -> bool: @@ -3442,13 +3463,10 @@ def _build_restart_handoff_argv( cleanup_dir: Path | None = None, ) -> list[str]: """Wrap the real restart command in a helper that finishes upgrade work.""" - from flocks.cli import service_manager - if not restart_argv: raise ValueError("restart command is empty") config = _current_service_config() - paths = service_manager.ensure_runtime_dirs() argv = [ restart_argv[0], "-m", @@ -3463,8 +3481,6 @@ def _build_restart_handoff_argv( str(config.frontend_host), "--frontend-port", str(config.frontend_port), - "--backend-pid-file", - str(paths.backend_pid), "--install-root", str(install_root), "--uv-path", diff --git a/tests/cli/test_service_commands.py b/tests/cli/test_service_commands.py index 49d92a200..d80928bc5 100644 --- a/tests/cli/test_service_commands.py +++ b/tests/cli/test_service_commands.py @@ -31,7 +31,7 @@ def test_cli_help_lists_service_commands(monkeypatch, tmp_path) -> None: assert result.exit_code == 0 for command in ("start", "stop", "restart", "status", "logs", "session", "mcp", "task", "skills"): assert _help_contains_command(result.stdout, command) - for command in ("agent", "acp", "debug", "run", "serve", "service-watchdog", "auth", "models"): + for command in ("agent", "acp", "debug", "run", "serve", "service-watchdog", "service-daemon", "auth", "models"): assert not _help_contains_command(result.stdout, command) diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index 286f9541a..e56e75e7c 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -1,6 +1,4 @@ -import contextlib import json -import signal import sys from pathlib import Path from types import SimpleNamespace @@ -9,6 +7,7 @@ import pytest from flocks.cli import service_manager +from flocks.cli import service_supervisor class DummyConsole: @@ -31,6 +30,21 @@ def _make_runtime_paths(tmp_path: Path) -> service_manager.RuntimePaths: ) +def _write_legacy_runtime_record(pid_file: Path, record: service_manager.RuntimeRecord) -> None: + payload: dict[str, object] = {"pid": record.pid} + if record.pgid is not None: + payload["pgid"] = record.pgid + if record.host is not None: + payload["host"] = record.host + if record.port is not None: + payload["port"] = record.port + if record.command: + payload["command"] = list(record.command) + if record.started_at is not None: + payload["started_at"] = record.started_at + pid_file.write_text(json.dumps(payload, ensure_ascii=True, sort_keys=True), encoding="utf-8") + + def test_runtime_paths_follow_flocks_root_env(monkeypatch, tmp_path: Path) -> None: monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path)) @@ -253,7 +267,7 @@ def test_runtime_record_round_trip_preserves_metadata(tmp_path: Path) -> None: started_at=1234.5, ) - service_manager.write_runtime_record(pid_file, record) + _write_legacy_runtime_record(pid_file, record) assert json.loads(pid_file.read_text(encoding="utf-8")) == { "command": ["python", "-m", "uvicorn"], @@ -276,7 +290,7 @@ def test_runtime_record_round_trip_preserves_host(tmp_path: Path) -> None: started_at=1234.5, ) - service_manager.write_runtime_record(pid_file, record) + _write_legacy_runtime_record(pid_file, record) assert json.loads(pid_file.read_text(encoding="utf-8")) == { "command": ["python", "-m", "uvicorn"], @@ -298,7 +312,7 @@ def test_read_runtime_record_rejects_invalid_content(tmp_path: Path) -> None: def test_cleanup_stale_pid_file_keeps_live_process_group(monkeypatch, tmp_path: Path) -> None: pid_file = tmp_path / "backend.pid" - service_manager.write_runtime_record( + _write_legacy_runtime_record( pid_file, service_manager.RuntimeRecord(pid=1001, pgid=2002, port=8000), ) @@ -313,7 +327,7 @@ def test_cleanup_stale_pid_file_keeps_live_process_group(monkeypatch, tmp_path: def test_cleanup_stale_pid_file_removes_reused_windows_pid(monkeypatch, tmp_path: Path) -> None: pid_file = tmp_path / "backend.pid" - service_manager.write_runtime_record( + _write_legacy_runtime_record( pid_file, service_manager.RuntimeRecord( pid=1232, @@ -588,7 +602,7 @@ def _client_factory(*, timeout, trust_env): captured["trust_env"] = trust_env return _FakeClient() - monkeypatch.setattr(service_manager.httpx, "Client", _client_factory) + monkeypatch.setattr(service_supervisor.httpx, "Client", _client_factory) service_manager.wait_for_http( ["http://127.0.0.1:8000/api/health"], @@ -685,139 +699,117 @@ def test_resolve_flocks_cli_command_falls_back_to_python_module(monkeypatch, tmp ] -def test_build_status_lines_reports_running_and_idle_services(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - paths.backend_pid.write_text("111", encoding="utf-8") - paths.frontend_pid.write_text("222", encoding="utf-8") +def _supervisor_status_payload() -> dict[str, object]: + return { + "daemon": { + "pid": 100, + "state": "running", + "log_path": "/tmp/logs/supervisor.log", + }, + "backend": { + "pid": 111, + "host": "0.0.0.0", + "port": 9000, + "state": "healthy", + "last_error": None, + "log_path": "/tmp/logs/backend.log", + }, + "webui": { + "pid": 222, + "host": "0.0.0.0", + "port": 5174, + "state": "healthy", + "last_error": None, + "log_path": "/tmp/logs/webui.log", + }, + } - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _: None) - monkeypatch.setattr( - service_manager, - "port_owner_pids", - lambda port: [111] if port == 8000 else [], - ) - monkeypatch.setattr(service_manager, "pid_is_running", lambda pid: pid == 222) + +def test_build_status_lines_reports_supervisor_control_status(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + monkeypatch.setattr(service_manager, "read_control_json", lambda *_args, **_kwargs: _supervisor_status_payload()) lines = service_manager.build_status_lines(paths) - assert "后端运行中" in lines[0] - assert "WebUI 主进程仍在运行" in lines[1] + assert "Supervisor 运行中" in lines[0] + assert "http://127.0.0.1:9000" in lines[1] + assert "http://127.0.0.1:5174" in lines[2] -def test_build_status_lines_uses_custom_server_and_webui_ports(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - service_manager.write_runtime_record( - paths.backend_pid, - service_manager.RuntimeRecord(pid=111, host="0.0.0.0", port=9000), - ) - service_manager.write_runtime_record( - paths.frontend_pid, - service_manager.RuntimeRecord(pid=222, host="0.0.0.0", port=5174), - ) +def test_build_status_lines_reports_daemon_down_without_port_scans(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + calls: list[str] = [] - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _: None) monkeypatch.setattr( service_manager, - "port_owner_pids", - lambda port: [111] if port in {9000, 5174} else [], + "read_control_json", + lambda *_args, **_kwargs: (_ for _ in ()).throw(service_manager.ServiceError("down")), ) - monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: False) + monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: calls.append("port_owner") or []) + monkeypatch.setattr(service_manager, "port_is_in_use", lambda *_args, **_kwargs: calls.append("port_in_use") or False) lines = service_manager.build_status_lines(paths) - assert "http://127.0.0.1:9000" in lines[0] - assert "http://127.0.0.1:5174" in lines[1] + assert lines[0] == "[flocks] Supervisor 未运行" + assert calls == [] -def test_start_all_stops_services_before_starting(monkeypatch) -> None: +def test_start_all_starts_supervisor_when_control_api_is_down(monkeypatch) -> None: call_order: list[str] = [] - paths = service_manager.RuntimePaths( - root=Path("/tmp"), - run_dir=Path("/tmp/run"), - log_dir=Path("/tmp/logs"), - backend_pid=Path("/tmp/run/backend.pid"), - frontend_pid=Path("/tmp/run/webui.pid"), - backend_log=Path("/tmp/logs/backend.log"), - frontend_log=Path("/tmp/logs/webui.log"), - ) + paths = _make_runtime_paths(Path("/tmp/flocks-test")) monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: (call_order.append("ensure_runtime_dirs"), paths)[1]) - monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call(call_order, "service_lock")) - monkeypatch.setattr(service_manager, "stop_one", lambda port, _pid_file, _name, _console: call_order.append(f"stop_one:{port}")) - monkeypatch.setattr(service_manager, "stop_all_browser_daemons", lambda: call_order.append("stop_browser") or []) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: False) monkeypatch.setattr(service_manager, "_start_all_without_stop", lambda _config, _console: call_order.append("_start_all_without_stop")) service_manager.start_all(service_manager.ServiceConfig(), console=None) - assert call_order == [ - "ensure_runtime_dirs", - "service_lock", - "stop_one:5173", - "stop_one:8000", - "stop_browser", - "_start_all_without_stop", - ] + assert call_order == ["ensure_runtime_dirs", "_start_all_without_stop"] + + +def test_start_all_does_not_duplicate_running_supervisor(monkeypatch) -> None: + calls: list[str] = [] + console = DummyConsole() + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: _make_runtime_paths(Path("/tmp/flocks-test"))) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: True) + monkeypatch.setattr(service_manager, "show_status", lambda _console: calls.append("status")) + monkeypatch.setattr(service_manager, "open_default_browser", lambda _url, _console: calls.append("browser")) + monkeypatch.setattr(service_manager, "_start_all_without_stop", lambda *_args: calls.append("start")) -def test_restart_all_stops_then_starts_under_lock(monkeypatch) -> None: + service_manager.start_all(service_manager.ServiceConfig(no_browser=True), console=console) + + assert calls == ["status"] + assert "[flocks] Supervisor 已在运行。" in console.messages + + +def test_restart_all_uses_supervisor_control_api(monkeypatch) -> None: call_order: list[str] = [] - paths = service_manager.RuntimePaths( - root=Path("/tmp"), - run_dir=Path("/tmp/run"), - log_dir=Path("/tmp/logs"), - backend_pid=Path("/tmp/run/backend.pid"), - frontend_pid=Path("/tmp/run/webui.pid"), - backend_log=Path("/tmp/logs/backend.log"), - frontend_log=Path("/tmp/logs/webui.log"), - ) + paths = _make_runtime_paths(Path("/tmp/flocks-test")) monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: (call_order.append("ensure_runtime_dirs"), paths)[1]) - monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call(call_order, "service_lock")) - monkeypatch.setattr(service_manager, "stop_one", lambda port, _pid_file, _name, _console: call_order.append(f"stop_one:{port}")) - monkeypatch.setattr(service_manager, "stop_all_browser_daemons", lambda: call_order.append("stop_browser") or []) - monkeypatch.setattr(service_manager, "_start_all_without_stop", lambda _config, _console: call_order.append("_start_all_without_stop")) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: True) + monkeypatch.setattr( + service_manager, + "post_control_json", + lambda path, **_kwargs: call_order.append(path) or _supervisor_status_payload(), + ) + monkeypatch.setattr(service_manager, "_print_status_payload", lambda _payload, _console: call_order.append("print_status")) service_manager.restart_all(service_manager.ServiceConfig(), console=None) - assert call_order == [ - "ensure_runtime_dirs", - "service_lock", - "stop_one:5173", - "stop_one:8000", - "stop_browser", - "_start_all_without_stop", - ] + assert call_order == ["ensure_runtime_dirs", "/restart", "print_status"] -def test_start_all_without_stop_starts_watchdog_after_frontend(monkeypatch, tmp_path: Path) -> None: +def test_start_all_without_stop_starts_supervisor_daemon(monkeypatch, tmp_path: Path) -> None: paths = _make_runtime_paths(tmp_path) calls: list[str] = [] monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "start_backend", lambda _config, _console: calls.append("backend")) - monkeypatch.setattr(service_manager, "start_frontend", lambda _config, _console: calls.append("webui")) - monkeypatch.setattr(service_manager, "start_watchdog", lambda _config, _console: calls.append("watchdog")) + monkeypatch.setattr(service_manager, "_start_supervisor_process", lambda _config, _paths, _console: calls.append("daemon") or SimpleNamespace(poll=lambda: None)) + monkeypatch.setattr(service_manager, "_wait_for_supervisor_ready", lambda _paths, **_kwargs: calls.append("ready") or _supervisor_status_payload()) monkeypatch.setattr(service_manager, "show_start_summary", lambda _config, _console: calls.append("summary")) + monkeypatch.setattr(service_manager, "_print_status_payload", lambda _payload, _console: calls.append("status")) monkeypatch.setattr( service_manager, "open_default_browser", @@ -826,37 +818,23 @@ def test_start_all_without_stop_starts_watchdog_after_frontend(monkeypatch, tmp_ service_manager._start_all_without_stop(service_manager.ServiceConfig(no_browser=True), DummyConsole()) - assert calls == ["backend", "webui", "watchdog", "summary"] + assert calls == ["daemon", "ready", "summary", "status"] -def test_start_all_stops_on_failure_before_restart(monkeypatch) -> None: - paths = service_manager.RuntimePaths( - root=Path("/tmp"), - run_dir=Path("/tmp/run"), - log_dir=Path("/tmp/logs"), - backend_pid=Path("/tmp/run/backend.pid"), - frontend_pid=Path("/tmp/run/webui.pid"), - backend_log=Path("/tmp/logs/backend.log"), - frontend_log=Path("/tmp/logs/webui.log"), - ) - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call([], "service_lock")) - monkeypatch.setattr( - service_manager, - "stop_one", - lambda *_args: (_ for _ in ()).throw(service_manager.ServiceError("stop failed")), - ) +def test_start_all_propagates_supervisor_start_failure(monkeypatch) -> None: + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: _make_runtime_paths(Path("/tmp/flocks-test"))) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: False) monkeypatch.setattr( service_manager, "_start_all_without_stop", - lambda *_args: (_ for _ in ()).throw(AssertionError("should not start")), + lambda *_args: (_ for _ in ()).throw(service_manager.ServiceError("daemon failed")), ) - with pytest.raises(service_manager.ServiceError, match="stop failed"): + with pytest.raises(service_manager.ServiceError, match="daemon failed"): service_manager.start_all(service_manager.ServiceConfig(), console=None) -def test_start_backend_writes_runtime_metadata(monkeypatch, tmp_path: Path) -> None: +def test_start_backend_process_does_not_write_runtime_metadata(monkeypatch, tmp_path: Path) -> None: paths = service_manager.RuntimePaths( root=tmp_path, run_dir=tmp_path / "run", @@ -901,24 +879,10 @@ def _capture_spawn(*_args, **kwargs) -> SimpleNamespace: monkeypatch.setattr(service_manager, "_spawn_process", _capture_spawn) - service_manager.start_backend(service_manager.ServiceConfig(), console) + process = service_manager._start_backend_process(service_manager.ServiceConfig(), console) - record = service_manager.read_runtime_record(paths.backend_pid) - assert record is not None - assert record.pid == 2468 - assert record.pgid == 2468 - assert record.host == "127.0.0.1" - assert record.port == 8000 - assert record.command == ( - "python", - "-m", - "flocks.cli.main", - "serve", - "--host", - "127.0.0.1", - "--port", - "8000", - ) + assert process.pid == 2468 + assert not paths.backend_pid.exists() assert probe_calls == [{ "urls": ["http://127.0.0.1:8000"], "name": "后端服务", @@ -944,7 +908,7 @@ def test_start_backend_rolls_back_when_probe_fails(monkeypatch, tmp_path: Path) paths.log_dir.mkdir(parents=True) paths.backend_log.write_text("line1\nline2\nboot failed here\n", encoding="utf-8") console = DummyConsole() - stop_calls: list[tuple[int, Path, str]] = [] + stop_calls: list[str] = [] monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) @@ -959,7 +923,7 @@ def test_start_backend_rolls_back_when_probe_fails(monkeypatch, tmp_path: Path) monkeypatch.setattr( service_manager, "_spawn_process", - lambda *_args, **_kwargs: SimpleNamespace(pid=2468), + lambda *_args, **_kwargs: SimpleNamespace(pid=2468, poll=lambda: None), ) monkeypatch.setattr( service_manager, @@ -968,14 +932,14 @@ def test_start_backend_rolls_back_when_probe_fails(monkeypatch, tmp_path: Path) ) monkeypatch.setattr( service_manager, - "stop_one", - lambda port, pid_file, name, _console: stop_calls.append((port, pid_file, name)), + "_terminate_process", + lambda _process, name, _console: stop_calls.append(name), ) with pytest.raises(service_manager.ServiceError, match="启动超时"): - service_manager.start_backend(service_manager.ServiceConfig(), console) + service_manager._start_backend_process(service_manager.ServiceConfig(), console) - assert stop_calls == [(8000, paths.backend_pid, "后端")] + assert stop_calls == ["后端"] joined = "\n".join(console.messages) assert "近期日志" in joined assert "boot failed here" in joined @@ -1017,15 +981,12 @@ def test_start_backend_reports_started_after_probe_succeeds(monkeypatch, tmp_pat lambda *_args, **_kwargs: None, ) - service_manager.start_backend(service_manager.ServiceConfig(), console) + service_manager._start_backend_process(service_manager.ServiceConfig(), console) - record = service_manager.read_runtime_record(paths.backend_pid) - assert record is not None - assert record.pid == 2468 backend_env = spawn_calls[0]["kwargs"]["env"] assert backend_env["_FLOCKS_WEBUI_HOST"] == "127.0.0.1" assert backend_env["_FLOCKS_WEBUI_PORT"] == "5173" - assert console.messages[-1] == f"[flocks] 后端已启动,日志: {paths.backend_log}" + assert not paths.backend_pid.exists() assert backend_env["FLOCKS_CONSOLE_BASE_URL"] == service_manager.DEFAULT_FLOCKS_CONSOLE_BASE_URL @@ -1066,7 +1027,7 @@ def test_start_backend_allows_overriding_console_base_url(monkeypatch, tmp_path: ) monkeypatch.setenv("FLOCKS_CONSOLE_BASE_URL", "https://custom-console.example.com") - service_manager.start_backend(service_manager.ServiceConfig(), console) + service_manager._start_backend_process(service_manager.ServiceConfig(), console) backend_env = spawn_calls[0]["kwargs"]["env"] assert backend_env["FLOCKS_CONSOLE_BASE_URL"] == "https://custom-console.example.com" @@ -1213,7 +1174,7 @@ def fake_spawn(command, **kwargs): frontend_host="0.0.0.0", frontend_port=5174, ) - service_manager.start_frontend(config, console) + service_manager._start_frontend_process(config, console) assert build_calls[0]["command"] == ["/usr/bin/npm", "run", "build"] assert build_calls[0]["kwargs"]["env"]["FLOCKS_API_PROXY_TARGET"] == "http://10.0.0.8:9000" @@ -1235,164 +1196,49 @@ def fake_spawn(command, **kwargs): assert preview_calls[0]["kwargs"]["env"]["__VITE_ADDITIONAL_SERVER_ALLOWED_HOSTS"] == "preview.example.com" assert "VITE_API_BASE_URL" not in preview_calls[0]["kwargs"]["env"] assert "VITE_WS_BASE_URL" not in preview_calls[0]["kwargs"]["env"] - record = service_manager.read_runtime_record(paths.frontend_pid) - assert record is not None - assert record.host == "0.0.0.0" - assert record.port == 5174 - - -def test_start_watchdog_writes_runtime_metadata(monkeypatch, tmp_path: Path) -> None: - paths = _make_runtime_paths(tmp_path) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - console = DummyConsole() - spawn_calls: list[dict[str, object]] = [] - - monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) - monkeypatch.setattr(service_manager, "runtime_record_is_running", lambda _record: False) - monkeypatch.setattr( - service_manager, - "resolve_flocks_cli_command", - lambda root=None: ["python", "-m", "flocks.cli.main"], - ) - monkeypatch.setattr(service_manager.os, "getpgid", lambda pid: pid) - monkeypatch.setattr( - service_manager, - "_spawn_process", - lambda *args, **kwargs: spawn_calls.append({"args": args, "kwargs": kwargs}) or SimpleNamespace(pid=2468), - ) + assert not paths.frontend_pid.exists() - service_manager.start_watchdog( - service_manager.ServiceConfig(backend_host="0.0.0.0", backend_port=9000), - console, - ) - record = service_manager.read_runtime_record(service_manager.watchdog_pid_path(paths)) - assert record is not None - assert record.pid == 2468 - assert record.port is None - assert record.command == ( - "python", - "-m", - "flocks.cli.main", - "service-watchdog", - "--server-host", - "0.0.0.0", - "--server-port", - "9000", - "--webui-host", - "127.0.0.1", - "--webui-port", - "5173", - "--interval", - str(service_manager.WATCHDOG_CHECK_INTERVAL_SECONDS), - ) - assert spawn_calls[0]["kwargs"]["log_path"] == service_manager.watchdog_log_path(paths) +def _fake_process(pid: int, args: list[str] | None = None, returncode: int | None = None): + return SimpleNamespace(pid=pid, args=args or [str(pid)], returncode=returncode, poll=lambda: returncode) -def test_watchdog_recovers_backend_when_process_alive_but_port_not_listening(monkeypatch, tmp_path: Path) -> None: +def test_supervisor_recovers_backend_when_port_disappears(monkeypatch, tmp_path: Path) -> None: paths = _make_runtime_paths(tmp_path) - paths.run_dir.mkdir(parents=True) - service_manager.write_runtime_record( - paths.backend_pid, - service_manager.RuntimeRecord(pid=111, pgid=222, host="0.0.0.0", port=9995), - ) - calls: list[tuple[str, int]] = [] - - monkeypatch.setattr(service_manager, "runtime_record_is_running", lambda _record: True) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "port_is_in_use", lambda _port, listeners=None: False) - monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call([], "service_lock")) - monkeypatch.setattr( - service_manager, - "stop_one", - lambda port, _pid_file, _name, _console: calls.append(("stop", port)), - ) - monkeypatch.setattr( - service_manager, - "start_backend", - lambda config, _console: calls.append(("start", config.backend_port)), - ) - - next_count = service_manager._watchdog_tick( - service_manager.ServiceConfig(backend_port=8000), - paths, - 0, - ) - - assert next_count == 0 - assert calls == [("stop", 9995), ("start", 9995)] - - -def test_watchdog_does_not_recover_when_port_owned_by_unexpected_pid(monkeypatch, tmp_path: Path) -> None: - paths = _make_runtime_paths(tmp_path) - paths.run_dir.mkdir(parents=True) - service_manager.write_runtime_record( - paths.backend_pid, - service_manager.RuntimeRecord(pid=111, pgid=222, host="0.0.0.0", port=9995), - ) calls: list[str] = [] - - monkeypatch.setattr(service_manager, "runtime_record_is_running", lambda _record: True) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: [999]) - monkeypatch.setattr(service_manager, "port_is_in_use", lambda _port, listeners=None: True) - monkeypatch.setattr(service_manager, "_runtime_record_pids", lambda _record: [111]) - monkeypatch.setattr(service_manager, "_recover_unhealthy_backend", lambda *_args: calls.append("recover")) - - next_count = service_manager._watchdog_tick( - service_manager.ServiceConfig(backend_port=9995), - paths, - 0, - ) - - assert next_count == 0 - assert calls == [] - - -def test_watchdog_recovers_backend_when_runtime_record_is_dead(monkeypatch, tmp_path: Path) -> None: - paths = _make_runtime_paths(tmp_path) - paths.run_dir.mkdir(parents=True) - service_manager.write_runtime_record( - paths.backend_pid, - service_manager.RuntimeRecord(pid=111, pgid=222, host="127.0.0.1", port=9995), - ) - calls: list[tuple[str, int]] = [] - - monkeypatch.setattr(service_manager, "runtime_record_is_running", lambda _record: False) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "port_is_in_use", lambda _port, listeners=None: False) - monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call([], "service_lock")) - monkeypatch.setattr( - service_manager, - "stop_one", - lambda port, _pid_file, _name, _console: calls.append(("stop", port)), - ) + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + daemon = service_supervisor.SupervisorDaemon(service_manager.ServiceConfig(backend_port=9995, frontend_port=9996)) + daemon.paths = paths + daemon.backend.log_path = paths.backend_log + daemon.webui.log_path = paths.frontend_log + daemon.backend.process = _fake_process(111, ["backend"]) + daemon.webui.process = _fake_process(222, ["webui"]) + + monkeypatch.setattr(service_supervisor, "_tcp_port_accepts_connections", lambda _host, port: port != 9995) + monkeypatch.setattr(service_manager, "_terminate_process", lambda _process, name, _console: calls.append(f"stop:{name}")) monkeypatch.setattr( service_manager, - "start_backend", - lambda config, _console: calls.append(("start", config.backend_port)), + "_start_backend_process", + lambda *_args, **_kwargs: calls.append("start:backend") or _fake_process(333, ["backend-new"]), ) - next_count = service_manager._watchdog_tick( - service_manager.ServiceConfig(backend_port=9995), - paths, - 0, - ) + daemon.tick() - assert next_count == 0 - assert calls == [("stop", 9995), ("start", 9995)] + assert calls == ["stop:后端", "start:backend"] + assert daemon.backend.pid == 333 -def test_watchdog_waits_for_second_health_failure_before_restart(monkeypatch, tmp_path: Path) -> None: +def test_supervisor_waits_for_second_backend_health_failure(monkeypatch, tmp_path: Path) -> None: paths = _make_runtime_paths(tmp_path) - paths.run_dir.mkdir(parents=True) - service_manager.write_runtime_record( - paths.backend_pid, - service_manager.RuntimeRecord(pid=111, pgid=222, host="127.0.0.1", port=9995), - ) calls: list[str] = [] + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + daemon = service_supervisor.SupervisorDaemon( + service_manager.ServiceConfig(backend_port=9995, frontend_port=9996), + failure_threshold=2, + ) + daemon.paths = paths + daemon.backend.process = _fake_process(111, ["backend"]) + daemon.webui.process = _fake_process(222, ["webui"]) class FakeClient: def __init__(self, *_args, **_kwargs) -> None: @@ -1407,27 +1253,44 @@ def __exit__(self, *_args) -> None: def get(self, _url): return httpx.Response(503, json={"status": "unhealthy"}) - monkeypatch.setattr(service_manager.httpx, "Client", FakeClient) - monkeypatch.setattr(service_manager, "runtime_record_is_running", lambda _record: True) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: [111]) - monkeypatch.setattr(service_manager, "port_is_in_use", lambda _port, listeners=None: True) - monkeypatch.setattr(service_manager, "_runtime_record_pids", lambda _record: [111]) - monkeypatch.setattr(service_manager, "_recover_unhealthy_backend", lambda *_args: calls.append("recover")) - - first_count = service_manager._watchdog_tick( - service_manager.ServiceConfig(backend_port=9995), - paths, - 0, + monkeypatch.setattr(service_supervisor.httpx, "Client", FakeClient) + monkeypatch.setattr(service_supervisor, "_tcp_port_accepts_connections", lambda *_args: True) + monkeypatch.setattr(service_manager, "_terminate_process", lambda _process, name, _console: calls.append(f"stop:{name}")) + monkeypatch.setattr( + service_manager, + "_start_backend_process", + lambda *_args, **_kwargs: calls.append("start:backend") or _fake_process(333, ["backend-new"]), ) - second_count = service_manager._watchdog_tick( - service_manager.ServiceConfig(backend_port=9995), - paths, - first_count, + + daemon.tick() + assert calls == [] + assert daemon.backend.state == "degraded" + + daemon.tick() + assert calls == ["stop:后端", "start:backend"] + + +def test_supervisor_recovers_webui_when_port_disappears(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + calls: list[str] = [] + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + daemon = service_supervisor.SupervisorDaemon(service_manager.ServiceConfig(backend_port=9995, frontend_port=9996)) + daemon.paths = paths + daemon.backend.process = _fake_process(111, ["backend"]) + daemon.webui.process = _fake_process(222, ["webui"]) + + monkeypatch.setattr(service_supervisor, "_tcp_port_accepts_connections", lambda _host, port: port != 9996) + monkeypatch.setattr(service_manager, "_terminate_process", lambda _process, name, _console: calls.append(f"stop:{name}")) + monkeypatch.setattr( + service_manager, + "_start_frontend_process", + lambda *_args, **_kwargs: calls.append("start:webui") or _fake_process(444, ["webui-new"]), ) - assert first_count == 1 - assert second_count == 0 - assert calls == ["recover"] + daemon.tick() + + assert calls == ["stop:WebUI", "start:webui"] + assert daemon.webui.pid == 444 def test_start_frontend_tolerates_windows_node_assertion_after_build(monkeypatch, tmp_path: Path) -> None: @@ -1471,7 +1334,7 @@ def fake_spawn(command, **_kwargs): monkeypatch.setattr(service_manager.subprocess, "run", fake_run) monkeypatch.setattr(service_manager, "_spawn_process", fake_spawn) - service_manager.start_frontend(service_manager.ServiceConfig(), console) + service_manager._start_frontend_process(service_manager.ServiceConfig(), console) assert preview_calls[0][:3] == ["npm.cmd", "run", "preview"] assert "[flocks] WebUI 构建产物已生成,忽略 Windows Node.js 退出断言。" in console.messages @@ -1519,7 +1382,7 @@ def fake_spawn(command, **kwargs): frontend_host="0.0.0.0", frontend_port=5174, ) - service_manager.start_frontend(config, console) + service_manager._start_frontend_process(config, console) assert build_calls[0]["kwargs"]["env"]["VITE_API_BASE_URL"] == "http://10.0.0.8:9000" assert build_calls[0]["kwargs"]["env"]["VITE_WS_BASE_URL"] == "ws://10.0.0.8:9000" @@ -1557,12 +1420,12 @@ def fake_run(command, **_kwargs): monkeypatch.setattr(service_manager.subprocess, "run", fake_run) monkeypatch.setattr(service_manager, "_spawn_process", lambda *_args, **_kwargs: SimpleNamespace(pid=2468)) - service_manager.start_frontend(service_manager.ServiceConfig(), console) + service_manager._start_frontend_process(service_manager.ServiceConfig(), console) assert build_calls[0][0] == r"C:\Users\flocks\AppData\Local\Programs\Flocks\tools\node\npm.cmd" -def test_start_backend_raises_on_port_record_mismatch(monkeypatch, tmp_path: Path) -> None: +def test_start_backend_raises_when_port_has_listener(monkeypatch, tmp_path: Path) -> None: paths = service_manager.RuntimePaths( root=tmp_path, run_dir=tmp_path / "run", @@ -1574,15 +1437,15 @@ def test_start_backend_raises_on_port_record_mismatch(monkeypatch, tmp_path: Pat ) paths.run_dir.mkdir(parents=True) paths.log_dir.mkdir(parents=True) - service_manager.write_runtime_record(paths.backend_pid, service_manager.RuntimeRecord(pid=1111, port=8000)) + _write_legacy_runtime_record(paths.backend_pid, service_manager.RuntimeRecord(pid=1111, port=8000)) monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: [9999]) - with pytest.raises(service_manager.ServiceError, match="运行时记录不一致"): - service_manager.start_backend(service_manager.ServiceConfig(), DummyConsole()) + with pytest.raises(service_manager.ServiceError, match="端口 8000 已被占用"): + service_manager._start_backend_process(service_manager.ServiceConfig(), DummyConsole()) def test_start_backend_raises_when_port_in_use_without_pid_lookup(monkeypatch, tmp_path: Path) -> None: @@ -1605,7 +1468,7 @@ def test_start_backend_raises_when_port_in_use_without_pid_lookup(monkeypatch, t monkeypatch.setattr(service_manager, "port_is_in_use", lambda _port, listeners=None: True) with pytest.raises(service_manager.ServiceError, match="无法识别占用 PID"): - service_manager.start_backend(service_manager.ServiceConfig(), DummyConsole()) + service_manager._start_backend_process(service_manager.ServiceConfig(), DummyConsole()) def test_spawn_process_uses_hidden_window_flags_on_windows(monkeypatch, tmp_path: Path) -> None: @@ -1736,309 +1599,11 @@ def fake_popen(*args, **kwargs): assert captured["kwargs"]["env"] == env -def test_stop_one_prefers_process_group_on_unix(monkeypatch, tmp_path: Path) -> None: - pid_file = tmp_path / "backend.pid" - service_manager.write_runtime_record( - pid_file, - service_manager.RuntimeRecord(pid=111, pgid=222, port=8000), - ) - console = DummyConsole() - group_alive = {"value": True} - group_signals: list[tuple[signal.Signals, int | None]] = [] - pid_signals: list[tuple[signal.Signals, list[int]]] = [] - - monkeypatch.setattr(service_manager.sys, "platform", "darwin") - monkeypatch.setattr(service_manager, "collect_process_tree_pids", lambda _pid: [111, 112]) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: False) - monkeypatch.setattr(service_manager, "process_group_is_running", lambda pgid: bool(pgid == 222 and group_alive["value"])) - - def fake_signal_group(sig, pgid): - group_signals.append((sig, pgid)) - if sig == signal.SIGTERM: - group_alive["value"] = False - - monkeypatch.setattr(service_manager, "signal_process_group", fake_signal_group) - monkeypatch.setattr( - service_manager, - "signal_pid_list", - lambda sig, pids: pid_signals.append((sig, list(pids))), - ) - - service_manager.stop_one(8000, pid_file, "后端", console) - - assert group_signals == [(signal.SIGTERM, 222)] - assert pid_signals == [] - assert not pid_file.exists() - - -def test_stop_one_falls_back_to_pid_signals_without_process_group(monkeypatch, tmp_path: Path) -> None: - pid_file = tmp_path / "backend.pid" - pid_file.write_text("111", encoding="utf-8") - console = DummyConsole() - pid_signals: list[tuple[signal.Signals, list[int]]] = [] - alive = {"value": True} - - monkeypatch.setattr(service_manager.sys, "platform", "darwin") - monkeypatch.setattr(service_manager, "collect_process_tree_pids", lambda _pid: [111, 112]) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: alive["value"]) - monkeypatch.setattr(service_manager, "process_group_is_running", lambda _pgid: False) - monkeypatch.setattr( - service_manager, - "signal_pid_list", - lambda sig, pids: ( - pid_signals.append((sig, list(pids))), - alive.__setitem__("value", False), - ), - ) - - service_manager.stop_one(8000, pid_file, "后端", console) - - assert pid_signals[0] == (signal.SIGTERM, [111, 112]) - assert not pid_file.exists() - - -def test_stop_one_uses_taskkill_on_windows(monkeypatch, tmp_path: Path) -> None: - pid_file = tmp_path / "backend.pid" - pid_file.write_text("111", encoding="utf-8") - console = DummyConsole() - commands: list[list[str]] = [] - alive = {"value": True} - - monkeypatch.setattr(service_manager.sys, "platform", "win32") - monkeypatch.setattr(service_manager, "collect_process_tree_pids", lambda _pid: [111, 222]) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: alive["value"]) - - def fake_run(args, **kwargs): - commands.append(list(args)) - alive["value"] = False - return SimpleNamespace(returncode=0) - - monkeypatch.setattr(service_manager.subprocess, "run", fake_run) - - service_manager.stop_one(8000, pid_file, "后端", console) - - assert commands == [ - ["taskkill", "/PID", "111", "/T", "/F"], - ["taskkill", "/PID", "222", "/T", "/F"], - ] - - -def test_stop_one_skips_taskkill_for_reused_windows_pid(monkeypatch, tmp_path: Path) -> None: - pid_file = tmp_path / "backend.pid" - service_manager.write_runtime_record( - pid_file, - service_manager.RuntimeRecord( - pid=111, - host="127.0.0.1", - port=8000, - command=("python.exe", "-m", "flocks.cli.main", "serve"), - ), - ) - console = DummyConsole() - - monkeypatch.setattr(service_manager.sys, "platform", "win32") - monkeypatch.setattr(service_manager, "collect_process_tree_pids", lambda _pid: [111]) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "pid_is_running", lambda pid: pid == 111) - monkeypatch.setattr( - service_manager, - "_windows_process_snapshot", - lambda _pid: { - "name": "svchost.exe", - "command_line": r"C:\Windows\System32\svchost.exe -k netsvcs", - "executable_path": r"C:\Windows\System32\svchost.exe", - }, - ) - monkeypatch.setattr( - service_manager.subprocess, - "run", - lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("taskkill should not run")), - ) - - service_manager.stop_one(8000, pid_file, "后端", console) - - assert console.messages[-1] == "[flocks] 后端 未运行。" - assert not pid_file.exists() - - -def test_stop_one_force_kill_refreshes_process_group_members(monkeypatch, tmp_path: Path) -> None: - pid_file = tmp_path / "backend.pid" - service_manager.write_runtime_record( - pid_file, - service_manager.RuntimeRecord(pid=111, pgid=222, port=8000), - ) - console = DummyConsole() - pid_signals: list[tuple[signal.Signals, list[int]]] = [] - group_signals: list[tuple[signal.Signals, int | None]] = [] - alive_group_members = {333} - - monkeypatch.setattr(service_manager.sys, "platform", "darwin") - monkeypatch.setattr(service_manager, "collect_process_tree_pids", lambda _pid: [111]) - monkeypatch.setattr(service_manager, "_process_group_member_pids", lambda pgid: [333] if pgid == 222 and alive_group_members else []) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "pid_is_running", lambda pid: pid in alive_group_members) - monkeypatch.setattr(service_manager, "process_group_is_running", lambda pgid: bool(pgid == 222 and alive_group_members)) - monkeypatch.setattr(service_manager.time, "sleep", lambda _delay: None) - - def fake_signal_group(sig, pgid): - group_signals.append((sig, pgid)) - def fake_signal_pid_list(sig, pids): - pid_list = list(pids) - pid_signals.append((sig, pid_list)) - if sig == signal.SIGKILL and 333 in pid_list: - alive_group_members.clear() - - monkeypatch.setattr(service_manager, "signal_process_group", fake_signal_group) - monkeypatch.setattr(service_manager, "signal_pid_list", fake_signal_pid_list) - - service_manager.stop_one(8000, pid_file, "后端", console) - - assert (signal.SIGTERM, 222) in group_signals - assert any(sig == signal.SIGKILL and 333 in pids for sig, pids in pid_signals) - assert not pid_file.exists() - assert console.messages[-1] == "[flocks] 后端 已停止。" - - -def test_stop_one_keeps_runtime_record_when_force_kill_still_times_out(monkeypatch, tmp_path: Path) -> None: - pid_file = tmp_path / "backend.pid" - service_manager.write_runtime_record( - pid_file, - service_manager.RuntimeRecord(pid=111, pgid=222, port=8000), - ) - console = DummyConsole() - - monkeypatch.setattr(service_manager.sys, "platform", "darwin") - monkeypatch.setattr(service_manager, "collect_process_tree_pids", lambda _pid: [111]) - monkeypatch.setattr(service_manager, "_process_group_member_pids", lambda pgid: [333] if pgid == 222 else []) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: False) - monkeypatch.setattr(service_manager, "process_group_is_running", lambda pgid: pgid == 222) - monkeypatch.setattr(service_manager, "signal_process_group", lambda *_args: None) - monkeypatch.setattr(service_manager, "signal_pid_list", lambda *_args: None) - monkeypatch.setattr(service_manager.time, "sleep", lambda _delay: None) - - with pytest.raises(service_manager.ServiceError, match="未在预期时间内退出"): - service_manager.stop_one(8000, pid_file, "后端", console) - - assert pid_file.exists() - - -@contextlib.contextmanager -def _record_call(call_order: list[str], name: str): - call_order.append(name) - yield - - -def test_stop_all_reads_port_from_runtime_record(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - service_manager.write_runtime_record(paths.backend_pid, service_manager.RuntimeRecord(pid=111, port=9995)) - service_manager.write_runtime_record(paths.frontend_pid, service_manager.RuntimeRecord(pid=222, port=9996)) - calls: list[tuple[int, Path, str]] = [] - - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call([], "service_lock")) - monkeypatch.setattr(service_manager, "stop_all_browser_daemons", lambda: []) - monkeypatch.setattr( - service_manager, - "stop_one", - lambda port, pid_file, name, _console: calls.append((port, pid_file, name)), - ) - - service_manager.stop_all(console=None) - - assert calls == [ - (9996, paths.frontend_pid, "WebUI"), - (9995, paths.backend_pid, "后端"), - ] - - -def test_stop_all_falls_back_to_default_port_when_record_missing(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - calls: list[int] = [] - - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call([], "service_lock")) - monkeypatch.setattr(service_manager, "stop_all_browser_daemons", lambda: []) - monkeypatch.setattr(service_manager, "stop_one", lambda port, *_args: calls.append(port)) - - service_manager.stop_all(console=None) - - assert calls == [5173, 8000] - - -def test_stop_all_falls_back_to_default_port_when_record_has_no_port(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.backend_pid.write_text("111", encoding="utf-8") - paths.frontend_pid.write_text("222", encoding="utf-8") - calls: list[int] = [] - - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call([], "service_lock")) - monkeypatch.setattr(service_manager, "stop_all_browser_daemons", lambda: []) - monkeypatch.setattr(service_manager, "stop_one", lambda port, *_args: calls.append(port)) - - service_manager.stop_all(console=None) - - assert calls == [5173, 8000] - - -def test_stop_all_also_cleans_browser_daemons(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) +def test_stop_all_uses_supervisor_control_api(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) calls: list[str] = [] - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call([], "service_lock")) - monkeypatch.setattr( - service_manager, - "stop_one", - lambda _port, _pid_file, name, _console: calls.append(name), - ) - monkeypatch.setattr( - service_manager, - "stop_all_browser_daemons", - lambda: calls.append("browser") or ["default", "remote"], - ) - class FakeConsole: def __init__(self) -> None: self.messages = [] @@ -2046,159 +1611,42 @@ def __init__(self) -> None: def print(self, message) -> None: self.messages.append(message) + states = iter([True, False]) + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: next(states)) + monkeypatch.setattr(service_manager, "post_control_json", lambda path, **_kwargs: calls.append(path) or {"status": "stopping"}) + console = FakeConsole() service_manager.stop_all(console=console) - assert calls == ["WebUI", "后端", "browser"] - assert console.messages == [] - - -def test_build_status_lines_reads_port_from_runtime_record(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - service_manager.write_runtime_record(paths.backend_pid, service_manager.RuntimeRecord(pid=111, port=9995)) - service_manager.write_runtime_record(paths.frontend_pid, service_manager.RuntimeRecord(pid=222, port=9996)) - - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda port: [port] if port in {9995, 9996} else []) - monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: False) + assert calls == ["/stop"] + assert console.messages == ["[flocks] Supervisor 已停止。"] - lines = service_manager.build_status_lines(paths) - - assert "http://127.0.0.1:9995" in lines[0] - assert "http://127.0.0.1:9996" in lines[1] - - -def test_build_status_lines_uses_recorded_host(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - service_manager.write_runtime_record( - paths.backend_pid, - service_manager.RuntimeRecord(pid=111, host="10.0.0.8", port=9000), - ) - service_manager.write_runtime_record( - paths.frontend_pid, - service_manager.RuntimeRecord(pid=222, host="0.0.0.0", port=5174), - ) - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda port: [111] if port == 9000 else [222]) - monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: False) +def test_stop_all_reports_when_supervisor_is_down(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + console = DummyConsole() - lines = service_manager.build_status_lines(paths) + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: False) - assert "http://10.0.0.8:9000" in lines[0] - assert "http://127.0.0.1:5174" in lines[1] + service_manager.stop_all(console) + assert console.messages == ["[flocks] Supervisor 未运行。"] -def test_build_status_lines_uses_unknown_pid_when_bind_fallback_detects_listener( - monkeypatch, tmp_path: Path -) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "port_is_in_use", lambda _port, listeners=None: True) - monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: False) - monkeypatch.setattr(service_manager, "process_group_is_running", lambda _pgid: False) +def test_status_lines_include_control_api_errors(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + payload = _supervisor_status_payload() + payload["backend"]["state"] = "degraded" + payload["backend"]["last_error"] = "health failed" + monkeypatch.setattr(service_manager, "read_control_json", lambda *_args, **_kwargs: payload) lines = service_manager.build_status_lines(paths) - assert "PID=unknown" in lines[0] - assert "PID=unknown" in lines[1] - - -def test_service_lock_prevents_concurrent_operations(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - state = {"locked": False} - - class FakeFcntl: - LOCK_EX = 1 - LOCK_NB = 2 - LOCK_UN = 4 - - @staticmethod - def flock(_handle, operation): - if operation == FakeFcntl.LOCK_UN: - state["locked"] = False - return - if state["locked"]: - raise OSError("busy") - state["locked"] = True - - monkeypatch.setattr(service_manager.sys, "platform", "darwin") - monkeypatch.setattr(service_manager, "fcntl", FakeFcntl) - - with service_manager.service_lock(paths): - with pytest.raises(service_manager.ServiceError, match="另一个 flocks 命令正在执行"): - with service_manager.service_lock(paths): - raise AssertionError("should not acquire nested lock") - - -def test_service_lock_releases_on_completion(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - operations: list[int] = [] - - class FakeFcntl: - LOCK_EX = 1 - LOCK_NB = 2 - LOCK_UN = 4 - - @staticmethod - def flock(_handle, operation): - operations.append(operation) - - monkeypatch.setattr(service_manager.sys, "platform", "darwin") - monkeypatch.setattr(service_manager, "fcntl", FakeFcntl) - - with service_manager.service_lock(paths): - pass + assert "state=degraded" in lines[1] + assert "last_error=health failed" in lines[1] - assert operations == [FakeFcntl.LOCK_EX | FakeFcntl.LOCK_NB, FakeFcntl.LOCK_UN] def test_log_startup_config_appends_to_log_file(tmp_path: Path) -> None: diff --git a/tests/server/test_server_port_config.py b/tests/server/test_server_port_config.py index 059acca42..ca275ea6b 100644 --- a/tests/server/test_server_port_config.py +++ b/tests/server/test_server_port_config.py @@ -240,24 +240,28 @@ def fake_restart_all(config, _console): assert captured["config"].frontend_host == "127.0.0.1" assert captured["config"].frontend_port == 5273 - def test_restart_reuses_runtime_recorded_host_and_port(self, monkeypatch, tmp_path: Path): - """Test restart reuses last runtime host/port when CLI and env omit them.""" + def test_restart_reuses_supervisor_recorded_host_and_port(self, monkeypatch, tmp_path: Path): + """Test restart reuses supervisor host/port when CLI and env omit them.""" captured = {} - paths = SimpleNamespace( - backend_pid=tmp_path / "backend.pid", - frontend_pid=tmp_path / "webui.pid", - ) - records = { - paths.backend_pid: SimpleNamespace(host="0.0.0.0", port=9000), - paths.frontend_pid: SimpleNamespace(host="0.0.0.0", port=5174), - } + paths = SimpleNamespace(run_dir=tmp_path) def fake_restart_all(config, _console): captured["config"] = config monkeypatch.setattr(cli_main, "restart_all", fake_restart_all) monkeypatch.setattr(cli_main, "runtime_paths", lambda: paths) - monkeypatch.setattr(cli_main, "read_runtime_record", lambda path: records.get(path)) + monkeypatch.setattr( + cli_main, + "read_supervisor_status", + lambda **_kwargs: { + "config": { + "backend_host": "0.0.0.0", + "backend_port": 9000, + "frontend_host": "0.0.0.0", + "frontend_port": 5174, + } + }, + ) Config._global_config = None result = CliRunner().invoke(cli_main.app, ["restart"]) @@ -268,13 +272,10 @@ def fake_restart_all(config, _console): assert captured["config"].frontend_host == "0.0.0.0" assert captured["config"].frontend_port == 5174 - def test_restart_cli_options_override_runtime_record(self, monkeypatch, tmp_path: Path): - """Test explicit restart CLI options override runtime-recorded host/port.""" + def test_restart_cli_options_override_supervisor_record(self, monkeypatch, tmp_path: Path): + """Test explicit restart CLI options override supervisor host/port.""" captured = {} - paths = SimpleNamespace( - backend_pid=tmp_path / "backend.pid", - frontend_pid=tmp_path / "webui.pid", - ) + paths = SimpleNamespace(run_dir=tmp_path) def fake_restart_all(config, _console): captured["config"] = config @@ -283,11 +284,15 @@ def fake_restart_all(config, _console): monkeypatch.setattr(cli_main, "runtime_paths", lambda: paths) monkeypatch.setattr( cli_main, - "read_runtime_record", - lambda path: SimpleNamespace( - host="0.0.0.0", - port=9000 if Path(path) == paths.backend_pid else 5174, - ), + "read_supervisor_status", + lambda **_kwargs: { + "config": { + "backend_host": "0.0.0.0", + "backend_port": 9000, + "frontend_host": "0.0.0.0", + "frontend_port": 5174, + } + }, ) Config._global_config = None @@ -312,13 +317,10 @@ def fake_restart_all(config, _console): assert captured["config"].frontend_host == "127.0.0.1" assert captured["config"].frontend_port == 5273 - def test_restart_environment_overrides_runtime_record(self, monkeypatch, tmp_path: Path): - """Test restart environment variables still override runtime-recorded host/port.""" + def test_restart_environment_overrides_supervisor_record(self, monkeypatch, tmp_path: Path): + """Test restart environment variables still override supervisor host/port.""" captured = {} - paths = SimpleNamespace( - backend_pid=tmp_path / "backend.pid", - frontend_pid=tmp_path / "webui.pid", - ) + paths = SimpleNamespace(run_dir=tmp_path) def fake_restart_all(config, _console): captured["config"] = config @@ -327,11 +329,15 @@ def fake_restart_all(config, _console): monkeypatch.setattr(cli_main, "runtime_paths", lambda: paths) monkeypatch.setattr( cli_main, - "read_runtime_record", - lambda path: SimpleNamespace( - host="0.0.0.0", - port=9000 if Path(path) == paths.backend_pid else 5174, - ), + "read_supervisor_status", + lambda **_kwargs: { + "config": { + "backend_host": "0.0.0.0", + "backend_port": 9000, + "frontend_host": "0.0.0.0", + "frontend_port": 5174, + } + }, ) monkeypatch.setenv("FLOCKS_SERVER_HOST", "127.0.0.1") monkeypatch.setenv("FLOCKS_SERVER_PORT", "9101") diff --git a/tests/updater/test_restart_handoff.py b/tests/updater/test_restart_handoff.py index 6f9ceaab2..b3464a89c 100644 --- a/tests/updater/test_restart_handoff.py +++ b/tests/updater/test_restart_handoff.py @@ -16,8 +16,6 @@ def _handoff_args(tmp_path: Path, restart_argv: list[str]) -> list[str]: "127.0.0.1", "--frontend-port", "5173", - "--backend-pid-file", - str(tmp_path / "backend.pid"), "--install-root", str(tmp_path), "--uv-path", @@ -49,7 +47,7 @@ def test_run_waits_for_parent_and_backend_port_before_spawning( monkeypatch.setattr( restart_handoff, "_ensure_backend_port_free", - lambda backend_port, backend_pid_file: events.append(f"free-port:{backend_port}:{backend_pid_file.name}") or True, + lambda backend_port: events.append(f"free-port:{backend_port}") or True, ) monkeypatch.setattr( restart_handoff.subprocess, @@ -57,11 +55,6 @@ def test_run_waits_for_parent_and_backend_port_before_spawning( lambda argv, cwd=None, close_fds=False: events.append(f"spawn:{list(argv)}:{cwd}:{close_fds}") or SimpleNamespace(pid=4321), ) - monkeypatch.setattr( - restart_handoff, - "_record_backend_runtime_if_direct_serve", - lambda process, argv, **kwargs: events.append(f"record:{process.pid}:{list(argv)}:{kwargs['backend_port']}"), - ) monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: events.append("tasks") or None) code = restart_handoff.run(_handoff_args(tmp_path, restart_argv)) @@ -69,10 +62,9 @@ def test_run_waits_for_parent_and_backend_port_before_spawning( assert code == 0 assert events[1:] == [ "wait-parent:1234", - "free-port:8000:backend.pid", + "free-port:8000", "tasks", f"spawn:{restart_argv}:{tmp_path}:True", - f"record:4321:{restart_argv}:8000", "log:restart_spawned pid=4321", ] @@ -101,7 +93,7 @@ def test_run_does_not_spawn_when_upgrade_tasks_fail(monkeypatch, tmp_path: Path) monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) monkeypatch.setattr(restart_handoff, "_wait_for_parent_exit", lambda parent_pid: True) - monkeypatch.setattr(restart_handoff, "_ensure_backend_port_free", lambda backend_port, backend_pid_file: True) + monkeypatch.setattr(restart_handoff, "_ensure_backend_port_free", lambda backend_port: True) monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: "sync failed") monkeypatch.setattr(restart_handoff, "_rollback_failed_upgrade", lambda args, error: events.append(f"rollback:{error}")) monkeypatch.setattr( @@ -132,7 +124,7 @@ def crash(_args): monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) monkeypatch.setattr(restart_handoff, "_wait_for_parent_exit", lambda parent_pid: True) - monkeypatch.setattr(restart_handoff, "_ensure_backend_port_free", lambda backend_port, backend_pid_file: True) + monkeypatch.setattr(restart_handoff, "_ensure_backend_port_free", lambda backend_port: True) monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", crash) monkeypatch.setattr(restart_handoff, "_rollback_failed_upgrade", lambda args, error: events.append(f"rollback:{error}")) monkeypatch.setattr( @@ -149,10 +141,9 @@ def crash(_args): assert "spawn" not in events -def test_ensure_backend_port_free_stops_backend_after_wait_timeout(monkeypatch, tmp_path: Path) -> None: +def test_ensure_backend_port_free_waits_again_after_timeout(monkeypatch) -> None: events: list[str] = [] wait_results = iter([False, True]) - backend_pid_file = tmp_path / "backend.pid" monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) monkeypatch.setattr( @@ -160,16 +151,10 @@ def test_ensure_backend_port_free_stops_backend_after_wait_timeout(monkeypatch, "_wait_for_backend_port_free", lambda port, **kwargs: events.append(f"wait:{port}:{kwargs.get('timeout_seconds')}") or next(wait_results), ) - monkeypatch.setattr( - restart_handoff.service_manager, - "stop_one", - lambda port, pid_file, name, console: events.append(f"stop:{port}:{pid_file.name}:{name}"), - ) - assert restart_handoff._ensure_backend_port_free(8000, backend_pid_file) is True + assert restart_handoff._ensure_backend_port_free(8000) is True assert events == [ "wait:8000:None", - "log:backend_port_still_in_use port=8000; stopping backend", - "stop:8000:backend.pid:backend", + "log:backend_port_still_in_use port=8000", "wait:8000:20.0", ] diff --git a/tests/updater/test_updater.py b/tests/updater/test_updater.py index 1a176274c..4b8932822 100644 --- a/tests/updater/test_updater.py +++ b/tests/updater/test_updater.py @@ -11,6 +11,7 @@ import pytest from flocks.cli import service_manager +from flocks.cli import service_control from flocks.updater import updater @@ -39,6 +40,15 @@ def _prepare_real_restart_runtime(install_root: Path) -> None: python_path.chmod(0o755) +def _webui_control_payload(state: str = "healthy", last_error: str | None = None) -> dict[str, object]: + return { + "webui": { + "state": state, + "last_error": last_error, + }, + } + + def test_run_handles_none_process_output(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: def fake_run(*args, **kwargs): return subprocess.CompletedProcess(args=args[0], returncode=0, stdout=None, stderr=None) @@ -995,36 +1005,23 @@ def test_prepare_upgrade_handover_writes_state_and_stops_frontend( tmp_path: Path, ) -> None: monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path / ".flocks")) - paths = service_manager.RuntimePaths( - root=tmp_path / ".flocks", - run_dir=tmp_path / ".flocks" / "run", - log_dir=tmp_path / ".flocks" / "logs", - backend_pid=tmp_path / ".flocks" / "run" / "backend.pid", - frontend_pid=tmp_path / ".flocks" / "run" / "webui.pid", - backend_log=tmp_path / ".flocks" / "logs" / "backend.log", - frontend_log=tmp_path / ".flocks" / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - calls: list[tuple[int, str]] = [] + calls: list[str] = [] monkeypatch.setattr(updater, "_current_service_config", lambda: service_manager.ServiceConfig()) monkeypatch.setattr( updater, "_start_upgrade_page_server", lambda config, version: {"upgrade_server_pid": 321, "page_dir": str(tmp_path / "page"), "page_log": str(tmp_path / "upgrade.log")}, ) - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "_recorded_port", lambda _pid_file, default: default) monkeypatch.setattr( - service_manager, - "stop_one", - lambda port, _pid_file, name, _console: calls.append((port, name)), + service_control, + "post_control_json", + lambda path, **_kwargs: calls.append(path) or _webui_control_payload(), ) payload = updater._prepare_upgrade_handover("2026.3.31.1") - assert calls == [(5173, "WebUI")] + assert calls == ["/stop/webui"] assert payload["upgrade_server_pid"] == 321 assert updater._read_upgrade_state()["version"] == "2026.3.31.1" @@ -1034,46 +1031,28 @@ def test_prepare_upgrade_handover_restores_frontend_when_upgrade_page_fails( tmp_path: Path, ) -> None: monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path / ".flocks")) - paths = service_manager.RuntimePaths( - root=tmp_path / ".flocks", - run_dir=tmp_path / ".flocks" / "run", - log_dir=tmp_path / ".flocks" / "logs", - backend_pid=tmp_path / ".flocks" / "run" / "backend.pid", - frontend_pid=tmp_path / ".flocks" / "run" / "webui.pid", - backend_log=tmp_path / ".flocks" / "logs" / "backend.log", - frontend_log=tmp_path / ".flocks" / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - - calls: list[tuple[str, bool]] = [] + calls: list[tuple[str, bool | None]] = [] monkeypatch.setattr(updater, "_current_service_config", lambda: service_manager.ServiceConfig()) - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "_recorded_port", lambda _pid_file, default: default) - monkeypatch.setattr( - service_manager, - "stop_one", - lambda port, _pid_file, name, _console: calls.append((f"stop:{name}:{port}", True)), - ) - - def fake_start_frontend(config, _console) -> None: - calls.append(("start_frontend", config.skip_frontend_build)) - monkeypatch.setattr(service_manager, "start_frontend", fake_start_frontend) monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: calls.append(("stop_page", True))) monkeypatch.setattr( updater, "_start_upgrade_page_server", lambda _config, _version: (_ for _ in ()).throw(RuntimeError("page failed")), ) + monkeypatch.setattr( + service_control, + "post_control_json", + lambda path, payload=None, **_kwargs: calls.append((path, None if payload is None else payload.get("skip_frontend_build"))) or _webui_control_payload(), + ) with pytest.raises(RuntimeError, match="page failed"): updater._prepare_upgrade_handover("2026.3.31.1") assert calls == [ - ("stop:WebUI:5173", True), + ("/stop/webui", None), ("stop_page", True), - ("start_frontend", False), + ("/restart/webui", False), ] assert updater._read_upgrade_state() is None @@ -1083,14 +1062,14 @@ def test_recover_upgrade_state_restarts_frontend_and_clears_marker( tmp_path: Path, ) -> None: monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path / ".flocks")) - started: list[tuple[int, bool]] = [] + started: list[tuple[int, bool | None]] = [] stopped: list[str] = [] monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: stopped.append("stop")) monkeypatch.setattr( - service_manager, - "start_frontend", - lambda config, _console: started.append((config.frontend_port, config.skip_frontend_build)), + service_control, + "post_control_json", + lambda _path, payload=None, **_kwargs: started.append((payload["frontend_port"], payload.get("skip_frontend_build"))) or _webui_control_payload(), ) updater._write_upgrade_state( { @@ -1115,16 +1094,20 @@ def test_recover_upgrade_state_retries_frontend_with_build_when_dist_is_missing( tmp_path: Path, ) -> None: monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path / ".flocks")) - starts: list[bool] = [] + starts: list[tuple[bool | None, bool | None]] = [] monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: None) - def fake_start_frontend(config, _console) -> None: - starts.append(config.skip_frontend_build) - if config.skip_frontend_build: - raise service_manager.ServiceError("missing dist") + results = iter([ + _webui_control_payload("degraded", "missing dist"), + _webui_control_payload(), + ]) + + def fake_restart_webui(_path, payload=None, **_kwargs): + starts.append((payload.get("skip_frontend_build"), payload.get("force_frontend_build"))) + return next(results) - monkeypatch.setattr(service_manager, "start_frontend", fake_start_frontend) + monkeypatch.setattr(service_control, "post_control_json", fake_restart_webui) updater._write_upgrade_state( { "version": "2026.3.31.1", @@ -1138,7 +1121,7 @@ def fake_start_frontend(config, _console) -> None: updater.recover_upgrade_state() - assert starts == [True, False] + assert starts == [(True, None), (False, True)] assert updater._read_upgrade_state() is None @@ -1147,15 +1130,15 @@ def test_recover_upgrade_state_restart_failure_clears_state_without_restarting_p tmp_path: Path, ) -> None: monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path / ".flocks")) - starts: list[bool] = [] + starts: list[tuple[bool | None, bool | None]] = [] monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: None) - def fake_start_frontend(config, _console) -> None: - starts.append(config.skip_frontend_build) - raise service_manager.ServiceError("still broken") + def fake_restart_webui(_path, payload=None, **_kwargs): + starts.append((payload.get("skip_frontend_build"), payload.get("force_frontend_build"))) + return _webui_control_payload("degraded", "still broken") - monkeypatch.setattr(service_manager, "start_frontend", fake_start_frontend) + monkeypatch.setattr(service_control, "post_control_json", fake_restart_webui) updater._write_upgrade_state( { "version": "2026.3.31.1", @@ -1167,10 +1150,10 @@ def fake_start_frontend(config, _console) -> None: } ) - with pytest.raises(service_manager.ServiceError, match="still broken"): + with pytest.raises(RuntimeError, match="still broken"): updater.recover_upgrade_state() - assert starts == [True, False] + assert starts == [(True, None), (False, True)] assert updater._read_upgrade_state() is None @@ -1286,12 +1269,16 @@ def test_rollback_failed_update_restores_backup_and_rebuilds_frontend_if_needed( monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: events.append("stop_page")) monkeypatch.setattr(updater.shutil, "rmtree", lambda path, ignore_errors=True: events.append(f"rmtree:{Path(path).name}")) - def fake_start_frontend(config, _console) -> None: - events.append(f"start_frontend:{config.skip_frontend_build}") - if config.skip_frontend_build: - raise service_manager.ServiceError("missing dist") + results = iter([ + _webui_control_payload("degraded", "missing dist"), + _webui_control_payload(), + ]) + + def fake_restart_webui(_path, payload=None, **_kwargs) -> dict[str, object]: + events.append(f"restart_webui:{payload.get('skip_frontend_build')}:{payload.get('force_frontend_build')}") + return next(results) - monkeypatch.setattr(service_manager, "start_frontend", fake_start_frontend) + monkeypatch.setattr(service_control, "post_control_json", fake_restart_webui) updater._write_upgrade_state( { "version": "2026.4.1", @@ -1311,8 +1298,8 @@ def fake_start_frontend(config, _console) -> None: "restore:backup.tar.gz:install", "marker:2026.3.31", "stop_page", - "start_frontend:True", - "start_frontend:False", + "restart_webui:True:None", + "restart_webui:False:True", "rmtree:upgrade-page", ] assert updater._read_upgrade_state() is None @@ -1334,11 +1321,11 @@ def test_rollback_failed_update_clears_state_when_restore_and_frontend_both_fail monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: events.append("stop_page")) monkeypatch.setattr(updater.shutil, "rmtree", lambda path, ignore_errors=True: events.append(f"rmtree:{Path(path).name}")) - def fake_start_frontend(config, _console) -> None: - events.append(f"start_frontend:{config.skip_frontend_build}") - raise service_manager.ServiceError("frontend still broken") + def fake_restart_webui(_path, payload=None, **_kwargs) -> dict[str, object]: + events.append(f"restart_webui:{payload.get('skip_frontend_build')}") + return _webui_control_payload("degraded", "frontend still broken") - monkeypatch.setattr(service_manager, "start_frontend", fake_start_frontend) + monkeypatch.setattr(service_control, "post_control_json", fake_restart_webui) updater._write_upgrade_state( { "version": "2026.4.1", @@ -1357,7 +1344,7 @@ def fake_start_frontend(config, _console) -> None: assert events == [ "stop_page", - "start_frontend:True", + "restart_webui:True", "rmtree:upgrade-page", ] assert updater._read_upgrade_state() is None From 10a18dc25d6038340e752653d7d9dc51227c09b5 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Thu, 2 Jul 2026 11:41:24 +0800 Subject: [PATCH 03/28] Refactor supervisor control and process adapters --- flocks/cli/main.py | 93 ++++---------- flocks/cli/service_config.py | 200 ++++++++++++++++++++++++++++++ flocks/cli/service_control.py | 186 ++++++++++++++++++++++++--- flocks/cli/service_manager.py | 73 ++++------- flocks/cli/service_process.py | 114 +++++++++++++++++ flocks/cli/service_supervisor.py | 109 ++++------------ flocks/updater/updater.py | 63 ++++------ tests/cli/test_service_manager.py | 28 +++-- tests/updater/test_updater.py | 58 +++++---- 9 files changed, 631 insertions(+), 293 deletions(-) create mode 100644 flocks/cli/service_config.py create mode 100644 flocks/cli/service_process.py diff --git a/flocks/cli/main.py b/flocks/cli/main.py index 58e5c2888..e5a09cdf6 100644 --- a/flocks/cli/main.py +++ b/flocks/cli/main.py @@ -5,7 +5,6 @@ """ import asyncio -import os import secrets as secrets_lib import sys from pathlib import Path @@ -30,8 +29,14 @@ task_app, ) from flocks.cli.commands.update import update_command -from flocks.cli.service_manager import ( +from flocks.cli.service_config import ( ServiceConfig, + ServiceConfigError, + build_service_config, + restart_defaults_from_status_payload, +) +from flocks.cli.service_control import read_supervisor_status +from flocks.cli.service_manager import ( ServiceError, resolve_flocks_cli_command, restart_all, @@ -41,7 +46,6 @@ start_all, stop_all, ) -from flocks.cli.service_control import read_supervisor_status from flocks.cli.service_supervisor import run_service_daemon from flocks.config.config import Config from flocks.utils.log import Log, LogLevel @@ -152,82 +156,27 @@ def _service_config( ) -> ServiceConfig: """Build service config from environment and CLI toggles.""" global_config = Config.get_global() - return ServiceConfig( - backend_host=_resolve_host( - cli_value=server_host, - env_names=("FLOCKS_SERVER_HOST", "FLOCKS_BACKEND_HOST"), - default=default_server_host or global_config.server_host, - ), - backend_port=_resolve_port( - cli_value=server_port, - env_names=("FLOCKS_SERVER_PORT", "FLOCKS_BACKEND_PORT"), - default=default_server_port or global_config.server_port, - label="server", - ), - frontend_host=_resolve_host( - cli_value=webui_host, - env_names=("FLOCKS_WEBUI_HOST", "FLOCKS_FRONTEND_HOST"), - default=default_webui_host or "127.0.0.1", - ), - frontend_port=_resolve_port( - cli_value=webui_port, - env_names=("FLOCKS_WEBUI_PORT", "FLOCKS_FRONTEND_PORT"), - default=default_webui_port or 5173, - label="webui", - ), + return build_service_config( no_browser=no_browser, - skip_frontend_build=skip_webui_build, + skip_webui_build=skip_webui_build, + server_host=server_host, + server_port=server_port, + webui_host=webui_host, + webui_port=webui_port, + default_server_host=default_server_host or global_config.server_host, + default_server_port=default_server_port or global_config.server_port, + default_webui_host=default_webui_host or "127.0.0.1", + default_webui_port=default_webui_port or 5173, ) -def _resolve_host(cli_value: Optional[str], env_names: tuple[str, ...], default: str) -> str: - """Resolve a host value from CLI, environment, and default values.""" - if cli_value is not None: - return cli_value - for env_name in env_names: - env_value = os.getenv(env_name) - if env_value: - return env_value - return default - - -def _resolve_port( - cli_value: Optional[int], - env_names: tuple[str, ...], - default: int, - label: str, -) -> int: - """Resolve a port value from CLI, environment, and default values.""" - if cli_value is not None: - return cli_value - for env_name in env_names: - env_value = os.getenv(env_name) - if not env_value: - continue - try: - return int(env_value) - except ValueError as error: - raise ServiceError(f"{label} port from {env_name} must be an integer.") from error - return default - - def _restart_runtime_defaults() -> dict[str, Any]: """Load host/port defaults from the running supervisor when available.""" - defaults: dict[str, Any] = {} try: - payload = read_supervisor_status(paths=runtime_paths(), timeout=1.0) + status = read_supervisor_status(paths=runtime_paths(), timeout=1.0) except Exception: - return defaults - config = payload.get("config") if isinstance(payload.get("config"), dict) else {} - if isinstance(config.get("backend_host"), str): - defaults["default_server_host"] = config["backend_host"] - if isinstance(config.get("backend_port"), int): - defaults["default_server_port"] = config["backend_port"] - if isinstance(config.get("frontend_host"), str): - defaults["default_webui_host"] = config["frontend_host"] - if isinstance(config.get("frontend_port"), int): - defaults["default_webui_port"] = config["frontend_port"] - return defaults + return {} + return restart_defaults_from_status_payload(status.raw) def _restart_service_config( @@ -327,7 +276,7 @@ def restart( ), console, ) - except ServiceError as error: + except (ServiceConfigError, ServiceError) as error: _handle_service_error(error) diff --git a/flocks/cli/service_config.py b/flocks/cli/service_config.py new file mode 100644 index 000000000..590fcd51f --- /dev/null +++ b/flocks/cli/service_config.py @@ -0,0 +1,200 @@ +"""Service configuration model and serialization helpers.""" + +from __future__ import annotations + +import os +from dataclasses import dataclass +from typing import Any + + +class ServiceConfigError(ValueError): + """Raised when service configuration input is invalid.""" + + +@dataclass(frozen=True) +class ServiceConfig: + backend_host: str = "127.0.0.1" + backend_port: int = 8000 + frontend_host: str = "127.0.0.1" + frontend_port: int = 5173 + no_browser: bool = False + skip_frontend_build: bool = False + + @property + def frontend_url(self) -> str: + return f"http://{_format_host_for_url(loopback_host(self.frontend_host))}:{self.frontend_port}" + + +def loopback_host(host: str) -> str: + """Return a local access host for wildcard bind addresses.""" + return "127.0.0.1" if host in {"0.0.0.0", "::"} else host + + +def _format_host_for_url(host: str) -> str: + """Wrap IPv6 literals in brackets before composing URLs.""" + if ":" in host and not host.startswith("["): + return f"[{host}]" + return host + + +def service_config_payload(config: ServiceConfig) -> dict[str, object]: + """Serialize service config for the supervisor control API.""" + return { + "backend_host": config.backend_host, + "backend_port": config.backend_port, + "frontend_host": config.frontend_host, + "frontend_port": config.frontend_port, + "no_browser": config.no_browser, + "skip_frontend_build": config.skip_frontend_build, + } + + +def service_config_from_payload( + payload: dict[str, Any], + default: ServiceConfig | None = None, + *, + no_browser: bool | None = None, + skip_frontend_build: bool | None = None, +) -> ServiceConfig: + """Deserialize service config from a control or upgrade payload.""" + base = default or ServiceConfig() + resolved_skip_frontend_build = ( + _bool(payload.get("skip_frontend_build"), base.skip_frontend_build) + if skip_frontend_build is None + else skip_frontend_build + ) + resolved_no_browser = _bool(payload.get("no_browser"), base.no_browser) if no_browser is None else no_browser + return ServiceConfig( + backend_host=_string(payload.get("backend_host"), base.backend_host), + backend_port=_positive_int(payload.get("backend_port"), base.backend_port), + frontend_host=_string(payload.get("frontend_host"), base.frontend_host), + frontend_port=_positive_int(payload.get("frontend_port"), base.frontend_port), + no_browser=resolved_no_browser, + skip_frontend_build=resolved_skip_frontend_build, + ) + + +def service_config_from_status_payload( + payload: dict[str, Any], + *, + default: ServiceConfig | None = None, + no_browser: bool | None = None, + skip_frontend_build: bool | None = None, +) -> ServiceConfig: + """Extract service config from a supervisor status payload.""" + config = payload.get("config") if isinstance(payload.get("config"), dict) else {} + return service_config_from_payload( + config, + default=default, + no_browser=no_browser, + skip_frontend_build=skip_frontend_build, + ) + + +def restart_defaults_from_status_payload(payload: dict[str, Any]) -> dict[str, Any]: + """Return CLI default overrides from a supervisor status payload.""" + config = payload.get("config") if isinstance(payload.get("config"), dict) else {} + defaults: dict[str, Any] = {} + if isinstance(config.get("backend_host"), str): + defaults["default_server_host"] = config["backend_host"] + if _is_positive_int(config.get("backend_port")): + defaults["default_server_port"] = config["backend_port"] + if isinstance(config.get("frontend_host"), str): + defaults["default_webui_host"] = config["frontend_host"] + if _is_positive_int(config.get("frontend_port")): + defaults["default_webui_port"] = config["frontend_port"] + return defaults + + +def build_service_config( + *, + no_browser: bool = False, + skip_webui_build: bool = False, + server_host: str | None = None, + server_port: int | None = None, + webui_host: str | None = None, + webui_port: int | None = None, + default_server_host: str, + default_server_port: int, + default_webui_host: str = "127.0.0.1", + default_webui_port: int = 5173, +) -> ServiceConfig: + """Build service config from CLI values, environment, and defaults.""" + return ServiceConfig( + backend_host=_resolve_host( + cli_value=server_host, + env_names=("FLOCKS_SERVER_HOST", "FLOCKS_BACKEND_HOST"), + default=default_server_host, + ), + backend_port=_resolve_port( + cli_value=server_port, + env_names=("FLOCKS_SERVER_PORT", "FLOCKS_BACKEND_PORT"), + default=default_server_port, + label="server", + ), + frontend_host=_resolve_host( + cli_value=webui_host, + env_names=("FLOCKS_WEBUI_HOST", "FLOCKS_FRONTEND_HOST"), + default=default_webui_host, + ), + frontend_port=_resolve_port( + cli_value=webui_port, + env_names=("FLOCKS_WEBUI_PORT", "FLOCKS_FRONTEND_PORT"), + default=default_webui_port, + label="webui", + ), + no_browser=no_browser, + skip_frontend_build=skip_webui_build, + ) + + +def with_frontend_build(config: ServiceConfig, *, skip_frontend_build: bool) -> ServiceConfig: + """Return config with only the WebUI build behavior changed.""" + return ServiceConfig( + backend_host=config.backend_host, + backend_port=config.backend_port, + frontend_host=config.frontend_host, + frontend_port=config.frontend_port, + no_browser=config.no_browser, + skip_frontend_build=skip_frontend_build, + ) + + +def _resolve_host(*, cli_value: str | None, env_names: tuple[str, ...], default: str) -> str: + if cli_value is not None: + return cli_value + for env_name in env_names: + env_value = os.getenv(env_name) + if env_value: + return env_value + return default + + +def _resolve_port(*, cli_value: int | None, env_names: tuple[str, ...], default: int, label: str) -> int: + if cli_value is not None: + return cli_value + for env_name in env_names: + env_value = os.getenv(env_name) + if not env_value: + continue + try: + return int(env_value) + except ValueError as error: + raise ServiceConfigError(f"{label} port from {env_name} must be an integer.") from error + return default + + +def _string(value: Any, fallback: str) -> str: + return value if isinstance(value, str) and value else fallback + + +def _positive_int(value: Any, fallback: int) -> int: + return value if _is_positive_int(value) else fallback + + +def _is_positive_int(value: Any) -> bool: + return isinstance(value, int) and not isinstance(value, bool) and value > 0 + + +def _bool(value: Any, fallback: bool) -> bool: + return value if isinstance(value, bool) else fallback diff --git a/flocks/cli/service_control.py b/flocks/cli/service_control.py index 866afb796..06ad426fd 100644 --- a/flocks/cli/service_control.py +++ b/flocks/cli/service_control.py @@ -4,16 +4,52 @@ import os import sys +from dataclasses import dataclass from pathlib import Path -from typing import Any +from typing import Any, Iterator import httpx +from flocks.cli.service_config import ServiceConfig, service_config_from_status_payload, service_config_payload + SUPERVISOR_CONTROL_PORT = 48765 SUPERVISOR_LOG_FILENAME = "supervisor.log" SUPERVISOR_SOCKET_FILENAME = "service-daemon.sock" +@dataclass(frozen=True) +class DaemonStatus: + pid: int | None + uptime: float | None + version: str | None + state: str + log_path: str | None + + +@dataclass(frozen=True) +class ManagedServiceStatus: + pid: int | None + host: str + port: int | None + state: str + health: str + last_error: str | None + restart_count: int + last_restart_at: float | None + log_path: str | None + command: tuple[str, ...] + paused: bool = False + + +@dataclass(frozen=True) +class SupervisorStatus: + daemon: DaemonStatus + backend: ManagedServiceStatus + webui: ManagedServiceStatus + config: ServiceConfig + raw: dict[str, Any] + + def _default_runtime_paths(): from flocks.cli.service_manager import runtime_paths @@ -77,7 +113,7 @@ def supervisor_is_running(paths=None) -> bool: return False -def read_control_json(path: str, *, paths=None, timeout: float | None = 2.0) -> dict[str, Any]: +def _read_control_json(path: str, *, paths=None, timeout: float | None = 2.0) -> dict[str, Any]: response = control_api_request("GET", path, paths=paths, timeout=timeout) payload = response.json() if not isinstance(payload, dict): @@ -85,12 +121,7 @@ def read_control_json(path: str, *, paths=None, timeout: float | None = 2.0) -> return payload -def read_supervisor_status(paths=None, timeout: float | None = 2.0) -> dict[str, Any]: - """Read the current supervisor status from the local control API.""" - return read_control_json("/status", paths=paths, timeout=timeout) - - -def post_control_json( +def _post_control_json( path: str, *, payload: dict[str, Any] | None = None, @@ -104,13 +135,132 @@ def post_control_json( return data -def service_config_payload(config) -> dict[str, object]: - """Serialize a ServiceConfig-like object for the supervisor control API.""" - return { - "backend_host": config.backend_host, - "backend_port": config.backend_port, - "frontend_host": config.frontend_host, - "frontend_port": config.frontend_port, - "no_browser": config.no_browser, - "skip_frontend_build": config.skip_frontend_build, - } +def read_supervisor_status(paths=None, timeout: float | None = 2.0) -> SupervisorStatus: + """Read and parse the current supervisor status.""" + return parse_supervisor_status(_read_control_json("/status", paths=paths, timeout=timeout)) + + +def request_stop(paths=None, timeout: float | None = 2.0) -> dict[str, Any]: + """Ask the supervisor daemon to stop itself and its children.""" + return _post_control_json("/stop", paths=paths, timeout=timeout) + + +def request_restart( + config: ServiceConfig, + *, + paths=None, + timeout: float | None = 180.0, +) -> SupervisorStatus: + """Ask the supervisor daemon to restart all managed services.""" + payload = _post_control_json("/restart", payload=service_config_payload(config), paths=paths, timeout=timeout) + return parse_supervisor_status(payload) + + +def request_restart_backend(*, paths=None, timeout: float | None = 180.0) -> SupervisorStatus: + """Ask the supervisor daemon to restart backend.""" + payload = _post_control_json("/restart/backend", paths=paths, timeout=timeout) + return parse_supervisor_status(payload) + + +def request_restart_webui( + config: ServiceConfig, + *, + force_frontend_build: bool = False, + paths=None, + timeout: float | None = 180.0, +) -> SupervisorStatus: + """Ask the supervisor daemon to restart WebUI.""" + payload = service_config_payload(config) + if force_frontend_build: + payload["force_frontend_build"] = True + data = _post_control_json("/restart/webui", payload=payload, paths=paths, timeout=timeout) + return parse_supervisor_status(data) + + +def request_stop_webui(*, paths=None, timeout: float | None = 30.0) -> SupervisorStatus: + """Ask the supervisor daemon to stop WebUI only.""" + payload = _post_control_json("/stop/webui", paths=paths, timeout=timeout) + return parse_supervisor_status(payload) + + +def read_logs( + *, + service: str, + lines: int, + paths=None, + timeout: float | None = 5.0, +) -> dict[str, Any]: + """Read recent service logs through the supervisor control API.""" + return _read_control_json( + f"/logs?service={service}&lines={lines}&follow=false", + paths=paths, + timeout=timeout, + ) + + +def stream_logs( + *, + service: str, + lines: int, + paths=None, + timeout: float | None = None, +) -> Iterator[str]: + """Stream service logs through the supervisor control API.""" + params = {"service": service, "lines": str(lines), "follow": "true"} + with supervisor_control_client(paths, timeout=timeout) as client: + with client.stream("GET", "/logs", params=params) as response: + response.raise_for_status() + yield from response.iter_lines() + + +def parse_supervisor_status(payload: dict[str, Any]) -> SupervisorStatus: + """Parse a supervisor status payload into typed status objects.""" + daemon = payload.get("daemon") if isinstance(payload.get("daemon"), dict) else {} + backend = payload.get("backend") if isinstance(payload.get("backend"), dict) else {} + webui = payload.get("webui") if isinstance(payload.get("webui"), dict) else {} + return SupervisorStatus( + daemon=_parse_daemon_status(daemon), + backend=_parse_service_status(backend), + webui=_parse_service_status(webui), + config=service_config_from_status_payload(payload), + raw=payload, + ) + + +def _parse_daemon_status(payload: dict[str, Any]) -> DaemonStatus: + return DaemonStatus( + pid=_optional_int(payload.get("pid")), + uptime=_optional_float(payload.get("uptime")), + version=str(payload["version"]) if payload.get("version") is not None else None, + state=str(payload.get("state") or "unknown"), + log_path=str(payload["log_path"]) if payload.get("log_path") is not None else None, + ) + + +def _parse_service_status(payload: dict[str, Any]) -> ManagedServiceStatus: + command = payload.get("command") if isinstance(payload.get("command"), list) else [] + return ManagedServiceStatus( + pid=_optional_int(payload.get("pid")), + host=str(payload.get("host") or "127.0.0.1"), + port=_optional_int(payload.get("port")), + state=str(payload.get("state") or "unknown"), + health=str(payload.get("health") or payload.get("state") or "unknown"), + last_error=str(payload["last_error"]) if payload.get("last_error") is not None else None, + restart_count=_optional_int(payload.get("restart_count")) or 0, + last_restart_at=_optional_float(payload.get("last_restart_at")), + log_path=str(payload["log_path"]) if payload.get("log_path") is not None else None, + command=tuple(str(item) for item in command), + paused=bool(payload.get("paused")), + ) + + +def _optional_int(value: Any) -> int | None: + return value if isinstance(value, int) and not isinstance(value, bool) else None + + +def _optional_float(value: Any) -> float | None: + if isinstance(value, bool): + return None + if isinstance(value, (float, int)): + return float(value) + return None diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index 917754b72..1fdbebdbf 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -25,11 +25,13 @@ import httpx +from flocks.cli.service_config import ServiceConfig, loopback_host from flocks.cli.service_control import ( - post_control_json, - read_control_json, - service_config_payload, - supervisor_control_client, + read_logs, + read_supervisor_status, + request_restart, + request_stop, + stream_logs, supervisor_is_running, supervisor_log_path, supervisor_socket_path, @@ -59,20 +61,6 @@ class ServiceError(RuntimeError): """Raised when a service lifecycle action fails.""" -@dataclass(frozen=True) -class ServiceConfig: - backend_host: str = "127.0.0.1" - backend_port: int = 8000 - frontend_host: str = "127.0.0.1" - frontend_port: int = 5173 - no_browser: bool = False - skip_frontend_build: bool = False - - @property - def frontend_url(self) -> str: - return f"http://{_loopback_host(self.frontend_host)}:{self.frontend_port}" - - @dataclass(frozen=True) class RuntimePaths: root: Path @@ -1159,14 +1147,14 @@ def _wait_for_supervisor_ready( if process is not None and process.poll() is not None: raise ServiceError(f"Supervisor 启动失败,退出码: {process.returncode}") try: - payload = read_control_json("/status", paths=paths, timeout=1.0) - last_payload = payload - backend_state = ((payload.get("backend") or {}).get("state") if isinstance(payload.get("backend"), dict) else None) - webui_state = ((payload.get("webui") or {}).get("state") if isinstance(payload.get("webui"), dict) else None) + status = read_supervisor_status(paths=paths, timeout=1.0) + last_payload = status.raw + backend_state = status.backend.state + webui_state = status.webui.state if backend_state == "healthy" and webui_state == "healthy": - return payload + return status.raw if backend_state == "degraded" or webui_state == "degraded": - return payload + return status.raw except Exception: pass time.sleep(0.5) @@ -1207,7 +1195,7 @@ def stop_all(console) -> None: console.print("[flocks] Supervisor 未运行。") return try: - post_control_json("/stop", paths=paths, timeout=2.0) + request_stop(paths=paths, timeout=2.0) except Exception as exc: raise ServiceError(f"无法请求 Supervisor 停止: {exc}") from exc @@ -1239,8 +1227,8 @@ def start_all(config: ServiceConfig, console) -> None: show_status(console) if not config.no_browser: try: - payload = read_control_json("/status", paths=paths, timeout=1.0) - url = _frontend_url_from_status_payload(payload, config.frontend_url) + status = read_supervisor_status(paths=paths, timeout=1.0) + url = _frontend_url_from_status(status, config.frontend_url) except Exception: url = config.frontend_url open_default_browser(url, console) @@ -1255,23 +1243,23 @@ def restart_all(config: ServiceConfig, console) -> None: start_all(config, console) return try: - payload = post_control_json("/restart", payload=service_config_payload(config), paths=paths, timeout=180.0) + status = request_restart(config, paths=paths, timeout=180.0) except Exception as exc: raise ServiceError(f"无法请求 Supervisor 重启: {exc}") from exc - _print_status_payload(payload, console) + _print_status_payload(status.raw, console) def build_status_lines(paths: RuntimePaths | None = None) -> list[str]: """Return a human-readable status summary from the supervisor control API.""" current = paths or runtime_paths() try: - payload = read_control_json("/status", paths=current) + status = read_supervisor_status(paths=current) except Exception: return [ "[flocks] Supervisor 未运行", f"[flocks] Supervisor 日志: {supervisor_log_path(current)}", ] - return _status_lines_from_payload(payload) + return _status_lines_from_payload(status.raw) def _status_lines_from_payload(payload: dict[str, Any]) -> list[str]: @@ -1301,12 +1289,9 @@ def _service_status_line(label: str, payload: dict[str, Any]) -> str: return f"[flocks] {label}: state={state} PID={pid} URL=http://{host}:{port}{suffix}" -def _frontend_url_from_status_payload(payload: dict[str, Any], fallback: str) -> str: - webui = payload.get("webui") if isinstance(payload.get("webui"), dict) else {} - host = webui.get("host") - port = webui.get("port") - if isinstance(host, str) and isinstance(port, int): - return f"http://{_format_host_for_url(_loopback_host(host))}:{port}" +def _frontend_url_from_status(status, fallback: str) -> str: + if status.webui.port is not None: + return f"http://{_format_host_for_url(_loopback_host(status.webui.host))}:{status.webui.port}" return fallback @@ -1351,10 +1336,9 @@ def show_logs( service = "backend" elif webui and not backend: service = "webui" - params = {"service": service, "lines": str(lines), "follow": "true" if follow else "false"} if not follow: try: - payload = read_control_json(f"/logs?service={service}&lines={lines}&follow=false", paths=paths, timeout=5.0) + payload = read_logs(service=service, lines=lines, paths=paths, timeout=5.0) except Exception as exc: raise ServiceError(f"无法通过 Supervisor 读取日志: {exc}") from exc logs = payload.get("logs") if isinstance(payload.get("logs"), dict) else {} @@ -1368,11 +1352,8 @@ def show_logs( console.print("[flocks] 按 Ctrl+C 退出日志跟随。") try: - with supervisor_control_client(paths, timeout=None) as client: - with client.stream("GET", "/logs", params=params) as response: - response.raise_for_status() - for line in response.iter_lines(): - console.print(line) + for line in stream_logs(service=service, lines=lines, paths=paths, timeout=None): + console.print(line) except KeyboardInterrupt: return except Exception as exc: @@ -1494,7 +1475,7 @@ def open_default_browser(url: str, console) -> None: def access_host(host: str) -> str: """Return the host that local health checks and browser requests should use.""" - return _loopback_host(host) + return loopback_host(host) def _format_host_for_url(host: str) -> str: @@ -1679,7 +1660,7 @@ def _join_pids(pids: Iterable[int]) -> str: def _loopback_host(host: str) -> str: - return "127.0.0.1" if host in {"0.0.0.0", "::"} else host + return loopback_host(host) def _http_to_ws_url(url: str) -> str: diff --git a/flocks/cli/service_process.py b/flocks/cli/service_process.py new file mode 100644 index 000000000..3f2d50e7a --- /dev/null +++ b/flocks/cli/service_process.py @@ -0,0 +1,114 @@ +"""Process adapters used by the service supervisor.""" + +from __future__ import annotations + +import socket +import subprocess +from dataclasses import dataclass +from typing import Protocol + +import httpx + +from flocks.cli.service_config import ServiceConfig, with_frontend_build + + +@dataclass(frozen=True) +class ServiceProbeResult: + healthy: bool + reason: str | None = None + restart: bool = False + + +class ProcessAdapter(Protocol): + name: str + label: str + + def start(self, config: ServiceConfig, paths, *, built_once: bool = False) -> subprocess.Popen: + """Start the service process.""" + + def stop(self, process: subprocess.Popen | None) -> None: + """Stop the service process group.""" + + def probe(self, process: subprocess.Popen | None, host: str, port: int) -> ServiceProbeResult: + """Probe service process and listener health.""" + + +class BackendProcessAdapter: + name = "backend" + label = "后端" + + def start(self, config: ServiceConfig, paths, *, built_once: bool = False) -> subprocess.Popen: + del built_once + from flocks.cli.service_manager import _StdoutConsole, _start_backend_process + + return _start_backend_process(config, _StdoutConsole(), paths=paths) + + def stop(self, process: subprocess.Popen | None) -> None: + from flocks.cli.service_manager import _StdoutConsole, _terminate_process + + _terminate_process(process, self.label, _StdoutConsole()) + + def probe(self, process: subprocess.Popen | None, host: str, port: int) -> ServiceProbeResult: + if process is None: + return ServiceProbeResult(healthy=False, reason="stopped") + if process.poll() is not None: + return ServiceProbeResult( + healthy=False, + reason=f"process exited with code {process.returncode}", + restart=True, + ) + if not tcp_port_accepts_connections(host, port): + return ServiceProbeResult(healthy=False, reason=f"port {port} is not listening", restart=True) + + from flocks.cli.service_manager import _backend_health_url, _is_healthy_status_response + + url = _backend_health_url(host, port) + try: + with httpx.Client(timeout=2.0, trust_env=False) as client: + response = client.get(url) + healthy = _is_healthy_status_response(response) + reason = f"health status={response.status_code}" + except Exception as exc: + healthy = False + reason = f"health failed: {exc}" + return ServiceProbeResult(healthy=healthy, reason=reason) + + +class WebUIProcessAdapter: + name = "webui" + label = "WebUI" + + def start(self, config: ServiceConfig, paths, *, built_once: bool = False) -> subprocess.Popen: + from flocks.cli.service_manager import _StdoutConsole, _start_frontend_process + + resolved = with_frontend_build(config, skip_frontend_build=True) if built_once else config + return _start_frontend_process(resolved, _StdoutConsole(), paths=paths) + + def stop(self, process: subprocess.Popen | None) -> None: + from flocks.cli.service_manager import _StdoutConsole, _terminate_process + + _terminate_process(process, self.label, _StdoutConsole()) + + def probe(self, process: subprocess.Popen | None, host: str, port: int) -> ServiceProbeResult: + if process is None: + return ServiceProbeResult(healthy=False, reason="stopped") + if process.poll() is not None: + return ServiceProbeResult( + healthy=False, + reason=f"process exited with code {process.returncode}", + restart=True, + ) + if not tcp_port_accepts_connections(host, port): + return ServiceProbeResult(healthy=False, reason=f"port {port} is not listening", restart=True) + return ServiceProbeResult(healthy=True) + + +def tcp_port_accepts_connections(host: str, port: int) -> bool: + """Return True when a local service accepts TCP connections.""" + from flocks.cli.service_manager import access_host + + try: + with socket.create_connection((access_host(host), port), timeout=1.0): + return True + except OSError: + return False diff --git a/flocks/cli/service_supervisor.py b/flocks/cli/service_supervisor.py index 67793186c..919049ee9 100644 --- a/flocks/cli/service_supervisor.py +++ b/flocks/cli/service_supervisor.py @@ -17,15 +17,14 @@ from typing import Any from urllib.parse import parse_qs, urlparse -import httpx - from flocks.browser.admin import stop_all_daemons as stop_all_browser_daemons +from flocks.cli.service_config import service_config_from_payload, service_config_payload from flocks.cli.service_control import ( - service_config_payload, supervisor_control_port, supervisor_log_path, supervisor_socket_path, ) +from flocks.cli.service_process import BackendProcessAdapter, ProcessAdapter, WebUIProcessAdapter SUPERVISOR_CHECK_INTERVAL_SECONDS = 5.0 SUPERVISOR_HEALTH_FAILURE_THRESHOLD = 2 @@ -64,38 +63,6 @@ def _daemon_log(event: str, details: dict[str, object] | None = None) -> None: sys.stdout.flush() -def _config_from_payload(payload: dict[str, Any], default): - from flocks.cli.service_manager import ServiceConfig - - def _string(name: str, fallback: str) -> str: - value = payload.get(name) - return value if isinstance(value, str) and value else fallback - - def _int(name: str, fallback: int) -> int: - value = payload.get(name) - return value if isinstance(value, int) and not isinstance(value, bool) and value > 0 else fallback - - return ServiceConfig( - backend_host=_string("backend_host", default.backend_host), - backend_port=_int("backend_port", default.backend_port), - frontend_host=_string("frontend_host", default.frontend_host), - frontend_port=_int("frontend_port", default.frontend_port), - no_browser=bool(payload.get("no_browser", default.no_browser)), - skip_frontend_build=bool(payload.get("skip_frontend_build", default.skip_frontend_build)), - ) - - -def _tcp_port_accepts_connections(host: str, port: int) -> bool: - """Return True when a local service accepts TCP connections.""" - from flocks.cli.service_manager import access_host - - try: - with socket.create_connection((access_host(host), port), timeout=1.0): - return True - except OSError: - return False - - def _health_status_from_service_state(state: str) -> str: if state in {"healthy", "starting", "restarting", "stopped", "paused"}: return state @@ -131,6 +98,8 @@ def __init__( *, interval: float = SUPERVISOR_CHECK_INTERVAL_SECONDS, failure_threshold: int = SUPERVISOR_HEALTH_FAILURE_THRESHOLD, + backend_adapter: ProcessAdapter | None = None, + webui_adapter: ProcessAdapter | None = None, ) -> None: from flocks.cli.service_manager import ensure_runtime_dirs @@ -138,6 +107,8 @@ def __init__( self.paths = ensure_runtime_dirs() self.interval = interval self.failure_threshold = failure_threshold + self.backend_adapter = backend_adapter or BackendProcessAdapter() + self.webui_adapter = webui_adapter or WebUIProcessAdapter() self.started_at = time.time() self._lock = threading.RLock() self._shutdown_requested = threading.Event() @@ -302,7 +273,7 @@ def do_POST(self) -> None: def update_config(self, payload: dict[str, Any]) -> None: with self._lock: - self.config = _config_from_payload(payload, self.config) + self.config = service_config_from_payload(payload, self.config) self.backend.host = self.config.backend_host self.backend.port = self.config.backend_port self.webui.host = self.config.frontend_host @@ -468,23 +439,20 @@ def _restart_service(self, service: ManagedService, *, reason: str, immediate: b service.next_restart_at = time.monotonic() if immediate else self._next_restart_time(service.restart_count) def _stop_service(self, service: ManagedService) -> None: - from flocks.cli.service_manager import _StdoutConsole, _terminate_process - - _terminate_process(service.process, service.label, _StdoutConsole()) + adapter = self._adapter_for(service) + adapter.stop(service.process) service.process = None service.command = () service.state = "stopped" def _start_backend_locked(self, *, immediate: bool) -> None: - from flocks.cli.service_manager import _StdoutConsole, _start_backend_process - if self.backend.process is not None and self.backend.process.poll() is None: return if not immediate and time.monotonic() < self.backend.next_restart_at: return self.backend.state = "starting" try: - process = _start_backend_process(self.config, _StdoutConsole(), paths=self.paths) + process = self.backend_adapter.start(self.config, self.paths) except Exception as exc: self._mark_start_failed(self.backend, exc) return @@ -495,25 +463,13 @@ def _start_backend_locked(self, *, immediate: bool) -> None: self.backend.health_failure_count = 0 def _start_webui_locked(self, *, immediate: bool) -> None: - from flocks.cli.service_manager import ServiceConfig, _StdoutConsole, _start_frontend_process - if self.webui.process is not None and self.webui.process.poll() is None: return if not immediate and time.monotonic() < self.webui.next_restart_at: return self.webui.state = "starting" - config = self.config - if self.webui.built_once: - config = ServiceConfig( - backend_host=config.backend_host, - backend_port=config.backend_port, - frontend_host=config.frontend_host, - frontend_port=config.frontend_port, - no_browser=config.no_browser, - skip_frontend_build=True, - ) try: - process = _start_frontend_process(config, _StdoutConsole(), paths=self.paths) + process = self.webui_adapter.start(self.config, self.paths, built_once=self.webui.built_once) except Exception as exc: self._mark_start_failed(self.webui, exc) return @@ -539,29 +495,14 @@ def _next_restart_time(self, restart_count: int) -> float: return time.monotonic() + SUPERVISOR_BACKOFF_SECONDS[index] def _probe_backend_locked(self) -> None: - from flocks.cli.service_manager import _backend_health_url, _is_healthy_status_response - - process = self.backend.process - if process is None: + result = self.backend_adapter.probe(self.backend.process, self.backend.host, self.backend.port) + if self.backend.process is None: self.backend.state = "stopped" return - if process.poll() is not None: - self._restart_service(self.backend, reason=f"process exited with code {process.returncode}", immediate=True) + if result.restart: + self._restart_service(self.backend, reason=result.reason or "backend probe failed", immediate=True) return - if not _tcp_port_accepts_connections(self.backend.host, self.backend.port): - self._restart_service(self.backend, reason=f"port {self.backend.port} is not listening", immediate=True) - return - - url = _backend_health_url(self.backend.host, self.backend.port) - try: - with httpx.Client(timeout=2.0, trust_env=False) as client: - response = client.get(url) - healthy = _is_healthy_status_response(response) - reason = f"health status={response.status_code}" - except Exception as exc: - healthy = False - reason = f"health failed: {exc}" - if healthy: + if result.healthy: self.backend.state = "healthy" self.backend.health_failure_count = 0 self.backend.last_error = None @@ -569,25 +510,25 @@ def _probe_backend_locked(self) -> None: self.backend.health_failure_count += 1 self.backend.state = "degraded" - self.backend.last_error = reason + self.backend.last_error = result.reason if self.backend.health_failure_count >= self.failure_threshold: - self._restart_service(self.backend, reason=reason, immediate=True) + self._restart_service(self.backend, reason=result.reason or "backend health failed", immediate=True) def _probe_webui_locked(self) -> None: - process = self.webui.process - if process is None: + result = self.webui_adapter.probe(self.webui.process, self.webui.host, self.webui.port) + if self.webui.process is None: self.webui.state = "stopped" return - if process.poll() is not None: - self._restart_service(self.webui, reason=f"process exited with code {process.returncode}", immediate=True) - return - if not _tcp_port_accepts_connections(self.webui.host, self.webui.port): - self._restart_service(self.webui, reason=f"port {self.webui.port} is not listening", immediate=True) + if result.restart: + self._restart_service(self.webui, reason=result.reason or "webui probe failed", immediate=True) return self.webui.state = "healthy" self.webui.health_failure_count = 0 self.webui.last_error = None + def _adapter_for(self, service: ManagedService) -> ProcessAdapter: + return self.backend_adapter if service.name == "backend" else self.webui_adapter + def run_service_daemon( config, diff --git a/flocks/updater/updater.py b/flocks/updater/updater.py index 9f5a75552..7831b3978 100644 --- a/flocks/updater/updater.py +++ b/flocks/updater/updater.py @@ -1888,19 +1888,18 @@ def print(self, *args, **kwargs) -> None: def _current_service_config(): from flocks.cli import service_manager + from flocks.cli.service_config import service_config_from_status_payload from flocks.cli.service_control import read_supervisor_status try: - payload = read_supervisor_status(paths=service_manager.runtime_paths(), timeout=1.0) + status = read_supervisor_status(paths=service_manager.runtime_paths(), timeout=1.0) except Exception: - payload = {} - - config = payload.get("config") if isinstance(payload.get("config"), dict) else {} - return service_manager.ServiceConfig( - backend_host=str(config.get("backend_host") or service_manager.ServiceConfig.backend_host), - backend_port=int(config.get("backend_port") or service_manager.ServiceConfig.backend_port), - frontend_host=str(config.get("frontend_host") or service_manager.ServiceConfig.frontend_host), - frontend_port=int(config.get("frontend_port") or service_manager.ServiceConfig.frontend_port), + return service_manager.ServiceConfig( + no_browser=True, + skip_frontend_build=True, + ) + return service_config_from_status_payload( + status.raw, no_browser=True, skip_frontend_build=True, ) @@ -2074,7 +2073,7 @@ def _stop_upgrade_page_server(*, frontend_port: int | None = None) -> None: def _prepare_upgrade_handover(version: str) -> dict[str, Any]: from flocks.cli import service_manager - from flocks.cli.service_control import post_control_json + from flocks.cli.service_control import request_stop_webui config = _current_service_config() payload: dict[str, Any] = { @@ -2090,7 +2089,7 @@ def _prepare_upgrade_handover(version: str) -> dict[str, Any]: console = _NullConsole() paths = service_manager.runtime_paths() - post_control_json("/stop/webui", paths=paths, timeout=30.0) + request_stop_webui(paths=paths, timeout=30.0) try: payload.update(_start_upgrade_page_server(config, version)) @@ -2116,16 +2115,13 @@ def _service_config_from_payload( *, skip_frontend_build: bool | None = None, ): - from flocks.cli import service_manager + from flocks.cli.service_config import service_config_from_payload resolved_skip_frontend_build = ( bool(payload.get("skip_frontend_build", True)) if skip_frontend_build is None else skip_frontend_build ) - return service_manager.ServiceConfig( - backend_host=str(payload.get("backend_host") or service_manager.ServiceConfig.backend_host), - backend_port=int(payload.get("backend_port") or service_manager.ServiceConfig.backend_port), - frontend_host=str(payload.get("frontend_host") or service_manager.ServiceConfig.frontend_host), - frontend_port=int(payload.get("frontend_port") or service_manager.ServiceConfig.frontend_port), + return service_config_from_payload( + payload, no_browser=True, skip_frontend_build=resolved_skip_frontend_build, ) @@ -2187,39 +2183,26 @@ def read_upgrade_runtime_state(frontend_port: int | None = None) -> dict[str, An def _start_frontend_with_fallback(config, console, *, allow_build_fallback: bool) -> None: - from flocks.cli.service_control import post_control_json, service_config_payload + from flocks.cli.service_config import with_frontend_build + from flocks.cli.service_control import request_restart_webui try: - payload = post_control_json( - "/restart/webui", - payload=service_config_payload(config), + status = request_restart_webui( + config, paths=None, timeout=180.0, ) - webui = payload.get("webui") if isinstance(payload.get("webui"), dict) else {} - if webui.get("state") != "healthy": - raise RuntimeError(str(webui.get("last_error") or "WebUI restart did not become healthy")) + if status.webui.state != "healthy": + raise RuntimeError(status.webui.last_error or "WebUI restart did not become healthy") return except Exception: if not allow_build_fallback or not config.skip_frontend_build: raise - from flocks.cli import service_manager - - rebuilt_config = service_manager.ServiceConfig( - backend_host=config.backend_host, - backend_port=config.backend_port, - frontend_host=config.frontend_host, - frontend_port=config.frontend_port, - no_browser=config.no_browser, - skip_frontend_build=False, - ) - payload = service_config_payload(rebuilt_config) - payload["force_frontend_build"] = True - result = post_control_json("/restart/webui", payload=payload, paths=None, timeout=180.0) - webui = result.get("webui") if isinstance(result.get("webui"), dict) else {} - if webui.get("state") != "healthy": - raise RuntimeError(str(webui.get("last_error") or "WebUI restart did not become healthy")) + rebuilt_config = with_frontend_build(config, skip_frontend_build=False) + result = request_restart_webui(rebuilt_config, force_frontend_build=True, paths=None, timeout=180.0) + if result.webui.state != "healthy": + raise RuntimeError(result.webui.last_error or "WebUI restart did not become healthy") def cleanup_orphan_upgrade_state(*, frontend_port: int | None = None) -> bool: diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index e56e75e7c..d5c51c647 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -8,6 +8,8 @@ from flocks.cli import service_manager from flocks.cli import service_supervisor +from flocks.cli import service_control +from flocks.cli import service_process class DummyConsole: @@ -602,7 +604,7 @@ def _client_factory(*, timeout, trust_env): captured["trust_env"] = trust_env return _FakeClient() - monkeypatch.setattr(service_supervisor.httpx, "Client", _client_factory) + monkeypatch.setattr(service_manager.httpx, "Client", _client_factory) service_manager.wait_for_http( ["http://127.0.0.1:8000/api/health"], @@ -725,9 +727,13 @@ def _supervisor_status_payload() -> dict[str, object]: } +def _supervisor_status(payload: dict[str, object] | None = None) -> service_control.SupervisorStatus: + return service_control.parse_supervisor_status(payload or _supervisor_status_payload()) + + def test_build_status_lines_reports_supervisor_control_status(monkeypatch, tmp_path: Path) -> None: paths = _make_runtime_paths(tmp_path) - monkeypatch.setattr(service_manager, "read_control_json", lambda *_args, **_kwargs: _supervisor_status_payload()) + monkeypatch.setattr(service_manager, "read_supervisor_status", lambda *_args, **_kwargs: _supervisor_status()) lines = service_manager.build_status_lines(paths) @@ -742,7 +748,7 @@ def test_build_status_lines_reports_daemon_down_without_port_scans(monkeypatch, monkeypatch.setattr( service_manager, - "read_control_json", + "read_supervisor_status", lambda *_args, **_kwargs: (_ for _ in ()).throw(service_manager.ServiceError("down")), ) monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: calls.append("port_owner") or []) @@ -791,8 +797,8 @@ def test_restart_all_uses_supervisor_control_api(monkeypatch) -> None: monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: True) monkeypatch.setattr( service_manager, - "post_control_json", - lambda path, **_kwargs: call_order.append(path) or _supervisor_status_payload(), + "request_restart", + lambda _config, **_kwargs: call_order.append("/restart") or _supervisor_status(), ) monkeypatch.setattr(service_manager, "_print_status_payload", lambda _payload, _console: call_order.append("print_status")) @@ -1214,7 +1220,7 @@ def test_supervisor_recovers_backend_when_port_disappears(monkeypatch, tmp_path: daemon.backend.process = _fake_process(111, ["backend"]) daemon.webui.process = _fake_process(222, ["webui"]) - monkeypatch.setattr(service_supervisor, "_tcp_port_accepts_connections", lambda _host, port: port != 9995) + monkeypatch.setattr(service_process, "tcp_port_accepts_connections", lambda _host, port: port != 9995) monkeypatch.setattr(service_manager, "_terminate_process", lambda _process, name, _console: calls.append(f"stop:{name}")) monkeypatch.setattr( service_manager, @@ -1253,8 +1259,8 @@ def __exit__(self, *_args) -> None: def get(self, _url): return httpx.Response(503, json={"status": "unhealthy"}) - monkeypatch.setattr(service_supervisor.httpx, "Client", FakeClient) - monkeypatch.setattr(service_supervisor, "_tcp_port_accepts_connections", lambda *_args: True) + monkeypatch.setattr(service_process.httpx, "Client", FakeClient) + monkeypatch.setattr(service_process, "tcp_port_accepts_connections", lambda *_args: True) monkeypatch.setattr(service_manager, "_terminate_process", lambda _process, name, _console: calls.append(f"stop:{name}")) monkeypatch.setattr( service_manager, @@ -1279,7 +1285,7 @@ def test_supervisor_recovers_webui_when_port_disappears(monkeypatch, tmp_path: P daemon.backend.process = _fake_process(111, ["backend"]) daemon.webui.process = _fake_process(222, ["webui"]) - monkeypatch.setattr(service_supervisor, "_tcp_port_accepts_connections", lambda _host, port: port != 9996) + monkeypatch.setattr(service_process, "tcp_port_accepts_connections", lambda _host, port: port != 9996) monkeypatch.setattr(service_manager, "_terminate_process", lambda _process, name, _console: calls.append(f"stop:{name}")) monkeypatch.setattr( service_manager, @@ -1614,7 +1620,7 @@ def print(self, message) -> None: states = iter([True, False]) monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: next(states)) - monkeypatch.setattr(service_manager, "post_control_json", lambda path, **_kwargs: calls.append(path) or {"status": "stopping"}) + monkeypatch.setattr(service_manager, "request_stop", lambda **_kwargs: calls.append("/stop") or {"status": "stopping"}) console = FakeConsole() service_manager.stop_all(console=console) @@ -1640,7 +1646,7 @@ def test_status_lines_include_control_api_errors(monkeypatch, tmp_path: Path) -> payload = _supervisor_status_payload() payload["backend"]["state"] = "degraded" payload["backend"]["last_error"] = "health failed" - monkeypatch.setattr(service_manager, "read_control_json", lambda *_args, **_kwargs: payload) + monkeypatch.setattr(service_manager, "read_supervisor_status", lambda *_args, **_kwargs: _supervisor_status(payload)) lines = service_manager.build_status_lines(paths) diff --git a/tests/updater/test_updater.py b/tests/updater/test_updater.py index 4b8932822..55460b92b 100644 --- a/tests/updater/test_updater.py +++ b/tests/updater/test_updater.py @@ -49,6 +49,13 @@ def _webui_control_payload(state: str = "healthy", last_error: str | None = None } +def _webui_control_status( + state: str = "healthy", + last_error: str | None = None, +) -> service_control.SupervisorStatus: + return service_control.parse_supervisor_status(_webui_control_payload(state, last_error)) + + def test_run_handles_none_process_output(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: def fake_run(*args, **kwargs): return subprocess.CompletedProcess(args=args[0], returncode=0, stdout=None, stderr=None) @@ -1015,8 +1022,8 @@ def test_prepare_upgrade_handover_writes_state_and_stops_frontend( ) monkeypatch.setattr( service_control, - "post_control_json", - lambda path, **_kwargs: calls.append(path) or _webui_control_payload(), + "request_stop_webui", + lambda **_kwargs: calls.append("/stop/webui") or _webui_control_status(), ) payload = updater._prepare_upgrade_handover("2026.3.31.1") @@ -1042,8 +1049,14 @@ def test_prepare_upgrade_handover_restores_frontend_when_upgrade_page_fails( ) monkeypatch.setattr( service_control, - "post_control_json", - lambda path, payload=None, **_kwargs: calls.append((path, None if payload is None else payload.get("skip_frontend_build"))) or _webui_control_payload(), + "request_stop_webui", + lambda **_kwargs: calls.append(("/stop/webui", None)) or _webui_control_status(), + ) + monkeypatch.setattr( + service_control, + "request_restart_webui", + lambda config, **_kwargs: calls.append(("/restart/webui", config.skip_frontend_build)) + or _webui_control_status(), ) with pytest.raises(RuntimeError, match="page failed"): @@ -1068,8 +1081,9 @@ def test_recover_upgrade_state_restarts_frontend_and_clears_marker( monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: stopped.append("stop")) monkeypatch.setattr( service_control, - "post_control_json", - lambda _path, payload=None, **_kwargs: started.append((payload["frontend_port"], payload.get("skip_frontend_build"))) or _webui_control_payload(), + "request_restart_webui", + lambda config, **_kwargs: started.append((config.frontend_port, config.skip_frontend_build)) + or _webui_control_status(), ) updater._write_upgrade_state( { @@ -1103,11 +1117,11 @@ def test_recover_upgrade_state_retries_frontend_with_build_when_dist_is_missing( _webui_control_payload(), ]) - def fake_restart_webui(_path, payload=None, **_kwargs): - starts.append((payload.get("skip_frontend_build"), payload.get("force_frontend_build"))) - return next(results) + def fake_restart_webui(config, *, force_frontend_build=False, **_kwargs): + starts.append((config.skip_frontend_build, force_frontend_build or None)) + return service_control.parse_supervisor_status(next(results)) - monkeypatch.setattr(service_control, "post_control_json", fake_restart_webui) + monkeypatch.setattr(service_control, "request_restart_webui", fake_restart_webui) updater._write_upgrade_state( { "version": "2026.3.31.1", @@ -1134,11 +1148,11 @@ def test_recover_upgrade_state_restart_failure_clears_state_without_restarting_p monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: None) - def fake_restart_webui(_path, payload=None, **_kwargs): - starts.append((payload.get("skip_frontend_build"), payload.get("force_frontend_build"))) - return _webui_control_payload("degraded", "still broken") + def fake_restart_webui(config, *, force_frontend_build=False, **_kwargs): + starts.append((config.skip_frontend_build, force_frontend_build or None)) + return _webui_control_status("degraded", "still broken") - monkeypatch.setattr(service_control, "post_control_json", fake_restart_webui) + monkeypatch.setattr(service_control, "request_restart_webui", fake_restart_webui) updater._write_upgrade_state( { "version": "2026.3.31.1", @@ -1274,11 +1288,11 @@ def test_rollback_failed_update_restores_backup_and_rebuilds_frontend_if_needed( _webui_control_payload(), ]) - def fake_restart_webui(_path, payload=None, **_kwargs) -> dict[str, object]: - events.append(f"restart_webui:{payload.get('skip_frontend_build')}:{payload.get('force_frontend_build')}") - return next(results) + def fake_restart_webui(config, *, force_frontend_build=False, **_kwargs) -> service_control.SupervisorStatus: + events.append(f"restart_webui:{config.skip_frontend_build}:{force_frontend_build or None}") + return service_control.parse_supervisor_status(next(results)) - monkeypatch.setattr(service_control, "post_control_json", fake_restart_webui) + monkeypatch.setattr(service_control, "request_restart_webui", fake_restart_webui) updater._write_upgrade_state( { "version": "2026.4.1", @@ -1321,11 +1335,11 @@ def test_rollback_failed_update_clears_state_when_restore_and_frontend_both_fail monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: events.append("stop_page")) monkeypatch.setattr(updater.shutil, "rmtree", lambda path, ignore_errors=True: events.append(f"rmtree:{Path(path).name}")) - def fake_restart_webui(_path, payload=None, **_kwargs) -> dict[str, object]: - events.append(f"restart_webui:{payload.get('skip_frontend_build')}") - return _webui_control_payload("degraded", "frontend still broken") + def fake_restart_webui(config, **_kwargs) -> service_control.SupervisorStatus: + events.append(f"restart_webui:{config.skip_frontend_build}") + return _webui_control_status("degraded", "frontend still broken") - monkeypatch.setattr(service_control, "post_control_json", fake_restart_webui) + monkeypatch.setattr(service_control, "request_restart_webui", fake_restart_webui) updater._write_upgrade_state( { "version": "2026.4.1", From 8563950444e6b96511e3c0d5b8034f2dd68895a9 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Thu, 2 Jul 2026 12:55:24 +0800 Subject: [PATCH 04/28] fix(updater): coordinate handoff with supervisor --- flocks/cli/service_control.py | 6 + flocks/cli/service_supervisor.py | 25 ++- flocks/updater/restart_handoff.py | 31 ++++ flocks/updater/updater.py | 60 ++++++-- tests/cli/test_service_manager.py | 52 +++++++ tests/helpers/__init__.py | 1 + tests/helpers/service_supervisor.py | 100 ++++++++++++ tests/updater/test_restart_handoff.py | 56 +++++++ tests/updater/test_updater.py | 212 +++++++++++++++++++------- 9 files changed, 471 insertions(+), 72 deletions(-) create mode 100644 tests/helpers/__init__.py create mode 100644 tests/helpers/service_supervisor.py diff --git a/flocks/cli/service_control.py b/flocks/cli/service_control.py index 06ad426fd..b34791784 100644 --- a/flocks/cli/service_control.py +++ b/flocks/cli/service_control.py @@ -183,6 +183,12 @@ def request_stop_webui(*, paths=None, timeout: float | None = 30.0) -> Superviso return parse_supervisor_status(payload) +def request_prepare_upgrade(*, paths=None, timeout: float | None = 30.0) -> SupervisorStatus: + """Ask the supervisor daemon to pause managed services for upgrade handoff.""" + payload = _post_control_json("/upgrade/prepare", paths=paths, timeout=timeout) + return parse_supervisor_status(payload) + + def read_logs( *, service: str, diff --git a/flocks/cli/service_supervisor.py b/flocks/cli/service_supervisor.py index 919049ee9..8c6e08565 100644 --- a/flocks/cli/service_supervisor.py +++ b/flocks/cli/service_supervisor.py @@ -114,6 +114,7 @@ def __init__( self._shutdown_requested = threading.Event() self._server: ThreadingHTTPServer | None = None self._server_thread: threading.Thread | None = None + self._backend_paused = False self._webui_paused = False self.backend = ManagedService( name="backend", @@ -265,6 +266,10 @@ def do_POST(self) -> None: daemon.stop_webui(reason="control stop") self._send_json(daemon.status_payload()) return + if parsed.path == "/upgrade/prepare": + daemon.prepare_upgrade(reason="control upgrade prepare") + self._send_json(daemon.status_payload()) + return self._send_json({"error": "not found"}, status=404) except Exception as exc: # pragma: no cover - defensive control path self._send_json({"error": str(exc)}, status=500) @@ -296,7 +301,7 @@ def status_payload(self) -> dict[str, object]: "state": "stopping" if self._shutdown_requested.is_set() else "running", "log_path": str(supervisor_log_path(self.paths)), }, - "backend": _service_payload(self.backend), + "backend": _service_payload(self.backend, paused=self._backend_paused), "webui": _service_payload(self.webui, paused=self._webui_paused), "config": service_config_payload(self.config), } @@ -388,6 +393,7 @@ def _log_paths_for_service(self, service_name: str) -> list[tuple[str, Path]]: def restart_all(self, *, reason: str) -> None: with self._lock: + self._backend_paused = False self._webui_paused = False self._restart_service(self.webui, reason=reason, immediate=True) self._restart_service(self.backend, reason=reason, immediate=True) @@ -396,6 +402,7 @@ def restart_all(self, *, reason: str) -> None: def restart_backend(self, *, reason: str) -> None: with self._lock: + self._backend_paused = False self._restart_service(self.backend, reason=reason, immediate=True) self._start_backend_locked(immediate=True) @@ -414,6 +421,16 @@ def stop_webui(self, *, reason: str) -> None: self._stop_service(self.webui) self.webui.last_error = reason + def prepare_upgrade(self, *, reason: str) -> None: + with self._lock: + self._backend_paused = True + self._webui_paused = True + _daemon_log("service_pause", {"service": "backend", "reason": reason}) + _daemon_log("service_pause", {"service": "webui", "reason": reason}) + self.backend.last_error = reason + self.webui.last_error = reason + self._stop_service(self.webui) + def shutdown_children(self) -> None: with self._lock: self._stop_service(self.webui) @@ -421,10 +438,12 @@ def shutdown_children(self) -> None: def tick(self) -> None: with self._lock: - self._probe_backend_locked() + if not self._backend_paused: + self._probe_backend_locked() if not self._webui_paused: self._probe_webui_locked() - self._start_backend_locked(immediate=False) + if not self._backend_paused: + self._start_backend_locked(immediate=False) if not self._webui_paused: self._start_webui_locked(immediate=False) diff --git a/flocks/updater/restart_handoff.py b/flocks/updater/restart_handoff.py index dc639b02e..9ca6c90e8 100644 --- a/flocks/updater/restart_handoff.py +++ b/flocks/updater/restart_handoff.py @@ -23,6 +23,7 @@ DEFAULT_PARENT_TIMEOUT_SECONDS = 20.0 DEFAULT_PORT_TIMEOUT_SECONDS = 10.0 POST_STOP_PORT_TIMEOUT_SECONDS = 20.0 +SUPERVISOR_STOP_TIMEOUT_SECONDS = 20.0 DEFAULT_POLL_INTERVAL_SECONDS = 0.25 @@ -71,6 +72,31 @@ def _ensure_backend_port_free(backend_port: int) -> bool: return _wait_for_backend_port_free(backend_port, timeout_seconds=POST_STOP_PORT_TIMEOUT_SECONDS) +def _stop_supervisor_before_restart( + *, + timeout_seconds: float = SUPERVISOR_STOP_TIMEOUT_SECONDS, + poll_interval_seconds: float = DEFAULT_POLL_INTERVAL_SECONDS, +) -> bool: + from flocks.cli import service_control + + paths = service_manager.runtime_paths() + if not service_control.supervisor_is_running(paths): + return True + + try: + service_control.request_stop(paths=paths, timeout=timeout_seconds) + except Exception as exc: + _record_handoff_log(f"supervisor_stop_request_failed error={exc}") + return False + + deadline = time.monotonic() + timeout_seconds + while time.monotonic() < deadline: + if not service_control.supervisor_is_running(paths): + return True + time.sleep(poll_interval_seconds) + return not service_control.supervisor_is_running(paths) + + def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: parser = argparse.ArgumentParser(description="Flocks restart handoff helper") parser.add_argument("--parent-pid", type=int, required=True) @@ -170,6 +196,11 @@ def run(argv: Sequence[str] | None = None) -> int: _cleanup_dir(args.cleanup_dir) return 1 + if not _stop_supervisor_before_restart(): + _record_handoff_log("supervisor_stop_timeout") + _cleanup_dir(args.cleanup_dir) + return 1 + try: process = subprocess.Popen( restart_argv, diff --git a/flocks/updater/updater.py b/flocks/updater/updater.py index 7831b3978..218f9f033 100644 --- a/flocks/updater/updater.py +++ b/flocks/updater/updater.py @@ -1893,11 +1893,8 @@ def _current_service_config(): try: status = read_supervisor_status(paths=service_manager.runtime_paths(), timeout=1.0) - except Exception: - return service_manager.ServiceConfig( - no_browser=True, - skip_frontend_build=True, - ) + except Exception as exc: + raise RuntimeError("Supervisor control API is unavailable; cannot perform managed upgrade restart.") from exc return service_config_from_status_payload( status.raw, no_browser=True, @@ -2073,7 +2070,7 @@ def _stop_upgrade_page_server(*, frontend_port: int | None = None) -> None: def _prepare_upgrade_handover(version: str) -> dict[str, Any]: from flocks.cli import service_manager - from flocks.cli.service_control import request_stop_webui + from flocks.cli.service_control import request_prepare_upgrade config = _current_service_config() payload: dict[str, Any] = { @@ -2089,7 +2086,7 @@ def _prepare_upgrade_handover(version: str) -> dict[str, Any]: console = _NullConsole() paths = service_manager.runtime_paths() - request_stop_webui(paths=paths, timeout=30.0) + request_prepare_upgrade(paths=paths, timeout=30.0) try: payload.update(_start_upgrade_page_server(config, version)) @@ -2110,6 +2107,22 @@ def _prepare_upgrade_handover(version: str) -> dict[str, Any]: return payload +def _spawn_restart_handoff(command: list[str], *, cwd: Path) -> subprocess.Popen: + creationflags = 0 + kwargs: dict[str, object] = {} + if sys.platform == "win32": + creationflags = getattr(subprocess, "CREATE_NEW_PROCESS_GROUP", 0) | getattr(subprocess, "CREATE_NO_WINDOW", 0) + startupinfo_cls = getattr(subprocess, "STARTUPINFO", None) + if startupinfo_cls is not None: + startupinfo = startupinfo_cls() + startupinfo.dwFlags |= getattr(subprocess, "STARTF_USESHOWWINDOW", 0) + startupinfo.wShowWindow = getattr(subprocess, "SW_HIDE", 0) + kwargs["startupinfo"] = startupinfo + else: + kwargs["start_new_session"] = True + return subprocess.Popen(command, cwd=cwd, close_fds=True, creationflags=creationflags, **kwargs) + + def _service_config_from_payload( payload: dict[str, Any], *, @@ -2127,6 +2140,13 @@ def _service_config_from_payload( ) +def _handoff_service_config(): + payload = _read_upgrade_state() + if payload is not None: + return _service_config_from_payload(payload, skip_frontend_build=True) + return _current_service_config() + + def _read_upgrade_server_pid() -> tuple[int | None, bool]: pid_path = _upgrade_server_pid_path() if not pid_path.exists(): @@ -3309,11 +3329,7 @@ async def _restore_after_apply_failure() -> None: "restart_argv": restart_argv, }, ) - subprocess.Popen( - handoff_argv, - cwd=install_root, - close_fds=True, - ) + _spawn_restart_handoff(handoff_argv, cwd=install_root) os._exit(0) except Exception as exc: log.error("updater.restart.handoff_spawn_failed", {"error": str(exc)}) @@ -3449,7 +3465,23 @@ def _build_restart_handoff_argv( if not restart_argv: raise ValueError("restart command is empty") - config = _current_service_config() + config = _handoff_service_config() + managed_restart_argv = [ + restart_argv[0], + "-m", + "flocks.cli.main", + "start", + "--no-browser", + "--skip-webui-build", + "--server-host", + str(config.backend_host), + "--server-port", + str(config.backend_port), + "--webui-host", + str(config.frontend_host), + "--webui-port", + str(config.frontend_port), + ] argv = [ restart_argv[0], "-m", @@ -3489,7 +3521,7 @@ def _build_restart_handoff_argv( argv.extend(["--bundle-sha256", bundle_sha256]) if cleanup_dir is not None: argv.extend(["--cleanup-dir", str(cleanup_dir)]) - argv.extend(["--", *restart_argv]) + argv.extend(["--", *managed_restart_argv]) return argv diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index d5c51c647..4d57ce3e1 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -1,4 +1,5 @@ import json +import shutil import sys from pathlib import Path from types import SimpleNamespace @@ -10,6 +11,11 @@ from flocks.cli import service_supervisor from flocks.cli import service_control from flocks.cli import service_process +from tests.helpers.service_supervisor import ( + SleeperProcessAdapter, + make_short_runtime_root, + wait_for_process_exit, +) class DummyConsole: @@ -1299,6 +1305,52 @@ def test_supervisor_recovers_webui_when_port_disappears(monkeypatch, tmp_path: P assert daemon.webui.pid == 444 +@pytest.mark.skipif(sys.platform == "win32", reason="uses the Unix domain socket control API") +def test_supervisor_upgrade_prepare_control_api_pauses_real_child_restart(monkeypatch, tmp_path: Path) -> None: + del tmp_path + short_root = make_short_runtime_root("flocks-supervisor-") + paths = _make_runtime_paths(short_root) + paths.run_dir.mkdir(parents=True) + paths.log_dir.mkdir(parents=True) + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + backend_adapter = SleeperProcessAdapter() + webui_adapter = SleeperProcessAdapter() + daemon = service_supervisor.SupervisorDaemon( + service_manager.ServiceConfig(backend_port=9995, frontend_port=9996), + backend_adapter=backend_adapter, + webui_adapter=webui_adapter, + ) + daemon._start_control_server() + + try: + daemon.restart_all(reason="test startup") + backend_process = daemon.backend.process + webui_process = daemon.webui.process + assert backend_process is not None + assert webui_process is not None + + status = service_control.request_prepare_upgrade(paths=paths) + + wait_for_process_exit(webui_process) + assert status.backend.paused is True + assert status.webui.paused is True + assert daemon.backend.process is backend_process + assert backend_process.poll() is None + assert webui_process.pid in webui_adapter.stopped + + backend_process.terminate() + backend_process.wait(timeout=5) + daemon.tick() + + assert len(backend_adapter.started) == 1 + assert daemon.backend.process is backend_process + assert daemon.status_payload()["backend"]["paused"] is True + finally: + daemon.shutdown_children() + daemon._stop_control_server() + shutil.rmtree(short_root, ignore_errors=True) + + def test_start_frontend_tolerates_windows_node_assertion_after_build(monkeypatch, tmp_path: Path) -> None: paths = service_manager.RuntimePaths( root=tmp_path, diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/tests/helpers/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/helpers/service_supervisor.py b/tests/helpers/service_supervisor.py new file mode 100644 index 000000000..a859deb1a --- /dev/null +++ b/tests/helpers/service_supervisor.py @@ -0,0 +1,100 @@ +"""Helpers for service supervisor integration-style tests.""" + +from __future__ import annotations + +import subprocess +import sys +import tempfile +import threading +import time +from pathlib import Path + +from flocks.cli import service_control, service_manager, service_process, service_supervisor + + +class SleeperProcessAdapter: + """Process adapter that starts a real, lightweight child process.""" + + def __init__(self) -> None: + self.started: list[subprocess.Popen] = [] + self.stopped: list[int] = [] + + def start(self, _config, _paths, *, built_once: bool = False) -> subprocess.Popen: + del built_once + process = subprocess.Popen([sys.executable, "-c", "import time; time.sleep(60)"]) + self.started.append(process) + return process + + def stop(self, process: subprocess.Popen | None) -> None: + if process is None: + return + self.stopped.append(process.pid) + process.terminate() + try: + process.wait(timeout=5) + except subprocess.TimeoutExpired: + process.kill() + process.wait(timeout=5) + + def probe(self, process: subprocess.Popen | None, _host: str, _port: int) -> service_process.ServiceProbeResult: + if process is None: + return service_process.ServiceProbeResult(healthy=False, reason="stopped") + if process.poll() is not None: + return service_process.ServiceProbeResult(healthy=False, reason="process exited", restart=True) + return service_process.ServiceProbeResult(healthy=True) + + +def make_short_runtime_root(prefix: str) -> Path: + """Create a short runtime root so Unix domain socket paths fit on macOS.""" + return Path(tempfile.mkdtemp(prefix=prefix, dir="/tmp")) + + +def make_runtime_paths(root: Path) -> service_manager.RuntimePaths: + return service_manager.RuntimePaths( + root=root, + run_dir=root / "run", + log_dir=root / "logs", + backend_pid=root / "run" / "backend.pid", + frontend_pid=root / "run" / "webui.pid", + backend_log=root / "logs" / "backend.log", + frontend_log=root / "logs" / "webui.log", + ) + + +def wait_for_process_exit(process: subprocess.Popen, timeout: float = 5.0) -> None: + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if process.poll() is not None: + return + time.sleep(0.05) + raise AssertionError(f"process {process.pid} did not exit") + + +def wait_for_supervisor(paths: service_manager.RuntimePaths, *, running: bool, timeout: float = 5.0) -> None: + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if service_control.supervisor_is_running(paths) is running: + return + time.sleep(0.05) + raise AssertionError(f"supervisor running={running} was not observed") + + +def start_supervisor( + config: service_manager.ServiceConfig, +) -> tuple[service_supervisor.SupervisorDaemon, threading.Thread]: + daemon = service_supervisor.SupervisorDaemon( + config, + interval=0.05, + backend_adapter=SleeperProcessAdapter(), + webui_adapter=SleeperProcessAdapter(), + ) + thread = threading.Thread(target=daemon.run, daemon=True) + thread.start() + return daemon, thread + + +def stop_supervisor(daemon: service_supervisor.SupervisorDaemon, thread: threading.Thread) -> None: + daemon.request_stop() + thread.join(timeout=5) + daemon.shutdown_children() + daemon._stop_control_server() diff --git a/tests/updater/test_restart_handoff.py b/tests/updater/test_restart_handoff.py index b3464a89c..a2a81f701 100644 --- a/tests/updater/test_restart_handoff.py +++ b/tests/updater/test_restart_handoff.py @@ -1,7 +1,13 @@ +import shutil +import sys from pathlib import Path from types import SimpleNamespace +import pytest + +from flocks.cli import service_manager from flocks.updater import restart_handoff +from tests.helpers.service_supervisor import make_short_runtime_root, start_supervisor, stop_supervisor, wait_for_supervisor def _handoff_args(tmp_path: Path, restart_argv: list[str]) -> list[str]: @@ -56,6 +62,11 @@ def test_run_waits_for_parent_and_backend_port_before_spawning( or SimpleNamespace(pid=4321), ) monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: events.append("tasks") or None) + monkeypatch.setattr( + restart_handoff, + "_stop_supervisor_before_restart", + lambda: events.append("stop-supervisor") or True, + ) code = restart_handoff.run(_handoff_args(tmp_path, restart_argv)) @@ -64,6 +75,7 @@ def test_run_waits_for_parent_and_backend_port_before_spawning( "wait-parent:1234", "free-port:8000", "tasks", + "stop-supervisor", f"spawn:{restart_argv}:{tmp_path}:True", "log:restart_spawned pid=4321", ] @@ -141,6 +153,50 @@ def crash(_args): assert "spawn" not in events +def test_run_does_not_spawn_when_supervisor_stop_fails(monkeypatch, tmp_path: Path) -> None: + events: list[str] = [] + restart_argv = ["python.exe", "-m", "flocks.cli.main", "start"] + + monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) + monkeypatch.setattr(restart_handoff, "_wait_for_parent_exit", lambda parent_pid: True) + monkeypatch.setattr(restart_handoff, "_ensure_backend_port_free", lambda backend_port: True) + monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: None) + monkeypatch.setattr(restart_handoff, "_stop_supervisor_before_restart", lambda: False) + monkeypatch.setattr( + restart_handoff.subprocess, + "Popen", + lambda *_args, **_kwargs: events.append("spawn"), + ) + + code = restart_handoff.run(_handoff_args(tmp_path, restart_argv)) + + assert code == 1 + assert "log:supervisor_stop_timeout" in events + assert "spawn" not in events + + +@pytest.mark.skipif(sys.platform == "win32", reason="uses the Unix domain socket control API") +def test_stop_supervisor_before_restart_waits_until_real_control_api_stops(monkeypatch) -> None: + short_root = make_short_runtime_root("flocks-handoff-") + monkeypatch.setenv("FLOCKS_ROOT", str(short_root)) + paths = service_manager.runtime_paths() + daemon, thread = start_supervisor( + service_manager.ServiceConfig(backend_port=9995, frontend_port=9996), + ) + + try: + wait_for_supervisor(paths, running=True) + + assert restart_handoff._stop_supervisor_before_restart(timeout_seconds=5.0, poll_interval_seconds=0.05) is True + + wait_for_supervisor(paths, running=False) + thread.join(timeout=5) + assert not thread.is_alive() + finally: + stop_supervisor(daemon, thread) + shutil.rmtree(short_root, ignore_errors=True) + + def test_ensure_backend_port_free_waits_again_after_timeout(monkeypatch) -> None: events: list[str] = [] wait_results = iter([False, True]) diff --git a/tests/updater/test_updater.py b/tests/updater/test_updater.py index 55460b92b..7e30d45d1 100644 --- a/tests/updater/test_updater.py +++ b/tests/updater/test_updater.py @@ -10,9 +10,9 @@ import pytest -from flocks.cli import service_manager -from flocks.cli import service_control +from flocks.cli import service_control, service_manager from flocks.updater import updater +from tests.helpers.service_supervisor import make_short_runtime_root, start_supervisor, stop_supervisor, wait_for_supervisor def _write_pyproject_version(pyproject_path: Path, version: str) -> None: @@ -56,6 +56,17 @@ def _webui_control_status( return service_control.parse_supervisor_status(_webui_control_payload(state, last_error)) +def test_current_service_config_requires_supervisor_control_api(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + service_control, + "read_supervisor_status", + lambda **_kwargs: (_ for _ in ()).throw(RuntimeError("control down")), + ) + + with pytest.raises(RuntimeError, match="Supervisor control API is unavailable"): + updater._current_service_config() + + def test_run_handles_none_process_output(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: def fake_run(*args, **kwargs): return subprocess.CompletedProcess(args=args[0], returncode=0, stdout=None, stderr=None) @@ -789,6 +800,46 @@ def test_build_restart_argv_uses_venv_python_on_non_windows( ] +def test_build_restart_handoff_argv_rewrites_serve_to_managed_start( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + config = service_manager.ServiceConfig( + backend_host="0.0.0.0", + backend_port=9000, + frontend_host="10.0.0.8", + frontend_port=5273, + ) + monkeypatch.setattr(updater, "_handoff_service_config", lambda: config) + monkeypatch.setattr(updater.os, "getpid", lambda: 1234) + + argv = updater._build_restart_handoff_argv( + ["python", "-m", "flocks.cli.main", "serve", "--host", "0.0.0.0", "--port", "9000"], + tmp_path, + uv_path="uv", + sync_timeout=300, + version="2026.4.1", + current_version="2026.3.31", + ) + + assert argv[argv.index("--") + 1 :] == [ + "python", + "-m", + "flocks.cli.main", + "start", + "--no-browser", + "--skip-webui-build", + "--server-host", + "0.0.0.0", + "--server-port", + "9000", + "--webui-host", + "10.0.0.8", + "--webui-port", + "5273", + ] + + def test_refresh_global_cli_entry_creates_symlink_on_unix( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, @@ -1007,67 +1058,83 @@ def test_safe_remove_renames_locked_directory_on_windows( assert (leftovers[0] / "dist" / "index.html").exists() -def test_prepare_upgrade_handover_writes_state_and_stops_frontend( +@pytest.mark.skipif(sys.platform == "win32", reason="uses the Unix domain socket control API") +def test_prepare_upgrade_handover_writes_state_and_stops_frontend_with_real_control_api( monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, ) -> None: - monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path / ".flocks")) + short_root = make_short_runtime_root("flocks-updater-") + monkeypatch.setenv("FLOCKS_ROOT", str(short_root)) + paths = service_manager.runtime_paths() + config = service_manager.ServiceConfig( + backend_host="127.0.0.1", + backend_port=9995, + frontend_host="127.0.0.1", + frontend_port=9996, + ) + daemon, thread = start_supervisor(config) + wait_for_supervisor(paths, running=True) - calls: list[str] = [] - monkeypatch.setattr(updater, "_current_service_config", lambda: service_manager.ServiceConfig()) monkeypatch.setattr( updater, "_start_upgrade_page_server", - lambda config, version: {"upgrade_server_pid": 321, "page_dir": str(tmp_path / "page"), "page_log": str(tmp_path / "upgrade.log")}, - ) - monkeypatch.setattr( - service_control, - "request_stop_webui", - lambda **_kwargs: calls.append("/stop/webui") or _webui_control_status(), + lambda _config, _version: { + "upgrade_server_pid": 321, + "page_dir": str(short_root / "page"), + "page_log": str(short_root / "logs" / "upgrade.log"), + }, ) - payload = updater._prepare_upgrade_handover("2026.3.31.1") - - assert calls == ["/stop/webui"] - assert payload["upgrade_server_pid"] == 321 - assert updater._read_upgrade_state()["version"] == "2026.3.31.1" - - -def test_prepare_upgrade_handover_restores_frontend_when_upgrade_page_fails( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path / ".flocks")) - calls: list[tuple[str, bool | None]] = [] - monkeypatch.setattr(updater, "_current_service_config", lambda: service_manager.ServiceConfig()) + try: + payload = updater._prepare_upgrade_handover("2026.3.31.1") + + status = service_control.read_supervisor_status(paths) + assert status.backend.paused is True + assert status.webui.paused is True + assert payload["upgrade_server_pid"] == 321 + assert payload["backend_port"] == 9995 + assert payload["frontend_port"] == 9996 + assert updater._read_upgrade_state()["version"] == "2026.3.31.1" + finally: + stop_supervisor(daemon, thread) + shutil.rmtree(short_root, ignore_errors=True) + + +@pytest.mark.skipif(sys.platform == "win32", reason="uses the Unix domain socket control API") +def test_prepare_upgrade_handover_restores_frontend_when_upgrade_page_fails_with_real_control_api( + monkeypatch: pytest.MonkeyPatch, +) -> None: + short_root = make_short_runtime_root("flocks-updater-") + monkeypatch.setenv("FLOCKS_ROOT", str(short_root)) + paths = service_manager.runtime_paths() + config = service_manager.ServiceConfig( + backend_host="127.0.0.1", + backend_port=9995, + frontend_host="127.0.0.1", + frontend_port=9996, + ) + daemon, thread = start_supervisor(config) + wait_for_supervisor(paths, running=True) + calls: list[str] = [] - monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: calls.append(("stop_page", True))) + monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **_kw: calls.append("stop_page")) monkeypatch.setattr( updater, "_start_upgrade_page_server", lambda _config, _version: (_ for _ in ()).throw(RuntimeError("page failed")), ) - monkeypatch.setattr( - service_control, - "request_stop_webui", - lambda **_kwargs: calls.append(("/stop/webui", None)) or _webui_control_status(), - ) - monkeypatch.setattr( - service_control, - "request_restart_webui", - lambda config, **_kwargs: calls.append(("/restart/webui", config.skip_frontend_build)) - or _webui_control_status(), - ) - with pytest.raises(RuntimeError, match="page failed"): - updater._prepare_upgrade_handover("2026.3.31.1") + try: + with pytest.raises(RuntimeError, match="page failed"): + updater._prepare_upgrade_handover("2026.3.31.1") - assert calls == [ - ("/stop/webui", None), - ("stop_page", True), - ("/restart/webui", False), - ] - assert updater._read_upgrade_state() is None + status = service_control.read_supervisor_status(paths) + assert calls == ["stop_page"] + assert status.webui.paused is False + assert status.webui.pid is not None + assert updater._read_upgrade_state() is None + finally: + stop_supervisor(daemon, thread) + shutil.rmtree(short_root, ignore_errors=True) def test_recover_upgrade_state_restarts_frontend_and_clears_marker( @@ -1567,6 +1634,7 @@ async def fake_sleep(_seconds) -> None: lambda name: "/usr/bin/npm" if name in {"npm", "npm.cmd"} else "/usr/bin/uv", ) monkeypatch.setattr(updater, "_prepare_upgrade_handover", lambda _version: events.append("handover") or {}) + monkeypatch.setattr(updater, "_handoff_service_config", lambda: service_manager.ServiceConfig()) monkeypatch.setattr( updater, "_replace_install_dir", @@ -1577,7 +1645,11 @@ async def fake_sleep(_seconds) -> None: monkeypatch.setattr(updater.asyncio, "sleep", fake_sleep) monkeypatch.setattr(updater, "_rollback_failed_update", lambda *_args: events.append("rollback")) monkeypatch.setattr(updater, "rollback_upgrade_handover", lambda *_args: events.append("rollback_handover")) - monkeypatch.setattr(updater.subprocess, "Popen", lambda argv, **_kwargs: popen_calls.append(list(argv)) or SimpleNamespace(pid=4321)) + monkeypatch.setattr( + updater, + "_spawn_restart_handoff", + lambda argv, **_kwargs: popen_calls.append(list(argv)) or SimpleNamespace(pid=4321), + ) monkeypatch.setattr(updater.os, "_exit", lambda code: (_ for _ in ()).throw(SystemExit(code))) with pytest.raises(SystemExit, match="0"): @@ -1596,6 +1668,16 @@ async def fake_sleep(_seconds) -> None: "-m", "flocks.cli.main", "start", + "--no-browser", + "--skip-webui-build", + "--server-host", + "127.0.0.1", + "--server-port", + "8000", + "--webui-host", + "127.0.0.1", + "--webui-port", + "5173", ] @@ -2771,11 +2853,16 @@ def fake_replace_install_dir(*_args, **_kwargs): lambda name: "/usr/bin/npm" if name in {"npm", "npm.cmd"} else "/usr/bin/uv", ) monkeypatch.setattr(updater, "_prepare_upgrade_handover", lambda _version: events.append("handover")) + monkeypatch.setattr(updater, "_handoff_service_config", lambda: service_manager.ServiceConfig()) monkeypatch.setattr(updater, "_replace_install_dir", fake_replace_install_dir) monkeypatch.setattr(updater, "_rollback_failed_update", lambda *_args: events.append("rollback")) monkeypatch.setattr(updater, "_restore_backup_if_possible", lambda *_args: events.append("restore")) monkeypatch.setattr(updater, "_build_restart_argv", lambda install_root=None: [r"C:\tool\python.exe", "-m", "flocks.cli.main", "start"]) - monkeypatch.setattr(updater.subprocess, "Popen", lambda *_args, **_kwargs: events.append("popen") or SimpleNamespace(pid=4321)) + monkeypatch.setattr( + updater, + "_spawn_restart_handoff", + lambda *_args, **_kwargs: events.append("popen") or SimpleNamespace(pid=4321), + ) monkeypatch.setattr(updater.os, "_exit", lambda code: (_ for _ in ()).throw(SystemExit(code))) with pytest.raises(SystemExit, match="0"): @@ -3018,7 +3105,7 @@ async def test_perform_update_spawns_restart_process_on_windows( (staged_webui / "dist").mkdir() (staged_webui / "dist" / "index.html").write_text("", encoding="utf-8") - popen_calls: list[tuple[list[str], Path, bool]] = [] + popen_calls: list[tuple[list[str], Path]] = [] events: list[str] = [] async def fake_get_updater_config(): @@ -3058,7 +3145,12 @@ async def fake_run_async(cmd, cwd=None, timeout=None, env=None): monkeypatch.setattr(updater, "_refresh_global_cli_entry", lambda _root: None) monkeypatch.setattr(updater, "_build_restart_argv", lambda install_root=None: [r"C:\tool\python.exe", "-m", "flocks.cli.main", "start"]) monkeypatch.setattr(updater, "_prepare_upgrade_handover", lambda _version: events.append("handover")) - monkeypatch.setattr(updater.subprocess, "Popen", lambda argv, cwd=None, close_fds=False: popen_calls.append((list(argv), cwd, close_fds)) or SimpleNamespace(pid=4321)) + monkeypatch.setattr(updater, "_handoff_service_config", lambda: service_manager.ServiceConfig()) + monkeypatch.setattr( + updater, + "_spawn_restart_handoff", + lambda argv, cwd=None: popen_calls.append((list(argv), cwd)) or SimpleNamespace(pid=4321), + ) monkeypatch.setattr(updater.os, "_exit", lambda code: (_ for _ in ()).throw(SystemExit(code))) monkeypatch.setattr(updater.os, "execv", lambda *_args: events.append("execv")) @@ -3067,9 +3159,8 @@ async def fake_run_async(cmd, cwd=None, timeout=None, env=None): pass assert len(popen_calls) == 1 - handoff_argv, cwd, close_fds = popen_calls[0] + handoff_argv, cwd = popen_calls[0] assert cwd == tmp_path / "install-root" - assert close_fds is True assert handoff_argv[:3] == [r"C:\tool\python.exe", "-m", "flocks.updater.restart_handoff"] assert "--parent-pid" in handoff_argv assert "--backend-port" in handoff_argv @@ -3078,6 +3169,16 @@ async def fake_run_async(cmd, cwd=None, timeout=None, env=None): "-m", "flocks.cli.main", "start", + "--no-browser", + "--skip-webui-build", + "--server-host", + "127.0.0.1", + "--server-port", + "8000", + "--webui-host", + "127.0.0.1", + "--webui-port", + "5173", ] assert events == ["handover"] assert "execv" not in events @@ -3260,10 +3361,11 @@ async def fake_run_async(cmd, cwd=None, timeout=None, env=None): monkeypatch.setattr(updater, "_refresh_global_cli_entry", lambda _root: None) monkeypatch.setattr(updater, "_build_restart_argv", lambda install_root=None: [r"C:\tool\python.exe", "-m", "flocks.cli.main"]) monkeypatch.setattr(updater, "_prepare_upgrade_handover", lambda _version: events.append("handover")) + monkeypatch.setattr(updater, "_handoff_service_config", lambda: service_manager.ServiceConfig()) monkeypatch.setattr(updater, "rollback_upgrade_handover", lambda: events.append("rollback_handover")) monkeypatch.setattr( - updater.subprocess, - "Popen", + updater, + "_spawn_restart_handoff", lambda *_args, **_kwargs: (_ for _ in ()).throw(OSError("spawn failed")), ) From 04c800ec42cdaa6e22fc129cef72aed534e6c2d1 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Thu, 2 Jul 2026 13:12:47 +0800 Subject: [PATCH 05/28] fix(supervisor): resume services after failed upgrade --- flocks/cli/service_control.py | 11 ++++ flocks/cli/service_supervisor.py | 16 ++++++ flocks/updater/updater.py | 4 +- tests/updater/test_updater.py | 88 ++++++++++++++++++++++++++++---- 4 files changed, 108 insertions(+), 11 deletions(-) diff --git a/flocks/cli/service_control.py b/flocks/cli/service_control.py index b34791784..c8e2e9280 100644 --- a/flocks/cli/service_control.py +++ b/flocks/cli/service_control.py @@ -189,6 +189,17 @@ def request_prepare_upgrade(*, paths=None, timeout: float | None = 30.0) -> Supe return parse_supervisor_status(payload) +def request_resume_upgrade( + config: ServiceConfig, + *, + paths=None, + timeout: float | None = 180.0, +) -> SupervisorStatus: + """Ask the supervisor daemon to resume managed services after upgrade handoff.""" + payload = _post_control_json("/upgrade/resume", payload=service_config_payload(config), paths=paths, timeout=timeout) + return parse_supervisor_status(payload) + + def read_logs( *, service: str, diff --git a/flocks/cli/service_supervisor.py b/flocks/cli/service_supervisor.py index 8c6e08565..2eccefc31 100644 --- a/flocks/cli/service_supervisor.py +++ b/flocks/cli/service_supervisor.py @@ -270,6 +270,11 @@ def do_POST(self) -> None: daemon.prepare_upgrade(reason="control upgrade prepare") self._send_json(daemon.status_payload()) return + if parsed.path == "/upgrade/resume": + daemon.update_config(payload) + daemon.resume_upgrade(reason="control upgrade resume") + self._send_json(daemon.status_payload()) + return self._send_json({"error": "not found"}, status=404) except Exception as exc: # pragma: no cover - defensive control path self._send_json({"error": str(exc)}, status=500) @@ -431,6 +436,17 @@ def prepare_upgrade(self, *, reason: str) -> None: self.webui.last_error = reason self._stop_service(self.webui) + def resume_upgrade(self, *, reason: str) -> None: + with self._lock: + self._backend_paused = False + self._webui_paused = False + _daemon_log("service_resume", {"service": "backend", "reason": reason}) + _daemon_log("service_resume", {"service": "webui", "reason": reason}) + self._probe_backend_locked() + self._probe_webui_locked() + self._start_backend_locked(immediate=True) + self._start_webui_locked(immediate=True) + def shutdown_children(self) -> None: with self._lock: self._stop_service(self.webui) diff --git a/flocks/updater/updater.py b/flocks/updater/updater.py index 218f9f033..a5456de45 100644 --- a/flocks/updater/updater.py +++ b/flocks/updater/updater.py @@ -2204,10 +2204,10 @@ def read_upgrade_runtime_state(frontend_port: int | None = None) -> dict[str, An def _start_frontend_with_fallback(config, console, *, allow_build_fallback: bool) -> None: from flocks.cli.service_config import with_frontend_build - from flocks.cli.service_control import request_restart_webui + from flocks.cli.service_control import request_restart_webui, request_resume_upgrade try: - status = request_restart_webui( + status = request_resume_upgrade( config, paths=None, timeout=180.0, diff --git a/tests/updater/test_updater.py b/tests/updater/test_updater.py index 7e30d45d1..1528d9c0e 100644 --- a/tests/updater/test_updater.py +++ b/tests/updater/test_updater.py @@ -1129,6 +1129,8 @@ def test_prepare_upgrade_handover_restores_frontend_when_upgrade_page_fails_with status = service_control.read_supervisor_status(paths) assert calls == ["stop_page"] + assert status.backend.paused is False + assert status.backend.pid is not None assert status.webui.paused is False assert status.webui.pid is not None assert updater._read_upgrade_state() is None @@ -1137,6 +1139,54 @@ def test_prepare_upgrade_handover_restores_frontend_when_upgrade_page_fails_with shutil.rmtree(short_root, ignore_errors=True) +@pytest.mark.skipif(sys.platform == "win32", reason="uses the Unix domain socket control API") +def test_rollback_failed_update_resumes_backend_when_handoff_tasks_fail( + monkeypatch: pytest.MonkeyPatch, +) -> None: + short_root = make_short_runtime_root("flocks-updater-") + monkeypatch.setenv("FLOCKS_ROOT", str(short_root)) + paths = service_manager.runtime_paths() + config = service_manager.ServiceConfig( + backend_host="127.0.0.1", + backend_port=9995, + frontend_host="127.0.0.1", + frontend_port=9996, + ) + daemon, thread = start_supervisor(config) + wait_for_supervisor(paths, running=True) + monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **_kw: None) + + try: + updater._write_upgrade_state( + { + "version": "2026.4.1", + "backend_host": "127.0.0.1", + "backend_port": 9995, + "frontend_host": "127.0.0.1", + "frontend_port": 9996, + "skip_frontend_build": True, + } + ) + service_control.request_prepare_upgrade(paths=paths) + old_backend = daemon.backend.process + assert old_backend is not None + old_backend.terminate() + old_backend.wait(timeout=5) + + updater._rollback_failed_update(None, short_root / "install", "2026.3.31") + + status = service_control.read_supervisor_status(paths) + assert status.backend.paused is False + assert status.webui.paused is False + assert status.backend.pid is not None + assert status.backend.pid != old_backend.pid + assert status.webui.pid is not None + assert updater._read_upgrade_state() is None + finally: + stop_supervisor(daemon, thread) + shutil.rmtree(short_root, ignore_errors=True) + + def test_recover_upgrade_state_restarts_frontend_and_clears_marker( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, @@ -1148,7 +1198,7 @@ def test_recover_upgrade_state_restarts_frontend_and_clears_marker( monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: stopped.append("stop")) monkeypatch.setattr( service_control, - "request_restart_webui", + "request_resume_upgrade", lambda config, **_kwargs: started.append((config.frontend_port, config.skip_frontend_build)) or _webui_control_status(), ) @@ -1175,7 +1225,7 @@ def test_recover_upgrade_state_retries_frontend_with_build_when_dist_is_missing( tmp_path: Path, ) -> None: monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path / ".flocks")) - starts: list[tuple[bool | None, bool | None]] = [] + starts: list[tuple[str, bool | None, bool | None]] = [] monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: None) @@ -1184,10 +1234,15 @@ def test_recover_upgrade_state_retries_frontend_with_build_when_dist_is_missing( _webui_control_payload(), ]) + def fake_resume_upgrade(config, **_kwargs): + starts.append(("resume", config.skip_frontend_build, None)) + return service_control.parse_supervisor_status(next(results)) + def fake_restart_webui(config, *, force_frontend_build=False, **_kwargs): - starts.append((config.skip_frontend_build, force_frontend_build or None)) + starts.append(("restart_webui", config.skip_frontend_build, force_frontend_build or None)) return service_control.parse_supervisor_status(next(results)) + monkeypatch.setattr(service_control, "request_resume_upgrade", fake_resume_upgrade) monkeypatch.setattr(service_control, "request_restart_webui", fake_restart_webui) updater._write_upgrade_state( { @@ -1202,7 +1257,7 @@ def fake_restart_webui(config, *, force_frontend_build=False, **_kwargs): updater.recover_upgrade_state() - assert starts == [(True, None), (False, True)] + assert starts == [("resume", True, None), ("restart_webui", False, True)] assert updater._read_upgrade_state() is None @@ -1211,14 +1266,19 @@ def test_recover_upgrade_state_restart_failure_clears_state_without_restarting_p tmp_path: Path, ) -> None: monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path / ".flocks")) - starts: list[tuple[bool | None, bool | None]] = [] + starts: list[tuple[str, bool | None, bool | None]] = [] monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: None) + def fake_resume_upgrade(config, **_kwargs): + starts.append(("resume", config.skip_frontend_build, None)) + return _webui_control_status("degraded", "still broken") + def fake_restart_webui(config, *, force_frontend_build=False, **_kwargs): - starts.append((config.skip_frontend_build, force_frontend_build or None)) + starts.append(("restart_webui", config.skip_frontend_build, force_frontend_build or None)) return _webui_control_status("degraded", "still broken") + monkeypatch.setattr(service_control, "request_resume_upgrade", fake_resume_upgrade) monkeypatch.setattr(service_control, "request_restart_webui", fake_restart_webui) updater._write_upgrade_state( { @@ -1234,7 +1294,7 @@ def fake_restart_webui(config, *, force_frontend_build=False, **_kwargs): with pytest.raises(RuntimeError, match="still broken"): updater.recover_upgrade_state() - assert starts == [(True, None), (False, True)] + assert starts == [("resume", True, None), ("restart_webui", False, True)] assert updater._read_upgrade_state() is None @@ -1355,10 +1415,15 @@ def test_rollback_failed_update_restores_backup_and_rebuilds_frontend_if_needed( _webui_control_payload(), ]) + def fake_resume_upgrade(config, **_kwargs) -> service_control.SupervisorStatus: + events.append(f"resume:{config.skip_frontend_build}") + return service_control.parse_supervisor_status(next(results)) + def fake_restart_webui(config, *, force_frontend_build=False, **_kwargs) -> service_control.SupervisorStatus: events.append(f"restart_webui:{config.skip_frontend_build}:{force_frontend_build or None}") return service_control.parse_supervisor_status(next(results)) + monkeypatch.setattr(service_control, "request_resume_upgrade", fake_resume_upgrade) monkeypatch.setattr(service_control, "request_restart_webui", fake_restart_webui) updater._write_upgrade_state( { @@ -1379,7 +1444,7 @@ def fake_restart_webui(config, *, force_frontend_build=False, **_kwargs) -> serv "restore:backup.tar.gz:install", "marker:2026.3.31", "stop_page", - "restart_webui:True:None", + "resume:True", "restart_webui:False:True", "rmtree:upgrade-page", ] @@ -1402,10 +1467,15 @@ def test_rollback_failed_update_clears_state_when_restore_and_frontend_both_fail monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: events.append("stop_page")) monkeypatch.setattr(updater.shutil, "rmtree", lambda path, ignore_errors=True: events.append(f"rmtree:{Path(path).name}")) + def fake_resume_upgrade(config, **_kwargs) -> service_control.SupervisorStatus: + events.append(f"resume:{config.skip_frontend_build}") + return _webui_control_status("degraded", "frontend still broken") + def fake_restart_webui(config, **_kwargs) -> service_control.SupervisorStatus: events.append(f"restart_webui:{config.skip_frontend_build}") return _webui_control_status("degraded", "frontend still broken") + monkeypatch.setattr(service_control, "request_resume_upgrade", fake_resume_upgrade) monkeypatch.setattr(service_control, "request_restart_webui", fake_restart_webui) updater._write_upgrade_state( { @@ -1425,7 +1495,7 @@ def fake_restart_webui(config, **_kwargs) -> service_control.SupervisorStatus: assert events == [ "stop_page", - "restart_webui:True", + "resume:True", "rmtree:upgrade-page", ] assert updater._read_upgrade_state() is None From 0e8cad1b8ebfdceef9f08814beffff51d5dcb94c Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Thu, 2 Jul 2026 13:56:53 +0800 Subject: [PATCH 06/28] chore(cli): clarify daemon status output --- flocks/cli/service_manager.py | 60 +++++++++++++------------------ tests/cli/test_service_manager.py | 27 ++++++++------ 2 files changed, 41 insertions(+), 46 deletions(-) diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index 1fdbebdbf..bc0e40262 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -1145,7 +1145,7 @@ def _wait_for_supervisor_ready( last_payload: dict[str, Any] | None = None while time.monotonic() < deadline: if process is not None and process.poll() is not None: - raise ServiceError(f"Supervisor 启动失败,退出码: {process.returncode}") + raise ServiceError(f"Flocks daemon 启动失败,退出码: {process.returncode}") try: status = read_supervisor_status(paths=paths, timeout=1.0) last_payload = status.raw @@ -1160,7 +1160,7 @@ def _wait_for_supervisor_ready( time.sleep(0.5) if last_payload is not None: return last_payload - raise ServiceError("Supervisor 启动超时,请检查日志。") + raise ServiceError("Flocks daemon 启动超时,请检查日志。") def _start_supervisor_process(config: ServiceConfig, paths: RuntimePaths, console) -> subprocess.Popen: @@ -1184,7 +1184,7 @@ def _start_supervisor_process(config: ServiceConfig, paths: RuntimePaths, consol command.append("--skip-webui-build") env = os.environ.copy() env["PYTHONUNBUFFERED"] = "1" - console.print("[flocks] 启动 Supervisor daemon...") + console.print("[flocks] 启动 Flocks daemon...") return _spawn_process(command, cwd=root, log_path=log_path, env=env) @@ -1192,20 +1192,20 @@ def stop_all(console) -> None: """Stop managed services through the supervisor control API.""" paths = ensure_runtime_dirs() if not supervisor_is_running(paths): - console.print("[flocks] Supervisor 未运行。") + console.print("[flocks] Flocks daemon 未运行。") return try: request_stop(paths=paths, timeout=2.0) except Exception as exc: - raise ServiceError(f"无法请求 Supervisor 停止: {exc}") from exc + raise ServiceError(f"无法请求 Flocks daemon 停止: {exc}") from exc deadline = time.monotonic() + 20.0 while time.monotonic() < deadline: if not supervisor_is_running(paths): - console.print("[flocks] Supervisor 已停止。") + console.print("[flocks] Flocks daemon 已停止。") return time.sleep(0.5) - raise ServiceError("Supervisor 未在预期时间内退出。") + raise ServiceError("Flocks daemon 未在预期时间内退出。") def _start_all_without_stop(config: ServiceConfig, console) -> None: @@ -1213,7 +1213,6 @@ def _start_all_without_stop(config: ServiceConfig, console) -> None: paths = ensure_runtime_dirs() process = _start_supervisor_process(config, paths, console) payload = _wait_for_supervisor_ready(paths, process=process) - show_start_summary(config, console) _print_status_payload(payload, console) if not config.no_browser: open_default_browser(config.frontend_url, console) @@ -1223,7 +1222,7 @@ def start_all(config: ServiceConfig, console) -> None: """Ensure the supervisor daemon is running.""" paths = ensure_runtime_dirs() if supervisor_is_running(paths): - console.print("[flocks] Supervisor 已在运行。") + console.print("[flocks] Flocks daemon 已在运行。") show_status(console) if not config.no_browser: try: @@ -1245,7 +1244,7 @@ def restart_all(config: ServiceConfig, console) -> None: try: status = request_restart(config, paths=paths, timeout=180.0) except Exception as exc: - raise ServiceError(f"无法请求 Supervisor 重启: {exc}") from exc + raise ServiceError(f"无法请求 Flocks daemon 重启: {exc}") from exc _print_status_payload(status.raw, console) @@ -1256,8 +1255,8 @@ def build_status_lines(paths: RuntimePaths | None = None) -> list[str]: status = read_supervisor_status(paths=current) except Exception: return [ - "[flocks] Supervisor 未运行", - f"[flocks] Supervisor 日志: {supervisor_log_path(current)}", + "[flocks] Flocks daemon 未运行", + f"[flocks] 日志: {supervisor_log_path(current)}", ] return _status_lines_from_payload(status.raw) @@ -1267,15 +1266,21 @@ def _status_lines_from_payload(payload: dict[str, Any]) -> list[str]: backend = payload.get("backend") if isinstance(payload.get("backend"), dict) else {} webui = payload.get("webui") if isinstance(payload.get("webui"), dict) else {} lines = [ - f"[flocks] Supervisor 运行中: PID={daemon.get('pid')} state={daemon.get('state')}", + "[flocks] Flocks daemon", + f"[flocks] PID: {daemon.get('pid')}", + f"[flocks] 状态: {daemon.get('state')}", + "", + "[flocks] 服务", _service_status_line("后端", backend), _service_status_line("WebUI", webui), - f"[flocks] Supervisor 日志: {daemon.get('log_path')}", + "", + "[flocks] 日志", + f"[flocks] daemon: {daemon.get('log_path')}", ] - for service in (backend, webui): + for label, service in (("后端", backend), ("WebUI", webui)): log_path = service.get("log_path") if log_path: - lines.append(f"[flocks] {service.get('state')} 日志: {log_path}") + lines.append(f"[flocks] {label}: {log_path}") return lines @@ -1286,7 +1291,7 @@ def _service_status_line(label: str, payload: dict[str, Any]) -> str: state = payload.get("state") or "unknown" error = payload.get("last_error") suffix = f" last_error={error}" if error else "" - return f"[flocks] {label}: state={state} PID={pid} URL=http://{host}:{port}{suffix}" + return f"[flocks] {label}: state={state} PID={pid} URL=http://{host}:{port}{suffix}" def _frontend_url_from_status(status, fallback: str) -> str: @@ -1306,21 +1311,6 @@ def show_status(console) -> None: console.print(line) -def show_start_summary(config: ServiceConfig, console) -> None: - """Print URLs and log locations after startup.""" - paths = ensure_runtime_dirs() - console.print() - console.print("[flocks] 日志:") - console.print(f"[flocks] 后端: {paths.backend_log}") - console.print(f"[flocks] WebUI: {paths.frontend_log}") - console.print() - console.print("[flocks] 后端接口:") - console.print(f"[flocks] http://{_loopback_host(config.backend_host)}:{config.backend_port}") - console.print() - console.print("[flocks] 打开浏览器访问:") - console.print(f"[flocks] {config.frontend_url}") - - def show_logs( console, *, @@ -1340,7 +1330,7 @@ def show_logs( try: payload = read_logs(service=service, lines=lines, paths=paths, timeout=5.0) except Exception as exc: - raise ServiceError(f"无法通过 Supervisor 读取日志: {exc}") from exc + raise ServiceError(f"无法通过 Flocks daemon 读取日志: {exc}") from exc logs = payload.get("logs") if isinstance(payload.get("logs"), dict) else {} for prefix, entry in logs.items(): if not isinstance(entry, dict): @@ -1357,7 +1347,7 @@ def show_logs( except KeyboardInterrupt: return except Exception as exc: - raise ServiceError(f"无法通过 Supervisor 跟随日志: {exc}") from exc + raise ServiceError(f"无法通过 Flocks daemon 跟随日志: {exc}") from exc def selected_log_paths( @@ -1466,7 +1456,7 @@ def open_default_browser(url: str, console) -> None: """Best-effort browser open.""" try: if webbrowser.open(url): - console.print(f"[flocks] 已使用默认浏览器打开: {url}") + console.print(f"[flocks] 浏览器已打开: {url}") return except Exception: pass diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index 4d57ce3e1..33583beaa 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -743,9 +743,14 @@ def test_build_status_lines_reports_supervisor_control_status(monkeypatch, tmp_p lines = service_manager.build_status_lines(paths) - assert "Supervisor 运行中" in lines[0] - assert "http://127.0.0.1:9000" in lines[1] - assert "http://127.0.0.1:5174" in lines[2] + assert lines[0] == "[flocks] Flocks daemon" + assert lines[1] == "[flocks] PID: 100" + assert lines[2] == "[flocks] 状态: running" + assert "http://127.0.0.1:9000" in lines[5] + assert "http://127.0.0.1:5174" in lines[6] + assert lines[9] == "[flocks] daemon: /tmp/logs/supervisor.log" + assert lines[10] == "[flocks] 后端: /tmp/logs/backend.log" + assert lines[11] == "[flocks] WebUI: /tmp/logs/webui.log" def test_build_status_lines_reports_daemon_down_without_port_scans(monkeypatch, tmp_path: Path) -> None: @@ -762,7 +767,7 @@ def test_build_status_lines_reports_daemon_down_without_port_scans(monkeypatch, lines = service_manager.build_status_lines(paths) - assert lines[0] == "[flocks] Supervisor 未运行" + assert lines[0] == "[flocks] Flocks daemon 未运行" assert calls == [] @@ -792,7 +797,7 @@ def test_start_all_does_not_duplicate_running_supervisor(monkeypatch) -> None: service_manager.start_all(service_manager.ServiceConfig(no_browser=True), console=console) assert calls == ["status"] - assert "[flocks] Supervisor 已在运行。" in console.messages + assert "[flocks] Flocks daemon 已在运行。" in console.messages def test_restart_all_uses_supervisor_control_api(monkeypatch) -> None: @@ -820,7 +825,6 @@ def test_start_all_without_stop_starts_supervisor_daemon(monkeypatch, tmp_path: monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) monkeypatch.setattr(service_manager, "_start_supervisor_process", lambda _config, _paths, _console: calls.append("daemon") or SimpleNamespace(poll=lambda: None)) monkeypatch.setattr(service_manager, "_wait_for_supervisor_ready", lambda _paths, **_kwargs: calls.append("ready") or _supervisor_status_payload()) - monkeypatch.setattr(service_manager, "show_start_summary", lambda _config, _console: calls.append("summary")) monkeypatch.setattr(service_manager, "_print_status_payload", lambda _payload, _console: calls.append("status")) monkeypatch.setattr( service_manager, @@ -830,7 +834,7 @@ def test_start_all_without_stop_starts_supervisor_daemon(monkeypatch, tmp_path: service_manager._start_all_without_stop(service_manager.ServiceConfig(no_browser=True), DummyConsole()) - assert calls == ["daemon", "ready", "summary", "status"] + assert calls == ["daemon", "ready", "status"] def test_start_all_propagates_supervisor_start_failure(monkeypatch) -> None: @@ -1678,7 +1682,7 @@ def print(self, message) -> None: service_manager.stop_all(console=console) assert calls == ["/stop"] - assert console.messages == ["[flocks] Supervisor 已停止。"] + assert console.messages == ["[flocks] Flocks daemon 已停止。"] def test_stop_all_reports_when_supervisor_is_down(monkeypatch, tmp_path: Path) -> None: @@ -1690,7 +1694,7 @@ def test_stop_all_reports_when_supervisor_is_down(monkeypatch, tmp_path: Path) - service_manager.stop_all(console) - assert console.messages == ["[flocks] Supervisor 未运行。"] + assert console.messages == ["[flocks] Flocks daemon 未运行。"] def test_status_lines_include_control_api_errors(monkeypatch, tmp_path: Path) -> None: @@ -1702,8 +1706,9 @@ def test_status_lines_include_control_api_errors(monkeypatch, tmp_path: Path) -> lines = service_manager.build_status_lines(paths) - assert "state=degraded" in lines[1] - assert "last_error=health failed" in lines[1] + backend_line = next(line for line in lines if "后端:" in line) + assert "state=degraded" in backend_line + assert "last_error=health failed" in backend_line From f689e91d0e67f08f2374fa76141936a43e40a557 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Thu, 2 Jul 2026 17:10:05 +0800 Subject: [PATCH 07/28] fix(cli): clean trusted orphan service ports --- flocks/cli/service_control.py | 2 +- flocks/cli/service_manager.py | 142 ++++++++++++++++++++++++++---- tests/cli/test_service_manager.py | 84 ++++++++++++++---- 3 files changed, 194 insertions(+), 34 deletions(-) diff --git a/flocks/cli/service_control.py b/flocks/cli/service_control.py index c8e2e9280..c3a98672e 100644 --- a/flocks/cli/service_control.py +++ b/flocks/cli/service_control.py @@ -13,7 +13,7 @@ from flocks.cli.service_config import ServiceConfig, service_config_from_status_payload, service_config_payload SUPERVISOR_CONTROL_PORT = 48765 -SUPERVISOR_LOG_FILENAME = "supervisor.log" +SUPERVISOR_LOG_FILENAME = "daemon.log" SUPERVISOR_SOCKET_FILENAME = "service-daemon.sock" diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index bc0e40262..20bc336ee 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -29,7 +29,6 @@ from flocks.cli.service_control import ( read_logs, read_supervisor_status, - request_restart, request_stop, stream_logs, supervisor_is_running, @@ -818,6 +817,76 @@ def port_is_in_use(port: int, listeners: Sequence[int] | None = None) -> bool: return not _bind_port_available(port) +def _process_command_line(pid: int) -> str: + """Return a process command line for best-effort orphan detection.""" + if pid <= 0: + return "" + if sys.platform == "win32": + snapshot = _windows_process_snapshot(pid) + return str(snapshot.get("command_line") or "") if snapshot else "" + completed = subprocess.run( + ["ps", "-p", str(pid), "-o", "command="], + check=False, + capture_output=True, + text=True, + ) + return completed.stdout.strip() + + +def _trusted_flocks_port_owner(pid: int, *, service: str, root: Path) -> bool: + """Return True only for port owners that look like Flocks leftovers.""" + command_line = _process_command_line(pid).lower() + if not command_line: + return False + root_text = str(root).lower() + webui_text = str(root / "webui").lower() + if service == "backend": + return ( + ("flocks.cli.main" in command_line and "serve" in command_line) + or ("flocks" in command_line and "serve" in command_line and root_text in command_line) + ) + if service == "webui": + looks_like_preview = ("vite" in command_line and "preview" in command_line) or ( + "npm" in command_line and "preview" in command_line + ) + return looks_like_preview and (webui_text in command_line or root_text in command_line) + return False + + +def _terminate_orphan_pid(pid: int, label: str, console, *, timeout: float = 5.0) -> None: + """Terminate a trusted orphan process tree by pid.""" + console.print(f"[flocks] 清理残留 {label} 进程(PID={pid})...") + if sys.platform == "win32": + subprocess.run(["taskkill", "/PID", str(pid), "/T", "/F"], check=False, capture_output=True) + return + + targets = collect_process_tree_pids(pid) + signal_pid_list(signal.SIGTERM, targets) + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if not any(pid_is_running(target) for target in targets): + return + time.sleep(0.25) + signal_pid_list(signal.SIGKILL, targets) + + +def cleanup_trusted_port_owners(port: int, *, service: str, label: str, console, root: Path | None = None) -> list[int]: + """Clean Flocks-owned orphan processes that are still occupying a service port.""" + current_root = root or ensure_install_layout() + listeners = port_owner_pids(port) + trusted = [pid for pid in listeners if _trusted_flocks_port_owner(pid, service=service, root=current_root)] + for pid in trusted: + _terminate_orphan_pid(pid, label, console) + if trusted: + deadline = time.monotonic() + 5.0 + while time.monotonic() < deadline: + current = port_owner_pids(port) + if not any(pid in trusted for pid in current): + break + time.sleep(0.25) + return trusted + + def _is_reachable_response(response: httpx.Response) -> bool: """Return True when an HTTP endpoint is reachable enough for startup checks.""" return response.status_code < 500 @@ -949,10 +1018,19 @@ def _start_backend_process( listeners = port_owner_pids(config.backend_port) if listeners: - raise ServiceError( - f"后端端口 {config.backend_port} 已被占用 (PID: {_join_pids(listeners)})," - "请先执行 `flocks stop` 或手动清理残留进程。" + cleanup_trusted_port_owners( + config.backend_port, + service="backend", + label="后端", + console=console, + root=root, ) + listeners = port_owner_pids(config.backend_port) + if listeners: + raise ServiceError( + f"后端端口 {config.backend_port} 已被占用 (PID: {_join_pids(listeners)})," + "请先执行 `flocks stop` 或手动清理残留进程。" + ) if port_is_in_use(config.backend_port, listeners): raise ServiceError( f"后端端口 {config.backend_port} 已被占用,但当前环境无法识别占用 PID;" @@ -1011,11 +1089,20 @@ def _start_frontend_process( ) else: - raise ServiceError( - f"WebUI 端口 {config.frontend_port} 已被占用 (PID: {_join_pids(listeners)})," - "请先执行 `flocks stop` 或手动清理残留进程。" + cleanup_trusted_port_owners( + config.frontend_port, + service="webui", + label="WebUI", + console=console, + root=root, ) - elif port_is_in_use(config.frontend_port, listeners): + listeners = port_owner_pids(config.frontend_port) + if listeners: + raise ServiceError( + f"WebUI 端口 {config.frontend_port} 已被占用 (PID: {_join_pids(listeners)})," + "请先执行 `flocks stop` 或手动清理残留进程。" + ) + if port_is_in_use(config.frontend_port, listeners): raise ServiceError( f"WebUI 端口 {config.frontend_port} 已被占用,但当前环境无法识别占用 PID;" "请先安装 lsof 或手动清理残留进程。" @@ -1191,9 +1278,15 @@ def _start_supervisor_process(config: ServiceConfig, paths: RuntimePaths, consol def stop_all(console) -> None: """Stop managed services through the supervisor control API.""" paths = ensure_runtime_dirs() + cleanup_config = ServiceConfig() if not supervisor_is_running(paths): console.print("[flocks] Flocks daemon 未运行。") + cleanup_orphan_service_ports(cleanup_config, console) return + try: + cleanup_config = read_supervisor_status(paths=paths, timeout=1.0).config + except Exception: + pass try: request_stop(paths=paths, timeout=2.0) except Exception as exc: @@ -1202,6 +1295,7 @@ def stop_all(console) -> None: deadline = time.monotonic() + 20.0 while time.monotonic() < deadline: if not supervisor_is_running(paths): + cleanup_orphan_service_ports(cleanup_config, console) console.print("[flocks] Flocks daemon 已停止。") return time.sleep(0.5) @@ -1236,16 +1330,28 @@ def start_all(config: ServiceConfig, console) -> None: def restart_all(config: ServiceConfig, console) -> None: - """Restart backend and frontend through the supervisor control API.""" - paths = ensure_runtime_dirs() - if not supervisor_is_running(paths): - start_all(config, console) - return - try: - status = request_restart(config, paths=paths, timeout=180.0) - except Exception as exc: - raise ServiceError(f"无法请求 Flocks daemon 重启: {exc}") from exc - _print_status_payload(status.raw, console) + """Restart by stopping the daemon first, then starting a fresh daemon.""" + stop_all(console) + start_all(config, console) + + +def cleanup_orphan_service_ports(config: ServiceConfig, console) -> None: + """Clean trusted Flocks leftovers on configured backend/WebUI ports.""" + root = ensure_install_layout() + cleanup_trusted_port_owners( + config.backend_port, + service="backend", + label="后端", + console=console, + root=root, + ) + cleanup_trusted_port_owners( + config.frontend_port, + service="webui", + label="WebUI", + console=console, + root=root, + ) def build_status_lines(paths: RuntimePaths | None = None) -> list[str]: diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index 33583beaa..40962ecb7 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -712,7 +712,7 @@ def _supervisor_status_payload() -> dict[str, object]: "daemon": { "pid": 100, "state": "running", - "log_path": "/tmp/logs/supervisor.log", + "log_path": "/tmp/logs/daemon.log", }, "backend": { "pid": 111, @@ -748,7 +748,7 @@ def test_build_status_lines_reports_supervisor_control_status(monkeypatch, tmp_p assert lines[2] == "[flocks] 状态: running" assert "http://127.0.0.1:9000" in lines[5] assert "http://127.0.0.1:5174" in lines[6] - assert lines[9] == "[flocks] daemon: /tmp/logs/supervisor.log" + assert lines[9] == "[flocks] daemon: /tmp/logs/daemon.log" assert lines[10] == "[flocks] 后端: /tmp/logs/backend.log" assert lines[11] == "[flocks] WebUI: /tmp/logs/webui.log" @@ -800,22 +800,15 @@ def test_start_all_does_not_duplicate_running_supervisor(monkeypatch) -> None: assert "[flocks] Flocks daemon 已在运行。" in console.messages -def test_restart_all_uses_supervisor_control_api(monkeypatch) -> None: +def test_restart_all_stops_then_starts_daemon(monkeypatch) -> None: call_order: list[str] = [] - paths = _make_runtime_paths(Path("/tmp/flocks-test")) - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: (call_order.append("ensure_runtime_dirs"), paths)[1]) - monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: True) - monkeypatch.setattr( - service_manager, - "request_restart", - lambda _config, **_kwargs: call_order.append("/restart") or _supervisor_status(), - ) - monkeypatch.setattr(service_manager, "_print_status_payload", lambda _payload, _console: call_order.append("print_status")) + monkeypatch.setattr(service_manager, "stop_all", lambda _console: call_order.append("stop")) + monkeypatch.setattr(service_manager, "start_all", lambda _config, _console: call_order.append("start")) service_manager.restart_all(service_manager.ServiceConfig(), console=None) - assert call_order == ["ensure_runtime_dirs", "/restart", "print_status"] + assert call_order == ["stop", "start"] def test_start_all_without_stop_starts_supervisor_daemon(monkeypatch, tmp_path: Path) -> None: @@ -1510,6 +1503,31 @@ def test_start_backend_raises_when_port_has_listener(monkeypatch, tmp_path: Path service_manager._start_backend_process(service_manager.ServiceConfig(), DummyConsole()) +def test_start_backend_cleans_trusted_orphan_port_owner(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + paths.run_dir.mkdir(parents=True) + paths.log_dir.mkdir(parents=True) + owners = iter([[9999], [9999], [], []]) + cleaned: list[int] = [] + + monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: next(owners)) + monkeypatch.setattr(service_manager, "_process_command_line", lambda _pid: f"{tmp_path}/.venv/bin/python -m flocks.cli.main serve") + monkeypatch.setattr(service_manager, "_terminate_orphan_pid", lambda pid, *_args, **_kwargs: cleaned.append(pid)) + monkeypatch.setattr(service_manager, "port_is_in_use", lambda *_args, **_kwargs: False) + monkeypatch.setattr(service_manager, "resolve_flocks_cli_command", lambda _root: ["/env/bin/python", "-m", "flocks.cli.main"]) + monkeypatch.setattr(service_manager, "_spawn_process", lambda command, **_kwargs: SimpleNamespace(pid=1234, args=command)) + monkeypatch.setattr(service_manager, "process_runtime_record", lambda *_args, **_kwargs: service_manager.RuntimeRecord(pid=1234)) + monkeypatch.setattr(service_manager, "_log_startup_config", lambda *_args, **_kwargs: None) + monkeypatch.setattr(service_manager, "wait_for_http", lambda *_args, **_kwargs: None) + + process = service_manager._start_backend_process(service_manager.ServiceConfig(), DummyConsole(), paths=paths) + + assert process.pid == 1234 + assert cleaned == [9999] + + def test_start_backend_raises_when_port_in_use_without_pid_lookup(monkeypatch, tmp_path: Path) -> None: paths = service_manager.RuntimePaths( root=tmp_path, @@ -1533,6 +1551,40 @@ def test_start_backend_raises_when_port_in_use_without_pid_lookup(monkeypatch, t service_manager._start_backend_process(service_manager.ServiceConfig(), DummyConsole()) +def test_start_webui_cleans_trusted_orphan_port_owner(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + paths.run_dir.mkdir(parents=True) + paths.log_dir.mkdir(parents=True) + webui_dir = tmp_path / "webui" + webui_dir.mkdir() + owners = iter([[52372], [52372], [], []]) + cleaned: list[int] = [] + + monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: next(owners)) + monkeypatch.setattr(service_manager, "_read_upgrade_runtime_info", lambda _port: service_manager.UpgradeRuntimeInfo()) + monkeypatch.setattr( + service_manager, + "_process_command_line", + lambda _pid: f"node {webui_dir}/node_modules/vite/bin/vite.js preview --host 127.0.0.1 --port 5173", + ) + monkeypatch.setattr(service_manager, "_terminate_orphan_pid", lambda pid, *_args, **_kwargs: cleaned.append(pid)) + monkeypatch.setattr(service_manager, "port_is_in_use", lambda *_args, **_kwargs: False) + monkeypatch.setattr(service_manager, "resolve_npm_executable", lambda: "/usr/bin/npm") + monkeypatch.setattr(service_manager, "node_version_satisfies_requirement", lambda: True) + monkeypatch.setattr(service_manager.subprocess, "run", lambda *_args, **_kwargs: SimpleNamespace(returncode=0)) + monkeypatch.setattr(service_manager, "_spawn_process", lambda command, **_kwargs: SimpleNamespace(pid=5678, args=command)) + monkeypatch.setattr(service_manager, "process_runtime_record", lambda *_args, **_kwargs: service_manager.RuntimeRecord(pid=5678)) + monkeypatch.setattr(service_manager, "_log_startup_config", lambda *_args, **_kwargs: None) + monkeypatch.setattr(service_manager, "wait_for_http", lambda *_args, **_kwargs: None) + + process = service_manager._start_frontend_process(service_manager.ServiceConfig(), DummyConsole(), paths=paths) + + assert process.pid == 5678 + assert cleaned == [52372] + + def test_spawn_process_uses_hidden_window_flags_on_windows(monkeypatch, tmp_path: Path) -> None: captured = {} log_path = tmp_path / "logs" / "backend.log" @@ -1677,11 +1729,12 @@ def print(self, message) -> None: monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: next(states)) monkeypatch.setattr(service_manager, "request_stop", lambda **_kwargs: calls.append("/stop") or {"status": "stopping"}) + monkeypatch.setattr(service_manager, "cleanup_orphan_service_ports", lambda _config, _console: calls.append("cleanup")) console = FakeConsole() service_manager.stop_all(console=console) - assert calls == ["/stop"] + assert calls == ["/stop", "cleanup"] assert console.messages == ["[flocks] Flocks daemon 已停止。"] @@ -1691,10 +1744,11 @@ def test_stop_all_reports_when_supervisor_is_down(monkeypatch, tmp_path: Path) - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: False) + monkeypatch.setattr(service_manager, "cleanup_orphan_service_ports", lambda _config, _console: console.messages.append("cleanup")) service_manager.stop_all(console) - assert console.messages == ["[flocks] Flocks daemon 未运行。"] + assert console.messages == ["[flocks] Flocks daemon 未运行。", "cleanup"] def test_status_lines_include_control_api_errors(monkeypatch, tmp_path: Path) -> None: From 4db629b233d9bded8c8edd8ad4e3f2f7a2303c74 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Thu, 2 Jul 2026 17:48:13 +0800 Subject: [PATCH 08/28] fix(cli): preserve daemon lifecycle compatibility --- flocks/cli/service_manager.py | 360 ++++++++++++++++++++++++++---- flocks/cli/service_supervisor.py | 6 +- tests/cli/test_service_manager.py | 181 ++++++++++++++- 3 files changed, 499 insertions(+), 48 deletions(-) diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index 20bc336ee..d56a68669 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -25,6 +25,7 @@ import httpx +from flocks.browser.admin import stop_all_daemons as stop_all_browser_daemons from flocks.cli.service_config import ServiceConfig, loopback_host from flocks.cli.service_control import ( read_logs, @@ -36,6 +37,11 @@ supervisor_socket_path, ) +try: + import fcntl +except ImportError: # pragma: no cover - unavailable on Windows + fcntl = None + MIN_NODE_MAJOR = 22 FOLLOW_POLL_INTERVAL = 0.5 MAX_SERVICE_LOG_BYTES = 1024 * 1024 * 1024 @@ -398,12 +404,7 @@ def process_runtime_record( command: Sequence[str], ) -> RuntimeRecord: """Build runtime metadata for a freshly started service process.""" - pgid = None - if sys.platform != "win32": - try: - pgid = os.getpgid(process.pid) - except OSError: - pgid = None + pgid = _process_group_id(process) return RuntimeRecord( pid=process.pid, pgid=pgid, @@ -414,6 +415,24 @@ def process_runtime_record( ) +def _process_group_id(process: subprocess.Popen) -> int | None: + """Return a cached or live Unix process group id for a managed process.""" + if sys.platform == "win32": + return None + cached = getattr(process, "_flocks_pgid", None) + if isinstance(cached, int) and cached > 0: + return cached + try: + pgid = os.getpgid(process.pid) + except OSError: + return None + try: + setattr(process, "_flocks_pgid", pgid) + except Exception: + pass + return pgid + + def read_pid(pid_file: Path) -> int | None: """Read a pid file if it exists and contains a valid integer.""" record = read_runtime_record(pid_file) @@ -887,6 +906,62 @@ def cleanup_trusted_port_owners(port: int, *, service: str, label: str, console, return trusted +def _process_list_pids() -> list[int]: + """Return process ids for best-effort trusted orphan cleanup.""" + if sys.platform == "win32": + completed = subprocess.run( + [ + "powershell", + "-NoProfile", + "-Command", + "Get-CimInstance Win32_Process | ForEach-Object { $_.ProcessId }", + ], + check=False, + capture_output=True, + text=True, + ) + else: + completed = subprocess.run( + ["ps", "-eo", "pid="], + check=False, + capture_output=True, + text=True, + ) + if completed.returncode != 0: + return [] + pids = [] + for line in completed.stdout.splitlines(): + value = line.strip() + if value.isdigit(): + pids.append(int(value)) + return sorted(dict.fromkeys(pids)) + + +def _trusted_flocks_daemon_owner(pid: int, *, root: Path) -> bool: + """Return True only for daemon processes that belong to this Flocks install.""" + if pid <= 0 or pid == os.getpid(): + return False + command_line = _process_command_line(pid).lower() + if not command_line: + return False + root_text = str(root).lower() + return "service-daemon" in command_line and "flocks" in command_line and root_text in command_line + + +def trusted_daemon_process_pids(*, root: Path | None = None) -> list[int]: + """Return trusted daemon pids for the current Flocks install.""" + current_root = root or ensure_install_layout() + return [pid for pid in _process_list_pids() if _trusted_flocks_daemon_owner(pid, root=current_root)] + + +def cleanup_trusted_daemon_processes(*, console, root: Path | None = None) -> list[int]: + """Clean trusted Flocks daemon processes whose control API is unavailable.""" + trusted = trusted_daemon_process_pids(root=root) + for pid in trusted: + _terminate_orphan_pid(pid, "daemon", console) + return trusted + + def _is_reachable_response(response: httpx.Response) -> bool: """Return True when an HTTP endpoint is reachable enough for startup checks.""" return response.status_code < 500 @@ -961,13 +1036,15 @@ def _terminate_process( """Terminate a process and its process group without scanning service ports.""" if process is None: return - if process.poll() is not None: - return record = process_runtime_record(process, host=None, port=None, command=()) + if process.poll() is not None and not process_group_is_running(record.pgid): + return + console.print(f"[flocks] 停止 {name}(PID={process.pid})...") if sys.platform == "win32": - subprocess.run(["taskkill", "/PID", str(process.pid), "/T", "/F"], check=False, capture_output=True) + if process.poll() is None: + subprocess.run(["taskkill", "/PID", str(process.pid), "/T", "/F"], check=False, capture_output=True) else: if record.pgid is not None: signal_process_group(signal.SIGTERM, record.pgid) @@ -982,7 +1059,8 @@ def _terminate_process( console.print(f"[flocks] {name} 未在预期时间内退出,强制终止...") if sys.platform == "win32": - subprocess.run(["taskkill", "/PID", str(process.pid), "/T", "/F"], check=False, capture_output=True) + if process.poll() is None: + subprocess.run(["taskkill", "/PID", str(process.pid), "/T", "/F"], check=False, capture_output=True) else: if record.pgid is not None: signal_process_group(signal.SIGKILL, record.pgid) @@ -1204,6 +1282,59 @@ def signal_process_group(sig: signal.Signals, pgid: int | None) -> None: pass +def _recorded_port(pid_file: Path, default: int) -> int: + """Return the port from a legacy runtime record, falling back to *default*.""" + record = read_runtime_record(pid_file) + if record is not None and record.port is not None: + return record.port + return default + + +def _recorded_host(pid_file: Path, default: str) -> str: + """Return the host from a legacy runtime record, falling back to *default*.""" + record = read_runtime_record(pid_file) + if record is not None and record.host: + return record.host + return default + + +@contextlib.contextmanager +def service_lock(paths: RuntimePaths): + """Serialize CLI lifecycle commands while starting/stopping the daemon.""" + lock_path = paths.run_dir / "service.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + handle = lock_path.open("a+", encoding="utf-8") + unlock_windows = None + try: + try: + if sys.platform == "win32": + import msvcrt + + handle.seek(0) + handle.write("0") + handle.flush() + handle.seek(0) + msvcrt.locking(handle.fileno(), msvcrt.LK_NBLCK, 1) + unlock_windows = msvcrt + else: + if fcntl is None: # pragma: no cover - defensive + raise OSError("fcntl unavailable") + fcntl.flock(handle, fcntl.LOCK_EX | fcntl.LOCK_NB) + except OSError as error: + raise ServiceError("另一个 flocks 命令正在执行,请稍后重试。") from error + yield + finally: + try: + if unlock_windows is not None: + handle.seek(0) + unlock_windows.locking(handle.fileno(), unlock_windows.LK_UNLCK, 1) + elif fcntl is not None and sys.platform != "win32": + fcntl.flock(handle, fcntl.LOCK_UN) + except OSError: + pass + handle.close() + + def _log_startup_config( log_path: Path, name: str, @@ -1275,16 +1406,64 @@ def _start_supervisor_process(config: ServiceConfig, paths: RuntimePaths, consol return _spawn_process(command, cwd=root, log_path=log_path, env=env) -def stop_all(console) -> None: - """Stop managed services through the supervisor control API.""" - paths = ensure_runtime_dirs() +def _service_config_matches(left: ServiceConfig, right: ServiceConfig) -> bool: + """Return True when two configs manage the same service endpoints.""" + return ( + left.backend_host == right.backend_host + and left.backend_port == right.backend_port + and left.frontend_host == right.frontend_host + and left.frontend_port == right.frontend_port + ) + + +def _legacy_runtime_config(paths: RuntimePaths, fallback: ServiceConfig) -> ServiceConfig: + """Build cleanup config from legacy runtime records when present.""" + return ServiceConfig( + backend_host=_recorded_host(paths.backend_pid, fallback.backend_host), + backend_port=_recorded_port(paths.backend_pid, fallback.backend_port), + frontend_host=_recorded_host(paths.frontend_pid, fallback.frontend_host), + frontend_port=_recorded_port(paths.frontend_pid, fallback.frontend_port), + no_browser=fallback.no_browser, + skip_frontend_build=fallback.skip_frontend_build, + ) + + +def _unique_cleanup_configs(*configs: ServiceConfig) -> list[ServiceConfig]: + """Deduplicate cleanup configs by backend/WebUI ports.""" + result: list[ServiceConfig] = [] + seen: set[tuple[int, int]] = set() + for config in configs: + key = (config.backend_port, config.frontend_port) + if key in seen: + continue + seen.add(key) + result.append(config) + return result + + +def cleanup_legacy_runtime_processes(paths: RuntimePaths, console) -> None: + """Clean legacy pid/runtime records left by pre-daemon service starts.""" + for pid_file, name in ( + (watchdog_pid_path(paths), "watchdog"), + (paths.frontend_pid, "WebUI"), + (paths.backend_pid, "后端"), + ): + stop_runtime_record_process(pid_file, name, console) + + +def _stop_all_unlocked(console, *, paths: RuntimePaths) -> None: + """Stop managed services; caller must hold the lifecycle lock.""" cleanup_config = ServiceConfig() + legacy_config = _legacy_runtime_config(paths, cleanup_config) if not supervisor_is_running(paths): console.print("[flocks] Flocks daemon 未运行。") - cleanup_orphan_service_ports(cleanup_config, console) + cleanup_legacy_runtime_processes(paths, console) + cleanup_orphan_service_ports(cleanup_config, console, extra_configs=[legacy_config]) + stop_all_browser_daemons() return try: cleanup_config = read_supervisor_status(paths=paths, timeout=1.0).config + legacy_config = _legacy_runtime_config(paths, cleanup_config) except Exception: pass try: @@ -1295,13 +1474,22 @@ def stop_all(console) -> None: deadline = time.monotonic() + 20.0 while time.monotonic() < deadline: if not supervisor_is_running(paths): - cleanup_orphan_service_ports(cleanup_config, console) + cleanup_legacy_runtime_processes(paths, console) + cleanup_orphan_service_ports(cleanup_config, console, extra_configs=[legacy_config]) + stop_all_browser_daemons() console.print("[flocks] Flocks daemon 已停止。") return time.sleep(0.5) raise ServiceError("Flocks daemon 未在预期时间内退出。") +def stop_all(console) -> None: + """Stop managed services through the supervisor control API.""" + paths = ensure_runtime_dirs() + with service_lock(paths): + _stop_all_unlocked(console, paths=paths) + + def _start_all_without_stop(config: ServiceConfig, console) -> None: """Start the supervisor daemon, then print access summary.""" paths = ensure_runtime_dirs() @@ -1312,15 +1500,23 @@ def _start_all_without_stop(config: ServiceConfig, console) -> None: open_default_browser(config.frontend_url, console) -def start_all(config: ServiceConfig, console) -> None: - """Ensure the supervisor daemon is running.""" - paths = ensure_runtime_dirs() +def _start_all_unlocked(config: ServiceConfig, console, *, paths: RuntimePaths) -> None: + """Ensure the supervisor daemon is running; caller must hold lifecycle lock.""" if supervisor_is_running(paths): + status = None + try: + status = read_supervisor_status(paths=paths, timeout=1.0) + except Exception: + status = None + if status is not None and not _service_config_matches(config, status.config): + console.print("[flocks] Flocks daemon 已在运行,但配置已变化,正在按新配置重启...") + _stop_all_unlocked(console, paths=paths) + _start_all_without_stop(config, console) + return console.print("[flocks] Flocks daemon 已在运行。") show_status(console) if not config.no_browser: try: - status = read_supervisor_status(paths=paths, timeout=1.0) url = _frontend_url_from_status(status, config.frontend_url) except Exception: url = config.frontend_url @@ -1329,29 +1525,40 @@ def start_all(config: ServiceConfig, console) -> None: _start_all_without_stop(config, console) +def start_all(config: ServiceConfig, console) -> None: + """Ensure the supervisor daemon is running.""" + paths = ensure_runtime_dirs() + with service_lock(paths): + _start_all_unlocked(config, console, paths=paths) + + def restart_all(config: ServiceConfig, console) -> None: """Restart by stopping the daemon first, then starting a fresh daemon.""" - stop_all(console) - start_all(config, console) + paths = ensure_runtime_dirs() + with service_lock(paths): + _stop_all_unlocked(console, paths=paths) + _start_all_unlocked(config, console, paths=paths) -def cleanup_orphan_service_ports(config: ServiceConfig, console) -> None: +def cleanup_orphan_service_ports(config: ServiceConfig, console, *, extra_configs: Sequence[ServiceConfig] = ()) -> None: """Clean trusted Flocks leftovers on configured backend/WebUI ports.""" root = ensure_install_layout() - cleanup_trusted_port_owners( - config.backend_port, - service="backend", - label="后端", - console=console, - root=root, - ) - cleanup_trusted_port_owners( - config.frontend_port, - service="webui", - label="WebUI", - console=console, - root=root, - ) + cleanup_trusted_daemon_processes(console=console, root=root) + for cleanup_config in _unique_cleanup_configs(config, *extra_configs): + cleanup_trusted_port_owners( + cleanup_config.backend_port, + service="backend", + label="后端", + console=console, + root=root, + ) + cleanup_trusted_port_owners( + cleanup_config.frontend_port, + service="webui", + label="WebUI", + console=console, + root=root, + ) def build_status_lines(paths: RuntimePaths | None = None) -> list[str]: @@ -1360,6 +1567,18 @@ def build_status_lines(paths: RuntimePaths | None = None) -> list[str]: try: status = read_supervisor_status(paths=current) except Exception: + residual_daemons = [] + try: + residual_daemons = trusted_daemon_process_pids(root=ensure_install_layout()) + except Exception: + residual_daemons = [] + if residual_daemons: + return [ + "[flocks] Flocks daemon control API 未运行", + f"[flocks] 检测到残留 daemon 进程: PID={_join_pids(residual_daemons)}", + f"[flocks] 日志: {supervisor_log_path(current)}", + "[flocks] 可执行 `flocks stop` 清理残留进程。", + ] return [ "[flocks] Flocks daemon 未运行", f"[flocks] 日志: {supervisor_log_path(current)}", @@ -1436,7 +1655,9 @@ def show_logs( try: payload = read_logs(service=service, lines=lines, paths=paths, timeout=5.0) except Exception as exc: - raise ServiceError(f"无法通过 Flocks daemon 读取日志: {exc}") from exc + console.print(f"[flocks] Flocks daemon 日志接口不可用,改为读取本地日志文件: {exc}") + _show_local_logs(console, paths, backend=backend, webui=webui, follow=False, lines=lines) + return logs = payload.get("logs") if isinstance(payload.get("logs"), dict) else {} for prefix, entry in logs.items(): if not isinstance(entry, dict): @@ -1453,7 +1674,8 @@ def show_logs( except KeyboardInterrupt: return except Exception as exc: - raise ServiceError(f"无法通过 Flocks daemon 跟随日志: {exc}") from exc + console.print(f"[flocks] Flocks daemon 日志接口不可用,改为跟随本地日志文件: {exc}") + _show_local_logs(console, paths, backend=backend, webui=webui, follow=True, lines=lines) def selected_log_paths( @@ -1470,6 +1692,64 @@ def selected_log_paths( return [paths.backend_log, paths.frontend_log] +def _selected_log_entries(paths: RuntimePaths, *, backend: bool = False, webui: bool = False) -> list[tuple[str, Path]]: + """Return local log files selected by CLI flags.""" + if backend and not webui: + return [("backend", paths.backend_log)] + if webui and not backend: + return [("webui", paths.frontend_log)] + return [ + ("backend", paths.backend_log), + ("webui", paths.frontend_log), + ("daemon", supervisor_log_path(paths)), + ] + + +def _show_local_logs( + console, + paths: RuntimePaths, + *, + backend: bool = False, + webui: bool = False, + follow: bool = True, + lines: int = 50, +) -> None: + """Print local log files when the daemon control API is unavailable.""" + selections = _selected_log_entries(paths, backend=backend, webui=webui) + for prefix, path in selections: + path.parent.mkdir(parents=True, exist_ok=True) + path.touch(exist_ok=True) + console.print(f"[{prefix}] --- {path} ---") + for line in tail_lines(path, lines): + console.print(f"[{prefix}] {line}") + + if not follow: + return + + handles = {} + try: + for prefix, path in selections: + handle = path.open("r", encoding="utf-8", errors="replace") + handle.seek(0, os.SEEK_END) + handles[prefix] = handle + while True: + emitted = False + for prefix, handle in handles.items(): + while True: + line = handle.readline() + if not line: + break + emitted = True + console.print(f"[{prefix}] {line.rstrip()}") + if not emitted: + time.sleep(FOLLOW_POLL_INTERVAL) + except KeyboardInterrupt: + return + finally: + for handle in handles.values(): + handle.close() + + def tail_lines(path: Path, lines: int) -> list[str]: """Read the last N lines from a text file.""" with path.open("r", encoding="utf-8", errors="replace") as handle: @@ -1671,7 +1951,7 @@ def _spawn_process( _cap_service_log_file(log_path, MAX_SERVICE_LOG_BYTES) handle = log_path.open("a", encoding="utf-8") try: - return subprocess.Popen( + process = subprocess.Popen( list(command), cwd=cwd, env=env, @@ -1681,6 +1961,8 @@ def _spawn_process( creationflags=creationflags, **kwargs, ) + _process_group_id(process) + return process finally: handle.close() diff --git a/flocks/cli/service_supervisor.py b/flocks/cli/service_supervisor.py index 2eccefc31..9864a7b42 100644 --- a/flocks/cli/service_supervisor.py +++ b/flocks/cli/service_supervisor.py @@ -386,13 +386,13 @@ def _log_paths_for_service(self, service_name: str) -> list[tuple[str, Path]]: return [("backend", self.paths.backend_log)] if service_name == "webui": return [("webui", self.paths.frontend_log)] - if service_name == "supervisor": - return [("supervisor", supervisor_log_path(self.paths))] + if service_name in {"daemon", "supervisor"}: + return [("daemon", supervisor_log_path(self.paths))] if service_name == "all": return [ ("backend", self.paths.backend_log), ("webui", self.paths.frontend_log), - ("supervisor", supervisor_log_path(self.paths)), + ("daemon", supervisor_log_path(self.paths)), ] return [] diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index 40962ecb7..0c0b4cb27 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -379,6 +379,38 @@ def test_selected_log_paths_support_specific_targets(tmp_path: Path) -> None: assert service_manager.selected_log_paths(paths) == [paths.backend_log, paths.frontend_log] +def test_show_logs_falls_back_to_local_files_when_daemon_unavailable(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + paths.log_dir.mkdir(parents=True) + paths.backend_log.write_text("backend-one\nbackend-two\n", encoding="utf-8") + paths.frontend_log.write_text("webui-one\n", encoding="utf-8") + (paths.log_dir / "daemon.log").write_text("daemon-one\n", encoding="utf-8") + console = DummyConsole() + + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr( + service_manager, + "read_logs", + lambda **_kwargs: (_ for _ in ()).throw(service_manager.ServiceError("down")), + ) + + service_manager.show_logs(console, follow=False, lines=1) + + assert any("改为读取本地日志文件" in message for message in console.messages) + assert "[backend] backend-two" in console.messages + assert "[webui] webui-one" in console.messages + assert "[daemon] daemon-one" in console.messages + + +def test_daemon_log_service_name_has_supervisor_alias(tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + daemon = service_supervisor.SupervisorDaemon(service_manager.ServiceConfig()) + daemon.paths = paths + + assert daemon._log_paths_for_service("daemon") == [("daemon", paths.log_dir / "daemon.log")] + assert daemon._log_paths_for_service("supervisor") == [("daemon", paths.log_dir / "daemon.log")] + + def test_tail_lines_returns_recent_content(tmp_path: Path) -> None: log_file = tmp_path / "backend.log" log_file.write_text("a\nb\nc\n", encoding="utf-8") @@ -764,6 +796,7 @@ def test_build_status_lines_reports_daemon_down_without_port_scans(monkeypatch, ) monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: calls.append("port_owner") or []) monkeypatch.setattr(service_manager, "port_is_in_use", lambda *_args, **_kwargs: calls.append("port_in_use") or False) + monkeypatch.setattr(service_manager, "trusted_daemon_process_pids", lambda **_kwargs: []) lines = service_manager.build_status_lines(paths) @@ -771,6 +804,27 @@ def test_build_status_lines_reports_daemon_down_without_port_scans(monkeypatch, assert calls == [] +def test_build_status_lines_reports_residual_daemon_when_control_api_is_down(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + + monkeypatch.setattr( + service_manager, + "read_supervisor_status", + lambda *_args, **_kwargs: (_ for _ in ()).throw(service_manager.ServiceError("down")), + ) + monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) + monkeypatch.setattr(service_manager, "trusted_daemon_process_pids", lambda **_kwargs: [52058]) + + lines = service_manager.build_status_lines(paths) + + assert lines == [ + "[flocks] Flocks daemon control API 未运行", + "[flocks] 检测到残留 daemon 进程: PID=52058", + f"[flocks] 日志: {paths.log_dir / 'daemon.log'}", + "[flocks] 可执行 `flocks stop` 清理残留进程。", + ] + + def test_start_all_starts_supervisor_when_control_api_is_down(monkeypatch) -> None: call_order: list[str] = [] paths = _make_runtime_paths(Path("/tmp/flocks-test")) @@ -800,11 +854,46 @@ def test_start_all_does_not_duplicate_running_supervisor(monkeypatch) -> None: assert "[flocks] Flocks daemon 已在运行。" in console.messages +def test_start_all_restarts_running_daemon_when_config_changes(monkeypatch) -> None: + calls: list[str] = [] + console = DummyConsole() + paths = _make_runtime_paths(Path("/tmp/flocks-test")) + payload = _supervisor_status_payload() + payload["config"] = { + "backend_host": "127.0.0.1", + "backend_port": 8000, + "frontend_host": "127.0.0.1", + "frontend_port": 5173, + } + + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: True) + monkeypatch.setattr(service_manager, "read_supervisor_status", lambda *_args, **_kwargs: _supervisor_status(payload)) + monkeypatch.setattr(service_manager, "_stop_all_unlocked", lambda _console, **_kwargs: calls.append("stop")) + monkeypatch.setattr(service_manager, "_start_all_without_stop", lambda _config, _console: calls.append("start")) + + service_manager.start_all( + service_manager.ServiceConfig( + backend_host="0.0.0.0", + backend_port=9000, + frontend_host="0.0.0.0", + frontend_port=5273, + no_browser=True, + ), + console, + ) + + assert calls == ["stop", "start"] + assert "[flocks] Flocks daemon 已在运行,但配置已变化,正在按新配置重启..." in console.messages + + def test_restart_all_stops_then_starts_daemon(monkeypatch) -> None: call_order: list[str] = [] + paths = _make_runtime_paths(Path("/tmp/flocks-test")) - monkeypatch.setattr(service_manager, "stop_all", lambda _console: call_order.append("stop")) - monkeypatch.setattr(service_manager, "start_all", lambda _config, _console: call_order.append("start")) + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "_stop_all_unlocked", lambda _console, **_kwargs: call_order.append("stop")) + monkeypatch.setattr(service_manager, "_start_all_unlocked", lambda _config, _console, **_kwargs: call_order.append("start")) service_manager.restart_all(service_manager.ServiceConfig(), console=None) @@ -1585,6 +1674,27 @@ def test_start_webui_cleans_trusted_orphan_port_owner(monkeypatch, tmp_path: Pat assert cleaned == [52372] +def test_cleanup_trusted_daemon_processes_cleans_current_install_only(monkeypatch, tmp_path: Path) -> None: + cleaned: list[int] = [] + + monkeypatch.setattr(service_manager, "_process_list_pids", lambda: [111, 222, 333]) + monkeypatch.setattr( + service_manager, + "_process_command_line", + lambda pid: { + 111: f"{tmp_path}/.venv/bin/python -m flocks.cli.main service-daemon --server-port 8000", + 222: "/other/flocks/.venv/bin/python -m flocks.cli.main service-daemon --server-port 8000", + 333: f"{tmp_path}/.venv/bin/python -m flocks.cli.main serve --port 8000", + }[pid], + ) + monkeypatch.setattr(service_manager, "_terminate_orphan_pid", lambda pid, *_args, **_kwargs: cleaned.append(pid)) + + result = service_manager.cleanup_trusted_daemon_processes(console=DummyConsole(), root=tmp_path) + + assert result == [111] + assert cleaned == [111] + + def test_spawn_process_uses_hidden_window_flags_on_windows(monkeypatch, tmp_path: Path) -> None: captured = {} log_path = tmp_path / "logs" / "backend.log" @@ -1633,10 +1743,12 @@ def fake_popen(*args, **kwargs): monkeypatch.setattr(service_manager.sys, "platform", "darwin") monkeypatch.setattr(service_manager.subprocess, "Popen", fake_popen) + monkeypatch.setattr(service_manager.os, "getpgid", lambda pid: 4321 if pid == 9876 else pid) process = service_manager._spawn_process(["python", "-m", "uvicorn"], cwd=tmp_path, log_path=log_path) assert process.pid == 9876 + assert process._flocks_pgid == 4321 assert captured["args"] == (["python", "-m", "uvicorn"],) assert captured["kwargs"]["cwd"] == tmp_path assert captured["kwargs"]["creationflags"] == 0 @@ -1644,6 +1756,26 @@ def fake_popen(*args, **kwargs): assert "startupinfo" not in captured["kwargs"] +def test_terminate_process_stops_cached_process_group_after_root_exits(monkeypatch) -> None: + signals: list[tuple[str, int]] = [] + group_running = iter([True, False]) + process = SimpleNamespace(pid=9876, returncode=0, poll=lambda: 0, _flocks_pgid=4321) + + monkeypatch.setattr(service_manager.sys, "platform", "darwin") + monkeypatch.setattr(service_manager, "process_group_is_running", lambda _pgid: next(group_running)) + monkeypatch.setattr( + service_manager, + "signal_process_group", + lambda sig, pgid: signals.append((sig.name, pgid)), + ) + monkeypatch.setattr(service_manager, "signal_pid_list", lambda *_args, **_kwargs: None) + monkeypatch.setattr(service_manager, "collect_process_tree_pids", lambda _pid: []) + + service_manager._terminate_process(process, "WebUI", DummyConsole(), timeout=0.1) + + assert signals == [("SIGTERM", 4321)] + + def test_spawn_process_appends_without_rotated_suffix(monkeypatch, tmp_path: Path) -> None: log_path = tmp_path / "logs" / "backend.log" log_path.parent.mkdir(parents=True) @@ -1729,12 +1861,14 @@ def print(self, message) -> None: monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: next(states)) monkeypatch.setattr(service_manager, "request_stop", lambda **_kwargs: calls.append("/stop") or {"status": "stopping"}) - monkeypatch.setattr(service_manager, "cleanup_orphan_service_ports", lambda _config, _console: calls.append("cleanup")) + monkeypatch.setattr(service_manager, "cleanup_legacy_runtime_processes", lambda _paths, _console: calls.append("legacy")) + monkeypatch.setattr(service_manager, "cleanup_orphan_service_ports", lambda _config, _console, **_kwargs: calls.append("cleanup")) + monkeypatch.setattr(service_manager, "stop_all_browser_daemons", lambda: calls.append("browser")) console = FakeConsole() service_manager.stop_all(console=console) - assert calls == ["/stop", "cleanup"] + assert calls == ["/stop", "legacy", "cleanup", "browser"] assert console.messages == ["[flocks] Flocks daemon 已停止。"] @@ -1744,11 +1878,46 @@ def test_stop_all_reports_when_supervisor_is_down(monkeypatch, tmp_path: Path) - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: False) - monkeypatch.setattr(service_manager, "cleanup_orphan_service_ports", lambda _config, _console: console.messages.append("cleanup")) + monkeypatch.setattr(service_manager, "cleanup_legacy_runtime_processes", lambda _paths, _console: console.messages.append("legacy")) + monkeypatch.setattr( + service_manager, + "cleanup_orphan_service_ports", + lambda _config, _console, **_kwargs: console.messages.append("cleanup"), + ) + monkeypatch.setattr(service_manager, "stop_all_browser_daemons", lambda: console.messages.append("browser")) + + service_manager.stop_all(console) + + assert console.messages == ["[flocks] Flocks daemon 未运行。", "legacy", "cleanup", "browser"] + + +def test_stop_all_uses_legacy_runtime_ports_for_orphan_cleanup(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + console = DummyConsole() + captured: list[service_manager.ServiceConfig] = [] + paths.run_dir.mkdir(parents=True) + _write_legacy_runtime_record( + paths.backend_pid, + service_manager.RuntimeRecord(pid=111, host="0.0.0.0", port=9000), + ) + _write_legacy_runtime_record( + paths.frontend_pid, + service_manager.RuntimeRecord(pid=222, host="0.0.0.0", port=5273), + ) + + def fake_cleanup(config, _console, *, extra_configs=()): + captured.append(config) + captured.extend(extra_configs) + + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: False) + monkeypatch.setattr(service_manager, "cleanup_legacy_runtime_processes", lambda *_args, **_kwargs: None) + monkeypatch.setattr(service_manager, "cleanup_orphan_service_ports", fake_cleanup) + monkeypatch.setattr(service_manager, "stop_all_browser_daemons", lambda: None) service_manager.stop_all(console) - assert console.messages == ["[flocks] Flocks daemon 未运行。", "cleanup"] + assert [(config.backend_port, config.frontend_port) for config in captured] == [(8000, 5173), (9000, 5273)] def test_status_lines_include_control_api_errors(monkeypatch, tmp_path: Path) -> None: From b69a681e18541440943e2507b53b28425de29c4b Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Thu, 2 Jul 2026 17:57:04 +0800 Subject: [PATCH 09/28] fix(cli): remove supervisor log alias --- flocks/cli/service_supervisor.py | 2 +- tests/cli/test_service_manager.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/flocks/cli/service_supervisor.py b/flocks/cli/service_supervisor.py index 9864a7b42..a351b571f 100644 --- a/flocks/cli/service_supervisor.py +++ b/flocks/cli/service_supervisor.py @@ -386,7 +386,7 @@ def _log_paths_for_service(self, service_name: str) -> list[tuple[str, Path]]: return [("backend", self.paths.backend_log)] if service_name == "webui": return [("webui", self.paths.frontend_log)] - if service_name in {"daemon", "supervisor"}: + if service_name == "daemon": return [("daemon", supervisor_log_path(self.paths))] if service_name == "all": return [ diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index 0c0b4cb27..523ff12de 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -402,13 +402,13 @@ def test_show_logs_falls_back_to_local_files_when_daemon_unavailable(monkeypatch assert "[daemon] daemon-one" in console.messages -def test_daemon_log_service_name_has_supervisor_alias(tmp_path: Path) -> None: +def test_daemon_log_service_name_uses_daemon_only(tmp_path: Path) -> None: paths = _make_runtime_paths(tmp_path) daemon = service_supervisor.SupervisorDaemon(service_manager.ServiceConfig()) daemon.paths = paths assert daemon._log_paths_for_service("daemon") == [("daemon", paths.log_dir / "daemon.log")] - assert daemon._log_paths_for_service("supervisor") == [("daemon", paths.log_dir / "daemon.log")] + assert daemon._log_paths_for_service("supervisor") == [] def test_tail_lines_returns_recent_content(tmp_path: Path) -> None: From a92bb37ab5f422ac43f27efbdc796f93ce75827f Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Thu, 2 Jul 2026 18:07:27 +0800 Subject: [PATCH 10/28] fix(cli): clean orphan service port owners --- flocks/cli/service_manager.py | 28 +++++++++-- tests/cli/test_service_manager.py | 82 +++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 5 deletions(-) diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index d56a68669..3abfcb0a4 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -860,15 +860,23 @@ def _trusted_flocks_port_owner(pid: int, *, service: str, root: Path) -> bool: root_text = str(root).lower() webui_text = str(root / "webui").lower() if service == "backend": + looks_like_uvicorn_backend = "uvicorn" in command_line and "flocks.server.app:app" in command_line return ( - ("flocks.cli.main" in command_line and "serve" in command_line) + looks_like_uvicorn_backend + or ("flocks.cli.main" in command_line and "serve" in command_line) or ("flocks" in command_line and "serve" in command_line and root_text in command_line) ) if service == "webui": - looks_like_preview = ("vite" in command_line and "preview" in command_line) or ( - "npm" in command_line and "preview" in command_line + looks_like_vite = "vite" in command_line and ( + "preview" in command_line or "--host" in command_line or "--port" in command_line ) - return looks_like_preview and (webui_text in command_line or root_text in command_line) + looks_like_flocks_webui = ( + webui_text in command_line + or root_text in command_line + or "/flocks/webui/" in command_line + or "\\flocks\\webui\\" in command_line + ) + return looks_like_vite and looks_like_flocks_webui return False @@ -879,13 +887,23 @@ def _terminate_orphan_pid(pid: int, label: str, console, *, timeout: float = 5.0 subprocess.run(["taskkill", "/PID", str(pid), "/T", "/F"], check=False, capture_output=True) return + pgid: int | None = None + try: + candidate_pgid = os.getpgid(pid) + if candidate_pgid != os.getpgrp(): + pgid = candidate_pgid + except OSError: + pgid = None + targets = collect_process_tree_pids(pid) + signal_process_group(signal.SIGTERM, pgid) signal_pid_list(signal.SIGTERM, targets) deadline = time.monotonic() + timeout while time.monotonic() < deadline: - if not any(pid_is_running(target) for target in targets): + if not any(pid_is_running(target) for target in targets) and not process_group_is_running(pgid): return time.sleep(0.25) + signal_process_group(signal.SIGKILL, pgid) signal_pid_list(signal.SIGKILL, targets) diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index 523ff12de..860c566b2 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -1617,6 +1617,34 @@ def test_start_backend_cleans_trusted_orphan_port_owner(monkeypatch, tmp_path: P assert cleaned == [9999] +def test_backend_cleanup_trusts_cross_worktree_flocks_uvicorn_owner(monkeypatch, tmp_path: Path) -> None: + cleaned: list[int] = [] + owners = iter([[18787], []]) + + monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: next(owners)) + monkeypatch.setattr( + service_manager, + "_process_command_line", + lambda _pid: ( + "/Users/zgy/.codex/worktrees/6be0/flocks/.venv/bin/python " + "/Users/zgy/.codex/worktrees/6be0/flocks/.venv/bin/uvicorn " + "flocks.server.app:app --host 127.0.0.1 --port 8000 --reload --reload-dir flocks" + ), + ) + monkeypatch.setattr(service_manager, "_terminate_orphan_pid", lambda pid, *_args, **_kwargs: cleaned.append(pid)) + + result = service_manager.cleanup_trusted_port_owners( + 8000, + service="backend", + label="后端", + console=DummyConsole(), + root=tmp_path, + ) + + assert result == [18787] + assert cleaned == [18787] + + def test_start_backend_raises_when_port_in_use_without_pid_lookup(monkeypatch, tmp_path: Path) -> None: paths = service_manager.RuntimePaths( root=tmp_path, @@ -1674,6 +1702,32 @@ def test_start_webui_cleans_trusted_orphan_port_owner(monkeypatch, tmp_path: Pat assert cleaned == [52372] +def test_webui_cleanup_trusts_cross_worktree_flocks_vite_owner(monkeypatch, tmp_path: Path) -> None: + cleaned: list[int] = [] + owners = iter([[18962], []]) + + monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: next(owners)) + monkeypatch.setattr( + service_manager, + "_process_command_line", + lambda _pid: ( + "node /Users/zgy/.codex/worktrees/6be0/flocks/webui/node_modules/.bin/vite --host 127.0.0.1 --port 5173" + ), + ) + monkeypatch.setattr(service_manager, "_terminate_orphan_pid", lambda pid, *_args, **_kwargs: cleaned.append(pid)) + + result = service_manager.cleanup_trusted_port_owners( + 5173, + service="webui", + label="WebUI", + console=DummyConsole(), + root=tmp_path, + ) + + assert result == [18962] + assert cleaned == [18962] + + def test_cleanup_trusted_daemon_processes_cleans_current_install_only(monkeypatch, tmp_path: Path) -> None: cleaned: list[int] = [] @@ -1776,6 +1830,34 @@ def test_terminate_process_stops_cached_process_group_after_root_exits(monkeypat assert signals == [("SIGTERM", 4321)] +def test_terminate_orphan_pid_stops_process_group(monkeypatch) -> None: + signals: list[tuple[str, int | tuple[int, ...] | None]] = [] + + monkeypatch.setattr(service_manager.sys, "platform", "darwin") + monkeypatch.setattr(service_manager.os, "getpgid", lambda pid: 18745 if pid == 18787 else pid) + monkeypatch.setattr(service_manager.os, "getpgrp", lambda: 99999) + monkeypatch.setattr(service_manager, "collect_process_tree_pids", lambda _pid: [18787, 18873]) + monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: False) + monkeypatch.setattr(service_manager, "process_group_is_running", lambda _pgid: False) + monkeypatch.setattr( + service_manager, + "signal_process_group", + lambda sig, pgid: signals.append((sig.name, pgid)), + ) + monkeypatch.setattr( + service_manager, + "signal_pid_list", + lambda sig, pids: signals.append((sig.name, tuple(pids))), + ) + + service_manager._terminate_orphan_pid(18787, "后端", DummyConsole(), timeout=0.1) + + assert signals == [ + ("SIGTERM", 18745), + ("SIGTERM", (18787, 18873)), + ] + + def test_spawn_process_appends_without_rotated_suffix(monkeypatch, tmp_path: Path) -> None: log_path = tmp_path / "logs" / "backend.log" log_path.parent.mkdir(parents=True) From 517da1e3986832a417acb99bf2ce3ae1c8ef31ec Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Thu, 2 Jul 2026 18:10:35 +0800 Subject: [PATCH 11/28] chore(cli): simplify startup status summary --- flocks/cli/service_manager.py | 39 +++++++++++++++++++++++++++++-- tests/cli/test_service_manager.py | 27 +++++++++++++++++++++ 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index 3abfcb0a4..38645a47d 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -1420,7 +1420,6 @@ def _start_supervisor_process(config: ServiceConfig, paths: RuntimePaths, consol command.append("--skip-webui-build") env = os.environ.copy() env["PYTHONUNBUFFERED"] = "1" - console.print("[flocks] 启动 Flocks daemon...") return _spawn_process(command, cwd=root, log_path=log_path, env=env) @@ -1637,6 +1636,42 @@ def _service_status_line(label: str, payload: dict[str, Any]) -> str: return f"[flocks] {label}: state={state} PID={pid} URL=http://{host}:{port}{suffix}" +def _daemon_status_line(payload: dict[str, Any]) -> str: + pid = payload.get("pid") + state = payload.get("state") or "unknown" + error = payload.get("last_error") + suffix = f" last_error={error}" if error else "" + return f"[flocks] daemon: state={state} PID={pid}{suffix}" + + +def _startup_step_marker(state: object, *, ready_states: set[str]) -> str: + return "[x]" if str(state or "").lower() in ready_states else "[!]" + + +def _startup_status_lines_from_payload(payload: dict[str, Any]) -> list[str]: + daemon = payload.get("daemon") if isinstance(payload.get("daemon"), dict) else {} + backend = payload.get("backend") if isinstance(payload.get("backend"), dict) else {} + webui = payload.get("webui") if isinstance(payload.get("webui"), dict) else {} + lines = [ + f"[flocks] {_startup_step_marker(daemon.get('state'), ready_states={'running'})} 启动 Flocks daemon...", + f"[flocks] {_startup_step_marker(backend.get('state'), ready_states={'healthy'})} 启动 Flocks server...", + f"[flocks] {_startup_step_marker(webui.get('state'), ready_states={'healthy'})} 启动 Flocks webui...", + "", + "[flocks] 服务", + _daemon_status_line(daemon), + _service_status_line("server", backend), + _service_status_line("webui", webui), + "", + "[flocks] 日志", + f"[flocks] daemon: {daemon.get('log_path')}", + ] + for label, service in (("server", backend), ("webui", webui)): + log_path = service.get("log_path") + if log_path: + lines.append(f"[flocks] {label}: {log_path}") + return lines + + def _frontend_url_from_status(status, fallback: str) -> str: if status.webui.port is not None: return f"http://{_format_host_for_url(_loopback_host(status.webui.host))}:{status.webui.port}" @@ -1644,7 +1679,7 @@ def _frontend_url_from_status(status, fallback: str) -> str: def _print_status_payload(payload: dict[str, Any], console) -> None: - for line in _status_lines_from_payload(payload): + for line in _startup_status_lines_from_payload(payload): console.print(line) diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index 860c566b2..4348271c7 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -785,6 +785,33 @@ def test_build_status_lines_reports_supervisor_control_status(monkeypatch, tmp_p assert lines[11] == "[flocks] WebUI: /tmp/logs/webui.log" +def test_startup_status_lines_use_progress_summary() -> None: + lines = service_manager._startup_status_lines_from_payload(_supervisor_status_payload()) + + assert lines[:3] == [ + "[flocks] [x] 启动 Flocks daemon...", + "[flocks] [x] 启动 Flocks server...", + "[flocks] [x] 启动 Flocks webui...", + ] + assert lines[5] == "[flocks] daemon: state=running PID=100" + assert lines[6] == "[flocks] server: state=healthy PID=111 URL=http://127.0.0.1:9000" + assert lines[7] == "[flocks] webui: state=healthy PID=222 URL=http://127.0.0.1:5174" + assert lines[10] == "[flocks] daemon: /tmp/logs/daemon.log" + assert lines[11] == "[flocks] server: /tmp/logs/backend.log" + assert lines[12] == "[flocks] webui: /tmp/logs/webui.log" + + +def test_startup_status_lines_mark_unhealthy_steps() -> None: + payload = _supervisor_status_payload() + payload["backend"]["state"] = "degraded" + payload["backend"]["last_error"] = "port occupied" + + lines = service_manager._startup_status_lines_from_payload(payload) + + assert lines[1] == "[flocks] [!] 启动 Flocks server..." + assert lines[6] == "[flocks] server: state=degraded PID=111 URL=http://127.0.0.1:9000 last_error=port occupied" + + def test_build_status_lines_reports_daemon_down_without_port_scans(monkeypatch, tmp_path: Path) -> None: paths = _make_runtime_paths(tmp_path) calls: list[str] = [] From 4005df4519d774f12513e66554e3231cf50ecde7 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Thu, 2 Jul 2026 18:18:19 +0800 Subject: [PATCH 12/28] chore(cli): print daemon startup step immediately --- flocks/cli/service_manager.py | 17 ++++++++++------- tests/cli/test_service_manager.py | 18 ++++++++++++++++-- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index 38645a47d..a294f1bc5 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -1511,8 +1511,9 @@ def _start_all_without_stop(config: ServiceConfig, console) -> None: """Start the supervisor daemon, then print access summary.""" paths = ensure_runtime_dirs() process = _start_supervisor_process(config, paths, console) + console.print("[flocks] [x] 启动 Flocks daemon...") payload = _wait_for_supervisor_ready(paths, process=process) - _print_status_payload(payload, console) + _print_status_payload(payload, console, include_daemon_step=False) if not config.no_browser: open_default_browser(config.frontend_url, console) @@ -1648,12 +1649,14 @@ def _startup_step_marker(state: object, *, ready_states: set[str]) -> str: return "[x]" if str(state or "").lower() in ready_states else "[!]" -def _startup_status_lines_from_payload(payload: dict[str, Any]) -> list[str]: +def _startup_status_lines_from_payload(payload: dict[str, Any], *, include_daemon_step: bool = True) -> list[str]: daemon = payload.get("daemon") if isinstance(payload.get("daemon"), dict) else {} backend = payload.get("backend") if isinstance(payload.get("backend"), dict) else {} webui = payload.get("webui") if isinstance(payload.get("webui"), dict) else {} - lines = [ - f"[flocks] {_startup_step_marker(daemon.get('state'), ready_states={'running'})} 启动 Flocks daemon...", + lines = [] + if include_daemon_step: + lines.append(f"[flocks] {_startup_step_marker(daemon.get('state'), ready_states={'running'})} 启动 Flocks daemon...") + lines.extend([ f"[flocks] {_startup_step_marker(backend.get('state'), ready_states={'healthy'})} 启动 Flocks server...", f"[flocks] {_startup_step_marker(webui.get('state'), ready_states={'healthy'})} 启动 Flocks webui...", "", @@ -1664,7 +1667,7 @@ def _startup_status_lines_from_payload(payload: dict[str, Any]) -> list[str]: "", "[flocks] 日志", f"[flocks] daemon: {daemon.get('log_path')}", - ] + ]) for label, service in (("server", backend), ("webui", webui)): log_path = service.get("log_path") if log_path: @@ -1678,8 +1681,8 @@ def _frontend_url_from_status(status, fallback: str) -> str: return fallback -def _print_status_payload(payload: dict[str, Any], console) -> None: - for line in _startup_status_lines_from_payload(payload): +def _print_status_payload(payload: dict[str, Any], console, *, include_daemon_step: bool = True) -> None: + for line in _startup_status_lines_from_payload(payload, include_daemon_step=include_daemon_step): console.print(line) diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index 4348271c7..1b4b1bf3d 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -812,6 +812,18 @@ def test_startup_status_lines_mark_unhealthy_steps() -> None: assert lines[6] == "[flocks] server: state=degraded PID=111 URL=http://127.0.0.1:9000 last_error=port occupied" +def test_startup_status_lines_can_skip_daemon_step() -> None: + lines = service_manager._startup_status_lines_from_payload( + _supervisor_status_payload(), + include_daemon_step=False, + ) + + assert lines[:2] == [ + "[flocks] [x] 启动 Flocks server...", + "[flocks] [x] 启动 Flocks webui...", + ] + + def test_build_status_lines_reports_daemon_down_without_port_scans(monkeypatch, tmp_path: Path) -> None: paths = _make_runtime_paths(tmp_path) calls: list[str] = [] @@ -930,20 +942,22 @@ def test_restart_all_stops_then_starts_daemon(monkeypatch) -> None: def test_start_all_without_stop_starts_supervisor_daemon(monkeypatch, tmp_path: Path) -> None: paths = _make_runtime_paths(tmp_path) calls: list[str] = [] + console = DummyConsole() monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) monkeypatch.setattr(service_manager, "_start_supervisor_process", lambda _config, _paths, _console: calls.append("daemon") or SimpleNamespace(poll=lambda: None)) monkeypatch.setattr(service_manager, "_wait_for_supervisor_ready", lambda _paths, **_kwargs: calls.append("ready") or _supervisor_status_payload()) - monkeypatch.setattr(service_manager, "_print_status_payload", lambda _payload, _console: calls.append("status")) + monkeypatch.setattr(service_manager, "_print_status_payload", lambda _payload, _console, **_kwargs: calls.append("status")) monkeypatch.setattr( service_manager, "open_default_browser", lambda _url, _console: calls.append("browser"), ) - service_manager._start_all_without_stop(service_manager.ServiceConfig(no_browser=True), DummyConsole()) + service_manager._start_all_without_stop(service_manager.ServiceConfig(no_browser=True), console) assert calls == ["daemon", "ready", "status"] + assert console.messages == ["[flocks] [x] 启动 Flocks daemon..."] def test_start_all_propagates_supervisor_start_failure(monkeypatch) -> None: From 533d8a5a7d0c7b5eb0133ece14e923ab3f474e75 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Thu, 2 Jul 2026 18:20:56 +0800 Subject: [PATCH 13/28] chore(cli): group daemon in status services --- flocks/cli/service_manager.py | 5 +---- tests/cli/test_service_manager.py | 15 +++++++-------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index a294f1bc5..ed932d775 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -1609,11 +1609,8 @@ def _status_lines_from_payload(payload: dict[str, Any]) -> list[str]: backend = payload.get("backend") if isinstance(payload.get("backend"), dict) else {} webui = payload.get("webui") if isinstance(payload.get("webui"), dict) else {} lines = [ - "[flocks] Flocks daemon", - f"[flocks] PID: {daemon.get('pid')}", - f"[flocks] 状态: {daemon.get('state')}", - "", "[flocks] 服务", + _daemon_status_line(daemon), _service_status_line("后端", backend), _service_status_line("WebUI", webui), "", diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index 1b4b1bf3d..fcefbb69c 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -775,14 +775,13 @@ def test_build_status_lines_reports_supervisor_control_status(monkeypatch, tmp_p lines = service_manager.build_status_lines(paths) - assert lines[0] == "[flocks] Flocks daemon" - assert lines[1] == "[flocks] PID: 100" - assert lines[2] == "[flocks] 状态: running" - assert "http://127.0.0.1:9000" in lines[5] - assert "http://127.0.0.1:5174" in lines[6] - assert lines[9] == "[flocks] daemon: /tmp/logs/daemon.log" - assert lines[10] == "[flocks] 后端: /tmp/logs/backend.log" - assert lines[11] == "[flocks] WebUI: /tmp/logs/webui.log" + assert lines[0] == "[flocks] 服务" + assert lines[1] == "[flocks] daemon: state=running PID=100" + assert "http://127.0.0.1:9000" in lines[2] + assert "http://127.0.0.1:5174" in lines[3] + assert lines[6] == "[flocks] daemon: /tmp/logs/daemon.log" + assert lines[7] == "[flocks] 后端: /tmp/logs/backend.log" + assert lines[8] == "[flocks] WebUI: /tmp/logs/webui.log" def test_startup_status_lines_use_progress_summary() -> None: From 112cedfce27d46996795fa1424b063a44efdbda8 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Thu, 2 Jul 2026 18:26:30 +0800 Subject: [PATCH 14/28] fix(daemon): suppress disconnected control clients --- flocks/cli/service_supervisor.py | 17 ++++++++++++----- tests/cli/test_service_manager.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/flocks/cli/service_supervisor.py b/flocks/cli/service_supervisor.py index a351b571f..b72dfe24d 100644 --- a/flocks/cli/service_supervisor.py +++ b/flocks/cli/service_supervisor.py @@ -208,11 +208,14 @@ def log_message(self, _format, *_args) -> None: def _send_json(self, payload: dict[str, object], status: int = 200) -> None: body = json.dumps(payload, ensure_ascii=False, sort_keys=True).encode("utf-8") - self.send_response(status) - self.send_header("Content-Type", "application/json; charset=utf-8") - self.send_header("Content-Length", str(len(body))) - self.end_headers() - self.wfile.write(body) + try: + self.send_response(status) + self.send_header("Content-Type", "application/json; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + except (BrokenPipeError, ConnectionResetError): + return def _read_json(self) -> dict[str, Any]: length = int(self.headers.get("Content-Length") or "0") @@ -234,6 +237,8 @@ def do_GET(self) -> None: daemon.handle_logs_request(self, parse_qs(parsed.query)) return self._send_json({"error": "not found"}, status=404) + except (BrokenPipeError, ConnectionResetError): + return except Exception as exc: # pragma: no cover - defensive control path self._send_json({"error": str(exc)}, status=500) @@ -276,6 +281,8 @@ def do_POST(self) -> None: self._send_json(daemon.status_payload()) return self._send_json({"error": "not found"}, status=404) + except (BrokenPipeError, ConnectionResetError): + return except Exception as exc: # pragma: no cover - defensive control path self._send_json({"error": str(exc)}, status=500) diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index fcefbb69c..79446c675 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -411,6 +411,37 @@ def test_daemon_log_service_name_uses_daemon_only(tmp_path: Path) -> None: assert daemon._log_paths_for_service("supervisor") == [] +def test_supervisor_control_send_json_ignores_disconnected_client() -> None: + daemon = service_supervisor.SupervisorDaemon(service_manager.ServiceConfig()) + handler_class = daemon._handler_class() + handler = handler_class.__new__(handler_class) + calls: list[tuple[str, object]] = [] + + handler.send_response = lambda status: calls.append(("status", status)) + handler.send_header = lambda name, value: calls.append((name, value)) + handler.end_headers = lambda: calls.append(("end_headers", None)) + handler.wfile = SimpleNamespace(write=lambda _body: (_ for _ in ()).throw(BrokenPipeError())) + + handler._send_json({"ok": True}) + + assert calls[0] == ("status", 200) + + +def test_supervisor_control_get_ignores_logs_client_disconnect() -> None: + daemon = service_supervisor.SupervisorDaemon(service_manager.ServiceConfig()) + handler_class = daemon._handler_class() + handler = handler_class.__new__(handler_class) + sent: list[dict[str, object]] = [] + + daemon.handle_logs_request = lambda *_args, **_kwargs: (_ for _ in ()).throw(BrokenPipeError()) + handler.path = "/logs?service=daemon" + handler._send_json = lambda payload, **_kwargs: sent.append(payload) + + handler.do_GET() + + assert sent == [] + + def test_tail_lines_returns_recent_content(tmp_path: Path) -> None: log_file = tmp_path / "backend.log" log_file.write_text("a\nb\nc\n", encoding="utf-8") From 21cd80f998669e5c2daebc4f24b6c29b122e0f3b Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Thu, 2 Jul 2026 18:31:12 +0800 Subject: [PATCH 15/28] chore(cli): show stopped services --- flocks/cli/service_manager.py | 16 ++++++++++++++-- tests/cli/test_service_manager.py | 8 +++++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index ed932d775..e7b0b32af 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -1472,6 +1472,7 @@ def _stop_all_unlocked(console, *, paths: RuntimePaths) -> None: """Stop managed services; caller must hold the lifecycle lock.""" cleanup_config = ServiceConfig() legacy_config = _legacy_runtime_config(paths, cleanup_config) + stop_status = None if not supervisor_is_running(paths): console.print("[flocks] Flocks daemon 未运行。") cleanup_legacy_runtime_processes(paths, console) @@ -1479,7 +1480,8 @@ def _stop_all_unlocked(console, *, paths: RuntimePaths) -> None: stop_all_browser_daemons() return try: - cleanup_config = read_supervisor_status(paths=paths, timeout=1.0).config + stop_status = read_supervisor_status(paths=paths, timeout=1.0) + cleanup_config = stop_status.config legacy_config = _legacy_runtime_config(paths, cleanup_config) except Exception: pass @@ -1494,7 +1496,7 @@ def _stop_all_unlocked(console, *, paths: RuntimePaths) -> None: cleanup_legacy_runtime_processes(paths, console) cleanup_orphan_service_ports(cleanup_config, console, extra_configs=[legacy_config]) stop_all_browser_daemons() - console.print("[flocks] Flocks daemon 已停止。") + _print_stop_summary(console, stop_status) return time.sleep(0.5) raise ServiceError("Flocks daemon 未在预期时间内退出。") @@ -1558,6 +1560,16 @@ def restart_all(config: ServiceConfig, console) -> None: _start_all_unlocked(config, console, paths=paths) +def _print_stop_summary(console, status) -> None: + """Print stopped services from the last available supervisor status.""" + if status is not None: + if status.backend.pid is not None: + console.print(f"[flocks] server 已停止(PID={status.backend.pid})。") + if status.webui.pid is not None: + console.print(f"[flocks] webui 已停止(PID={status.webui.pid})。") + console.print("[flocks] daemon 已停止。") + + def cleanup_orphan_service_ports(config: ServiceConfig, console, *, extra_configs: Sequence[ServiceConfig] = ()) -> None: """Clean trusted Flocks leftovers on configured backend/WebUI ports.""" root = ensure_install_layout() diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index 79446c675..c5036c5e1 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -2011,8 +2011,10 @@ def print(self, message) -> None: self.messages.append(message) states = iter([True, False]) + payload = _supervisor_status_payload() monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: next(states)) + monkeypatch.setattr(service_manager, "read_supervisor_status", lambda *_args, **_kwargs: _supervisor_status(payload)) monkeypatch.setattr(service_manager, "request_stop", lambda **_kwargs: calls.append("/stop") or {"status": "stopping"}) monkeypatch.setattr(service_manager, "cleanup_legacy_runtime_processes", lambda _paths, _console: calls.append("legacy")) monkeypatch.setattr(service_manager, "cleanup_orphan_service_ports", lambda _config, _console, **_kwargs: calls.append("cleanup")) @@ -2022,7 +2024,11 @@ def print(self, message) -> None: service_manager.stop_all(console=console) assert calls == ["/stop", "legacy", "cleanup", "browser"] - assert console.messages == ["[flocks] Flocks daemon 已停止。"] + assert console.messages == [ + "[flocks] server 已停止(PID=111)。", + "[flocks] webui 已停止(PID=222)。", + "[flocks] daemon 已停止。", + ] def test_stop_all_reports_when_supervisor_is_down(monkeypatch, tmp_path: Path) -> None: From 5f538b5d8d7b48a68526453b86cbab27ab451668 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Fri, 3 Jul 2026 11:43:11 +0800 Subject: [PATCH 16/28] refactor(cli): serve webui through supervised backend --- flocks/cli/main.py | 50 +++- flocks/cli/service_config.py | 102 +++++-- flocks/cli/service_control.py | 6 - flocks/cli/service_manager.py | 293 ++++++++++---------- flocks/cli/service_process.py | 43 +-- flocks/cli/service_supervisor.py | 111 ++++---- flocks/server/app.py | 15 +- flocks/server/static_webui.py | 129 +++++++++ flocks/updater/updater.py | 34 ++- pyproject.toml | 3 + tests/cli/test_service_manager.py | 352 +++++++----------------- tests/helpers/service_supervisor.py | 1 - tests/server/test_server_port_config.py | 168 ++++++++++- tests/server/test_static_webui.py | 120 ++++++++ tests/updater/test_updater.py | 49 ++-- webui/src/utils/restartPolling.test.ts | 13 +- webui/src/utils/restartPolling.ts | 22 -- 17 files changed, 891 insertions(+), 620 deletions(-) create mode 100644 flocks/server/static_webui.py create mode 100644 tests/server/test_static_webui.py diff --git a/flocks/cli/main.py b/flocks/cli/main.py index e5a09cdf6..c41478436 100644 --- a/flocks/cli/main.py +++ b/flocks/cli/main.py @@ -145,6 +145,8 @@ def main_callback( def _service_config( no_browser: bool = False, skip_webui_build: bool = False, + host: Optional[str] = None, + port: Optional[int] = None, server_host: Optional[str] = None, server_port: Optional[int] = None, webui_host: Optional[str] = None, @@ -159,6 +161,8 @@ def _service_config( return build_service_config( no_browser=no_browser, skip_webui_build=skip_webui_build, + public_host=host, + public_port=port, server_host=server_host, server_port=server_port, webui_host=webui_host, @@ -176,12 +180,14 @@ def _restart_runtime_defaults() -> dict[str, Any]: status = read_supervisor_status(paths=runtime_paths(), timeout=1.0) except Exception: return {} - return restart_defaults_from_status_payload(status.raw) + return restart_defaults_from_status_payload(getattr(status, "raw", status)) def _restart_service_config( no_browser: bool = False, skip_webui_build: bool = False, + host: Optional[str] = None, + port: Optional[int] = None, server_host: Optional[str] = None, server_port: Optional[int] = None, webui_host: Optional[str] = None, @@ -191,6 +197,8 @@ def _restart_service_config( return _service_config( no_browser=no_browser, skip_webui_build=skip_webui_build, + host=host, + port=port, server_host=server_host, server_port=server_port, webui_host=webui_host, @@ -211,21 +219,25 @@ def start( skip_webui_build: bool = typer.Option( False, "--skip-webui-build", - help="Skip `npm run build` before starting WebUI", + help="Skip WebUI static asset build before starting Flocks service", ), + host: Optional[str] = typer.Option(None, "--host", "-h", help="Public service host"), + port: Optional[int] = typer.Option(None, "--port", "-p", help="Public service port"), server_host: Optional[str] = typer.Option(None, "--server-host", help="Backend server host"), server_port: Optional[int] = typer.Option(None, "--server-port", help="Backend server port"), webui_host: Optional[str] = typer.Option(None, "--webui-host", help="WebUI host"), webui_port: Optional[int] = typer.Option(None, "--webui-port", help="WebUI port"), ): """ - Start backend and WebUI in daemon mode + Start Flocks service in daemon mode. """ try: start_all( _service_config( no_browser=no_browser, skip_webui_build=skip_webui_build, + host=host, + port=port, server_host=server_host, server_port=server_port, webui_host=webui_host, @@ -240,7 +252,7 @@ def start( @app.command() def stop(): """ - Stop backend and WebUI + Stop Flocks service. """ try: stop_all(console) @@ -254,21 +266,25 @@ def restart( skip_webui_build: bool = typer.Option( False, "--skip-webui-build", - help="Skip `npm run build` before starting WebUI", + help="Skip WebUI static asset build before starting Flocks service", ), + host: Optional[str] = typer.Option(None, "--host", "-h", help="Public service host"), + port: Optional[int] = typer.Option(None, "--port", "-p", help="Public service port"), server_host: Optional[str] = typer.Option(None, "--server-host", help="Backend server host"), server_port: Optional[int] = typer.Option(None, "--server-port", help="Backend server port"), webui_host: Optional[str] = typer.Option(None, "--webui-host", help="WebUI host"), webui_port: Optional[int] = typer.Option(None, "--webui-port", help="WebUI port"), ): """ - Restart backend and WebUI + Restart Flocks service. """ try: restart_all( _restart_service_config( no_browser=no_browser, skip_webui_build=skip_webui_build, + host=host, + port=port, server_host=server_host, server_port=server_port, webui_host=webui_host, @@ -283,7 +299,7 @@ def restart( @app.command() def status(): """ - Show backend and WebUI status + Show Flocks service status. """ try: show_status(console) @@ -293,13 +309,13 @@ def status(): @app.command() def logs( - backend: bool = typer.Option(False, "--backend", help="Only show backend logs"), - webui: bool = typer.Option(False, "--webui", help="Only show WebUI logs"), + backend: bool = typer.Option(False, "--backend", help="Only show service logs"), + webui: bool = typer.Option(False, "--webui", help="Only show service logs"), follow: bool = typer.Option(True, "--follow/--no-follow", help="Follow logs in real time"), lines: int = typer.Option(50, "--lines", "-n", min=0, help="Number of recent lines to show"), ): """ - Show backend and WebUI logs + Show Flocks service logs. """ try: show_logs(console, backend=backend, webui=webui, follow=follow, lines=lines) @@ -354,10 +370,17 @@ def serve( @app.command(name="service-daemon", hidden=True) def service_daemon( server_host: str = typer.Option("127.0.0.1", "--server-host", help="Backend server host"), - server_port: int = typer.Option(8000, "--server-port", help="Backend server port"), + server_port: int = typer.Option(5173, "--server-port", help="Public service port"), webui_host: str = typer.Option("127.0.0.1", "--webui-host", help="WebUI host"), webui_port: int = typer.Option(5173, "--webui-port", help="WebUI port"), - skip_webui_build: bool = typer.Option(False, "--skip-webui-build", help="Skip WebUI build before preview start"), + legacy_server_host: Optional[str] = typer.Option(None, "--legacy-server-host", help="Legacy backend host"), + legacy_server_port: Optional[int] = typer.Option(8000, "--legacy-server-port", help="Legacy backend port"), + server_port_migration_hint: bool = typer.Option( + False, + "--server-port-migration-hint", + help="Print server-port migration hint in parent CLI", + ), + skip_webui_build: bool = typer.Option(False, "--skip-webui-build", help="Skip WebUI static asset build"), ): """ Run the Flocks service supervisor daemon. @@ -368,6 +391,9 @@ def service_daemon( backend_port=server_port, frontend_host=webui_host, frontend_port=webui_port, + legacy_backend_host=legacy_server_host, + legacy_backend_port=legacy_server_port, + server_port_migration_hint=server_port_migration_hint, no_browser=True, skip_frontend_build=skip_webui_build, ), diff --git a/flocks/cli/service_config.py b/flocks/cli/service_config.py index 590fcd51f..7e6df10e2 100644 --- a/flocks/cli/service_config.py +++ b/flocks/cli/service_config.py @@ -14,15 +14,34 @@ class ServiceConfigError(ValueError): @dataclass(frozen=True) class ServiceConfig: backend_host: str = "127.0.0.1" - backend_port: int = 8000 + backend_port: int = 5173 frontend_host: str = "127.0.0.1" frontend_port: int = 5173 + legacy_backend_host: str | None = "127.0.0.1" + legacy_backend_port: int | None = 8000 + server_port_migration_hint: bool = False no_browser: bool = False skip_frontend_build: bool = False + @property + def backend_url(self) -> str: + return f"http://{_format_host_for_url(loopback_host(self.backend_host))}:{self.backend_port}" + @property def frontend_url(self) -> str: - return f"http://{_format_host_for_url(loopback_host(self.frontend_host))}:{self.frontend_port}" + return self.backend_url + + @property + def legacy_cleanup_config(self) -> "ServiceConfig": + return ServiceConfig( + backend_host=self.legacy_backend_host or self.backend_host, + backend_port=self.legacy_backend_port or self.backend_port, + frontend_host=self.frontend_host, + frontend_port=self.frontend_port, + no_browser=self.no_browser, + server_port_migration_hint=self.server_port_migration_hint, + skip_frontend_build=self.skip_frontend_build, + ) def loopback_host(host: str) -> str: @@ -44,6 +63,9 @@ def service_config_payload(config: ServiceConfig) -> dict[str, object]: "backend_port": config.backend_port, "frontend_host": config.frontend_host, "frontend_port": config.frontend_port, + "legacy_backend_host": config.legacy_backend_host, + "legacy_backend_port": config.legacy_backend_port, + "server_port_migration_hint": config.server_port_migration_hint, "no_browser": config.no_browser, "skip_frontend_build": config.skip_frontend_build, } @@ -69,6 +91,9 @@ def service_config_from_payload( backend_port=_positive_int(payload.get("backend_port"), base.backend_port), frontend_host=_string(payload.get("frontend_host"), base.frontend_host), frontend_port=_positive_int(payload.get("frontend_port"), base.frontend_port), + legacy_backend_host=_optional_string(payload.get("legacy_backend_host"), base.legacy_backend_host), + legacy_backend_port=_optional_positive_int(payload.get("legacy_backend_port"), base.legacy_backend_port), + server_port_migration_hint=_bool(payload.get("server_port_migration_hint"), base.server_port_migration_hint), no_browser=resolved_no_browser, skip_frontend_build=resolved_skip_frontend_build, ) @@ -110,6 +135,8 @@ def build_service_config( *, no_browser: bool = False, skip_webui_build: bool = False, + public_host: str | None = None, + public_port: int | None = None, server_host: str | None = None, server_port: int | None = None, webui_host: str | None = None, @@ -119,30 +146,36 @@ def build_service_config( default_webui_host: str = "127.0.0.1", default_webui_port: int = 5173, ) -> ServiceConfig: - """Build service config from CLI values, environment, and defaults.""" + """Build service config from CLI values, environment, and defaults. + + Static WebUI mode uses the old WebUI endpoint as the public FastAPI + listener so remote deployments keep their existing browser URL. + """ + explicit_public_host = _first_host(public_host, ("FLOCKS_HOST", "FLOCKS_PUBLIC_HOST")) + explicit_public_port = _first_port(public_port, ("FLOCKS_PORT", "FLOCKS_PUBLIC_PORT"), "public") + explicit_webui_host = _first_host(webui_host, ("FLOCKS_WEBUI_HOST", "FLOCKS_FRONTEND_HOST")) + explicit_webui_port = _first_port(webui_port, ("FLOCKS_WEBUI_PORT", "FLOCKS_FRONTEND_PORT"), "webui") + explicit_server_host = _first_host(server_host, ("FLOCKS_SERVER_HOST", "FLOCKS_BACKEND_HOST")) + explicit_server_port = _first_port(server_port, ("FLOCKS_SERVER_PORT", "FLOCKS_BACKEND_PORT"), "server") + + resolved_public_host = explicit_public_host or explicit_webui_host or explicit_server_host or default_webui_host + resolved_public_port = explicit_public_port or explicit_webui_port or explicit_server_port or default_webui_port + legacy_host = explicit_server_host or default_server_host + legacy_port = explicit_server_port or default_server_port + show_server_port_hint = ( + explicit_server_port is not None + and (explicit_public_port is not None or explicit_webui_port is not None) + and explicit_server_port != resolved_public_port + ) + return ServiceConfig( - backend_host=_resolve_host( - cli_value=server_host, - env_names=("FLOCKS_SERVER_HOST", "FLOCKS_BACKEND_HOST"), - default=default_server_host, - ), - backend_port=_resolve_port( - cli_value=server_port, - env_names=("FLOCKS_SERVER_PORT", "FLOCKS_BACKEND_PORT"), - default=default_server_port, - label="server", - ), - frontend_host=_resolve_host( - cli_value=webui_host, - env_names=("FLOCKS_WEBUI_HOST", "FLOCKS_FRONTEND_HOST"), - default=default_webui_host, - ), - frontend_port=_resolve_port( - cli_value=webui_port, - env_names=("FLOCKS_WEBUI_PORT", "FLOCKS_FRONTEND_PORT"), - default=default_webui_port, - label="webui", - ), + backend_host=resolved_public_host, + backend_port=resolved_public_port, + frontend_host=resolved_public_host, + frontend_port=resolved_public_port, + legacy_backend_host=legacy_host, + legacy_backend_port=legacy_port, + server_port_migration_hint=show_server_port_hint, no_browser=no_browser, skip_frontend_build=skip_webui_build, ) @@ -155,22 +188,25 @@ def with_frontend_build(config: ServiceConfig, *, skip_frontend_build: bool) -> backend_port=config.backend_port, frontend_host=config.frontend_host, frontend_port=config.frontend_port, + legacy_backend_host=config.legacy_backend_host, + legacy_backend_port=config.legacy_backend_port, + server_port_migration_hint=config.server_port_migration_hint, no_browser=config.no_browser, skip_frontend_build=skip_frontend_build, ) -def _resolve_host(*, cli_value: str | None, env_names: tuple[str, ...], default: str) -> str: +def _first_host(cli_value: str | None, env_names: tuple[str, ...]) -> str | None: if cli_value is not None: return cli_value for env_name in env_names: env_value = os.getenv(env_name) if env_value: return env_value - return default + return None -def _resolve_port(*, cli_value: int | None, env_names: tuple[str, ...], default: int, label: str) -> int: +def _first_port(cli_value: int | None, env_names: tuple[str, ...], label: str) -> int | None: if cli_value is not None: return cli_value for env_name in env_names: @@ -181,17 +217,25 @@ def _resolve_port(*, cli_value: int | None, env_names: tuple[str, ...], default: return int(env_value) except ValueError as error: raise ServiceConfigError(f"{label} port from {env_name} must be an integer.") from error - return default + return None def _string(value: Any, fallback: str) -> str: return value if isinstance(value, str) and value else fallback +def _optional_string(value: Any, fallback: str | None) -> str | None: + return value if isinstance(value, str) and value else fallback + + def _positive_int(value: Any, fallback: int) -> int: return value if _is_positive_int(value) else fallback +def _optional_positive_int(value: Any, fallback: int | None) -> int | None: + return value if _is_positive_int(value) else fallback + + def _is_positive_int(value: Any) -> bool: return isinstance(value, int) and not isinstance(value, bool) and value > 0 diff --git a/flocks/cli/service_control.py b/flocks/cli/service_control.py index c3a98672e..636dd78a9 100644 --- a/flocks/cli/service_control.py +++ b/flocks/cli/service_control.py @@ -177,12 +177,6 @@ def request_restart_webui( return parse_supervisor_status(data) -def request_stop_webui(*, paths=None, timeout: float | None = 30.0) -> SupervisorStatus: - """Ask the supervisor daemon to stop WebUI only.""" - payload = _post_control_json("/stop/webui", paths=paths, timeout=timeout) - return parse_supervisor_status(payload) - - def request_prepare_upgrade(*, paths=None, timeout: float | None = 30.0) -> SupervisorStatus: """Ask the supervisor daemon to pause managed services for upgrade handoff.""" payload = _post_control_json("/upgrade/prepare", paths=paths, timeout=timeout) diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index e7b0b32af..4376d7fdb 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -150,10 +150,15 @@ def watchdog_pid_path(paths: RuntimePaths) -> Path: def ensure_install_layout(root: Path | None = None) -> Path: """Validate that the installed repo still contains backend and WebUI code.""" current = root or repo_root() + from flocks.server.static_webui import resolve_webui_dist_dir + if not (current / "pyproject.toml").exists(): - raise ServiceError(f"未找到安装目录中的 pyproject.toml: {current}") + if resolve_webui_dist_dir() is None: + raise ServiceError(f"未找到安装目录中的 pyproject.toml 或 WebUI 静态资源: {current}") + return current if not (current / "webui" / "package.json").exists(): - raise ServiceError("未找到 WebUI 源码,请重新安装 Flocks,或设置 FLOCKS_REPO_ROOT 指向有效安装目录。") + if resolve_webui_dist_dir() is None: + raise ServiceError("未找到 WebUI 静态资源,请重新安装 Flocks,或设置 FLOCKS_REPO_ROOT 指向有效安装目录。") return current @@ -1102,6 +1107,75 @@ def _backend_command_and_env(root: Path, config: ServiceConfig) -> tuple[list[st return command, env +def _build_webui_dist(root: Path, config: ServiceConfig, console) -> None: + """Build the production WebUI static bundle.""" + npm = resolve_npm_executable() + if not npm: + raise ServiceError("WebUI dist 不存在,且未检测到 npm;请先安装 Node.js 22+(包含 npm)后重试。") + if not node_version_satisfies_requirement(): + raise ServiceError(f"检测到的 Node.js 版本过低。构建 WebUI 至少需要 Node.js {MIN_NODE_MAJOR}+。") + + webui_dir = root / "webui" + if not (webui_dir / "package.json").exists(): + raise ServiceError("未找到 WebUI 源码,无法构建静态资源。") + + console.print("[flocks] 准备 Flocks 静态资源...") + frontend_env = build_frontend_env(config) + run_kwargs: dict[str, object] = {"cwd": webui_dir, "check": False, "env": frontend_env} + if sys.platform == "win32": + run_kwargs.update({"capture_output": True, "text": True, "encoding": "utf-8", "errors": "replace"}) + completed = subprocess.run([npm, "run", "build"], **run_kwargs) + if completed.returncode != 0: + output = "\n".join( + value for value in (getattr(completed, "stdout", None), getattr(completed, "stderr", None)) if value + ) + if windows_frontend_build_assertion_is_recoverable(webui_dir, output): + console.print("[flocks] WebUI 构建产物已生成,忽略 Windows Node.js 退出断言。") + else: + if output: + console.print(output) + raise ServiceError("WebUI 构建失败。") + + +def _ensure_webui_dist(root: Path, config: ServiceConfig, console) -> None: + """Ensure the FastAPI process can serve the production WebUI bundle.""" + from flocks.server.static_webui import WebUIDistMissingError, ensure_webui_dist_dir + + try: + ensure_webui_dist_dir() + return + except WebUIDistMissingError: + if config.skip_frontend_build: + raise + + _build_webui_dist(root, config, console) + ensure_webui_dist_dir() + + +def _cleanup_backend_start_port(port: int, console, *, root: Path) -> list[int]: + """Clean trusted leftovers that can occupy the unified public service port.""" + cleaned: list[int] = [] + cleaned.extend( + cleanup_trusted_port_owners( + port, + service="backend", + label="后端", + console=console, + root=root, + ) + ) + cleaned.extend( + cleanup_trusted_port_owners( + port, + service="webui", + label="WebUI", + console=console, + root=root, + ) + ) + return sorted(dict.fromkeys(cleaned)) + + def _start_backend_process( config: ServiceConfig, console, @@ -1111,30 +1185,25 @@ def _start_backend_process( """Start the backend child process for the supervisor.""" root = ensure_install_layout() current = paths if paths is not None else ensure_runtime_dirs() + _ensure_webui_dist(root, config, console) listeners = port_owner_pids(config.backend_port) if listeners: - cleanup_trusted_port_owners( - config.backend_port, - service="backend", - label="后端", - console=console, - root=root, - ) + _cleanup_backend_start_port(config.backend_port, console, root=root) listeners = port_owner_pids(config.backend_port) if listeners: raise ServiceError( - f"后端端口 {config.backend_port} 已被占用 (PID: {_join_pids(listeners)})," + f"server 端口 {config.backend_port} 已被占用 (PID: {_join_pids(listeners)})," "请先执行 `flocks stop` 或手动清理残留进程。" ) if port_is_in_use(config.backend_port, listeners): raise ServiceError( - f"后端端口 {config.backend_port} 已被占用,但当前环境无法识别占用 PID;" + f"server 端口 {config.backend_port} 已被占用,但当前环境无法识别占用 PID;" "请先安装 lsof 或手动清理残留进程。" ) command, env = _backend_command_and_env(root, config) - console.print("[flocks] 启动后端服务...") + console.print("[flocks] 启动 Flocks service...") process = _spawn_process(command, cwd=root, log_path=current.backend_log, env=env) record = process_runtime_record( process, @@ -1158,108 +1227,6 @@ def _start_backend_process( return process -def _start_frontend_process( - config: ServiceConfig, - console, - *, - paths: RuntimePaths | None = None, -) -> subprocess.Popen: - """Build and start the WebUI child process.""" - root = ensure_install_layout() - current = paths if paths is not None else ensure_runtime_dirs() - - listeners = port_owner_pids(config.frontend_port) - if listeners: - upgrade_info = _read_upgrade_runtime_info(config.frontend_port) - if upgrade_info.page_active: - _resolve_upgrade_runtime( - console, - frontend_port=upgrade_info.frontend_port or config.frontend_port, - attempt_recover=False, - ) - listeners = port_owner_pids(config.frontend_port) - if listeners: - raise ServiceError( - f"WebUI 端口 {config.frontend_port} 已被占用 (PID: {_join_pids(listeners)})," - "请先执行 `flocks stop` 或手动清理残留进程。" - ) - - else: - cleanup_trusted_port_owners( - config.frontend_port, - service="webui", - label="WebUI", - console=console, - root=root, - ) - listeners = port_owner_pids(config.frontend_port) - if listeners: - raise ServiceError( - f"WebUI 端口 {config.frontend_port} 已被占用 (PID: {_join_pids(listeners)})," - "请先执行 `flocks stop` 或手动清理残留进程。" - ) - if port_is_in_use(config.frontend_port, listeners): - raise ServiceError( - f"WebUI 端口 {config.frontend_port} 已被占用,但当前环境无法识别占用 PID;" - "请先安装 lsof 或手动清理残留进程。" - ) - - npm = resolve_npm_executable() - if not npm: - raise ServiceError("未检测到 npm,请先安装 Node.js 22+(包含 npm)后重试。") - if not node_version_satisfies_requirement(): - raise ServiceError(f"检测到的 Node.js 版本过低。启动 WebUI 至少需要 Node.js {MIN_NODE_MAJOR}+。") - - webui_dir = root / "webui" - frontend_env = build_frontend_env(config) - if not config.skip_frontend_build: - console.print("[flocks] 构建 WebUI...") - run_kwargs: dict[str, object] = {"cwd": webui_dir, "check": False, "env": frontend_env} - if sys.platform == "win32": - run_kwargs.update({"capture_output": True, "text": True, "encoding": "utf-8", "errors": "replace"}) - completed = subprocess.run([npm, "run", "build"], **run_kwargs) - if completed.returncode != 0: - output = "\n".join( - value for value in (getattr(completed, "stdout", None), getattr(completed, "stderr", None)) if value - ) - if windows_frontend_build_assertion_is_recoverable(webui_dir, output): - console.print("[flocks] WebUI 构建产物已生成,忽略 Windows Node.js 退出断言。") - else: - if output: - console.print(output) - raise ServiceError("WebUI 构建失败。") - - command = [ - npm, - "run", - "preview", - "--", - "--host", - config.frontend_host, - "--port", - str(config.frontend_port), - ] - - console.print("[flocks] 启动 WebUI...") - process = _spawn_process(command, cwd=webui_dir, log_path=current.frontend_log, env=frontend_env) - record = process_runtime_record( - process, - host=config.frontend_host, - port=config.frontend_port, - command=command, - ) - _log_startup_config(current.frontend_log, "webui", config.frontend_host, config.frontend_port, record) - - try: - wait_for_http([config.frontend_url], "WebUI") - except ServiceError: - _emit_service_log_tail(console, current.frontend_log, "WebUI") - _terminate_process(process, "WebUI", console) - raise - - return process - - def stop_runtime_record_process(pid_file: Path, name: str, console) -> None: """Stop a legacy pid/runtime record without scanning ports.""" cleanup_stale_pid_file(pid_file) @@ -1387,7 +1354,7 @@ def _wait_for_supervisor_ready( last_payload = status.raw backend_state = status.backend.state webui_state = status.webui.state - if backend_state == "healthy" and webui_state == "healthy": + if backend_state == "healthy" and webui_state in {"healthy", "static"}: return status.raw if backend_state == "degraded" or webui_state == "degraded": return status.raw @@ -1416,6 +1383,12 @@ def _start_supervisor_process(config: ServiceConfig, paths: RuntimePaths, consol "--webui-port", str(config.frontend_port), ] + if config.legacy_backend_host is not None: + command.extend(["--legacy-server-host", config.legacy_backend_host]) + if config.legacy_backend_port is not None: + command.extend(["--legacy-server-port", str(config.legacy_backend_port)]) + if config.server_port_migration_hint: + command.append("--server-port-migration-hint") if config.skip_frontend_build: command.append("--skip-webui-build") env = os.environ.copy() @@ -1440,6 +1413,8 @@ def _legacy_runtime_config(paths: RuntimePaths, fallback: ServiceConfig) -> Serv backend_port=_recorded_port(paths.backend_pid, fallback.backend_port), frontend_host=_recorded_host(paths.frontend_pid, fallback.frontend_host), frontend_port=_recorded_port(paths.frontend_pid, fallback.frontend_port), + legacy_backend_host=fallback.legacy_backend_host, + legacy_backend_port=fallback.legacy_backend_port, no_browser=fallback.no_browser, skip_frontend_build=fallback.skip_frontend_build, ) @@ -1448,9 +1423,9 @@ def _legacy_runtime_config(paths: RuntimePaths, fallback: ServiceConfig) -> Serv def _unique_cleanup_configs(*configs: ServiceConfig) -> list[ServiceConfig]: """Deduplicate cleanup configs by backend/WebUI ports.""" result: list[ServiceConfig] = [] - seen: set[tuple[int, int]] = set() + seen: set[tuple[int, int, int | None]] = set() for config in configs: - key = (config.backend_port, config.frontend_port) + key = (config.backend_port, config.frontend_port, config.legacy_backend_port) if key in seen: continue seen.add(key) @@ -1512,6 +1487,9 @@ def stop_all(console) -> None: def _start_all_without_stop(config: ServiceConfig, console) -> None: """Start the supervisor daemon, then print access summary.""" paths = ensure_runtime_dirs() + _print_static_port_migration_hint(config, console) + cleanup_legacy_runtime_processes(paths, console) + cleanup_orphan_service_ports(config, console) process = _start_supervisor_process(config, paths, console) console.print("[flocks] [x] 启动 Flocks daemon...") payload = _wait_for_supervisor_ready(paths, process=process) @@ -1560,13 +1538,25 @@ def restart_all(config: ServiceConfig, console) -> None: _start_all_unlocked(config, console, paths=paths) +def _print_static_port_migration_hint(config: ServiceConfig, console) -> None: + """Explain legacy server-port behavior when it differs from public WebUI port.""" + if ( + not config.server_port_migration_hint + or config.legacy_backend_port is None + or config.legacy_backend_port == config.backend_port + ): + return + console.print( + "[flocks] API 已与 WebUI 同源," + f"当前统一监听端口为 {config.backend_port};旧 server 端口 {config.legacy_backend_port} 仅用于残留清理。" + ) + + def _print_stop_summary(console, status) -> None: """Print stopped services from the last available supervisor status.""" if status is not None: if status.backend.pid is not None: - console.print(f"[flocks] server 已停止(PID={status.backend.pid})。") - if status.webui.pid is not None: - console.print(f"[flocks] webui 已停止(PID={status.webui.pid})。") + console.print(f"[flocks] flocks 已停止(PID={status.backend.pid})。") console.print("[flocks] daemon 已停止。") @@ -1574,7 +1564,11 @@ def cleanup_orphan_service_ports(config: ServiceConfig, console, *, extra_config """Clean trusted Flocks leftovers on configured backend/WebUI ports.""" root = ensure_install_layout() cleanup_trusted_daemon_processes(console=console, root=root) - for cleanup_config in _unique_cleanup_configs(config, *extra_configs): + candidates: list[ServiceConfig] = [] + for candidate in (config, config.legacy_cleanup_config, *extra_configs): + candidates.append(candidate) + candidates.append(candidate.legacy_cleanup_config) + for cleanup_config in _unique_cleanup_configs(*candidates): cleanup_trusted_port_owners( cleanup_config.backend_port, service="backend", @@ -1582,6 +1576,13 @@ def cleanup_orphan_service_ports(config: ServiceConfig, console, *, extra_config console=console, root=root, ) + cleanup_trusted_port_owners( + cleanup_config.backend_port, + service="webui", + label="WebUI", + console=console, + root=root, + ) cleanup_trusted_port_owners( cleanup_config.frontend_port, service="webui", @@ -1589,6 +1590,13 @@ def cleanup_orphan_service_ports(config: ServiceConfig, console, *, extra_config console=console, root=root, ) + cleanup_trusted_port_owners( + cleanup_config.frontend_port, + service="backend", + label="后端", + console=console, + root=root, + ) def build_status_lines(paths: RuntimePaths | None = None) -> list[str]: @@ -1619,20 +1627,17 @@ def build_status_lines(paths: RuntimePaths | None = None) -> list[str]: def _status_lines_from_payload(payload: dict[str, Any]) -> list[str]: daemon = payload.get("daemon") if isinstance(payload.get("daemon"), dict) else {} backend = payload.get("backend") if isinstance(payload.get("backend"), dict) else {} - webui = payload.get("webui") if isinstance(payload.get("webui"), dict) else {} lines = [ "[flocks] 服务", _daemon_status_line(daemon), - _service_status_line("后端", backend), - _service_status_line("WebUI", webui), + _service_status_line("flocks", backend), "", "[flocks] 日志", f"[flocks] daemon: {daemon.get('log_path')}", ] - for label, service in (("后端", backend), ("WebUI", webui)): - log_path = service.get("log_path") - if log_path: - lines.append(f"[flocks] {label}: {log_path}") + log_path = backend.get("log_path") + if log_path: + lines.append(f"[flocks] flocks: {log_path}") return lines @@ -1643,7 +1648,8 @@ def _service_status_line(label: str, payload: dict[str, Any]) -> str: state = payload.get("state") or "unknown" error = payload.get("last_error") suffix = f" last_error={error}" if error else "" - return f"[flocks] {label}: state={state} PID={pid} URL=http://{host}:{port}{suffix}" + pid_part = f" PID={pid}" if pid is not None else "" + return f"[flocks] {label}: state={state}{pid_part} URL=http://{host}:{port}{suffix}" def _daemon_status_line(payload: dict[str, Any]) -> str: @@ -1661,32 +1667,28 @@ def _startup_step_marker(state: object, *, ready_states: set[str]) -> str: def _startup_status_lines_from_payload(payload: dict[str, Any], *, include_daemon_step: bool = True) -> list[str]: daemon = payload.get("daemon") if isinstance(payload.get("daemon"), dict) else {} backend = payload.get("backend") if isinstance(payload.get("backend"), dict) else {} - webui = payload.get("webui") if isinstance(payload.get("webui"), dict) else {} lines = [] if include_daemon_step: lines.append(f"[flocks] {_startup_step_marker(daemon.get('state'), ready_states={'running'})} 启动 Flocks daemon...") lines.extend([ - f"[flocks] {_startup_step_marker(backend.get('state'), ready_states={'healthy'})} 启动 Flocks server...", - f"[flocks] {_startup_step_marker(webui.get('state'), ready_states={'healthy'})} 启动 Flocks webui...", + f"[flocks] {_startup_step_marker(backend.get('state'), ready_states={'healthy'})} 启动 Flocks service...", "", "[flocks] 服务", _daemon_status_line(daemon), - _service_status_line("server", backend), - _service_status_line("webui", webui), + _service_status_line("flocks", backend), "", "[flocks] 日志", f"[flocks] daemon: {daemon.get('log_path')}", ]) - for label, service in (("server", backend), ("webui", webui)): - log_path = service.get("log_path") - if log_path: - lines.append(f"[flocks] {label}: {log_path}") + log_path = backend.get("log_path") + if log_path: + lines.append(f"[flocks] flocks: {log_path}") return lines def _frontend_url_from_status(status, fallback: str) -> str: - if status.webui.port is not None: - return f"http://{_format_host_for_url(_loopback_host(status.webui.host))}:{status.webui.port}" + if status.backend.port is not None: + return f"http://{_format_host_for_url(_loopback_host(status.backend.host))}:{status.backend.port}" return fallback @@ -1753,19 +1755,18 @@ def selected_log_paths( if backend and not webui: return [paths.backend_log] if webui and not backend: - return [paths.frontend_log] - return [paths.backend_log, paths.frontend_log] + return [paths.backend_log] + return [paths.backend_log] def _selected_log_entries(paths: RuntimePaths, *, backend: bool = False, webui: bool = False) -> list[tuple[str, Path]]: """Return local log files selected by CLI flags.""" if backend and not webui: - return [("backend", paths.backend_log)] + return [("flocks", paths.backend_log)] if webui and not backend: - return [("webui", paths.frontend_log)] + return [("flocks", paths.backend_log)] return [ - ("backend", paths.backend_log), - ("webui", paths.frontend_log), + ("flocks", paths.backend_log), ("daemon", supervisor_log_path(paths)), ] diff --git a/flocks/cli/service_process.py b/flocks/cli/service_process.py index 3f2d50e7a..d6407417e 100644 --- a/flocks/cli/service_process.py +++ b/flocks/cli/service_process.py @@ -9,7 +9,7 @@ import httpx -from flocks.cli.service_config import ServiceConfig, with_frontend_build +from flocks.cli.service_config import ServiceConfig @dataclass(frozen=True) @@ -60,47 +60,28 @@ def probe(self, process: subprocess.Popen | None, host: str, port: int) -> Servi if not tcp_port_accepts_connections(host, port): return ServiceProbeResult(healthy=False, reason=f"port {port} is not listening", restart=True) - from flocks.cli.service_manager import _backend_health_url, _is_healthy_status_response + from flocks.cli.service_manager import _backend_health_url, _is_healthy_status_response, backend_access_base_url url = _backend_health_url(host, port) try: with httpx.Client(timeout=2.0, trust_env=False) as client: response = client.get(url) - healthy = _is_healthy_status_response(response) - reason = f"health status={response.status_code}" + root_response = client.get( + backend_access_base_url(ServiceConfig(backend_host=host, backend_port=port)), + headers={"Accept": "text/html"}, + ) + healthy = _is_healthy_status_response(response) and _is_static_webui_response(root_response) + reason = f"health status={response.status_code}, root status={root_response.status_code}" except Exception as exc: healthy = False reason = f"health failed: {exc}" return ServiceProbeResult(healthy=healthy, reason=reason) -class WebUIProcessAdapter: - name = "webui" - label = "WebUI" - - def start(self, config: ServiceConfig, paths, *, built_once: bool = False) -> subprocess.Popen: - from flocks.cli.service_manager import _StdoutConsole, _start_frontend_process - - resolved = with_frontend_build(config, skip_frontend_build=True) if built_once else config - return _start_frontend_process(resolved, _StdoutConsole(), paths=paths) - - def stop(self, process: subprocess.Popen | None) -> None: - from flocks.cli.service_manager import _StdoutConsole, _terminate_process - - _terminate_process(process, self.label, _StdoutConsole()) - - def probe(self, process: subprocess.Popen | None, host: str, port: int) -> ServiceProbeResult: - if process is None: - return ServiceProbeResult(healthy=False, reason="stopped") - if process.poll() is not None: - return ServiceProbeResult( - healthy=False, - reason=f"process exited with code {process.returncode}", - restart=True, - ) - if not tcp_port_accepts_connections(host, port): - return ServiceProbeResult(healthy=False, reason=f"port {port} is not listening", restart=True) - return ServiceProbeResult(healthy=True) +def _is_static_webui_response(response: httpx.Response) -> bool: + """Return True only when the unified service serves the SPA index.""" + content_type = response.headers.get("content-type", "").lower() + return response.status_code == 200 and "text/html" in content_type def tcp_port_accepts_connections(host: str, port: int) -> bool: diff --git a/flocks/cli/service_supervisor.py b/flocks/cli/service_supervisor.py index b72dfe24d..afb706926 100644 --- a/flocks/cli/service_supervisor.py +++ b/flocks/cli/service_supervisor.py @@ -24,7 +24,7 @@ supervisor_log_path, supervisor_socket_path, ) -from flocks.cli.service_process import BackendProcessAdapter, ProcessAdapter, WebUIProcessAdapter +from flocks.cli.service_process import BackendProcessAdapter, ProcessAdapter SUPERVISOR_CHECK_INTERVAL_SECONDS = 5.0 SUPERVISOR_HEALTH_FAILURE_THRESHOLD = 2 @@ -64,7 +64,7 @@ def _daemon_log(event: str, details: dict[str, object] | None = None) -> None: def _health_status_from_service_state(state: str) -> str: - if state in {"healthy", "starting", "restarting", "stopped", "paused"}: + if state in {"healthy", "static", "starting", "restarting", "stopped", "paused"}: return state return "degraded" @@ -99,7 +99,6 @@ def __init__( interval: float = SUPERVISOR_CHECK_INTERVAL_SECONDS, failure_threshold: int = SUPERVISOR_HEALTH_FAILURE_THRESHOLD, backend_adapter: ProcessAdapter | None = None, - webui_adapter: ProcessAdapter | None = None, ) -> None: from flocks.cli.service_manager import ensure_runtime_dirs @@ -108,7 +107,6 @@ def __init__( self.interval = interval self.failure_threshold = failure_threshold self.backend_adapter = backend_adapter or BackendProcessAdapter() - self.webui_adapter = webui_adapter or WebUIProcessAdapter() self.started_at = time.time() self._lock = threading.RLock() self._shutdown_requested = threading.Event() @@ -126,9 +124,10 @@ def __init__( self.webui = ManagedService( name="webui", label="WebUI", - host=config.frontend_host, - port=config.frontend_port, - log_path=self.paths.frontend_log, + host=config.backend_host, + port=config.backend_port, + log_path=self.paths.backend_log, + state="static", ) def run(self) -> None: @@ -268,8 +267,7 @@ def do_POST(self) -> None: self._send_json(daemon.status_payload()) return if parsed.path == "/stop/webui": - daemon.stop_webui(reason="control stop") - self._send_json(daemon.status_payload()) + self._send_json({"error": "static WebUI is served by Flocks service and cannot be stopped separately"}, status=409) return if parsed.path == "/upgrade/prepare": daemon.prepare_upgrade(reason="control upgrade prepare") @@ -293,8 +291,9 @@ def update_config(self, payload: dict[str, Any]) -> None: self.config = service_config_from_payload(payload, self.config) self.backend.host = self.config.backend_host self.backend.port = self.config.backend_port - self.webui.host = self.config.frontend_host - self.webui.port = self.config.frontend_port + self.webui.host = self.config.backend_host + self.webui.port = self.config.backend_port + self.webui.log_path = self.paths.backend_log def request_stop(self) -> None: self._shutdown_requested.set() @@ -390,15 +389,14 @@ def handle_logs_request(self, handler: BaseHTTPRequestHandler, query: dict[str, def _log_paths_for_service(self, service_name: str) -> list[tuple[str, Path]]: if service_name == "backend": - return [("backend", self.paths.backend_log)] + return [("flocks", self.paths.backend_log)] if service_name == "webui": - return [("webui", self.paths.frontend_log)] + return [("flocks", self.paths.backend_log)] if service_name == "daemon": return [("daemon", supervisor_log_path(self.paths))] if service_name == "all": return [ - ("backend", self.paths.backend_log), - ("webui", self.paths.frontend_log), + ("flocks", self.paths.backend_log), ("daemon", supervisor_log_path(self.paths)), ] return [] @@ -407,31 +405,27 @@ def restart_all(self, *, reason: str) -> None: with self._lock: self._backend_paused = False self._webui_paused = False - self._restart_service(self.webui, reason=reason, immediate=True) self._restart_service(self.backend, reason=reason, immediate=True) self._start_backend_locked(immediate=True) - self._start_webui_locked(immediate=True) + self._sync_static_webui_state() def restart_backend(self, *, reason: str) -> None: with self._lock: self._backend_paused = False self._restart_service(self.backend, reason=reason, immediate=True) self._start_backend_locked(immediate=True) + self._sync_static_webui_state() def restart_webui(self, *, reason: str, force_frontend_build: bool = False) -> None: with self._lock: self._webui_paused = False if force_frontend_build: - self.webui.built_once = False - self._restart_service(self.webui, reason=reason, immediate=True) - self._start_webui_locked(immediate=True) + from flocks.cli.service_config import with_frontend_build - def stop_webui(self, *, reason: str) -> None: - with self._lock: - self._webui_paused = True - _daemon_log("service_pause", {"service": "webui", "reason": reason}) - self._stop_service(self.webui) - self.webui.last_error = reason + self.config = with_frontend_build(self.config, skip_frontend_build=False) + self._restart_service(self.backend, reason=f"{reason}: static webui", immediate=True) + self._start_backend_locked(immediate=True) + self._sync_static_webui_state() def prepare_upgrade(self, *, reason: str) -> None: with self._lock: @@ -441,7 +435,8 @@ def prepare_upgrade(self, *, reason: str) -> None: _daemon_log("service_pause", {"service": "webui", "reason": reason}) self.backend.last_error = reason self.webui.last_error = reason - self._stop_service(self.webui) + self._stop_service(self.backend) + self.webui.state = "paused" def resume_upgrade(self, *, reason: str) -> None: with self._lock: @@ -450,25 +445,21 @@ def resume_upgrade(self, *, reason: str) -> None: _daemon_log("service_resume", {"service": "backend", "reason": reason}) _daemon_log("service_resume", {"service": "webui", "reason": reason}) self._probe_backend_locked() - self._probe_webui_locked() self._start_backend_locked(immediate=True) - self._start_webui_locked(immediate=True) + self._sync_static_webui_state() def shutdown_children(self) -> None: with self._lock: - self._stop_service(self.webui) self._stop_service(self.backend) + self.webui.state = "stopped" def tick(self) -> None: with self._lock: if not self._backend_paused: self._probe_backend_locked() - if not self._webui_paused: - self._probe_webui_locked() if not self._backend_paused: self._start_backend_locked(immediate=False) - if not self._webui_paused: - self._start_webui_locked(immediate=False) + self._sync_static_webui_state() def _restart_service(self, service: ManagedService, *, reason: str, immediate: bool) -> None: _daemon_log("service_restart", {"service": service.name, "reason": reason}) @@ -503,24 +494,7 @@ def _start_backend_locked(self, *, immediate: bool) -> None: self.backend.state = "healthy" self.backend.last_error = None self.backend.health_failure_count = 0 - - def _start_webui_locked(self, *, immediate: bool) -> None: - if self.webui.process is not None and self.webui.process.poll() is None: - return - if not immediate and time.monotonic() < self.webui.next_restart_at: - return - self.webui.state = "starting" - try: - process = self.webui_adapter.start(self.config, self.paths, built_once=self.webui.built_once) - except Exception as exc: - self._mark_start_failed(self.webui, exc) - return - self.webui.process = process - self.webui.command = tuple(str(item) for item in process.args) - self.webui.state = "healthy" - self.webui.last_error = None - self.webui.health_failure_count = 0 - self.webui.built_once = True + self._sync_static_webui_state() def _mark_start_failed(self, service: ManagedService, error: Exception) -> None: service.process = None @@ -556,20 +530,27 @@ def _probe_backend_locked(self) -> None: if self.backend.health_failure_count >= self.failure_threshold: self._restart_service(self.backend, reason=result.reason or "backend health failed", immediate=True) - def _probe_webui_locked(self) -> None: - result = self.webui_adapter.probe(self.webui.process, self.webui.host, self.webui.port) - if self.webui.process is None: - self.webui.state = "stopped" - return - if result.restart: - self._restart_service(self.webui, reason=result.reason or "webui probe failed", immediate=True) - return - self.webui.state = "healthy" - self.webui.health_failure_count = 0 - self.webui.last_error = None - def _adapter_for(self, service: ManagedService) -> ProcessAdapter: - return self.backend_adapter if service.name == "backend" else self.webui_adapter + return self.backend_adapter + + def _sync_static_webui_state(self) -> None: + self.webui.host = self.backend.host + self.webui.port = self.backend.port + self.webui.log_path = self.paths.backend_log + self.webui.process = None + self.webui.command = () + if self._webui_paused: + self.webui.state = "paused" + return + if self.backend.state == "healthy": + self.webui.state = "static" + self.webui.last_error = None + elif self.backend.state in {"starting", "restarting"}: + self.webui.state = self.backend.state + self.webui.last_error = self.backend.last_error + else: + self.webui.state = "degraded" + self.webui.last_error = self.backend.last_error or "server is not healthy" def run_service_daemon( diff --git a/flocks/server/app.py b/flocks/server/app.py index aa03ebc60..d5e9f128c 100644 --- a/flocks/server/app.py +++ b/flocks/server/app.py @@ -13,7 +13,7 @@ from pathlib import Path from typing import Any, Callable, Optional from contextlib import asynccontextmanager -from fastapi import FastAPI, Request, Response, status +from fastapi import FastAPI, Request, status from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse from fastapi.exceptions import RequestValidationError @@ -26,6 +26,7 @@ from flocks.auth.service import AuthService from flocks.extensions import ExtensionOptions, handler_name, normalize_fail_policy, normalize_timeout from flocks.server.auth import apply_auth_for_request, clear_auth_context +from flocks.server.static_webui import maybe_serve_static_webui # Load .env file at startup try: @@ -780,7 +781,6 @@ async def __call__(self, scope, receive, send): await self._inner(scope, receive, send) -# Instance Context Middleware @app.middleware("http") async def instance_context_middleware(request: Request, call_next): """ @@ -796,7 +796,7 @@ async def instance_context_middleware(request: Request, call_next): from urllib.parse import unquote from flocks.project.instance import Instance from flocks.project.bootstrap import instance_bootstrap - + # Skip instance context for global routes, static files, and simple endpoints skip_prefixes = { "/global", "/docs", "/redoc", "/openapi.json", "/health", @@ -891,6 +891,15 @@ async def auth_guard_middleware(request: Request, call_next): clear_auth_context(token) +@app.middleware("http") +async def static_webui_middleware(request: Request, call_next): + """Serve the SPA shell before auth for browser navigations.""" + static_response = await maybe_serve_static_webui(request) + if static_response is not None: + return static_response + return await call_next(request) + + # Error Handlers @app.exception_handler(RequestValidationError) async def validation_exception_handler(request: Request, exc: RequestValidationError): diff --git a/flocks/server/static_webui.py b/flocks/server/static_webui.py new file mode 100644 index 000000000..dede61320 --- /dev/null +++ b/flocks/server/static_webui.py @@ -0,0 +1,129 @@ +"""Static WebUI hosting helpers for the FastAPI server.""" + +from __future__ import annotations + +import os +import re +from pathlib import Path +from urllib.parse import unquote + +from fastapi import Request, Response +from fastapi.responses import FileResponse, PlainTextResponse + +_INDEX_CACHE_CONTROL = "no-store" +_ASSET_CACHE_CONTROL = "public, max-age=31536000, immutable" +_STATIC_CACHE_CONTROL = "no-cache" +_FINGERPRINT_RE = re.compile(r"(?:^|[.-])[0-9a-f]{8,}(?:[.-]|$)", re.IGNORECASE) +_PROTECTED_PREFIXES = ( + "/api", + "/event", + "/global", + "/docs", + "/redoc", + "/openapi.json", + "/health", +) + + +class WebUIDistMissingError(RuntimeError): + """Raised when the production WebUI build output is unavailable.""" + + +def source_webui_dist_dir() -> Path: + """Return the source-tree WebUI dist directory.""" + return Path(__file__).resolve().parents[2] / "webui" / "dist" + + +def packaged_webui_dist_dir() -> Path: + """Return the packaged WebUI static directory.""" + return Path(__file__).resolve().parents[1] / "webui_static" + + +def resolve_webui_dist_dir() -> Path | None: + """Return the first usable WebUI dist directory.""" + candidates: list[Path] = [] + override = os.getenv("FLOCKS_WEBUI_DIST_DIR") + if override: + candidates.append(Path(override).expanduser()) + candidates.extend([source_webui_dist_dir(), packaged_webui_dist_dir()]) + for candidate in candidates: + if (candidate / "index.html").is_file(): + return candidate.resolve() + return None + + +def ensure_webui_dist_dir() -> Path: + """Return the WebUI dist directory or raise a clear startup error.""" + dist_dir = resolve_webui_dist_dir() + if dist_dir is None: + raise WebUIDistMissingError( + "WebUI build output is missing. Run `cd webui && npm run build`, " + "or start without `--skip-webui-build` so Flocks can build it." + ) + return dist_dir + + +async def maybe_serve_static_webui(request: Request) -> Response | None: + """Serve SPA static files for browser navigations. + + API and TUI-compatible requests continue through the existing routers. Only + real static files and browser HTML navigation requests are handled here. + """ + if request.method not in {"GET", "HEAD"}: + return None + + path = request.url.path or "/" + dist_dir = resolve_webui_dist_dir() + if dist_dir is None: + return None + + file_path = _resolve_existing_static_file(dist_dir, path) + if file_path is not None: + return _file_response(file_path, cache_control=_cache_control_for_file(path, file_path)) + + if path.startswith("/assets/"): + return PlainTextResponse("Not found", status_code=404) + if _is_protected_backend_path(path): + return None + if not _accepts_html(request): + return None + + return _file_response(dist_dir / "index.html", cache_control=_INDEX_CACHE_CONTROL) + + +def _resolve_existing_static_file(dist_dir: Path, path: str) -> Path | None: + if path == "/": + return None + relative = unquote(path.lstrip("/")) + candidate = (dist_dir / relative).resolve() + try: + candidate.relative_to(dist_dir) + except ValueError: + return None + if candidate.is_file(): + return candidate + return None + + +def _file_response(path: Path, *, cache_control: str) -> FileResponse: + headers = {"Cache-Control": cache_control} + return FileResponse(path, headers=headers) + + +def _cache_control_for_file(path: str, file_path: Path) -> str: + if file_path.name == "index.html": + return _INDEX_CACHE_CONTROL + if path.startswith("/assets/") or _FINGERPRINT_RE.search(file_path.name): + return _ASSET_CACHE_CONTROL + return _STATIC_CACHE_CONTROL + + +def _is_protected_backend_path(path: str) -> bool: + return any(path == prefix or path.startswith(prefix + "/") for prefix in _PROTECTED_PREFIXES) + + +def _accepts_html(request: Request) -> bool: + accept = request.headers.get("accept", "") + if not accept or accept == "*/*": + return False + return "text/html" in accept or "application/xhtml+xml" in accept diff --git a/flocks/updater/updater.py b/flocks/updater/updater.py index a5456de45..121a3553a 100644 --- a/flocks/updater/updater.py +++ b/flocks/updater/updater.py @@ -2128,13 +2128,23 @@ def _service_config_from_payload( *, skip_frontend_build: bool | None = None, ): - from flocks.cli.service_config import service_config_from_payload + from flocks.cli.service_config import ServiceConfig, service_config_from_payload resolved_skip_frontend_build = ( bool(payload.get("skip_frontend_build", True)) if skip_frontend_build is None else skip_frontend_build ) + migrated_payload = dict(payload) + backend_port = migrated_payload.get("backend_port") + frontend_port = migrated_payload.get("frontend_port") + if isinstance(backend_port, int) and isinstance(frontend_port, int) and backend_port != frontend_port: + migrated_payload["legacy_backend_host"] = migrated_payload.get("backend_host") + migrated_payload["legacy_backend_port"] = backend_port + migrated_payload["backend_host"] = migrated_payload.get("frontend_host") or migrated_payload.get("backend_host") + migrated_payload["backend_port"] = frontend_port + migrated_payload["server_port_migration_hint"] = True return service_config_from_payload( - payload, + migrated_payload, + default=ServiceConfig(), no_browser=True, skip_frontend_build=resolved_skip_frontend_build, ) @@ -2202,6 +2212,10 @@ def read_upgrade_runtime_state(frontend_port: int | None = None) -> dict[str, An } +def _webui_runtime_ready(state: str) -> bool: + return state in {"healthy", "static"} + + def _start_frontend_with_fallback(config, console, *, allow_build_fallback: bool) -> None: from flocks.cli.service_config import with_frontend_build from flocks.cli.service_control import request_restart_webui, request_resume_upgrade @@ -2212,7 +2226,7 @@ def _start_frontend_with_fallback(config, console, *, allow_build_fallback: bool paths=None, timeout=180.0, ) - if status.webui.state != "healthy": + if not _webui_runtime_ready(status.webui.state): raise RuntimeError(status.webui.last_error or "WebUI restart did not become healthy") return except Exception: @@ -2221,7 +2235,7 @@ def _start_frontend_with_fallback(config, console, *, allow_build_fallback: bool rebuilt_config = with_frontend_build(config, skip_frontend_build=False) result = request_restart_webui(rebuilt_config, force_frontend_build=True, paths=None, timeout=180.0) - if result.webui.state != "healthy": + if not _webui_runtime_ready(result.webui.state): raise RuntimeError(result.webui.last_error or "WebUI restart did not become healthy") @@ -3473,15 +3487,15 @@ def _build_restart_handoff_argv( "start", "--no-browser", "--skip-webui-build", - "--server-host", + "--host", str(config.backend_host), - "--server-port", + "--port", str(config.backend_port), - "--webui-host", - str(config.frontend_host), - "--webui-port", - str(config.frontend_port), ] + if config.legacy_backend_host is not None: + managed_restart_argv.extend(["--server-host", str(config.legacy_backend_host)]) + if config.legacy_backend_port is not None: + managed_restart_argv.extend(["--server-port", str(config.legacy_backend_port)]) argv = [ restart_argv[0], "-m", diff --git a/pyproject.toml b/pyproject.toml index 81ebf74ef..ab558f626 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -109,6 +109,9 @@ build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] packages = ["flocks"] +[tool.hatch.build.targets.wheel.force-include] +"webui/dist" = "flocks/webui_static" + [tool.ruff] line-length = 120 target-version = "py312" diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index c5036c5e1..69d634e8e 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -26,6 +26,11 @@ def print(self, *args, **kwargs) -> None: self.messages.append(" ".join(str(arg) for arg in args)) +@pytest.fixture(autouse=True) +def _skip_backend_webui_dist_check(monkeypatch) -> None: + monkeypatch.setattr(service_manager, "_ensure_webui_dist", lambda *_args, **_kwargs: None) + + def _make_runtime_paths(tmp_path: Path) -> service_manager.RuntimePaths: return service_manager.RuntimePaths( root=tmp_path, @@ -375,8 +380,8 @@ def test_selected_log_paths_support_specific_targets(tmp_path: Path) -> None: ) assert service_manager.selected_log_paths(paths, backend=True) == [paths.backend_log] - assert service_manager.selected_log_paths(paths, webui=True) == [paths.frontend_log] - assert service_manager.selected_log_paths(paths) == [paths.backend_log, paths.frontend_log] + assert service_manager.selected_log_paths(paths, webui=True) == [paths.backend_log] + assert service_manager.selected_log_paths(paths) == [paths.backend_log] def test_show_logs_falls_back_to_local_files_when_daemon_unavailable(monkeypatch, tmp_path: Path) -> None: @@ -397,8 +402,7 @@ def test_show_logs_falls_back_to_local_files_when_daemon_unavailable(monkeypatch service_manager.show_logs(console, follow=False, lines=1) assert any("改为读取本地日志文件" in message for message in console.messages) - assert "[backend] backend-two" in console.messages - assert "[webui] webui-one" in console.messages + assert "[flocks] backend-two" in console.messages assert "[daemon] daemon-one" in console.messages @@ -786,12 +790,11 @@ def _supervisor_status_payload() -> dict[str, object]: "log_path": "/tmp/logs/backend.log", }, "webui": { - "pid": 222, "host": "0.0.0.0", - "port": 5174, - "state": "healthy", + "port": 9000, + "state": "static", "last_error": None, - "log_path": "/tmp/logs/webui.log", + "log_path": "/tmp/logs/backend.log", }, } @@ -809,26 +812,21 @@ def test_build_status_lines_reports_supervisor_control_status(monkeypatch, tmp_p assert lines[0] == "[flocks] 服务" assert lines[1] == "[flocks] daemon: state=running PID=100" assert "http://127.0.0.1:9000" in lines[2] - assert "http://127.0.0.1:5174" in lines[3] - assert lines[6] == "[flocks] daemon: /tmp/logs/daemon.log" - assert lines[7] == "[flocks] 后端: /tmp/logs/backend.log" - assert lines[8] == "[flocks] WebUI: /tmp/logs/webui.log" + assert lines[5] == "[flocks] daemon: /tmp/logs/daemon.log" + assert lines[6] == "[flocks] flocks: /tmp/logs/backend.log" def test_startup_status_lines_use_progress_summary() -> None: lines = service_manager._startup_status_lines_from_payload(_supervisor_status_payload()) - assert lines[:3] == [ + assert lines[:2] == [ "[flocks] [x] 启动 Flocks daemon...", - "[flocks] [x] 启动 Flocks server...", - "[flocks] [x] 启动 Flocks webui...", + "[flocks] [x] 启动 Flocks service...", ] - assert lines[5] == "[flocks] daemon: state=running PID=100" - assert lines[6] == "[flocks] server: state=healthy PID=111 URL=http://127.0.0.1:9000" - assert lines[7] == "[flocks] webui: state=healthy PID=222 URL=http://127.0.0.1:5174" - assert lines[10] == "[flocks] daemon: /tmp/logs/daemon.log" - assert lines[11] == "[flocks] server: /tmp/logs/backend.log" - assert lines[12] == "[flocks] webui: /tmp/logs/webui.log" + assert lines[4] == "[flocks] daemon: state=running PID=100" + assert lines[5] == "[flocks] flocks: state=healthy PID=111 URL=http://127.0.0.1:9000" + assert lines[8] == "[flocks] daemon: /tmp/logs/daemon.log" + assert lines[9] == "[flocks] flocks: /tmp/logs/backend.log" def test_startup_status_lines_mark_unhealthy_steps() -> None: @@ -838,8 +836,8 @@ def test_startup_status_lines_mark_unhealthy_steps() -> None: lines = service_manager._startup_status_lines_from_payload(payload) - assert lines[1] == "[flocks] [!] 启动 Flocks server..." - assert lines[6] == "[flocks] server: state=degraded PID=111 URL=http://127.0.0.1:9000 last_error=port occupied" + assert lines[1] == "[flocks] [!] 启动 Flocks service..." + assert lines[5] == "[flocks] flocks: state=degraded PID=111 URL=http://127.0.0.1:9000 last_error=port occupied" def test_startup_status_lines_can_skip_daemon_step() -> None: @@ -848,10 +846,7 @@ def test_startup_status_lines_can_skip_daemon_step() -> None: include_daemon_step=False, ) - assert lines[:2] == [ - "[flocks] [x] 启动 Flocks server...", - "[flocks] [x] 启动 Flocks webui...", - ] + assert lines[:1] == ["[flocks] [x] 启动 Flocks service..."] def test_build_status_lines_reports_daemon_down_without_port_scans(monkeypatch, tmp_path: Path) -> None: @@ -975,6 +970,8 @@ def test_start_all_without_stop_starts_supervisor_daemon(monkeypatch, tmp_path: console = DummyConsole() monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "cleanup_legacy_runtime_processes", lambda *_args, **_kwargs: None) + monkeypatch.setattr(service_manager, "cleanup_orphan_service_ports", lambda *_args, **_kwargs: None) monkeypatch.setattr(service_manager, "_start_supervisor_process", lambda _config, _paths, _console: calls.append("daemon") or SimpleNamespace(poll=lambda: None)) monkeypatch.setattr(service_manager, "_wait_for_supervisor_ready", lambda _paths, **_kwargs: calls.append("ready") or _supervisor_status_payload()) monkeypatch.setattr(service_manager, "_print_status_payload", lambda _payload, _console, **_kwargs: calls.append("status")) @@ -1053,7 +1050,7 @@ def _capture_spawn(*_args, **kwargs) -> SimpleNamespace: assert process.pid == 2468 assert not paths.backend_pid.exists() assert probe_calls == [{ - "urls": ["http://127.0.0.1:8000"], + "urls": ["http://127.0.0.1:5173"], "name": "后端服务", "attempts": 30, "delay": 3.0, @@ -1301,73 +1298,6 @@ def test_build_frontend_env_allows_direct_backend_urls_when_opted_in(monkeypatch assert env["VITE_WS_BASE_URL"] == "ws://10.0.0.8:9000" -def test_start_frontend_passes_backend_urls_to_build_and_preview(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - console = DummyConsole() - build_calls: list[dict[str, object]] = [] - preview_calls: list[dict[str, object]] = [] - - def fake_run(command, **kwargs): - build_calls.append({"command": command, "kwargs": kwargs}) - return SimpleNamespace(returncode=0) - - def fake_spawn(command, **kwargs): - preview_calls.append({"command": command, "kwargs": kwargs}) - return SimpleNamespace(pid=2468) - - monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "wait_for_http", lambda *_args, **_kwargs: None) - monkeypatch.setattr(service_manager.os, "getpgid", lambda pid: pid) - monkeypatch.setattr(service_manager, "resolve_npm_executable", lambda: "/usr/bin/npm") - monkeypatch.setattr(service_manager, "node_version_satisfies_requirement", lambda: True) - monkeypatch.setattr(service_manager.subprocess, "run", fake_run) - monkeypatch.setattr(service_manager, "_spawn_process", fake_spawn) - monkeypatch.setenv("__VITE_ADDITIONAL_SERVER_ALLOWED_HOSTS", "preview.example.com") - - config = service_manager.ServiceConfig( - backend_host="10.0.0.8", - backend_port=9000, - frontend_host="0.0.0.0", - frontend_port=5174, - ) - service_manager._start_frontend_process(config, console) - - assert build_calls[0]["command"] == ["/usr/bin/npm", "run", "build"] - assert build_calls[0]["kwargs"]["env"]["FLOCKS_API_PROXY_TARGET"] == "http://10.0.0.8:9000" - assert build_calls[0]["kwargs"]["env"]["__VITE_ADDITIONAL_SERVER_ALLOWED_HOSTS"] == "preview.example.com" - assert "VITE_API_BASE_URL" not in build_calls[0]["kwargs"]["env"] - assert "VITE_WS_BASE_URL" not in build_calls[0]["kwargs"]["env"] - - assert preview_calls[0]["command"] == [ - "/usr/bin/npm", - "run", - "preview", - "--", - "--host", - "0.0.0.0", - "--port", - "5174", - ] - assert preview_calls[0]["kwargs"]["env"]["FLOCKS_API_PROXY_TARGET"] == "http://10.0.0.8:9000" - assert preview_calls[0]["kwargs"]["env"]["__VITE_ADDITIONAL_SERVER_ALLOWED_HOSTS"] == "preview.example.com" - assert "VITE_API_BASE_URL" not in preview_calls[0]["kwargs"]["env"] - assert "VITE_WS_BASE_URL" not in preview_calls[0]["kwargs"]["env"] - assert not paths.frontend_pid.exists() - - def _fake_process(pid: int, args: list[str] | None = None, returncode: int | None = None): return SimpleNamespace(pid=pid, args=args or [str(pid)], returncode=returncode, poll=lambda: returncode) @@ -1379,9 +1309,7 @@ def test_supervisor_recovers_backend_when_port_disappears(monkeypatch, tmp_path: daemon = service_supervisor.SupervisorDaemon(service_manager.ServiceConfig(backend_port=9995, frontend_port=9996)) daemon.paths = paths daemon.backend.log_path = paths.backend_log - daemon.webui.log_path = paths.frontend_log daemon.backend.process = _fake_process(111, ["backend"]) - daemon.webui.process = _fake_process(222, ["webui"]) monkeypatch.setattr(service_process, "tcp_port_accepts_connections", lambda _host, port: port != 9995) monkeypatch.setattr(service_manager, "_terminate_process", lambda _process, name, _console: calls.append(f"stop:{name}")) @@ -1407,7 +1335,6 @@ def test_supervisor_waits_for_second_backend_health_failure(monkeypatch, tmp_pat ) daemon.paths = paths daemon.backend.process = _fake_process(111, ["backend"]) - daemon.webui.process = _fake_process(222, ["webui"]) class FakeClient: def __init__(self, *_args, **_kwargs) -> None: @@ -1419,7 +1346,7 @@ def __enter__(self): def __exit__(self, *_args) -> None: return None - def get(self, _url): + def get(self, _url, **_kwargs): return httpx.Response(503, json={"status": "unhealthy"}) monkeypatch.setattr(service_process.httpx, "Client", FakeClient) @@ -1439,27 +1366,65 @@ def get(self, _url): assert calls == ["stop:后端", "start:backend"] -def test_supervisor_recovers_webui_when_port_disappears(monkeypatch, tmp_path: Path) -> None: +def test_backend_probe_rejects_api_root_when_static_webui_missing(monkeypatch) -> None: + class FakeClient: + def __init__(self, *_args, **_kwargs) -> None: + pass + + def __enter__(self): + return self + + def __exit__(self, *_args) -> None: + return None + + def get(self, url, **_kwargs): + if str(url).endswith("/api/health"): + return httpx.Response(200, json={"status": "healthy"}) + return httpx.Response(200, json={"status": "running"}) + + monkeypatch.setattr(service_process, "tcp_port_accepts_connections", lambda *_args: True) + monkeypatch.setattr(service_process.httpx, "Client", FakeClient) + + result = service_process.BackendProcessAdapter().probe(_fake_process(111, ["backend"]), "127.0.0.1", 5173) + + assert result.healthy is False + assert result.reason == "health status=200, root status=200" + + +def test_supervisor_reports_webui_as_static_endpoint(monkeypatch, tmp_path: Path) -> None: paths = _make_runtime_paths(tmp_path) calls: list[str] = [] monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) daemon = service_supervisor.SupervisorDaemon(service_manager.ServiceConfig(backend_port=9995, frontend_port=9996)) daemon.paths = paths daemon.backend.process = _fake_process(111, ["backend"]) - daemon.webui.process = _fake_process(222, ["webui"]) - - monkeypatch.setattr(service_process, "tcp_port_accepts_connections", lambda _host, port: port != 9996) - monkeypatch.setattr(service_manager, "_terminate_process", lambda _process, name, _console: calls.append(f"stop:{name}")) - monkeypatch.setattr( - service_manager, - "_start_frontend_process", - lambda *_args, **_kwargs: calls.append("start:webui") or _fake_process(444, ["webui-new"]), - ) daemon.tick() - assert calls == ["stop:WebUI", "start:webui"] - assert daemon.webui.pid == 444 + assert calls == [] + assert daemon.webui.pid is None + assert daemon.webui.state == "static" + + +@pytest.mark.skipif(sys.platform == "win32", reason="uses the Unix domain socket control API") +def test_supervisor_rejects_static_webui_stop_control_api(monkeypatch, tmp_path: Path) -> None: + del tmp_path + short_root = make_short_runtime_root("flocks-supervisor-") + paths = _make_runtime_paths(short_root) + paths.run_dir.mkdir(parents=True) + paths.log_dir.mkdir(parents=True) + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + daemon = service_supervisor.SupervisorDaemon(service_manager.ServiceConfig()) + daemon._start_control_server() + + try: + with pytest.raises(httpx.HTTPStatusError) as exc_info: + service_control.control_api_request("POST", "/stop/webui", paths=paths) + finally: + daemon._stop_control_server() + shutil.rmtree(short_root, ignore_errors=True) + + assert exc_info.value.response.status_code == 409 @pytest.mark.skipif(sys.platform == "win32", reason="uses the Unix domain socket control API") @@ -1471,36 +1436,31 @@ def test_supervisor_upgrade_prepare_control_api_pauses_real_child_restart(monkey paths.log_dir.mkdir(parents=True) monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) backend_adapter = SleeperProcessAdapter() - webui_adapter = SleeperProcessAdapter() daemon = service_supervisor.SupervisorDaemon( service_manager.ServiceConfig(backend_port=9995, frontend_port=9996), backend_adapter=backend_adapter, - webui_adapter=webui_adapter, ) daemon._start_control_server() try: daemon.restart_all(reason="test startup") backend_process = daemon.backend.process - webui_process = daemon.webui.process assert backend_process is not None - assert webui_process is not None + assert daemon.webui.process is None + assert daemon.webui.state == "static" status = service_control.request_prepare_upgrade(paths=paths) - wait_for_process_exit(webui_process) + wait_for_process_exit(backend_process) assert status.backend.paused is True assert status.webui.paused is True - assert daemon.backend.process is backend_process - assert backend_process.poll() is None - assert webui_process.pid in webui_adapter.stopped + assert daemon.backend.process is None + assert backend_process.pid in backend_adapter.stopped - backend_process.terminate() - backend_process.wait(timeout=5) daemon.tick() assert len(backend_adapter.started) == 1 - assert daemon.backend.process is backend_process + assert daemon.backend.process is None assert daemon.status_payload()["backend"]["paused"] is True finally: daemon.shutdown_children() @@ -1508,23 +1468,12 @@ def test_supervisor_upgrade_prepare_control_api_pauses_real_child_restart(monkey shutil.rmtree(short_root, ignore_errors=True) -def test_start_frontend_tolerates_windows_node_assertion_after_build(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) +def test_build_webui_dist_tolerates_windows_node_assertion_after_build(monkeypatch, tmp_path: Path) -> None: webui_dir = tmp_path / "webui" webui_dist = webui_dir / "dist" webui_dist.mkdir(parents=True) + (webui_dir / "package.json").write_text("{}", encoding="utf-8") console = DummyConsole() - preview_calls: list[list[str]] = [] def fake_run(_command, **_kwargs): (webui_dist / "index.html").write_text("", encoding="utf-8") @@ -1534,89 +1483,20 @@ def fake_run(_command, **_kwargs): stderr="Assertion failed: !(handle->flags & UV_HANDLE_CLOSING), file src\\win\\async.c, line 76", ) - def fake_spawn(command, **_kwargs): - preview_calls.append(list(command)) - return SimpleNamespace(pid=2468) - monkeypatch.setattr(service_manager.sys, "platform", "win32") - monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "wait_for_http", lambda *_args, **_kwargs: None) monkeypatch.setattr(service_manager, "resolve_npm_executable", lambda: "npm.cmd") monkeypatch.setattr(service_manager, "node_version_satisfies_requirement", lambda: True) monkeypatch.setattr(service_manager.subprocess, "run", fake_run) - monkeypatch.setattr(service_manager, "_spawn_process", fake_spawn) - service_manager._start_frontend_process(service_manager.ServiceConfig(), console) + service_manager._build_webui_dist(tmp_path, service_manager.ServiceConfig(), console) - assert preview_calls[0][:3] == ["npm.cmd", "run", "preview"] assert "[flocks] WebUI 构建产物已生成,忽略 Windows Node.js 退出断言。" in console.messages -def test_start_frontend_passes_direct_backend_urls_when_opted_in(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - console = DummyConsole() - build_calls: list[dict[str, object]] = [] - preview_calls: list[dict[str, object]] = [] - - def fake_run(command, **kwargs): - build_calls.append({"command": command, "kwargs": kwargs}) - return SimpleNamespace(returncode=0) - - def fake_spawn(command, **kwargs): - preview_calls.append({"command": command, "kwargs": kwargs}) - return SimpleNamespace(pid=2468) - - monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "wait_for_http", lambda *_args, **_kwargs: None) - monkeypatch.setattr(service_manager.os, "getpgid", lambda pid: pid) - monkeypatch.setattr(service_manager, "resolve_npm_executable", lambda: "/usr/bin/npm") - monkeypatch.setattr(service_manager, "node_version_satisfies_requirement", lambda: True) - monkeypatch.setattr(service_manager.subprocess, "run", fake_run) - monkeypatch.setattr(service_manager, "_spawn_process", fake_spawn) - monkeypatch.setenv(service_manager.WEBUI_DIRECT_BACKEND_URLS_ENV, "true") - - config = service_manager.ServiceConfig( - backend_host="10.0.0.8", - backend_port=9000, - frontend_host="0.0.0.0", - frontend_port=5174, - ) - service_manager._start_frontend_process(config, console) - - assert build_calls[0]["kwargs"]["env"]["VITE_API_BASE_URL"] == "http://10.0.0.8:9000" - assert build_calls[0]["kwargs"]["env"]["VITE_WS_BASE_URL"] == "ws://10.0.0.8:9000" - assert preview_calls[0]["kwargs"]["env"]["VITE_API_BASE_URL"] == "http://10.0.0.8:9000" - assert preview_calls[0]["kwargs"]["env"]["VITE_WS_BASE_URL"] == "ws://10.0.0.8:9000" - - -def test_start_frontend_prefers_bundled_npm_over_path_lookup(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) +def test_build_webui_dist_prefers_bundled_npm_over_path_lookup(monkeypatch, tmp_path: Path) -> None: + webui_dir = tmp_path / "webui" + webui_dir.mkdir() + (webui_dir / "package.json").write_text("{}", encoding="utf-8") console = DummyConsole() build_calls: list[list[str]] = [] @@ -1624,18 +1504,11 @@ def fake_run(command, **_kwargs): build_calls.append(command) return SimpleNamespace(returncode=0) - monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "wait_for_http", lambda *_args, **_kwargs: None) - monkeypatch.setattr(service_manager.os, "getpgid", lambda pid: pid) monkeypatch.setattr(service_manager, "resolve_npm_executable", lambda: r"C:\Users\flocks\AppData\Local\Programs\Flocks\tools\node\npm.cmd") monkeypatch.setattr(service_manager, "node_version_satisfies_requirement", lambda: True) monkeypatch.setattr(service_manager.subprocess, "run", fake_run) - monkeypatch.setattr(service_manager, "_spawn_process", lambda *_args, **_kwargs: SimpleNamespace(pid=2468)) - service_manager._start_frontend_process(service_manager.ServiceConfig(), console) + service_manager._build_webui_dist(tmp_path, service_manager.ServiceConfig(), console) assert build_calls[0][0] == r"C:\Users\flocks\AppData\Local\Programs\Flocks\tools\node\npm.cmd" @@ -1659,7 +1532,7 @@ def test_start_backend_raises_when_port_has_listener(monkeypatch, tmp_path: Path monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: [9999]) - with pytest.raises(service_manager.ServiceError, match="端口 8000 已被占用"): + with pytest.raises(service_manager.ServiceError, match="端口 5173 已被占用"): service_manager._start_backend_process(service_manager.ServiceConfig(), DummyConsole()) @@ -1667,7 +1540,7 @@ def test_start_backend_cleans_trusted_orphan_port_owner(monkeypatch, tmp_path: P paths = _make_runtime_paths(tmp_path) paths.run_dir.mkdir(parents=True) paths.log_dir.mkdir(parents=True) - owners = iter([[9999], [9999], [], []]) + owners = iter([[9999], [9999], [], [], []]) cleaned: list[int] = [] monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) @@ -1739,40 +1612,6 @@ def test_start_backend_raises_when_port_in_use_without_pid_lookup(monkeypatch, t service_manager._start_backend_process(service_manager.ServiceConfig(), DummyConsole()) -def test_start_webui_cleans_trusted_orphan_port_owner(monkeypatch, tmp_path: Path) -> None: - paths = _make_runtime_paths(tmp_path) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - webui_dir = tmp_path / "webui" - webui_dir.mkdir() - owners = iter([[52372], [52372], [], []]) - cleaned: list[int] = [] - - monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: next(owners)) - monkeypatch.setattr(service_manager, "_read_upgrade_runtime_info", lambda _port: service_manager.UpgradeRuntimeInfo()) - monkeypatch.setattr( - service_manager, - "_process_command_line", - lambda _pid: f"node {webui_dir}/node_modules/vite/bin/vite.js preview --host 127.0.0.1 --port 5173", - ) - monkeypatch.setattr(service_manager, "_terminate_orphan_pid", lambda pid, *_args, **_kwargs: cleaned.append(pid)) - monkeypatch.setattr(service_manager, "port_is_in_use", lambda *_args, **_kwargs: False) - monkeypatch.setattr(service_manager, "resolve_npm_executable", lambda: "/usr/bin/npm") - monkeypatch.setattr(service_manager, "node_version_satisfies_requirement", lambda: True) - monkeypatch.setattr(service_manager.subprocess, "run", lambda *_args, **_kwargs: SimpleNamespace(returncode=0)) - monkeypatch.setattr(service_manager, "_spawn_process", lambda command, **_kwargs: SimpleNamespace(pid=5678, args=command)) - monkeypatch.setattr(service_manager, "process_runtime_record", lambda *_args, **_kwargs: service_manager.RuntimeRecord(pid=5678)) - monkeypatch.setattr(service_manager, "_log_startup_config", lambda *_args, **_kwargs: None) - monkeypatch.setattr(service_manager, "wait_for_http", lambda *_args, **_kwargs: None) - - process = service_manager._start_frontend_process(service_manager.ServiceConfig(), DummyConsole(), paths=paths) - - assert process.pid == 5678 - assert cleaned == [52372] - - def test_webui_cleanup_trusts_cross_worktree_flocks_vite_owner(monkeypatch, tmp_path: Path) -> None: cleaned: list[int] = [] owners = iter([[18962], []]) @@ -2025,8 +1864,7 @@ def print(self, message) -> None: assert calls == ["/stop", "legacy", "cleanup", "browser"] assert console.messages == [ - "[flocks] server 已停止(PID=111)。", - "[flocks] webui 已停止(PID=222)。", + "[flocks] flocks 已停止(PID=111)。", "[flocks] daemon 已停止。", ] @@ -2076,7 +1914,7 @@ def fake_cleanup(config, _console, *, extra_configs=()): service_manager.stop_all(console) - assert [(config.backend_port, config.frontend_port) for config in captured] == [(8000, 5173), (9000, 5273)] + assert [(config.backend_port, config.frontend_port) for config in captured] == [(5173, 5173), (9000, 5273)] def test_status_lines_include_control_api_errors(monkeypatch, tmp_path: Path) -> None: @@ -2088,7 +1926,7 @@ def test_status_lines_include_control_api_errors(monkeypatch, tmp_path: Path) -> lines = service_manager.build_status_lines(paths) - backend_line = next(line for line in lines if "后端:" in line) + backend_line = next(line for line in lines if "flocks:" in line) assert "state=degraded" in backend_line assert "last_error=health failed" in backend_line diff --git a/tests/helpers/service_supervisor.py b/tests/helpers/service_supervisor.py index a859deb1a..f51b19352 100644 --- a/tests/helpers/service_supervisor.py +++ b/tests/helpers/service_supervisor.py @@ -86,7 +86,6 @@ def start_supervisor( config, interval=0.05, backend_adapter=SleeperProcessAdapter(), - webui_adapter=SleeperProcessAdapter(), ) thread = threading.Thread(target=daemon.run, daemon=True) thread.start() diff --git a/tests/server/test_server_port_config.py b/tests/server/test_server_port_config.py index ca275ea6b..3964ff3d2 100644 --- a/tests/server/test_server_port_config.py +++ b/tests/server/test_server_port_config.py @@ -206,9 +206,74 @@ def fake_start_all(config, _console): assert result.exit_code == 0 assert captured["config"].backend_host == "0.0.0.0" - assert captured["config"].backend_port == 9000 + assert captured["config"].backend_port == 5174 assert captured["config"].frontend_host == "0.0.0.0" assert captured["config"].frontend_port == 5174 + assert captured["config"].legacy_backend_port == 9000 + assert captured["config"].server_port_migration_hint is True + + def test_start_accepts_public_host_and_port(self, monkeypatch): + """Test start command accepts the unified public host/port options.""" + captured = {} + + def fake_start_all(config, _console): + captured["config"] = config + + monkeypatch.setattr(cli_main, "start_all", fake_start_all) + + result = CliRunner().invoke( + cli_main.app, + [ + "start", + "--host", + "0.0.0.0", + "--port", + "8888", + ], + ) + + assert result.exit_code == 0 + assert captured["config"].backend_host == "0.0.0.0" + assert captured["config"].backend_port == 8888 + assert captured["config"].frontend_host == "0.0.0.0" + assert captured["config"].frontend_port == 8888 + assert captured["config"].legacy_backend_port == 8000 + + def test_public_host_and_port_override_legacy_options(self, monkeypatch): + """Test unified public host/port win over legacy server and WebUI options.""" + captured = {} + + def fake_start_all(config, _console): + captured["config"] = config + + monkeypatch.setattr(cli_main, "start_all", fake_start_all) + + result = CliRunner().invoke( + cli_main.app, + [ + "start", + "--host", + "0.0.0.0", + "--port", + "8888", + "--server-host", + "127.0.0.1", + "--server-port", + "9000", + "--webui-host", + "127.0.0.1", + "--webui-port", + "5174", + ], + ) + + assert result.exit_code == 0 + assert captured["config"].backend_host == "0.0.0.0" + assert captured["config"].backend_port == 8888 + assert captured["config"].frontend_host == "0.0.0.0" + assert captured["config"].frontend_port == 8888 + assert captured["config"].legacy_backend_host == "127.0.0.1" + assert captured["config"].legacy_backend_port == 9000 def test_restart_accepts_server_and_webui_options(self, monkeypatch): """Test restart command accepts explicit server and WebUI host/port options.""" @@ -218,6 +283,7 @@ def fake_restart_all(config, _console): captured["config"] = config monkeypatch.setattr(cli_main, "restart_all", fake_restart_all) + monkeypatch.setattr(cli_main, "read_supervisor_status", lambda **_kwargs: (_ for _ in ()).throw(RuntimeError("down"))) result = CliRunner().invoke( cli_main.app, @@ -236,9 +302,38 @@ def fake_restart_all(config, _console): assert result.exit_code == 0 assert captured["config"].backend_host == "127.0.0.1" - assert captured["config"].backend_port == 9100 + assert captured["config"].backend_port == 5273 assert captured["config"].frontend_host == "127.0.0.1" assert captured["config"].frontend_port == 5273 + assert captured["config"].legacy_backend_port == 9100 + + def test_restart_accepts_public_host_and_port(self, monkeypatch): + """Test restart command accepts the unified public host/port options.""" + captured = {} + + def fake_restart_all(config, _console): + captured["config"] = config + + monkeypatch.setattr(cli_main, "restart_all", fake_restart_all) + monkeypatch.setattr(cli_main, "read_supervisor_status", lambda **_kwargs: (_ for _ in ()).throw(RuntimeError("down"))) + + result = CliRunner().invoke( + cli_main.app, + [ + "restart", + "--host", + "0.0.0.0", + "--port", + "8888", + ], + ) + + assert result.exit_code == 0 + assert captured["config"].backend_host == "0.0.0.0" + assert captured["config"].backend_port == 8888 + assert captured["config"].frontend_host == "0.0.0.0" + assert captured["config"].frontend_port == 8888 + assert captured["config"].legacy_backend_port == 8000 def test_restart_reuses_supervisor_recorded_host_and_port(self, monkeypatch, tmp_path: Path): """Test restart reuses supervisor host/port when CLI and env omit them.""" @@ -268,9 +363,10 @@ def fake_restart_all(config, _console): assert result.exit_code == 0 assert captured["config"].backend_host == "0.0.0.0" - assert captured["config"].backend_port == 9000 + assert captured["config"].backend_port == 5174 assert captured["config"].frontend_host == "0.0.0.0" assert captured["config"].frontend_port == 5174 + assert captured["config"].legacy_backend_port == 9000 def test_restart_cli_options_override_supervisor_record(self, monkeypatch, tmp_path: Path): """Test explicit restart CLI options override supervisor host/port.""" @@ -313,9 +409,10 @@ def fake_restart_all(config, _console): assert result.exit_code == 0 assert captured["config"].backend_host == "127.0.0.1" - assert captured["config"].backend_port == 9100 + assert captured["config"].backend_port == 5273 assert captured["config"].frontend_host == "127.0.0.1" assert captured["config"].frontend_port == 5273 + assert captured["config"].legacy_backend_port == 9100 def test_restart_environment_overrides_supervisor_record(self, monkeypatch, tmp_path: Path): """Test restart environment variables still override supervisor host/port.""" @@ -349,9 +446,10 @@ def fake_restart_all(config, _console): assert result.exit_code == 0 assert captured["config"].backend_host == "127.0.0.1" - assert captured["config"].backend_port == 9101 + assert captured["config"].backend_port == 5275 assert captured["config"].frontend_host == "127.0.0.1" assert captured["config"].frontend_port == 5275 + assert captured["config"].legacy_backend_port == 9101 def test_service_config_prefers_cli_values(self, monkeypatch): """Test CLI values override environment and default values.""" @@ -368,10 +466,54 @@ def test_service_config_prefers_cli_values(self, monkeypatch): webui_port=5174, ) - assert config.backend_host == "0.0.0.0" - assert config.backend_port == 9000 + assert config.backend_host == "127.0.0.1" + assert config.backend_port == 5174 assert config.frontend_host == "127.0.0.1" assert config.frontend_port == 5174 + assert config.legacy_backend_host == "0.0.0.0" + assert config.legacy_backend_port == 9000 + + def test_service_config_default_public_port_is_webui_port(self, monkeypatch): + """Test service startup defaults to the public WebUI port.""" + monkeypatch.delenv("FLOCKS_HOST", raising=False) + monkeypatch.delenv("FLOCKS_PORT", raising=False) + monkeypatch.delenv("FLOCKS_PUBLIC_HOST", raising=False) + monkeypatch.delenv("FLOCKS_PUBLIC_PORT", raising=False) + monkeypatch.delenv("FLOCKS_SERVER_HOST", raising=False) + monkeypatch.delenv("FLOCKS_SERVER_PORT", raising=False) + monkeypatch.delenv("FLOCKS_WEBUI_HOST", raising=False) + monkeypatch.delenv("FLOCKS_WEBUI_PORT", raising=False) + Config._global_config = None + + config = cli_main._service_config() + + assert config.backend_host == "127.0.0.1" + assert config.backend_port == 5173 + assert config.frontend_host == "127.0.0.1" + assert config.frontend_port == 5173 + assert config.legacy_backend_port == 8000 + + def test_service_config_prefers_public_values(self, monkeypatch): + """Test unified public values override legacy CLI and environment values.""" + monkeypatch.setenv("FLOCKS_WEBUI_HOST", "10.0.0.2") + monkeypatch.setenv("FLOCKS_WEBUI_PORT", "5274") + Config._global_config = None + + config = cli_main._service_config( + host="0.0.0.0", + port=8888, + server_host="127.0.0.1", + server_port=9000, + webui_host="127.0.0.1", + webui_port=5174, + ) + + assert config.backend_host == "0.0.0.0" + assert config.backend_port == 8888 + assert config.frontend_host == "0.0.0.0" + assert config.frontend_port == 8888 + assert config.legacy_backend_host == "127.0.0.1" + assert config.legacy_backend_port == 9000 def test_service_config_uses_server_and_webui_environment(self, monkeypatch): """Test environment variables are used when CLI values are absent.""" @@ -384,9 +526,10 @@ def test_service_config_uses_server_and_webui_environment(self, monkeypatch): config = cli_main._service_config() assert config.backend_host == "0.0.0.0" - assert config.backend_port == 9001 + assert config.backend_port == 5175 assert config.frontend_host == "0.0.0.0" assert config.frontend_port == 5175 + assert config.legacy_backend_port == 9001 def test_service_config_keeps_legacy_env_fallbacks(self, monkeypatch): """Test legacy backend/frontend environment variables still work as fallback.""" @@ -403,9 +546,10 @@ def test_service_config_keeps_legacy_env_fallbacks(self, monkeypatch): config = cli_main._service_config() assert config.backend_host == "0.0.0.0" - assert config.backend_port == 9200 + assert config.backend_port == 5176 assert config.frontend_host == "0.0.0.0" assert config.frontend_port == 5176 + assert config.legacy_backend_port == 9200 def test_cli_tui_command_default_port(self): """Test that CLI tui command uses correct default port.""" @@ -581,13 +725,13 @@ def test_script_port_env_var(self): assert port == '7000' def test_script_port_env_var_default(self): - """Test FLOCKS_PORT defaults to 8000 when not set.""" + """Test FLOCKS_PORT defaults to the public service port when not set.""" # Temporarily remove the env var if it exists old_value = os.environ.pop('FLOCKS_PORT', None) try: - port = int(os.getenv('FLOCKS_PORT', '8000')) - assert port == 8000 + port = int(os.getenv('FLOCKS_PORT', '5173')) + assert port == 5173 finally: # Restore old value if it existed if old_value is not None: diff --git a/tests/server/test_static_webui.py b/tests/server/test_static_webui.py new file mode 100644 index 000000000..8e478c5ac --- /dev/null +++ b/tests/server/test_static_webui.py @@ -0,0 +1,120 @@ +from pathlib import Path + +import pytest +from fastapi import FastAPI +from httpx import ASGITransport, AsyncClient + +from flocks.server.static_webui import maybe_serve_static_webui + + +def _write_dist(root: Path) -> Path: + dist = root / "dist" + assets = dist / "assets" + assets.mkdir(parents=True) + (dist / "index.html").write_text("Flocks WebUI", encoding="utf-8") + (assets / "app.12345678.js").write_text("console.log('flocks');", encoding="utf-8") + return dist + + +def _app() -> FastAPI: + app = FastAPI() + + @app.middleware("http") + async def static_webui(request, call_next): + response = await maybe_serve_static_webui(request) + if response is not None: + return response + return await call_next(request) + + @app.get("/api/health") + async def health(): + return {"status": "healthy"} + + return app + + +@pytest.mark.asyncio +async def test_static_webui_serves_browser_root(monkeypatch, tmp_path: Path) -> None: + monkeypatch.setenv("FLOCKS_WEBUI_DIST_DIR", str(_write_dist(tmp_path))) + async with AsyncClient(transport=ASGITransport(app=_app()), base_url="http://test") as client: + response = await client.get("/", headers={"Accept": "text/html"}) + + assert response.status_code == 200 + assert "Flocks WebUI" in response.text + assert response.headers["Cache-Control"] == "no-store" + + +@pytest.mark.asyncio +async def test_static_webui_serves_assets_with_immutable_cache(monkeypatch, tmp_path: Path) -> None: + monkeypatch.setenv("FLOCKS_WEBUI_DIST_DIR", str(_write_dist(tmp_path))) + async with AsyncClient(transport=ASGITransport(app=_app()), base_url="http://test") as client: + response = await client.get("/assets/app.12345678.js") + + assert response.status_code == 200 + assert "console.log" in response.text + assert response.headers["Cache-Control"] == "public, max-age=31536000, immutable" + + +@pytest.mark.asyncio +async def test_static_webui_falls_back_for_browser_deep_link(monkeypatch, tmp_path: Path) -> None: + monkeypatch.setenv("FLOCKS_WEBUI_DIST_DIR", str(_write_dist(tmp_path))) + async with AsyncClient(transport=ASGITransport(app=_app()), base_url="http://test") as client: + response = await client.get("/session/abc", headers={"Accept": "text/html"}) + + assert response.status_code == 200 + assert "Flocks WebUI" in response.text + + +@pytest.mark.asyncio +async def test_static_webui_falls_back_before_full_app_auth(monkeypatch, tmp_path: Path) -> None: + monkeypatch.setenv("FLOCKS_WEBUI_DIST_DIR", str(_write_dist(tmp_path))) + from flocks.server.app import app + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get( + "/session/abc", + headers={ + "Accept": "text/html", + "User-Agent": "Mozilla/5.0", + }, + ) + + assert response.status_code == 200 + assert "Flocks WebUI" in response.text + + +@pytest.mark.asyncio +async def test_static_webui_does_not_bypass_full_app_api_auth(monkeypatch, tmp_path: Path) -> None: + monkeypatch.setenv("FLOCKS_WEBUI_DIST_DIR", str(_write_dist(tmp_path))) + from flocks.server.app import app + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get( + "/api/session/abc", + headers={ + "Accept": "text/html", + "User-Agent": "Mozilla/5.0", + }, + ) + + assert response.status_code == 401 + assert "Flocks WebUI" not in response.text + + +@pytest.mark.asyncio +async def test_static_webui_does_not_intercept_api(monkeypatch, tmp_path: Path) -> None: + monkeypatch.setenv("FLOCKS_WEBUI_DIST_DIR", str(_write_dist(tmp_path))) + async with AsyncClient(transport=ASGITransport(app=_app()), base_url="http://test") as client: + response = await client.get("/api/health", headers={"Accept": "text/html"}) + + assert response.status_code == 200 + assert response.json() == {"status": "healthy"} + + +@pytest.mark.asyncio +async def test_static_webui_does_not_fallback_for_non_get(monkeypatch, tmp_path: Path) -> None: + monkeypatch.setenv("FLOCKS_WEBUI_DIST_DIR", str(_write_dist(tmp_path))) + async with AsyncClient(transport=ASGITransport(app=_app()), base_url="http://test") as client: + response = await client.post("/session/abc", headers={"Accept": "text/html"}) + + assert response.status_code == 404 diff --git a/tests/updater/test_updater.py b/tests/updater/test_updater.py index 1528d9c0e..4dd329691 100644 --- a/tests/updater/test_updater.py +++ b/tests/updater/test_updater.py @@ -12,7 +12,13 @@ from flocks.cli import service_control, service_manager from flocks.updater import updater -from tests.helpers.service_supervisor import make_short_runtime_root, start_supervisor, stop_supervisor, wait_for_supervisor +from tests.helpers.service_supervisor import ( + make_short_runtime_root, + start_supervisor, + stop_supervisor, + wait_for_process_exit, + wait_for_supervisor, +) def _write_pyproject_version(pyproject_path: Path, version: str) -> None: @@ -805,10 +811,12 @@ def test_build_restart_handoff_argv_rewrites_serve_to_managed_start( tmp_path: Path, ) -> None: config = service_manager.ServiceConfig( - backend_host="0.0.0.0", - backend_port=9000, + backend_host="10.0.0.8", + backend_port=5273, frontend_host="10.0.0.8", frontend_port=5273, + legacy_backend_host="0.0.0.0", + legacy_backend_port=9000, ) monkeypatch.setattr(updater, "_handoff_service_config", lambda: config) monkeypatch.setattr(updater.os, "getpid", lambda: 1234) @@ -829,14 +837,14 @@ def test_build_restart_handoff_argv_rewrites_serve_to_managed_start( "start", "--no-browser", "--skip-webui-build", + "--host", + "10.0.0.8", + "--port", + "5273", "--server-host", "0.0.0.0", "--server-port", "9000", - "--webui-host", - "10.0.0.8", - "--webui-port", - "5273", ] @@ -1132,7 +1140,8 @@ def test_prepare_upgrade_handover_restores_frontend_when_upgrade_page_fails_with assert status.backend.paused is False assert status.backend.pid is not None assert status.webui.paused is False - assert status.webui.pid is not None + assert status.webui.pid is None + assert status.webui.state == "static" assert updater._read_upgrade_state() is None finally: stop_supervisor(daemon, thread) @@ -1167,11 +1176,10 @@ def test_rollback_failed_update_resumes_backend_when_handoff_tasks_fail( "skip_frontend_build": True, } ) - service_control.request_prepare_upgrade(paths=paths) old_backend = daemon.backend.process assert old_backend is not None - old_backend.terminate() - old_backend.wait(timeout=5) + service_control.request_prepare_upgrade(paths=paths) + wait_for_process_exit(old_backend) updater._rollback_failed_update(None, short_root / "install", "2026.3.31") @@ -1180,7 +1188,8 @@ def test_rollback_failed_update_resumes_backend_when_handoff_tasks_fail( assert status.webui.paused is False assert status.backend.pid is not None assert status.backend.pid != old_backend.pid - assert status.webui.pid is not None + assert status.webui.pid is None + assert status.webui.state == "static" assert updater._read_upgrade_state() is None finally: stop_supervisor(daemon, thread) @@ -1740,14 +1749,14 @@ async def fake_sleep(_seconds) -> None: "start", "--no-browser", "--skip-webui-build", + "--host", + "127.0.0.1", + "--port", + "5173", "--server-host", "127.0.0.1", "--server-port", "8000", - "--webui-host", - "127.0.0.1", - "--webui-port", - "5173", ] @@ -3241,14 +3250,14 @@ async def fake_run_async(cmd, cwd=None, timeout=None, env=None): "start", "--no-browser", "--skip-webui-build", + "--host", + "127.0.0.1", + "--port", + "5173", "--server-host", "127.0.0.1", "--server-port", "8000", - "--webui-host", - "127.0.0.1", - "--webui-port", - "5173", ] assert events == ["handover"] assert "execv" not in events diff --git a/webui/src/utils/restartPolling.test.ts b/webui/src/utils/restartPolling.test.ts index 0f3cc54f5..5e456d8d2 100644 --- a/webui/src/utils/restartPolling.test.ts +++ b/webui/src/utils/restartPolling.test.ts @@ -20,21 +20,22 @@ describe('checkRestartReadiness', () => { vi.restoreAllMocks(); }); - it('falls back to the loopback backend health endpoint during static handover', async () => { + it('checks same-origin health without probing the legacy backend port', async () => { const fetchMock = vi.fn(async (input: RequestInfo | URL) => { const url = String(input); if (url === '/api/health') { return new Response('', { status: 404 }); } - if (url === 'http://127.0.0.1:8000/api/health') { - return new Response(JSON.stringify({ status: 'healthy' }), { status: 200 }); - } return new Response('', { status: 200 }); }); vi.stubGlobal('fetch', fetchMock); - await expect(checkRestartReadiness()).resolves.toEqual({ ready: true }); + await expect(checkRestartReadiness()).resolves.toEqual({ + ready: false, + reason: 'health check returned HTTP 404', + }); expect(fetchMock).toHaveBeenCalledWith('/api/health', { cache: 'no-store' }); - expect(fetchMock).toHaveBeenCalledWith('http://127.0.0.1:8000/api/health', { cache: 'no-store' }); + expect(fetchMock).toHaveBeenCalledWith('/', { cache: 'no-store' }); + expect(fetchMock).not.toHaveBeenCalledWith('http://127.0.0.1:8000/api/health', { cache: 'no-store' }); }); }); diff --git a/webui/src/utils/restartPolling.ts b/webui/src/utils/restartPolling.ts index 2b9142bc4..dc142d5c1 100644 --- a/webui/src/utils/restartPolling.ts +++ b/webui/src/utils/restartPolling.ts @@ -29,17 +29,6 @@ async function readUpgradePageState(): Promise { return null; } -function loopbackBackendHealthURL(): string | null { - if (typeof window === 'undefined') return null; - - const { protocol, hostname, port } = window.location; - if (!['localhost', '127.0.0.1', '::1'].includes(hostname)) return null; - if (!port || port === '8000') return null; - - const host = hostname === '::1' ? '[::1]' : hostname; - return `${protocol}//${host}:8000/api/health`; -} - async function checkHealth(url: string): Promise { try { return await fetch(url, { cache: 'no-store' }); @@ -54,22 +43,11 @@ export async function checkRestartReadiness(): Promise { return { ready: true }; } - const fallbackURL = loopbackBackendHealthURL(); - const fallbackResponse = fallbackURL ? await checkHealth(fallbackURL) : null; - if (fallbackResponse?.ok) { - return { ready: true }; - } - const pageReason = await readUpgradePageState(); return { ready: false, reason: [ healthResponse ? `health check returned HTTP ${healthResponse.status}` : 'health check failed', - fallbackURL && fallbackResponse - ? `loopback health check returned HTTP ${fallbackResponse.status}` - : fallbackURL - ? `loopback health check failed: ${fallbackURL}` - : null, pageReason, ].filter(Boolean).join('; '), }; From dcfdc9575061a3ee8765d6705dffa22d66709c9d Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Fri, 3 Jul 2026 12:04:20 +0800 Subject: [PATCH 17/28] fix(cli): support supervisor control on Windows --- flocks/cli/service_control.py | 8 +++++++- flocks/cli/service_manager.py | 3 ++- flocks/cli/service_supervisor.py | 15 +++++++++++---- tests/cli/test_service_manager.py | 23 +++++++++++++++++++++++ 4 files changed, 43 insertions(+), 6 deletions(-) diff --git a/flocks/cli/service_control.py b/flocks/cli/service_control.py index 636dd78a9..34320fa9f 100644 --- a/flocks/cli/service_control.py +++ b/flocks/cli/service_control.py @@ -3,6 +3,7 @@ from __future__ import annotations import os +import socket import sys from dataclasses import dataclass from pathlib import Path @@ -76,9 +77,14 @@ def supervisor_control_port() -> int: return SUPERVISOR_CONTROL_PORT +def supervisor_uses_tcp_control() -> bool: + """Return True when the daemon control API should use localhost TCP.""" + return sys.platform == "win32" or not hasattr(socket, "AF_UNIX") + + def supervisor_control_client(paths=None, timeout: float | None = 2.0) -> httpx.Client: """Create a client for the local daemon control API.""" - if sys.platform == "win32": + if supervisor_uses_tcp_control(): return httpx.Client( base_url=f"http://127.0.0.1:{supervisor_control_port()}", timeout=timeout, diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index 4376d7fdb..096d8c198 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -35,6 +35,7 @@ supervisor_is_running, supervisor_log_path, supervisor_socket_path, + supervisor_uses_tcp_control, ) try: @@ -1370,7 +1371,7 @@ def _start_supervisor_process(config: ServiceConfig, paths: RuntimePaths, consol """Spawn the detached service supervisor daemon.""" root = ensure_install_layout() log_path = supervisor_log_path(paths) - if sys.platform != "win32": + if not supervisor_uses_tcp_control(): supervisor_socket_path(paths).unlink(missing_ok=True) command = resolve_flocks_cli_command(root) + [ "service-daemon", diff --git a/flocks/cli/service_supervisor.py b/flocks/cli/service_supervisor.py index afb706926..efa7a4c8f 100644 --- a/flocks/cli/service_supervisor.py +++ b/flocks/cli/service_supervisor.py @@ -23,6 +23,7 @@ supervisor_control_port, supervisor_log_path, supervisor_socket_path, + supervisor_uses_tcp_control, ) from flocks.cli.service_process import BackendProcessAdapter, ProcessAdapter @@ -85,8 +86,13 @@ def _service_payload(service: ManagedService, *, paused: bool = False) -> dict[s } -class _UnixControlServer(ThreadingHTTPServer): - address_family = socket.AF_UNIX +if hasattr(socket, "AF_UNIX"): + + class _UnixControlServer(ThreadingHTTPServer): + address_family = socket.AF_UNIX + +else: # pragma: no cover - exercised by importing on Windows + _UnixControlServer = None class SupervisorDaemon: @@ -175,12 +181,13 @@ def _cleanup_legacy_runtime(self) -> None: def _start_control_server(self) -> None: handler = self._handler_class() - if sys.platform == "win32": + if supervisor_uses_tcp_control(): server: ThreadingHTTPServer = ThreadingHTTPServer(("127.0.0.1", supervisor_control_port()), handler) else: socket_path = supervisor_socket_path(self.paths) socket_path.parent.mkdir(parents=True, exist_ok=True) socket_path.unlink(missing_ok=True) + assert _UnixControlServer is not None server = _UnixControlServer(str(socket_path), handler) self._server = server self._server_thread = threading.Thread(target=server.serve_forever, name="flocks-supervisor-control", daemon=True) @@ -193,7 +200,7 @@ def _stop_control_server(self) -> None: self._server.server_close() if self._server_thread is not None: self._server_thread.join(timeout=5.0) - if sys.platform != "win32": + if not supervisor_uses_tcp_control(): supervisor_socket_path(self.paths).unlink(missing_ok=True) def _handler_class(self): diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index 69d634e8e..ffb3d8c4c 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -1,5 +1,6 @@ import json import shutil +import subprocess import sys from pathlib import Path from types import SimpleNamespace @@ -58,6 +59,28 @@ def _write_legacy_runtime_record(pid_file: Path, record: service_manager.Runtime pid_file.write_text(json.dumps(payload, ensure_ascii=True, sort_keys=True), encoding="utf-8") +def test_supervisor_uses_tcp_control_when_af_unix_is_unavailable(monkeypatch) -> None: + monkeypatch.setattr(service_control.sys, "platform", "linux") + monkeypatch.delattr(service_control.socket, "AF_UNIX", raising=False) + + assert service_control.supervisor_uses_tcp_control() is True + + +def test_service_supervisor_imports_when_af_unix_is_unavailable() -> None: + code = "\n".join( + [ + "import socket", + "if hasattr(socket, 'AF_UNIX'):", + " delattr(socket, 'AF_UNIX')", + "import flocks.cli.service_supervisor", + ] + ) + + completed = subprocess.run([sys.executable, "-c", code], capture_output=True, text=True, check=False) + + assert completed.returncode == 0, completed.stderr + + def test_runtime_paths_follow_flocks_root_env(monkeypatch, tmp_path: Path) -> None: monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path)) From 81ee06f95d31bb1f5906f7ec12a3b73b63ceeee9 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Fri, 3 Jul 2026 12:53:58 +0800 Subject: [PATCH 18/28] fix(cli): improve Windows supervisor startup --- flocks/cli/service_manager.py | 50 +++++++++++++++++++ flocks/cli/service_supervisor.py | 7 +-- tests/cli/test_service_manager.py | 82 +++++++++++++++++++++++++++++-- 3 files changed, 133 insertions(+), 6 deletions(-) diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index 096d8c198..d2839a379 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -961,6 +961,43 @@ def _process_list_pids() -> list[int]: return sorted(dict.fromkeys(pids)) +def _windows_trusted_daemon_process_pids(*, root: Path) -> list[int]: + """Return trusted Windows daemon pids with a single process query.""" + if sys.platform != "win32": + return [] + root_text = str(root).lower() + env = os.environ.copy() + env["FLOCKS_DAEMON_ROOT_MATCH"] = root_text + env["FLOCKS_DAEMON_CURRENT_PID"] = str(os.getpid()) + powershell = which("powershell") or which("powershell.exe") + if not powershell: + return [] + script = ( + "$root = [Environment]::GetEnvironmentVariable('FLOCKS_DAEMON_ROOT_MATCH'); " + "$currentPid = [int][Environment]::GetEnvironmentVariable('FLOCKS_DAEMON_CURRENT_PID'); " + "Get-CimInstance Win32_Process | Where-Object { " + "$_.ProcessId -ne $currentPid -and $_.CommandLine -and " + "$_.CommandLine.ToLowerInvariant().Contains('service-daemon') -and " + "$_.CommandLine.ToLowerInvariant().Contains('flocks') -and " + "$_.CommandLine.ToLowerInvariant().Contains($root) " + "} | ForEach-Object { $_.ProcessId }" + ) + completed = subprocess.run( + [powershell, "-NoProfile", "-Command", script], + check=False, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + env=env, + ) + if completed.returncode != 0: + return [] + return sorted( + dict.fromkeys(int(line.strip()) for line in completed.stdout.splitlines() if line.strip().isdigit()) + ) + + def _trusted_flocks_daemon_owner(pid: int, *, root: Path) -> bool: """Return True only for daemon processes that belong to this Flocks install.""" if pid <= 0 or pid == os.getpid(): @@ -975,6 +1012,8 @@ def _trusted_flocks_daemon_owner(pid: int, *, root: Path) -> bool: def trusted_daemon_process_pids(*, root: Path | None = None) -> list[int]: """Return trusted daemon pids for the current Flocks install.""" current_root = root or ensure_install_layout() + if sys.platform == "win32": + return _windows_trusted_daemon_process_pids(root=current_root) return [pid for pid in _process_list_pids() if _trusted_flocks_daemon_owner(pid, root=current_root)] @@ -1489,8 +1528,10 @@ def _start_all_without_stop(config: ServiceConfig, console) -> None: """Start the supervisor daemon, then print access summary.""" paths = ensure_runtime_dirs() _print_static_port_migration_hint(config, console) + console.print("[flocks] [ ] 启动 Flocks daemon...") cleanup_legacy_runtime_processes(paths, console) cleanup_orphan_service_ports(config, console) + _ensure_webui_dist(ensure_install_layout(), config, console) process = _start_supervisor_process(config, paths, console) console.print("[flocks] [x] 启动 Flocks daemon...") payload = _wait_for_supervisor_ready(paths, process=process) @@ -1907,6 +1948,15 @@ def signal_pid_list(sig: signal.Signals, pids: Iterable[int]) -> None: def open_default_browser(url: str, console) -> None: """Best-effort browser open.""" + if sys.platform == "win32": + startfile = getattr(os, "startfile", None) + if startfile is not None: + try: + startfile(url) + console.print(f"[flocks] 浏览器已打开: {url}") + return + except Exception: + pass try: if webbrowser.open(url): console.print(f"[flocks] 浏览器已打开: {url}") diff --git a/flocks/cli/service_supervisor.py b/flocks/cli/service_supervisor.py index efa7a4c8f..d72fbaf5e 100644 --- a/flocks/cli/service_supervisor.py +++ b/flocks/cli/service_supervisor.py @@ -30,6 +30,7 @@ SUPERVISOR_CHECK_INTERVAL_SECONDS = 5.0 SUPERVISOR_HEALTH_FAILURE_THRESHOLD = 2 SUPERVISOR_BACKOFF_SECONDS = (1.0, 2.0, 5.0, 10.0, 30.0) +_CLIENT_DISCONNECT_ERRORS = (BrokenPipeError, ConnectionResetError, ConnectionAbortedError) @dataclass @@ -220,7 +221,7 @@ def _send_json(self, payload: dict[str, object], status: int = 200) -> None: self.send_header("Content-Length", str(len(body))) self.end_headers() self.wfile.write(body) - except (BrokenPipeError, ConnectionResetError): + except _CLIENT_DISCONNECT_ERRORS: return def _read_json(self) -> dict[str, Any]: @@ -243,7 +244,7 @@ def do_GET(self) -> None: daemon.handle_logs_request(self, parse_qs(parsed.query)) return self._send_json({"error": "not found"}, status=404) - except (BrokenPipeError, ConnectionResetError): + except _CLIENT_DISCONNECT_ERRORS: return except Exception as exc: # pragma: no cover - defensive control path self._send_json({"error": str(exc)}, status=500) @@ -286,7 +287,7 @@ def do_POST(self) -> None: self._send_json(daemon.status_payload()) return self._send_json({"error": "not found"}, status=404) - except (BrokenPipeError, ConnectionResetError): + except _CLIENT_DISCONNECT_ERRORS: return except Exception as exc: # pragma: no cover - defensive control path self._send_json({"error": str(exc)}, status=500) diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index ffb3d8c4c..33bf7d4b2 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -438,7 +438,8 @@ def test_daemon_log_service_name_uses_daemon_only(tmp_path: Path) -> None: assert daemon._log_paths_for_service("supervisor") == [] -def test_supervisor_control_send_json_ignores_disconnected_client() -> None: +@pytest.mark.parametrize("disconnect_error", [BrokenPipeError, ConnectionResetError, ConnectionAbortedError]) +def test_supervisor_control_send_json_ignores_disconnected_client(disconnect_error: type[Exception]) -> None: daemon = service_supervisor.SupervisorDaemon(service_manager.ServiceConfig()) handler_class = daemon._handler_class() handler = handler_class.__new__(handler_class) @@ -447,7 +448,7 @@ def test_supervisor_control_send_json_ignores_disconnected_client() -> None: handler.send_response = lambda status: calls.append(("status", status)) handler.send_header = lambda name, value: calls.append((name, value)) handler.end_headers = lambda: calls.append(("end_headers", None)) - handler.wfile = SimpleNamespace(write=lambda _body: (_ for _ in ()).throw(BrokenPipeError())) + handler.wfile = SimpleNamespace(write=lambda _body: (_ for _ in ()).throw(disconnect_error())) handler._send_json({"ok": True}) @@ -469,6 +470,24 @@ def test_supervisor_control_get_ignores_logs_client_disconnect() -> None: assert sent == [] +def test_open_default_browser_uses_windows_startfile(monkeypatch) -> None: + opened: list[str] = [] + console = DummyConsole() + + monkeypatch.setattr(service_manager.sys, "platform", "win32") + monkeypatch.setattr(service_manager.os, "startfile", lambda url: opened.append(url), raising=False) + monkeypatch.setattr( + service_manager.webbrowser, + "open", + lambda _url: (_ for _ in ()).throw(AssertionError("webbrowser should not be used on Windows when startfile exists")), + ) + + service_manager.open_default_browser("http://127.0.0.1:5173", console) + + assert opened == ["http://127.0.0.1:5173"] + assert console.messages == ["[flocks] 浏览器已打开: http://127.0.0.1:5173"] + + def test_tail_lines_returns_recent_content(tmp_path: Path) -> None: log_file = tmp_path / "backend.log" log_file.write_text("a\nb\nc\n", encoding="utf-8") @@ -1007,7 +1026,37 @@ def test_start_all_without_stop_starts_supervisor_daemon(monkeypatch, tmp_path: service_manager._start_all_without_stop(service_manager.ServiceConfig(no_browser=True), console) assert calls == ["daemon", "ready", "status"] - assert console.messages == ["[flocks] [x] 启动 Flocks daemon..."] + assert console.messages == [ + "[flocks] [ ] 启动 Flocks daemon...", + "[flocks] [x] 启动 Flocks daemon...", + ] + + +def test_start_all_without_stop_prints_before_cleanup(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + events: list[str] = [] + console = DummyConsole() + + def record_print(message: str) -> None: + events.append(f"print:{message}") + console.messages.append(message) + + console.print = record_print + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "cleanup_legacy_runtime_processes", lambda *_args, **_kwargs: events.append("legacy")) + monkeypatch.setattr(service_manager, "cleanup_orphan_service_ports", lambda *_args, **_kwargs: events.append("orphan")) + monkeypatch.setattr(service_manager, "_ensure_webui_dist", lambda *_args, **_kwargs: events.append("dist")) + monkeypatch.setattr( + service_manager, + "_start_supervisor_process", + lambda _config, _paths, _console: events.append("daemon") or SimpleNamespace(poll=lambda: None), + ) + monkeypatch.setattr(service_manager, "_wait_for_supervisor_ready", lambda _paths, **_kwargs: _supervisor_status_payload()) + monkeypatch.setattr(service_manager, "_print_status_payload", lambda *_args, **_kwargs: None) + + service_manager._start_all_without_stop(service_manager.ServiceConfig(no_browser=True), console) + + assert events[:5] == ["print:[flocks] [ ] 启动 Flocks daemon...", "legacy", "orphan", "dist", "daemon"] def test_start_all_propagates_supervisor_start_failure(monkeypatch) -> None: @@ -1682,6 +1731,33 @@ def test_cleanup_trusted_daemon_processes_cleans_current_install_only(monkeypatc assert cleaned == [111] +def test_windows_cleanup_trusted_daemon_processes_uses_single_query(monkeypatch, tmp_path: Path) -> None: + cleaned: list[int] = [] + commands: list[list[str]] = [] + + def fail_per_pid_lookup(_pid: int) -> str: + raise AssertionError("Windows daemon cleanup should not query each pid separately") + + def fake_run(command, **kwargs): + commands.append(command) + assert command[:2] == ["powershell.exe", "-NoProfile"] + assert kwargs["env"]["FLOCKS_DAEMON_ROOT_MATCH"] == str(tmp_path).lower() + return SimpleNamespace(returncode=0, stdout="111\n222\n111\n") + + monkeypatch.setattr(service_manager.sys, "platform", "win32") + monkeypatch.setattr(service_manager, "which", lambda name: "powershell.exe" if name == "powershell" else None) + monkeypatch.setattr(service_manager, "_process_list_pids", lambda: [111, 222, 333]) + monkeypatch.setattr(service_manager, "_process_command_line", fail_per_pid_lookup) + monkeypatch.setattr(service_manager.subprocess, "run", fake_run) + monkeypatch.setattr(service_manager, "_terminate_orphan_pid", lambda pid, *_args, **_kwargs: cleaned.append(pid)) + + result = service_manager.cleanup_trusted_daemon_processes(console=DummyConsole(), root=tmp_path) + + assert result == [111, 222] + assert cleaned == [111, 222] + assert len(commands) == 1 + + def test_spawn_process_uses_hidden_window_flags_on_windows(monkeypatch, tmp_path: Path) -> None: captured = {} log_path = tmp_path / "logs" / "backend.log" From 0be619861d294cb32453c2314630aff58be5b1cb Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Fri, 3 Jul 2026 13:44:39 +0800 Subject: [PATCH 19/28] fix(cli): recover upgrade handoff on start --- flocks/cli/service_manager.py | 29 ++++++- flocks/storage/storage.py | 10 ++- flocks/tool/device/models.py | 21 +++-- flocks/updater/updater.py | 22 ++++- tests/cli/test_service_manager.py | 91 +++++++++++++++++++++ tests/cli/test_update_command.py | 6 ++ tests/tool/test_device_schema_migration.py | 94 ++++++++++++++++++++++ tests/updater/test_updater.py | 45 +++++++++++ 8 files changed, 305 insertions(+), 13 deletions(-) create mode 100644 tests/tool/test_device_schema_migration.py diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index d2839a379..f782065f1 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -30,6 +30,8 @@ from flocks.cli.service_control import ( read_logs, read_supervisor_status, + request_restart, + request_resume_upgrade, request_stop, stream_logs, supervisor_is_running, @@ -1243,7 +1245,6 @@ def _start_backend_process( ) command, env = _backend_command_and_env(root, config) - console.print("[flocks] 启动 Flocks service...") process = _spawn_process(command, cwd=root, log_path=current.backend_log, env=env) record = process_runtime_record( process, @@ -1446,6 +1447,15 @@ def _service_config_matches(left: ServiceConfig, right: ServiceConfig) -> bool: ) +def _supervisor_backend_is_healthy(status) -> bool: + """Return whether a supervisor status represents an accessible Flocks service.""" + return ( + not status.backend.paused + and status.backend.state.lower() == "healthy" + and status.backend.health.lower() == "healthy" + ) + + def _legacy_runtime_config(paths: RuntimePaths, fallback: ServiceConfig) -> ServiceConfig: """Build cleanup config from legacy runtime records when present.""" return ServiceConfig( @@ -1542,6 +1552,7 @@ def _start_all_without_stop(config: ServiceConfig, console) -> None: def _start_all_unlocked(config: ServiceConfig, console, *, paths: RuntimePaths) -> None: """Ensure the supervisor daemon is running; caller must hold lifecycle lock.""" + _resolve_upgrade_runtime(console, frontend_port=config.frontend_port, attempt_recover=True) if supervisor_is_running(paths): status = None try: @@ -1553,9 +1564,23 @@ def _start_all_unlocked(config: ServiceConfig, console, *, paths: RuntimePaths) _stop_all_unlocked(console, paths=paths) _start_all_without_stop(config, console) return + if status is not None and (status.backend.paused or status.backend.state.lower() == "paused"): + console.print("[flocks] Flocks daemon 已在运行,但 Flocks service 处于暂停状态,正在恢复...") + status = request_resume_upgrade(config, paths=paths) + _print_status_payload(status.raw, console, include_daemon_step=False) + if not config.no_browser and _supervisor_backend_is_healthy(status): + open_default_browser(_frontend_url_from_status(status, config.frontend_url), console) + return + if status is not None and not _supervisor_backend_is_healthy(status): + console.print("[flocks] Flocks daemon 已在运行,但 Flocks service 不可用,正在重启...") + status = request_restart(config, paths=paths) + _print_status_payload(status.raw, console, include_daemon_step=False) + if not config.no_browser and _supervisor_backend_is_healthy(status): + open_default_browser(_frontend_url_from_status(status, config.frontend_url), console) + return console.print("[flocks] Flocks daemon 已在运行。") show_status(console) - if not config.no_browser: + if status is not None and not config.no_browser and _supervisor_backend_is_healthy(status): try: url = _frontend_url_from_status(status, config.frontend_url) except Exception: diff --git a/flocks/storage/storage.py b/flocks/storage/storage.py index bad02354f..455765235 100644 --- a/flocks/storage/storage.py +++ b/flocks/storage/storage.py @@ -21,6 +21,7 @@ T = TypeVar("T", bound=BaseModel) +DDLScript = str | Callable[[aiosqlite.Connection], Awaitable[None]] class NotFoundError(Exception): @@ -73,7 +74,7 @@ class Storage: # descriptors and ``_initialized=True`` flag are never silently inherited # — a known SQLite corruption vector. _init_pid: Optional[int] = None - _extension_ddls: List[str] = [] + _extension_ddls: List[DDLScript] = [] _sqlite_timeout_s = 5.0 _sqlite_busy_timeout_ms = 5000 _sqlite_journal_mode = "WAL" @@ -432,7 +433,7 @@ def connect_sync(cls, db_path: Optional[Path] = None) -> sqlite3.Connection: return cls.configure_sync_connection(conn) @classmethod - def register_ddl(cls, ddl: str) -> None: + def register_ddl(cls, ddl: DDLScript) -> None: """Register an extension DDL script to be executed during ``init()``. If init() has already completed the DDL is executed immediately @@ -823,7 +824,10 @@ async def _bootstrap_schema(cls) -> None: async def _run_extension_ddl() -> None: async with cls.connect(cls._db_path) as db: - await db.executescript(ddl) + if isinstance(ddl, str): + await db.executescript(ddl) + else: + await ddl(db) await db.commit() await cls._run_write_with_retry( diff --git a/flocks/tool/device/models.py b/flocks/tool/device/models.py index a3b5d15d1..e9bc1c27d 100644 --- a/flocks/tool/device/models.py +++ b/flocks/tool/device/models.py @@ -54,15 +54,24 @@ updated_at INTEGER NOT NULL ); CREATE INDEX IF NOT EXISTS idx_device_storage_key ON device_integrations(storage_key); -CREATE INDEX IF NOT EXISTS idx_device_group ON device_integrations(group_id); """) + # Upgrade hook for installations created before group_id was added. -# Storage wraps each DDL in try/except so the duplicate-column error on fresh -# installs is silently ignored. -Storage.register_ddl( - "ALTER TABLE device_integrations ADD COLUMN group_id TEXT NOT NULL DEFAULT '';" -) +async def _ensure_device_integrations_group_id(db: Any) -> None: + cursor = await db.execute("PRAGMA table_info(device_integrations)") + columns = {str(row[1]) for row in await cursor.fetchall()} + if "group_id" in columns: + return + await db.execute("ALTER TABLE device_integrations ADD COLUMN group_id TEXT NOT NULL DEFAULT '';") + + +Storage.register_ddl(_ensure_device_integrations_group_id) + +Storage.register_ddl(""" +CREATE INDEX IF NOT EXISTS idx_device_group ON device_integrations(group_id); +""") + # Per-device tool enabled/disabled overrides. # diff --git a/flocks/updater/updater.py b/flocks/updater/updater.py index 121a3553a..160d48bf1 100644 --- a/flocks/updater/updater.py +++ b/flocks/updater/updater.py @@ -2037,7 +2037,11 @@ def _stop_upgrade_page_server(*, frontend_port: int | None = None) -> None: from flocks.cli import service_manager - remaining = service_manager.port_owner_pids(frontend_port) + remaining = [ + pid + for pid in service_manager.port_owner_pids(frontend_port) + if _looks_like_upgrade_page_process(pid) + ] if remaining: log.info( "updater.upgrade_page.port_fallback_kill", @@ -2059,7 +2063,7 @@ def _stop_upgrade_page_server(*, frontend_port: int | None = None) -> None: wait_attempts = 40 wait_interval = 0.25 for _ in range(wait_attempts): - if not service_manager.port_owner_pids(frontend_port): + if not any(_looks_like_upgrade_page_process(pid) for pid in service_manager.port_owner_pids(frontend_port)): return time.sleep(wait_interval) return @@ -2068,6 +2072,20 @@ def _stop_upgrade_page_server(*, frontend_port: int | None = None) -> None: time.sleep(0.3) +def _looks_like_upgrade_page_process(pid: int) -> bool: + """Return True only for the temporary upgrade-page http.server process.""" + try: + from flocks.cli import service_manager + + command_line = service_manager._process_command_line(pid).lower() + except Exception: + return False + if not command_line: + return False + page_dir = str(_upgrade_page_dir()).lower() + return "http.server" in command_line and "upgrade-page" in command_line and page_dir in command_line + + def _prepare_upgrade_handover(version: str) -> dict[str, Any]: from flocks.cli import service_manager from flocks.cli.service_control import request_prepare_upgrade diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index 33bf7d4b2..e4c6ab96f 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -30,6 +30,7 @@ def print(self, *args, **kwargs) -> None: @pytest.fixture(autouse=True) def _skip_backend_webui_dist_check(monkeypatch) -> None: monkeypatch.setattr(service_manager, "_ensure_webui_dist", lambda *_args, **_kwargs: None) + monkeypatch.setattr(service_manager, "_resolve_upgrade_runtime", lambda *_args, **_kwargs: {"action": "noop", "error": None}) def _make_runtime_paths(tmp_path: Path) -> service_manager.RuntimePaths: @@ -944,6 +945,29 @@ def test_start_all_starts_supervisor_when_control_api_is_down(monkeypatch) -> No assert call_order == ["ensure_runtime_dirs", "_start_all_without_stop"] +def test_start_all_resolves_upgrade_runtime_before_supervisor_status(monkeypatch) -> None: + events: list[str] = [] + console = DummyConsole() + paths = _make_runtime_paths(Path("/tmp/flocks-test")) + + def resolve_upgrade_runtime(_console, *, frontend_port: int, attempt_recover: bool) -> dict[str, object]: + events.append(f"upgrade:{frontend_port}:{attempt_recover}") + return {"action": "cleaned", "error": None} + + def supervisor_running(_paths) -> bool: + events.append("supervisor") + return False + + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "_resolve_upgrade_runtime", resolve_upgrade_runtime) + monkeypatch.setattr(service_manager, "supervisor_is_running", supervisor_running) + monkeypatch.setattr(service_manager, "_start_all_without_stop", lambda _config, _console: events.append("start")) + + service_manager.start_all(service_manager.ServiceConfig(frontend_port=5173), console) + + assert events == ["upgrade:5173:True", "supervisor", "start"] + + def test_start_all_does_not_duplicate_running_supervisor(monkeypatch) -> None: calls: list[str] = [] console = DummyConsole() @@ -960,6 +984,72 @@ def test_start_all_does_not_duplicate_running_supervisor(monkeypatch) -> None: assert "[flocks] Flocks daemon 已在运行。" in console.messages +def test_start_all_resumes_paused_supervisor_before_opening_browser(monkeypatch) -> None: + calls: list[str] = [] + console = DummyConsole() + paths = _make_runtime_paths(Path("/tmp/flocks-test")) + paused_payload = _supervisor_status_payload() + paused_payload["backend"].update({ + "pid": None, + "state": "paused", + "health": "paused", + "paused": True, + "last_error": "control upgrade prepare", + }) + paused_payload["webui"].update({ + "state": "paused", + "health": "paused", + "paused": True, + "last_error": "control upgrade prepare", + }) + resumed_status = _supervisor_status() + + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: True) + monkeypatch.setattr(service_manager, "read_supervisor_status", lambda *_args, **_kwargs: _supervisor_status(paused_payload)) + monkeypatch.setattr( + service_manager, + "request_resume_upgrade", + lambda _config, **_kwargs: calls.append("resume") or resumed_status, + ) + monkeypatch.setattr(service_manager, "_print_status_payload", lambda *_args, **_kwargs: calls.append("status")) + monkeypatch.setattr(service_manager, "open_default_browser", lambda url, _console: calls.append(f"browser:{url}")) + + service_manager.start_all(service_manager.ServiceConfig(), console) + + assert calls == ["resume", "status", "browser:http://127.0.0.1:9000"] + assert "[flocks] Flocks daemon 已在运行,但 Flocks service 处于暂停状态,正在恢复..." in console.messages + + +def test_start_all_does_not_open_browser_when_restarted_service_remains_unhealthy(monkeypatch) -> None: + calls: list[str] = [] + console = DummyConsole() + paths = _make_runtime_paths(Path("/tmp/flocks-test")) + degraded_payload = _supervisor_status_payload() + degraded_payload["backend"].update({ + "state": "degraded", + "health": "degraded", + "last_error": "port unavailable", + }) + degraded_status = _supervisor_status(degraded_payload) + + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: True) + monkeypatch.setattr(service_manager, "read_supervisor_status", lambda *_args, **_kwargs: degraded_status) + monkeypatch.setattr( + service_manager, + "request_restart", + lambda _config, **_kwargs: calls.append("restart") or degraded_status, + ) + monkeypatch.setattr(service_manager, "_print_status_payload", lambda *_args, **_kwargs: calls.append("status")) + monkeypatch.setattr(service_manager, "open_default_browser", lambda *_args, **_kwargs: calls.append("browser")) + + service_manager.start_all(service_manager.ServiceConfig(), console) + + assert calls == ["restart", "status"] + assert "[flocks] Flocks daemon 已在运行,但 Flocks service 不可用,正在重启..." in console.messages + + def test_start_all_restarts_running_daemon_when_config_changes(monkeypatch) -> None: calls: list[str] = [] console = DummyConsole() @@ -1130,6 +1220,7 @@ def _capture_spawn(*_args, **kwargs) -> SimpleNamespace: }] assert spawn_env is not None assert spawn_env.get("PYTHONUNBUFFERED") == "1" + assert "[flocks] 启动 Flocks service..." not in console.messages def test_start_backend_rolls_back_when_probe_fails(monkeypatch, tmp_path: Path) -> None: diff --git a/tests/cli/test_update_command.py b/tests/cli/test_update_command.py index 6c3104b78..3d5253836 100644 --- a/tests/cli/test_update_command.py +++ b/tests/cli/test_update_command.py @@ -81,6 +81,8 @@ async def fake_perform_update( *, zipball_url: str | None = None, tarball_url: str | None = None, + bundle_sha256: str | None = None, + bundle_format: str | None = None, restart: bool = True, locale: str | None = None, region: str | None = None, @@ -238,6 +240,8 @@ async def fake_perform_update( *, zipball_url: str | None = None, tarball_url: str | None = None, + bundle_sha256: str | None = None, + bundle_format: str | None = None, restart: bool = True, locale: str | None = None, region: str | None = None, @@ -296,6 +300,8 @@ async def fake_perform_update( *, zipball_url: str | None = None, tarball_url: str | None = None, + bundle_sha256: str | None = None, + bundle_format: str | None = None, restart: bool = True, locale: str | None = None, region: str | None = None, diff --git a/tests/tool/test_device_schema_migration.py b/tests/tool/test_device_schema_migration.py new file mode 100644 index 000000000..40bd9be68 --- /dev/null +++ b/tests/tool/test_device_schema_migration.py @@ -0,0 +1,94 @@ +"""Device integration schema migration tests.""" + +import sqlite3 +from pathlib import Path +from typing import Any + +import pytest + +from flocks.storage.storage import Storage +from flocks.tool.device import models as device_models + + +def _reset_storage_state() -> None: + Storage._initialized = False + Storage._init_pid = None + Storage._db_path = None + + +async def _shutdown_storage() -> None: + await Storage.shutdown() + _reset_storage_state() + + +async def _device_columns(db_path: Path) -> set[str]: + async with Storage.connect(db_path) as db: + cursor = await db.execute("PRAGMA table_info(device_integrations)") + return {str(row[1]) for row in await cursor.fetchall()} + + +async def _device_indexes(db_path: Path) -> set[str]: + async with Storage.connect(db_path) as db: + cursor = await db.execute("PRAGMA index_list(device_integrations)") + return {str(row[1]) for row in await cursor.fetchall()} + + +def _capture_storage_warnings(monkeypatch) -> list[tuple[Any, Any]]: + warnings: list[tuple[Any, Any]] = [] + monkeypatch.setattr(Storage._log, "warn", lambda message=None, extra=None: warnings.append((message, extra))) + return warnings + + +def _extension_ddl_warnings(warnings: list[tuple[Any, Any]]) -> list[tuple[Any, Any]]: + return [entry for entry in warnings if entry[0] == "storage.extension_ddl.failed"] + + +@pytest.mark.asyncio +async def test_device_schema_fresh_init_does_not_warn_duplicate_group_id(monkeypatch, tmp_path: Path) -> None: + warnings = _capture_storage_warnings(monkeypatch) + db_path = tmp_path / "fresh.db" + + _reset_storage_state() + try: + await Storage.init(db_path) + + assert device_models.DEFAULT_GROUP_ID == "default-room" + assert "group_id" in await _device_columns(db_path) + assert "idx_device_group" in await _device_indexes(db_path) + assert _extension_ddl_warnings(warnings) == [] + finally: + await _shutdown_storage() + + +@pytest.mark.asyncio +async def test_device_schema_old_integrations_table_gets_group_id(monkeypatch, tmp_path: Path) -> None: + warnings = _capture_storage_warnings(monkeypatch) + db_path = tmp_path / "old.db" + with sqlite3.connect(db_path) as db: + db.executescript(""" + CREATE TABLE device_integrations ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL, + storage_key TEXT NOT NULL, + service_id TEXT NOT NULL, + enabled INTEGER NOT NULL DEFAULT 1, + verify_ssl INTEGER NOT NULL DEFAULT 0, + fields TEXT NOT NULL DEFAULT '{}', + status TEXT NOT NULL DEFAULT 'unknown', + message TEXT, + latency_ms INTEGER, + checked_at INTEGER, + created_at INTEGER NOT NULL, + updated_at INTEGER NOT NULL + ); + """) + + _reset_storage_state() + try: + await Storage.init(db_path) + + assert "group_id" in await _device_columns(db_path) + assert "idx_device_group" in await _device_indexes(db_path) + assert _extension_ddl_warnings(warnings) == [] + finally: + await _shutdown_storage() diff --git a/tests/updater/test_updater.py b/tests/updater/test_updater.py index 4dd329691..a3ed5f9b5 100644 --- a/tests/updater/test_updater.py +++ b/tests/updater/test_updater.py @@ -1353,6 +1353,51 @@ def test_start_upgrade_page_server_binds_configured_frontend_host( assert captured["wait_host"] == "0.0.0.0" +def test_stop_upgrade_page_server_does_not_kill_unified_flocks_service( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + flocks_root = tmp_path / ".flocks" + monkeypatch.setenv("FLOCKS_ROOT", str(flocks_root)) + killed: list[int] = [] + + monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: [111]) + monkeypatch.setattr( + service_manager, + "_process_command_line", + lambda _pid: "/env/bin/python -m flocks.cli.main serve --host 127.0.0.1 --port 5173", + ) + monkeypatch.setattr(updater.os, "kill", lambda pid, _sig: killed.append(pid)) + + updater._stop_upgrade_page_server(frontend_port=5173) + + assert killed == [] + + +def test_stop_upgrade_page_server_kills_only_upgrade_page_process( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + flocks_root = tmp_path / ".flocks" + page_dir = flocks_root / "run" / "upgrade-page" + monkeypatch.setenv("FLOCKS_ROOT", str(flocks_root)) + killed: list[int] = [] + + def fake_command_line(pid: int) -> str: + if pid == 222: + return f"/env/bin/python -m http.server 5173 --directory {page_dir}" + return "/env/bin/python -m flocks.cli.main serve --host 127.0.0.1 --port 5173" + + monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: [111, 222]) + monkeypatch.setattr(service_manager, "_process_command_line", fake_command_line) + monkeypatch.setattr(updater.os, "kill", lambda pid, _sig: killed.append(pid)) + monkeypatch.setattr(updater.time, "sleep", lambda _seconds: None) + + updater._stop_upgrade_page_server(frontend_port=5173) + + assert killed == [222] + + def test_wait_for_upgrade_page_uses_access_host_for_local_probe( monkeypatch: pytest.MonkeyPatch, ) -> None: From 3227f6bb7c9474c1f79d3317f2ba4e5e5b775a7a Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Fri, 3 Jul 2026 14:45:11 +0800 Subject: [PATCH 20/28] Rename daemon logs and drop wheel force-include --- flocks/cli/service_supervisor.py | 4 ++-- pyproject.toml | 3 --- tests/cli/test_service_manager.py | 6 ++++++ tests/updater/test_updater.py | 6 ++++-- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/flocks/cli/service_supervisor.py b/flocks/cli/service_supervisor.py index d72fbaf5e..e94397300 100644 --- a/flocks/cli/service_supervisor.py +++ b/flocks/cli/service_supervisor.py @@ -56,12 +56,12 @@ def pid(self) -> int | None: def _daemon_log(event: str, details: dict[str, object] | None = None) -> None: - """Write a structured supervisor log line to stdout.""" + """Write a structured daemon log line to stdout.""" timestamp = datetime.datetime.now().isoformat(timespec="seconds") suffix = "" if details: suffix = " " + json.dumps(details, ensure_ascii=True, sort_keys=True) - sys.stdout.write(f"[{timestamp}] supervisor.{event}{suffix}\n") + sys.stdout.write(f"[{timestamp}] daemon.{event}{suffix}\n") sys.stdout.flush() diff --git a/pyproject.toml b/pyproject.toml index ab558f626..81ebf74ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -109,9 +109,6 @@ build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] packages = ["flocks"] -[tool.hatch.build.targets.wheel.force-include] -"webui/dist" = "flocks/webui_static" - [tool.ruff] line-length = 120 target-version = "py312" diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index e4c6ab96f..f1e57b90b 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -439,6 +439,12 @@ def test_daemon_log_service_name_uses_daemon_only(tmp_path: Path) -> None: assert daemon._log_paths_for_service("supervisor") == [] +def test_daemon_log_event_prefix_uses_daemon(capsys) -> None: + service_supervisor._daemon_log("stopped") + + assert "daemon.stopped" in capsys.readouterr().out + + @pytest.mark.parametrize("disconnect_error", [BrokenPipeError, ConnectionResetError, ConnectionAbortedError]) def test_supervisor_control_send_json_ignores_disconnected_client(disconnect_error: type[Exception]) -> None: daemon = service_supervisor.SupervisorDaemon(service_manager.ServiceConfig()) diff --git a/tests/updater/test_updater.py b/tests/updater/test_updater.py index a3ed5f9b5..3cef6d5a0 100644 --- a/tests/updater/test_updater.py +++ b/tests/updater/test_updater.py @@ -538,12 +538,14 @@ def test_build_dependency_sync_command_keeps_project_install_on_non_windows( assert updater._build_dependency_sync_command("uv") == ["uv", "sync", "--frozen", "--no-python-downloads"] -def test_wheel_build_config_does_not_force_include_flockshub() -> None: +def test_wheel_build_config_does_not_force_include_runtime_or_build_outputs() -> None: pyproject_path = Path(__file__).resolve().parents[2] / "pyproject.toml" pyproject = tomllib.loads(pyproject_path.read_text(encoding="utf-8")) wheel_config = pyproject["tool"]["hatch"]["build"]["targets"]["wheel"] + forced_includes = wheel_config.get("force-include", {}) - assert ".flocks/flockshub" not in wheel_config.get("force-include", {}) + assert ".flocks/flockshub" not in forced_includes + assert "webui/dist" not in forced_includes def test_build_frontend_subprocess_env_prepends_bundled_node_on_windows( From 26c8f67bd57a27bec895ae9460eea7d3e57a5765 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Fri, 3 Jul 2026 14:58:07 +0800 Subject: [PATCH 21/28] test(updater): add manual webui branch upgrade test --- tests/server/routes/test_update_routes.py | 119 ++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/tests/server/routes/test_update_routes.py b/tests/server/routes/test_update_routes.py index f673fca72..8be953356 100644 --- a/tests/server/routes/test_update_routes.py +++ b/tests/server/routes/test_update_routes.py @@ -1,5 +1,8 @@ from __future__ import annotations +import os +from urllib.parse import quote, unquote, urlparse + import pytest from fastapi import HTTPException, status from starlette.requests import Request @@ -7,11 +10,80 @@ pytestmark = pytest.mark.asyncio +_MANUAL_REAL_UPGRADE_ENV = "FLOCKS_RUN_REAL_WEBUI_UPGRADE_TEST" +_MANUAL_REAL_UPGRADE_BRANCH_ENV = "FLOCKS_REAL_WEBUI_UPGRADE_BRANCH" +_MANUAL_REAL_UPGRADE_TARGET_BRANCH = "" + def _request() -> Request: return Request({"type": "http", "method": "GET", "path": "/api/update/check", "headers": []}) +def _manual_real_upgrade_branch() -> str: + branch_input = ( + os.environ.get(_MANUAL_REAL_UPGRADE_BRANCH_ENV, "").strip() + or _MANUAL_REAL_UPGRADE_TARGET_BRANCH.strip() + ) + try: + if not branch_input: + branch_input = input("Target branch for the real WebUI upgrade test: ").strip() + if not branch_input: + pytest.skip("No target branch was provided for the real WebUI upgrade test") + + confirmation = input( + "This will trigger a real upgrade and may replace the current install tree. " + f"Type the branch name again to confirm ({branch_input}): " + ).strip() + except OSError as exc: + pytest.skip(f"Interactive confirmation is required: {exc}") + + if confirmation != branch_input: + pytest.skip("Real WebUI upgrade test was not confirmed") + return _normalize_manual_branch_target(branch_input) + + +def _normalize_manual_branch_target(target: str) -> str: + branch = target.strip() + parsed = urlparse(branch) + if parsed.scheme in {"http", "https"}: + path = parsed.path + github_marker = "/archive/refs/heads/" + gitee_marker = "/repository/archive/" + if github_marker in path: + branch = path.split(github_marker, 1)[1] + elif gitee_marker in path: + branch = path.split(gitee_marker, 1)[1] + branch = branch.removesuffix(".tar.gz").removesuffix(".zip") + branch = unquote(branch) + + for prefix in ("refs/heads/",): + if branch.startswith(prefix): + branch = branch[len(prefix):] + break + + if not branch: + pytest.skip("No target branch was provided for the real WebUI upgrade test") + return branch + + +def _manual_branch_version_label(branch: str) -> str: + return "branch-" + branch.replace("/", "-") + + +def _github_branch_archive_url(branch: str, extension: str) -> str: + encoded_branch = quote(branch, safe="/") + return f"https://github.com/AgentFlocks/flocks/archive/refs/heads/{encoded_branch}.{extension}" + + +async def test_normalize_manual_branch_target_accepts_archive_urls(): + assert _normalize_manual_branch_target( + "https://gitee.com/flocks/flocks/repository/archive/refactor/supervisor-control-adapters.zip" + ) == "refactor/supervisor-control-adapters" + assert _normalize_manual_branch_target( + "https://github.com/AgentFlocks/flocks/archive/refs/heads/fix/session-mixed-parts-read-merge.zip" + ) == "fix/session-mixed-parts-read-merge" + + async def test_check_version_requires_admin_for_flockspro(monkeypatch: pytest.MonkeyPatch): from flocks.server.routes import update as update_routes @@ -52,3 +124,50 @@ async def _fake_check_update(**kwargs): info = await update_routes.check_version(_request(), locale="zh-CN", edition="flocks") assert info.current_version == "v2026.5.9" + + +@pytest.mark.skipif( + os.environ.get(_MANUAL_REAL_UPGRADE_ENV) != "1", + reason=f"manual real upgrade test; set {_MANUAL_REAL_UPGRADE_ENV}=1 to enable", +) +async def test_manual_webui_apply_update_upgrades_to_confirmed_branch( + client, + monkeypatch: pytest.MonkeyPatch, +): + from flocks.config.config import UpdaterConfig + from flocks.server.routes import update as update_routes + from flocks.updater import updater as updater_module + from flocks.updater.models import VersionInfo + + branch = _manual_real_upgrade_branch() + version_label = _manual_branch_version_label(branch) + + async def _manual_github_updater_config(): + return UpdaterConfig( + repo="AgentFlocks/flocks", + gitee_repo=None, + sources=["github"], + archive_format="zip", + ) + + async def _manual_branch_update_info(**kwargs): + assert kwargs["force_console_manifest"] is False + return VersionInfo( + current_version="manual-real-upgrade-test", + latest_version=version_label, + has_update=True, + release_url=f"https://github.com/AgentFlocks/flocks/tree/{quote(branch, safe='/')}", + zipball_url=_github_branch_archive_url(branch, "zip"), + tarball_url=_github_branch_archive_url(branch, "tar.gz"), + ) + + monkeypatch.setattr(update_routes, "check_update", _manual_branch_update_info) + monkeypatch.setattr(updater_module, "_get_updater_config", _manual_github_updater_config) + + response = await client.post( + "/api/update/apply", + params={"edition": "flocks"}, + ) + + assert response.status_code == 200, response.text + assert '"stage":"error"' not in response.text From fc17ab5656dd95f3aff860a6bbca1b902c9548f1 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Fri, 3 Jul 2026 15:23:22 +0800 Subject: [PATCH 22/28] fix(cli): simplify service start upgrade cleanup --- flocks/cli/service_manager.py | 23 ++++++++++------------ tests/cli/test_service_manager.py | 32 +++++++++++++------------------ 2 files changed, 23 insertions(+), 32 deletions(-) diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index f782065f1..0309ae4e8 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -31,7 +31,6 @@ read_logs, read_supervisor_status, request_restart, - request_resume_upgrade, request_stop, stream_logs, supervisor_is_running, @@ -1538,12 +1537,12 @@ def _start_all_without_stop(config: ServiceConfig, console) -> None: """Start the supervisor daemon, then print access summary.""" paths = ensure_runtime_dirs() _print_static_port_migration_hint(config, console) - console.print("[flocks] [ ] 启动 Flocks daemon...") + console.print("[flocks] Flocks daemon 启动中...") cleanup_legacy_runtime_processes(paths, console) cleanup_orphan_service_ports(config, console) _ensure_webui_dist(ensure_install_layout(), config, console) process = _start_supervisor_process(config, paths, console) - console.print("[flocks] [x] 启动 Flocks daemon...") + console.print("[flocks] Flocks daemon 已启动。") payload = _wait_for_supervisor_ready(paths, process=process) _print_status_payload(payload, console, include_daemon_step=False) if not config.no_browser: @@ -1552,7 +1551,7 @@ def _start_all_without_stop(config: ServiceConfig, console) -> None: def _start_all_unlocked(config: ServiceConfig, console, *, paths: RuntimePaths) -> None: """Ensure the supervisor daemon is running; caller must hold lifecycle lock.""" - _resolve_upgrade_runtime(console, frontend_port=config.frontend_port, attempt_recover=True) + _resolve_upgrade_runtime(console, frontend_port=config.frontend_port, attempt_recover=False) if supervisor_is_running(paths): status = None try: @@ -1565,11 +1564,9 @@ def _start_all_unlocked(config: ServiceConfig, console, *, paths: RuntimePaths) _start_all_without_stop(config, console) return if status is not None and (status.backend.paused or status.backend.state.lower() == "paused"): - console.print("[flocks] Flocks daemon 已在运行,但 Flocks service 处于暂停状态,正在恢复...") - status = request_resume_upgrade(config, paths=paths) - _print_status_payload(status.raw, console, include_daemon_step=False) - if not config.no_browser and _supervisor_backend_is_healthy(status): - open_default_browser(_frontend_url_from_status(status, config.frontend_url), console) + console.print("[flocks] Flocks daemon 已在运行,但 Flocks service 处于暂停状态,正在重新启动...") + _stop_all_unlocked(console, paths=paths) + _start_all_without_stop(config, console) return if status is not None and not _supervisor_backend_is_healthy(status): console.print("[flocks] Flocks daemon 已在运行,但 Flocks service 不可用,正在重启...") @@ -1727,8 +1724,8 @@ def _daemon_status_line(payload: dict[str, Any]) -> str: return f"[flocks] daemon: state={state} PID={pid}{suffix}" -def _startup_step_marker(state: object, *, ready_states: set[str]) -> str: - return "[x]" if str(state or "").lower() in ready_states else "[!]" +def _startup_step_status(state: object, *, ready_states: set[str]) -> str: + return "已启动" if str(state or "").lower() in ready_states else "启动异常" def _startup_status_lines_from_payload(payload: dict[str, Any], *, include_daemon_step: bool = True) -> list[str]: @@ -1736,9 +1733,9 @@ def _startup_status_lines_from_payload(payload: dict[str, Any], *, include_daemo backend = payload.get("backend") if isinstance(payload.get("backend"), dict) else {} lines = [] if include_daemon_step: - lines.append(f"[flocks] {_startup_step_marker(daemon.get('state'), ready_states={'running'})} 启动 Flocks daemon...") + lines.append(f"[flocks] Flocks daemon {_startup_step_status(daemon.get('state'), ready_states={'running'})}。") lines.extend([ - f"[flocks] {_startup_step_marker(backend.get('state'), ready_states={'healthy'})} 启动 Flocks service...", + f"[flocks] Flocks service {_startup_step_status(backend.get('state'), ready_states={'healthy'})}。", "", "[flocks] 服务", _daemon_status_line(daemon), diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index f1e57b90b..1346410cb 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -869,8 +869,8 @@ def test_startup_status_lines_use_progress_summary() -> None: lines = service_manager._startup_status_lines_from_payload(_supervisor_status_payload()) assert lines[:2] == [ - "[flocks] [x] 启动 Flocks daemon...", - "[flocks] [x] 启动 Flocks service...", + "[flocks] Flocks daemon 已启动。", + "[flocks] Flocks service 已启动。", ] assert lines[4] == "[flocks] daemon: state=running PID=100" assert lines[5] == "[flocks] flocks: state=healthy PID=111 URL=http://127.0.0.1:9000" @@ -885,7 +885,7 @@ def test_startup_status_lines_mark_unhealthy_steps() -> None: lines = service_manager._startup_status_lines_from_payload(payload) - assert lines[1] == "[flocks] [!] 启动 Flocks service..." + assert lines[1] == "[flocks] Flocks service 启动异常。" assert lines[5] == "[flocks] flocks: state=degraded PID=111 URL=http://127.0.0.1:9000 last_error=port occupied" @@ -895,7 +895,7 @@ def test_startup_status_lines_can_skip_daemon_step() -> None: include_daemon_step=False, ) - assert lines[:1] == ["[flocks] [x] 启动 Flocks service..."] + assert lines[:1] == ["[flocks] Flocks service 已启动。"] def test_build_status_lines_reports_daemon_down_without_port_scans(monkeypatch, tmp_path: Path) -> None: @@ -971,7 +971,7 @@ def supervisor_running(_paths) -> bool: service_manager.start_all(service_manager.ServiceConfig(frontend_port=5173), console) - assert events == ["upgrade:5173:True", "supervisor", "start"] + assert events == ["upgrade:5173:False", "supervisor", "start"] def test_start_all_does_not_duplicate_running_supervisor(monkeypatch) -> None: @@ -990,7 +990,7 @@ def test_start_all_does_not_duplicate_running_supervisor(monkeypatch) -> None: assert "[flocks] Flocks daemon 已在运行。" in console.messages -def test_start_all_resumes_paused_supervisor_before_opening_browser(monkeypatch) -> None: +def test_start_all_restarts_paused_supervisor(monkeypatch) -> None: calls: list[str] = [] console = DummyConsole() paths = _make_runtime_paths(Path("/tmp/flocks-test")) @@ -1008,23 +1008,17 @@ def test_start_all_resumes_paused_supervisor_before_opening_browser(monkeypatch) "paused": True, "last_error": "control upgrade prepare", }) - resumed_status = _supervisor_status() monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: True) monkeypatch.setattr(service_manager, "read_supervisor_status", lambda *_args, **_kwargs: _supervisor_status(paused_payload)) - monkeypatch.setattr( - service_manager, - "request_resume_upgrade", - lambda _config, **_kwargs: calls.append("resume") or resumed_status, - ) - monkeypatch.setattr(service_manager, "_print_status_payload", lambda *_args, **_kwargs: calls.append("status")) - monkeypatch.setattr(service_manager, "open_default_browser", lambda url, _console: calls.append(f"browser:{url}")) + monkeypatch.setattr(service_manager, "_stop_all_unlocked", lambda _console, **_kwargs: calls.append("stop")) + monkeypatch.setattr(service_manager, "_start_all_without_stop", lambda _config, _console: calls.append("start")) service_manager.start_all(service_manager.ServiceConfig(), console) - assert calls == ["resume", "status", "browser:http://127.0.0.1:9000"] - assert "[flocks] Flocks daemon 已在运行,但 Flocks service 处于暂停状态,正在恢复..." in console.messages + assert calls == ["stop", "start"] + assert "[flocks] Flocks daemon 已在运行,但 Flocks service 处于暂停状态,正在重新启动..." in console.messages def test_start_all_does_not_open_browser_when_restarted_service_remains_unhealthy(monkeypatch) -> None: @@ -1123,8 +1117,8 @@ def test_start_all_without_stop_starts_supervisor_daemon(monkeypatch, tmp_path: assert calls == ["daemon", "ready", "status"] assert console.messages == [ - "[flocks] [ ] 启动 Flocks daemon...", - "[flocks] [x] 启动 Flocks daemon...", + "[flocks] Flocks daemon 启动中...", + "[flocks] Flocks daemon 已启动。", ] @@ -1152,7 +1146,7 @@ def record_print(message: str) -> None: service_manager._start_all_without_stop(service_manager.ServiceConfig(no_browser=True), console) - assert events[:5] == ["print:[flocks] [ ] 启动 Flocks daemon...", "legacy", "orphan", "dist", "daemon"] + assert events[:5] == ["print:[flocks] Flocks daemon 启动中...", "legacy", "orphan", "dist", "daemon"] def test_start_all_propagates_supervisor_start_failure(monkeypatch) -> None: From 8d74380688353e9da0495427a49e09ddcd304f6f Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Fri, 3 Jul 2026 15:47:37 +0800 Subject: [PATCH 23/28] fix(updater): defer supervisor handoff restart --- flocks/updater/restart_handoff.py | 66 ++++++++- flocks/updater/updater.py | 77 +---------- tests/updater/test_restart_handoff.py | 186 +++++++++++++++++++++++++- tests/updater/test_updater.py | 57 ++++---- 4 files changed, 278 insertions(+), 108 deletions(-) diff --git a/flocks/updater/restart_handoff.py b/flocks/updater/restart_handoff.py index 9ca6c90e8..1ad888b74 100644 --- a/flocks/updater/restart_handoff.py +++ b/flocks/updater/restart_handoff.py @@ -104,6 +104,7 @@ def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: parser.add_argument("--backend-port", type=int, required=True) parser.add_argument("--frontend-host", required=True) parser.add_argument("--frontend-port", type=int, required=True) + parser.add_argument("--backend-pid-file") parser.add_argument("--install-root", required=True) parser.add_argument("--uv-path", required=True) parser.add_argument("--sync-timeout", type=int, required=True) @@ -116,6 +117,7 @@ def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: parser.add_argument("--pro-bundle-manifest-path") parser.add_argument("--bundle-sha256") parser.add_argument("--cleanup-dir") + parser.add_argument("--prepare-handover", action="store_true") parser.add_argument("restart_argv", nargs=argparse.REMAINDER) args = parser.parse_args(argv) if args.restart_argv and args.restart_argv[0] == "--": @@ -158,15 +160,67 @@ def _rollback_failed_upgrade(args: argparse.Namespace, error: str) -> None: _record_handoff_log(f"rollback_failed error={exc}") +def _prepare_upgrade_handover(args: argparse.Namespace) -> bool: + from flocks.updater import updater + + try: + updater._prepare_upgrade_handover(args.version) + except Exception as exc: + _record_handoff_log(f"prepare_handover_failed error={exc}") + return False + return True + + +def _rollback_upgrade_handover() -> None: + from flocks.updater import updater + + try: + updater.rollback_upgrade_handover() + except Exception as exc: + _record_handoff_log(f"handover_rollback_failed error={exc}") + + def _cleanup_dir(path_value: str | None) -> None: if not path_value: return shutil.rmtree(Path(path_value), ignore_errors=True) +def _cli_subcommand(argv: Sequence[str]) -> str | None: + """Return the flocks.cli.main subcommand embedded in a Python argv.""" + for index, value in enumerate(argv[:-2]): + if value == "-m" and argv[index + 1] == "flocks.cli.main": + return argv[index + 2] + return None + + +def _restart_argv_for_current_runtime(args: argparse.Namespace, restart_argv: Sequence[str]) -> list[str]: + if _cli_subcommand(restart_argv) != "serve": + return list(restart_argv) + + argv = [ + restart_argv[0], + "-m", + "flocks.cli.main", + "start", + "--no-browser", + "--skip-webui-build", + "--host", + str(args.frontend_host), + "--port", + str(args.frontend_port), + "--server-host", + str(args.backend_host), + "--server-port", + str(args.backend_port), + ] + _record_handoff_log(f"legacy_serve_restart_migrated argv={argv}") + return argv + + def run(argv: Sequence[str] | None = None) -> int: args = _parse_args(argv) - restart_argv = list(args.restart_argv) + restart_argv = _restart_argv_for_current_runtime(args, args.restart_argv) if not restart_argv: _record_handoff_log("missing_restart_argv") return 2 @@ -182,7 +236,11 @@ def run(argv: Sequence[str] | None = None) -> int: _cleanup_dir(args.cleanup_dir) return 1 - if not _ensure_backend_port_free(args.backend_port): + if args.prepare_handover: + if not _prepare_upgrade_handover(args): + _cleanup_dir(args.cleanup_dir) + return 1 + elif not _ensure_backend_port_free(args.backend_port): _record_handoff_log(f"backend_port_unavailable port={args.backend_port}") _cleanup_dir(args.cleanup_dir) return 1 @@ -198,6 +256,8 @@ def run(argv: Sequence[str] | None = None) -> int: if not _stop_supervisor_before_restart(): _record_handoff_log("supervisor_stop_timeout") + if args.prepare_handover: + _rollback_upgrade_handover() _cleanup_dir(args.cleanup_dir) return 1 @@ -209,6 +269,8 @@ def run(argv: Sequence[str] | None = None) -> int: ) except OSError as exc: _record_handoff_log(f"restart_spawn_failed error={exc}") + if args.prepare_handover: + _rollback_upgrade_handover() _cleanup_dir(args.cleanup_dir) return 1 diff --git a/flocks/updater/updater.py b/flocks/updater/updater.py index 160d48bf1..5add0c0f7 100644 --- a/flocks/updater/updater.py +++ b/flocks/updater/updater.py @@ -199,17 +199,6 @@ def _looks_like_windows_python_launcher(entry: str) -> bool: return _windows_path_stem(entry) in {"python", "pythonw", "py"} -def _is_windows_file_in_use_error(exc: BaseException) -> bool: - """Return True when *exc* looks like a Windows file-lock failure.""" - if sys.platform != "win32": - return False - if isinstance(exc, OSError) and getattr(exc, "winerror", None) == 32: - return True - - text = str(exc).lower() - return "winerror 32" in text or "used by another process" in text - - def _is_uv_managed_python_runtime_error(text: str) -> bool: """Return True when uv reports a broken managed Python runtime cache.""" if not text: @@ -2970,7 +2959,6 @@ async def perform_update( current_version = get_current_version() effective_update_version = current_version skip_core_replace = False - handover_active = False console_manifest_info: ConsoleManifestRelease | None = None console_manifest_payload = console_manifest_payload if isinstance(console_manifest_payload, dict) else None fmt = _choose_archive_format(ucfg.archive_format) @@ -3162,20 +3150,7 @@ async def _queue_download_progress(progress: UpdateProgress) -> None: ) async def _restore_after_apply_failure() -> None: - nonlocal handover_active if backup_path is None: - if handover_active: - await asyncio.to_thread(rollback_upgrade_handover) - handover_active = False - return - if handover_active: - await asyncio.to_thread( - _rollback_failed_update, - backup_path, - install_root, - current_version, - ) - handover_active = False return await asyncio.to_thread( _restore_backup_if_possible, @@ -3193,28 +3168,6 @@ async def _restore_after_apply_failure() -> None: ) except Exception as exc: final_replace_error: Exception | None = exc - if ( - sys.platform == "win32" - and restart - and needs_handover - and not handover_active - and _is_windows_file_in_use_error(exc) - ): - log.warning("updater.replace.locked_retry_with_handover", {"error": str(exc)}) - try: - _prepare_upgrade_handover(latest_tag) - handover_active = True - if not skip_core_replace: - await asyncio.to_thread( - _replace_install_dir, - content_root, - install_root, - ) - except Exception as retry_exc: - final_replace_error = retry_exc - else: - final_replace_error = None - if final_replace_error is not None: shutil.rmtree(tmp_dir, ignore_errors=True) await _restore_after_apply_failure() @@ -3311,12 +3264,6 @@ async def _restore_after_apply_failure() -> None: restart_argv = _build_restart_argv(install_root) except Exception as exc: log.error("updater.restart.build_argv_failed", {"error": str(exc)}) - if handover_active: - try: - rollback_upgrade_handover() - except Exception: - pass - handover_active = False yield UpdateProgress( stage="error", message=f"Failed to build restart command: {exc}", @@ -3324,20 +3271,6 @@ async def _restore_after_apply_failure() -> None: ) return - if needs_handover and not handover_active: - try: - _prepare_upgrade_handover(latest_tag) - handover_active = True - except Exception as exc: - log.error("updater.handover.failed", {"error": str(exc)}) - await _restore_after_apply_failure() - yield UpdateProgress( - stage="error", - message=f"Failed to prepare WebUI handover: {exc}", - success=False, - ) - return - try: handoff_argv = _build_restart_handoff_argv( restart_argv, @@ -3353,6 +3286,7 @@ async def _restore_after_apply_failure() -> None: pro_bundle_manifest_path=pro_bundle_manifest_path, bundle_sha256=bundle_sha256, cleanup_dir=tmp_dir, + prepare_handover=needs_handover, ) log.info( "updater.restart.handoff_spawn", @@ -3366,12 +3300,6 @@ async def _restore_after_apply_failure() -> None: except Exception as exc: log.error("updater.restart.handoff_spawn_failed", {"error": str(exc)}) shutil.rmtree(tmp_dir, ignore_errors=True) - if handover_active: - try: - rollback_upgrade_handover() - except Exception: - pass - handover_active = False yield UpdateProgress( stage="error", message=f"Failed to restart service: {exc}", @@ -3492,6 +3420,7 @@ def _build_restart_handoff_argv( pro_bundle_manifest_path: Path | None = None, bundle_sha256: str | None = None, cleanup_dir: Path | None = None, + prepare_handover: bool = False, ) -> list[str]: """Wrap the real restart command in a helper that finishes upgrade work.""" if not restart_argv: @@ -3553,6 +3482,8 @@ def _build_restart_handoff_argv( argv.extend(["--bundle-sha256", bundle_sha256]) if cleanup_dir is not None: argv.extend(["--cleanup-dir", str(cleanup_dir)]) + if prepare_handover: + argv.append("--prepare-handover") argv.extend(["--", *managed_restart_argv]) return argv diff --git a/tests/updater/test_restart_handoff.py b/tests/updater/test_restart_handoff.py index a2a81f701..9190db208 100644 --- a/tests/updater/test_restart_handoff.py +++ b/tests/updater/test_restart_handoff.py @@ -10,8 +10,8 @@ from tests.helpers.service_supervisor import make_short_runtime_root, start_supervisor, stop_supervisor, wait_for_supervisor -def _handoff_args(tmp_path: Path, restart_argv: list[str]) -> list[str]: - return [ +def _handoff_args(tmp_path: Path, restart_argv: list[str], *, prepare_handover: bool = False) -> list[str]: + args = [ "--parent-pid", "1234", "--backend-host", @@ -32,9 +32,10 @@ def _handoff_args(tmp_path: Path, restart_argv: list[str]) -> list[str]: "2026.4.1", "--current-version", "2026.3.31", - "--", - *restart_argv, ] + if prepare_handover: + args.append("--prepare-handover") + return [*args, "--", *restart_argv] def test_run_waits_for_parent_and_backend_port_before_spawning( @@ -43,6 +44,22 @@ def test_run_waits_for_parent_and_backend_port_before_spawning( ) -> None: events: list[str] = [] restart_argv = ["python.exe", "-m", "flocks.cli.main", "serve", "--host", "127.0.0.1", "--port", "8000"] + expected_restart_argv = [ + "python.exe", + "-m", + "flocks.cli.main", + "start", + "--no-browser", + "--skip-webui-build", + "--host", + "127.0.0.1", + "--port", + "5173", + "--server-host", + "127.0.0.1", + "--server-port", + "8000", + ] monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) monkeypatch.setattr( @@ -71,11 +88,122 @@ def test_run_waits_for_parent_and_backend_port_before_spawning( code = restart_handoff.run(_handoff_args(tmp_path, restart_argv)) assert code == 0 - assert events[1:] == [ + assert events == [ + f"log:legacy_serve_restart_migrated argv={expected_restart_argv}", + "log:started parent_pid=1234 backend=127.0.0.1:8000 frontend=127.0.0.1:5173", "wait-parent:1234", "free-port:8000", "tasks", "stop-supervisor", + f"spawn:{expected_restart_argv}:{tmp_path}:True", + "log:restart_spawned pid=4321", + ] + + +def test_run_keeps_current_start_restart_argv(monkeypatch, tmp_path: Path) -> None: + events: list[str] = [] + restart_argv = [ + "python.exe", + "-m", + "flocks.cli.main", + "start", + "--no-browser", + "--skip-webui-build", + "--host", + "127.0.0.1", + "--port", + "5173", + ] + + monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) + monkeypatch.setattr(restart_handoff, "_wait_for_parent_exit", lambda parent_pid: True) + monkeypatch.setattr(restart_handoff, "_ensure_backend_port_free", lambda backend_port: True) + monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: None) + monkeypatch.setattr(restart_handoff, "_stop_supervisor_before_restart", lambda: True) + monkeypatch.setattr( + restart_handoff.subprocess, + "Popen", + lambda argv, cwd=None, close_fds=False: events.append(f"spawn:{list(argv)}:{cwd}:{close_fds}") + or SimpleNamespace(pid=4321), + ) + + code = restart_handoff.run(_handoff_args(tmp_path, restart_argv)) + + assert code == 0 + assert f"spawn:{restart_argv}:{tmp_path}:True" in events + + +def test_run_accepts_legacy_backend_pid_file_argument(monkeypatch, tmp_path: Path) -> None: + events: list[str] = [] + restart_argv = ["python.exe", "-m", "flocks.cli.main", "start"] + args = _handoff_args(tmp_path, restart_argv) + args[args.index("--install-root"):args.index("--install-root")] = [ + "--backend-pid-file", + str(tmp_path / "backend.pid"), + ] + + monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) + monkeypatch.setattr(restart_handoff, "_wait_for_parent_exit", lambda parent_pid: True) + monkeypatch.setattr(restart_handoff, "_ensure_backend_port_free", lambda backend_port: True) + monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: None) + monkeypatch.setattr(restart_handoff, "_stop_supervisor_before_restart", lambda: True) + monkeypatch.setattr( + restart_handoff.subprocess, + "Popen", + lambda argv, cwd=None, close_fds=False: events.append(f"spawn:{list(argv)}:{cwd}:{close_fds}") + or SimpleNamespace(pid=4321), + ) + + code = restart_handoff.run(args) + + assert code == 0 + assert f"spawn:{restart_argv}:{tmp_path}:True" in events + + +def test_run_prepares_handover_after_parent_exit_without_waiting_for_page_port( + monkeypatch, + tmp_path: Path, +) -> None: + events: list[str] = [] + restart_argv = ["python.exe", "-m", "flocks.cli.main", "start"] + + monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) + monkeypatch.setattr( + restart_handoff, + "_wait_for_parent_exit", + lambda parent_pid: events.append(f"wait-parent:{parent_pid}") or True, + ) + monkeypatch.setattr( + restart_handoff, + "_prepare_upgrade_handover", + lambda args: events.append(f"prepare:{args.version}") or True, + ) + monkeypatch.setattr( + restart_handoff, + "_ensure_backend_port_free", + lambda backend_port: events.append(f"free-port:{backend_port}") or True, + ) + monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: events.append("tasks") or None) + monkeypatch.setattr( + restart_handoff, + "_stop_supervisor_before_restart", + lambda: events.append("stop-supervisor") or True, + ) + monkeypatch.setattr( + restart_handoff.subprocess, + "Popen", + lambda argv, cwd=None, close_fds=False: events.append(f"spawn:{list(argv)}:{cwd}:{close_fds}") + or SimpleNamespace(pid=4321), + ) + + code = restart_handoff.run(_handoff_args(tmp_path, restart_argv, prepare_handover=True)) + + assert code == 0 + assert events[1:] == [ + "wait-parent:1234", + "prepare:2026.4.1", + "tasks", + "stop-supervisor", f"spawn:{restart_argv}:{tmp_path}:True", "log:restart_spawned pid=4321", ] @@ -93,7 +221,7 @@ def test_run_does_not_spawn_when_parent_exit_times_out(monkeypatch, tmp_path: Pa ) monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: events.append("tasks") or None) - code = restart_handoff.run(_handoff_args(tmp_path, ["python.exe", "-m", "flocks.cli.main", "serve"])) + code = restart_handoff.run(_handoff_args(tmp_path, ["python.exe", "-m", "flocks.cli.main", "start"])) assert code == 1 assert events == ["log:started parent_pid=1234 backend=127.0.0.1:8000 frontend=127.0.0.1:5173", "log:parent_exit_timeout parent_pid=1234"] @@ -175,6 +303,52 @@ def test_run_does_not_spawn_when_supervisor_stop_fails(monkeypatch, tmp_path: Pa assert "spawn" not in events +def test_run_rolls_back_prepared_handover_when_supervisor_stop_fails(monkeypatch, tmp_path: Path) -> None: + events: list[str] = [] + restart_argv = ["python.exe", "-m", "flocks.cli.main", "start"] + + monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) + monkeypatch.setattr(restart_handoff, "_wait_for_parent_exit", lambda parent_pid: True) + monkeypatch.setattr(restart_handoff, "_prepare_upgrade_handover", lambda args: events.append("prepare") or True) + monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: None) + monkeypatch.setattr(restart_handoff, "_stop_supervisor_before_restart", lambda: False) + monkeypatch.setattr(restart_handoff, "_rollback_upgrade_handover", lambda: events.append("rollback-handover")) + monkeypatch.setattr( + restart_handoff.subprocess, + "Popen", + lambda *_args, **_kwargs: events.append("spawn"), + ) + + code = restart_handoff.run(_handoff_args(tmp_path, restart_argv, prepare_handover=True)) + + assert code == 1 + assert "rollback-handover" in events + assert "spawn" not in events + + +def test_run_rolls_back_prepared_handover_when_restart_spawn_fails(monkeypatch, tmp_path: Path) -> None: + events: list[str] = [] + restart_argv = ["python.exe", "-m", "flocks.cli.main", "start"] + + monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) + monkeypatch.setattr(restart_handoff, "_wait_for_parent_exit", lambda parent_pid: True) + monkeypatch.setattr(restart_handoff, "_prepare_upgrade_handover", lambda args: events.append("prepare") or True) + monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: None) + monkeypatch.setattr(restart_handoff, "_stop_supervisor_before_restart", lambda: True) + monkeypatch.setattr(restart_handoff, "_rollback_upgrade_handover", lambda: events.append("rollback-handover")) + monkeypatch.setattr( + restart_handoff.subprocess, + "Popen", + lambda *_args, **_kwargs: (_ for _ in ()).throw(OSError("spawn failed")), + ) + + code = restart_handoff.run(_handoff_args(tmp_path, restart_argv, prepare_handover=True)) + + assert code == 1 + assert "log:restart_spawn_failed error=spawn failed" in events + assert "rollback-handover" in events + + @pytest.mark.skipif(sys.platform == "win32", reason="uses the Unix domain socket control API") def test_stop_supervisor_before_restart_waits_until_real_control_api_stops(monkeypatch) -> None: short_root = make_short_runtime_root("flocks-handoff-") diff --git a/tests/updater/test_updater.py b/tests/updater/test_updater.py index 3cef6d5a0..cbeb4b043 100644 --- a/tests/updater/test_updater.py +++ b/tests/updater/test_updater.py @@ -259,13 +259,6 @@ def test_find_executable_checks_windows_cmd_suffixes( assert updater._find_executable("npm") == str(npm_cmd) -def test_is_windows_file_in_use_error_detects_winerror32(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr(updater.sys, "platform", "win32") - - assert updater._is_windows_file_in_use_error(PermissionError("[WinError 32] file in use")) is True - assert updater._is_windows_file_in_use_error(PermissionError("[WinError 5] access denied")) is False - - def test_is_uv_managed_python_runtime_error_detects_virtualenv_creation_failure() -> None: text = ( "Failed to create temporary virtualenv\n" @@ -830,8 +823,10 @@ def test_build_restart_handoff_argv_rewrites_serve_to_managed_start( sync_timeout=300, version="2026.4.1", current_version="2026.3.31", + prepare_handover=True, ) + assert "--prepare-handover" in argv[: argv.index("--")] assert argv[argv.index("--") + 1 :] == [ "python", "-m", @@ -1707,7 +1702,7 @@ def test_replace_install_dir_copies_dot_flocks_plugins_from_source( @pytest.mark.asyncio -async def test_perform_update_schedules_handoff_after_handover( +async def test_perform_update_schedules_handoff_with_deferred_handover( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, ) -> None: @@ -1783,12 +1778,13 @@ async def fake_sleep(_seconds) -> None: pass assert events[:2] == ["replace", "sleep"] - assert "handover" in events + assert "handover" not in events assert len(popen_calls) == 1 handoff_argv = popen_calls[0] assert handoff_argv[:3] == ["/usr/bin/python3", "-m", "flocks.updater.restart_handoff"] assert "--uv-path" in handoff_argv assert "--version" in handoff_argv + assert "--prepare-handover" in handoff_argv[: handoff_argv.index("--")] assert handoff_argv[handoff_argv.index("--") + 1 :] == [ "/usr/bin/python3", "-m", @@ -1808,7 +1804,7 @@ async def fake_sleep(_seconds) -> None: @pytest.mark.asyncio -async def test_perform_update_errors_when_handover_fails_before_frontend_build( +async def test_perform_update_does_not_prepare_handover_before_spawning_handoff( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, ) -> None: @@ -1824,6 +1820,7 @@ async def test_perform_update_errors_when_handover_fails_before_frontend_build( install_root.mkdir() events: list[str] = [] + popen_calls: list[list[str]] = [] async def fake_get_updater_config(): return SimpleNamespace( @@ -1880,12 +1877,20 @@ async def fake_sleep(_seconds) -> None: lambda _version: (_ for _ in ()).throw(RuntimeError("handover boom")), ) monkeypatch.setattr(updater, "_restore_backup_if_possible", lambda *_args: events.append("restore")) + monkeypatch.setattr( + updater, + "_spawn_restart_handoff", + lambda argv, **_kwargs: popen_calls.append(list(argv)) or SimpleNamespace(pid=4321), + ) + monkeypatch.setattr(updater.os, "_exit", lambda code: (_ for _ in ()).throw(SystemExit(code))) - progresses = [step async for step in updater.perform_update("2026.4.1")] + with pytest.raises(SystemExit, match="0"): + async for _step in updater.perform_update("2026.4.1"): + pass - assert progresses[-1].stage == "error" - assert progresses[-1].message == "Failed to prepare WebUI handover: handover boom" - assert events == ["replace", "restore"] + assert events == ["replace"] + assert len(popen_calls) == 1 + assert "--prepare-handover" in popen_calls[0][: popen_calls[0].index("--")] @pytest.mark.asyncio @@ -2919,7 +2924,7 @@ async def fake_run_async(cmd, cwd=None, timeout=None, env=None): @pytest.mark.asyncio -async def test_perform_update_retries_after_windows_file_lock_and_rolls_back_handover_failures( +async def test_perform_update_reports_windows_file_lock_without_stopping_current_backend( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, ) -> None: @@ -2991,17 +2996,13 @@ def fake_replace_install_dir(*_args, **_kwargs): ) monkeypatch.setattr(updater.os, "_exit", lambda code: (_ for _ in ()).throw(SystemExit(code))) - with pytest.raises(SystemExit, match="0"): - async for _step in updater.perform_update("2026.4.1"): - pass + progresses = [step async for step in updater.perform_update("2026.4.1")] - assert events == [ - "replace-1", - "handover", - "replace-2", - "popen", - ] - assert "restore" not in events + assert progresses[-1].stage == "error" + assert "WinError 32" in progresses[-1].message + assert events == ["replace-1", "restore"] + assert "handover" not in events + assert "popen" not in events @pytest.mark.asyncio @@ -3290,6 +3291,7 @@ async def fake_run_async(cmd, cwd=None, timeout=None, env=None): assert handoff_argv[:3] == [r"C:\tool\python.exe", "-m", "flocks.updater.restart_handoff"] assert "--parent-pid" in handoff_argv assert "--backend-port" in handoff_argv + assert "--prepare-handover" in handoff_argv[: handoff_argv.index("--")] assert handoff_argv[handoff_argv.index("--") + 1 :] == [ r"C:\tool\python.exe", "-m", @@ -3306,7 +3308,7 @@ async def fake_run_async(cmd, cwd=None, timeout=None, env=None): "--server-port", "8000", ] - assert events == ["handover"] + assert events == [] assert "execv" not in events @@ -3499,4 +3501,5 @@ async def fake_run_async(cmd, cwd=None, timeout=None, env=None): assert progresses[-1].stage == "error" assert "Failed to restart service" in progresses[-1].message - assert "rollback_handover" in events + assert "handover" not in events + assert "rollback_handover" not in events From 7479b675258a5cbf7c7250984202e7d6f0fd129b Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Fri, 3 Jul 2026 16:07:12 +0800 Subject: [PATCH 24/28] test(docker): remove obsolete runtime install assertion --- tests/docker/test_dockerfile_runtime_requirements.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/docker/test_dockerfile_runtime_requirements.py b/tests/docker/test_dockerfile_runtime_requirements.py index 1e2d5ccc1..bbb61e1c8 100644 --- a/tests/docker/test_dockerfile_runtime_requirements.py +++ b/tests/docker/test_dockerfile_runtime_requirements.py @@ -5,14 +5,6 @@ DOCKERFILE = REPO_ROOT / "docker" / "Dockerfile" -def test_runtime_image_installs_required_cli_tools() -> None: - dockerfile = DOCKERFILE.read_text(encoding="utf-8") - - assert "npm install --global agent-browser" in dockerfile - assert "agent-browser install --with-deps" in dockerfile - assert "curl -LsSf https://astral.sh/uv/install.sh | sh" in dockerfile - - def test_runtime_image_no_longer_bundles_system_chromium() -> None: dockerfile = DOCKERFILE.read_text(encoding="utf-8") From 6cd762009701e3855a941b8fc7f02bc7601c50d7 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Fri, 3 Jul 2026 16:44:25 +0800 Subject: [PATCH 25/28] fix(installer): build webui after dependency install --- scripts/install.ps1 | 13 +++++++++++++ scripts/install.sh | 9 +++++++++ 2 files changed, 22 insertions(+) diff --git a/scripts/install.ps1 b/scripts/install.ps1 index aac4a1b4b..8dffbb00d 100644 --- a/scripts/install.ps1 +++ b/scripts/install.ps1 @@ -1293,6 +1293,19 @@ function Main { finally { Pop-Location } + Write-Info (Get-LocalizedText -English "Building WebUI static assets..." -Chinese "正在构建 WebUI 静态资源...") + Push-Location (Join-Path $RootDir "webui") + try { + $null = Invoke-NativeCommandOrFail ` + -Description "WebUI static asset build" ` + -FilePath "npm.cmd" ` + -ArgumentList @("run", "build") ` + -WorkingDirectory (Join-Path $RootDir "webui") ` + -StreamOutput + } + finally { + Pop-Location + } if ($InstallTui) { Install-Bun diff --git a/scripts/install.sh b/scripts/install.sh index 72246984b..35780c3dc 100644 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -1150,6 +1150,15 @@ main() { cd "$ROOT_DIR/webui" npm_config_registry="$NPM_REGISTRY" "$NPM_CMD" install ) + if is_zh_install; then + info "正在构建 WebUI 静态资源..." + else + info "Building WebUI static assets..." + fi + ( + cd "$ROOT_DIR/webui" + "$NPM_CMD" run build + ) if [[ "$INSTALL_TUI" -eq 1 ]]; then install_bun From 2f3d6c8ec6f91bdf4ac7628074d86b7747ba8e62 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Fri, 3 Jul 2026 16:54:25 +0800 Subject: [PATCH 26/28] fix(doctor): detect current service status --- flocks/cli/commands/doctor.py | 13 +++++++++---- tests/cli/test_doctor_command.py | 25 ++++++++++++++++++++----- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/flocks/cli/commands/doctor.py b/flocks/cli/commands/doctor.py index 30b5deb35..812ff2618 100644 --- a/flocks/cli/commands/doctor.py +++ b/flocks/cli/commands/doctor.py @@ -101,20 +101,25 @@ def _print_service_diagnosis() -> None: status_lines = build_status_lines() except Exception as error: console.print(f"[yellow]服务状态检查失败:{error}[/yellow]") - console.print("[yellow]服务不正常,请执行 `flocks restart`[/yellow]") + console.print("[yellow]运行状态异常,请执行 `flocks restart`[/yellow]") return for line in status_lines: console.print(line) if _service_status_is_healthy(status_lines): - console.print("[green]服务正常[/green]") + console.print("[green]运行状态正常[/green]") else: - console.print("[yellow]服务不正常,请执行 `flocks restart`[/yellow]") + console.print("[yellow]运行状态异常,请执行 `flocks restart`[/yellow]") def _service_status_is_healthy(status_lines: list[str]) -> bool: - """Return whether backend and WebUI both look healthy from status lines.""" + """Return whether the current or legacy service status looks healthy.""" + daemon_running = any("daemon:" in line and "state=running" in line for line in status_lines) + flocks_healthy = any("flocks:" in line and "state=healthy" in line for line in status_lines) + if daemon_running and flocks_healthy: + return True + backend_running = any("后端运行中" in line for line in status_lines) webui_running = any("WebUI 运行中" in line for line in status_lines) return backend_running and webui_running diff --git a/tests/cli/test_doctor_command.py b/tests/cli/test_doctor_command.py index 28ef05c95..4220514cd 100644 --- a/tests/cli/test_doctor_command.py +++ b/tests/cli/test_doctor_command.py @@ -26,8 +26,8 @@ def fake_run(command, *, cwd, check, env): monkeypatch.setattr( "flocks.cli.service_manager.build_status_lines", lambda: [ - "[flocks] 后端运行中: PID=111 URL=http://127.0.0.1:8000", - "[flocks] WebUI 运行中: PID=222 URL=http://127.0.0.1:5173", + "[flocks] daemon: state=running PID=111", + "[flocks] flocks: state=healthy PID=222 URL=http://127.0.0.1:5173", ], ) @@ -37,7 +37,7 @@ def fake_run(command, *, cwd, check, env): assert "Flocks source directory:" in result.stdout assert "scripts/install.sh" in result.stdout assert "安装正常" in result.stdout - assert "服务正常" in result.stdout + assert "运行状态正常" in result.stdout assert len(calls) == 1 command, cwd, check = calls[0] @@ -71,10 +71,25 @@ def fake_run(command, *, cwd, check, env): assert isinstance(env, dict) assert env["FLOCKS_INSTALL_LANGUAGE"] == "zh-CN" assert env["FLOCKS_UV_DEFAULT_INDEX"] == "https://mirrors.aliyun.com/pypi/simple" - assert "服务不正常,请执行 `flocks restart`" in result.stdout + assert "运行状态异常,请执行 `flocks restart`" in result.stdout + + +def test_service_status_is_healthy_accepts_current_daemon_status() -> None: + assert doctor_cmd._service_status_is_healthy( + [ + "[flocks] daemon: state=running PID=111", + "[flocks] flocks: state=healthy PID=222 URL=http://127.0.0.1:5173", + ] + ) + assert not doctor_cmd._service_status_is_healthy( + [ + "[flocks] daemon: state=running PID=111", + "[flocks] flocks: state=degraded PID=222 URL=http://127.0.0.1:5173", + ] + ) -def test_service_status_is_healthy_requires_backend_and_webui() -> None: +def test_service_status_is_healthy_accepts_legacy_backend_and_webui() -> None: assert doctor_cmd._service_status_is_healthy( [ "[flocks] 后端运行中: PID=111 URL=http://127.0.0.1:8000", From fb12b0e0e98ced1a5d79169e20f57cf9b86e8250 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Fri, 3 Jul 2026 18:49:47 +0800 Subject: [PATCH 27/28] fix(server): harden startup and auth security --- flocks/cli/service_manager.py | 33 +++++ flocks/server/app.py | 16 +++ flocks/server/auth.py | 3 - flocks/server/routes/auth.py | 123 +++++++++++++++++- flocks/server/routes/health.py | 7 - tests/cli/test_service_manager.py | 36 ++++- tests/server/routes/test_auth_audit_routes.py | 38 ++++++ tests/server/test_auth_compat.py | 3 + tests/server/test_server.py | 16 ++- 9 files changed, 259 insertions(+), 16 deletions(-) diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index 0309ae4e8..cc4ce8d44 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -1406,6 +1406,35 @@ def _wait_for_supervisor_ready( raise ServiceError("Flocks daemon 启动超时,请检查日志。") +def _startup_payload_is_ready(payload: dict[str, Any]) -> bool: + """Return whether startup status represents a usable Flocks service.""" + backend = payload.get("backend") if isinstance(payload.get("backend"), dict) else {} + webui = payload.get("webui") if isinstance(payload.get("webui"), dict) else {} + backend_state = str(backend.get("state") or "").lower() + webui_state = str(webui.get("state") or "").lower() + return backend_state == "healthy" and webui_state in {"healthy", "static"} + + +def _startup_failure_message(payload: dict[str, Any]) -> str: + """Build a concise error for failed startup status payloads.""" + daemon = payload.get("daemon") if isinstance(payload.get("daemon"), dict) else {} + backend = payload.get("backend") if isinstance(payload.get("backend"), dict) else {} + webui = payload.get("webui") if isinstance(payload.get("webui"), dict) else {} + details = [] + backend_error = backend.get("last_error") + webui_error = webui.get("last_error") + details.append(f"flocks state={backend.get('state') or 'unknown'}") + if backend_error: + details.append(f"last_error={backend_error}") + if webui.get("state") not in {"healthy", "static"}: + details.append(f"webui state={webui.get('state') or 'unknown'}") + if webui_error and webui_error != backend_error: + details.append(f"webui_error={webui_error}") + log_path = backend.get("log_path") or daemon.get("log_path") + suffix = f";日志: {log_path}" if log_path else "" + return f"Flocks service 启动失败({', '.join(details)}){suffix}" + + def _start_supervisor_process(config: ServiceConfig, paths: RuntimePaths, console) -> subprocess.Popen: """Spawn the detached service supervisor daemon.""" root = ensure_install_layout() @@ -1545,6 +1574,8 @@ def _start_all_without_stop(config: ServiceConfig, console) -> None: console.print("[flocks] Flocks daemon 已启动。") payload = _wait_for_supervisor_ready(paths, process=process) _print_status_payload(payload, console, include_daemon_step=False) + if not _startup_payload_is_ready(payload): + raise ServiceError(_startup_failure_message(payload)) if not config.no_browser: open_default_browser(config.frontend_url, console) @@ -1572,6 +1603,8 @@ def _start_all_unlocked(config: ServiceConfig, console, *, paths: RuntimePaths) console.print("[flocks] Flocks daemon 已在运行,但 Flocks service 不可用,正在重启...") status = request_restart(config, paths=paths) _print_status_payload(status.raw, console, include_daemon_step=False) + if not _startup_payload_is_ready(status.raw): + raise ServiceError(_startup_failure_message(status.raw)) if not config.no_browser and _supervisor_backend_is_healthy(status): open_default_browser(_frontend_url_from_status(status, config.frontend_url), console) return diff --git a/flocks/server/app.py b/flocks/server/app.py index d5e9f128c..c5d408c0b 100644 --- a/flocks/server/app.py +++ b/flocks/server/app.py @@ -660,6 +660,13 @@ async def _run_http_middleware_hooks(request: Request, context: dict[str, Any]) "/api/session/status", }) +_SECURITY_HEADERS = { + "X-Content-Type-Options": "nosniff", + "Referrer-Policy": "no-referrer", + "Content-Security-Policy": "frame-ancestors 'self'", + "Permissions-Policy": "camera=(), microphone=(), geolocation=()", +} + def _is_noisy_request_path(path: str) -> bool: """Return True for high-frequency polling endpoints that are noisy on success.""" @@ -781,6 +788,15 @@ async def __call__(self, scope, receive, send): await self._inner(scope, receive, send) +@app.middleware("http") +async def security_headers_middleware(request: Request, call_next): + """Attach baseline browser security headers to every HTTP response.""" + response = await call_next(request) + for name, value in _SECURITY_HEADERS.items(): + response.headers.setdefault(name, value) + return response + + @app.middleware("http") async def instance_context_middleware(request: Request, call_next): """ diff --git a/flocks/server/auth.py b/flocks/server/auth.py index 0e9f6f95c..593508eeb 100644 --- a/flocks/server/auth.py +++ b/flocks/server/auth.py @@ -23,9 +23,6 @@ PUBLIC_PATHS = frozenset({ "/", "/health", - "/docs", - "/redoc", - "/openapi.json", "/favicon.ico", "/api/health", "/api/config/ui-display", diff --git a/flocks/server/routes/auth.py b/flocks/server/routes/auth.py index 83ec3df9d..813ac1253 100644 --- a/flocks/server/routes/auth.py +++ b/flocks/server/routes/auth.py @@ -4,6 +4,8 @@ from __future__ import annotations +import threading +import time from typing import Any from fastapi import APIRouter, HTTPException, Request, Response, status @@ -21,6 +23,106 @@ router = APIRouter() +_LOGIN_FAILURE_WINDOW_SECONDS = 5 * 60 +_LOGIN_LOCKOUT_SECONDS = 15 * 60 +_LOGIN_MAX_FAILURES_PER_USER_AND_IP = 5 +_LOGIN_MAX_FAILURES_PER_IP = 20 + + +class _LoginRateLimiter: + """In-process failed-login limiter for local account authentication.""" + + def __init__(self) -> None: + self._lock = threading.Lock() + self._failures: dict[tuple[str, str], list[float]] = {} + self._locked_until: dict[tuple[str, str], float] = {} + + def check(self, *, username: str, ip: str | None) -> int | None: + """Return retry-after seconds when the login attempt is currently blocked.""" + now = time.monotonic() + with self._lock: + retry_after = self._retry_after(("user_ip", self._user_ip_key(username, ip)), now) + if retry_after is not None: + return retry_after + return self._retry_after(("ip", self._ip_key(ip)), now) + + def record_failure(self, *, username: str, ip: str | None) -> int | None: + """Record a failed login attempt and return retry-after when it locks out.""" + now = time.monotonic() + with self._lock: + user_retry = self._record_failure( + ("user_ip", self._user_ip_key(username, ip)), + limit=_LOGIN_MAX_FAILURES_PER_USER_AND_IP, + now=now, + ) + ip_retry = self._record_failure( + ("ip", self._ip_key(ip)), + limit=_LOGIN_MAX_FAILURES_PER_IP, + now=now, + ) + if user_retry is not None and ip_retry is not None: + return max(user_retry, ip_retry) + return user_retry if user_retry is not None else ip_retry + + def record_success(self, *, username: str, ip: str | None) -> None: + """Clear the exact user/IP failure bucket after a successful login.""" + with self._lock: + key = ("user_ip", self._user_ip_key(username, ip)) + self._failures.pop(key, None) + self._locked_until.pop(key, None) + + def reset(self) -> None: + """Clear limiter state for tests and process lifecycle resets.""" + with self._lock: + self._failures.clear() + self._locked_until.clear() + + def _retry_after(self, key: tuple[str, str], now: float) -> int | None: + locked_until = self._locked_until.get(key) + if locked_until is None: + return None + if locked_until <= now: + self._locked_until.pop(key, None) + self._failures.pop(key, None) + return None + return max(1, int(locked_until - now)) + + def _record_failure(self, key: tuple[str, str], *, limit: int, now: float) -> int | None: + if retry_after := self._retry_after(key, now): + return retry_after + cutoff = now - _LOGIN_FAILURE_WINDOW_SECONDS + failures = [timestamp for timestamp in self._failures.get(key, []) if timestamp >= cutoff] + failures.append(now) + self._failures[key] = failures + if len(failures) <= limit: + return None + locked_until = now + _LOGIN_LOCKOUT_SECONDS + self._locked_until[key] = locked_until + return _LOGIN_LOCKOUT_SECONDS + + @staticmethod + def _user_ip_key(username: str, ip: str | None) -> str: + return f"{(username or '').strip().casefold()}@{ip or 'unknown'}" + + @staticmethod + def _ip_key(ip: str | None) -> str: + return ip or "unknown" + + +_login_rate_limiter = _LoginRateLimiter() + + +def _request_ip(request: Request) -> str | None: + return getattr(getattr(request, "client", None), "host", None) + + +def _raise_login_rate_limited(retry_after: int) -> None: + raise HTTPException( + status_code=status.HTTP_429_TOO_MANY_REQUESTS, + detail="登录失败次数过多,请稍后再试", + headers={"Retry-After": str(retry_after)}, + ) + def _parse_event_type(event_type: str) -> tuple[str, str]: if "." in event_type: @@ -181,22 +283,39 @@ async def bootstrap_admin(payload: BootstrapAdminRequest, response: Response, re @router.post("/login", response_model=MeResponse, summary="登录本地账号") async def login(payload: LoginRequest, response: Response, request: Request) -> MeResponse: + ip = _request_ip(request) + retry_after = _login_rate_limiter.check(username=payload.username, ip=ip) + if retry_after is not None: + await _emit_auth_audit( + "account.login_rate_limited", + { + "username": payload.username, + "ip": ip, + "retry_after": retry_after, + }, + ) + _raise_login_rate_limited(retry_after) + try: user, session_id = await AuthService.login( payload.username, payload.password, ) except ValueError as exc: + retry_after = _login_rate_limiter.record_failure(username=payload.username, ip=ip) await _emit_auth_audit( "account.login_failed", { "username": payload.username, "reason": str(exc), - "ip": getattr(getattr(request, "client", None), "host", None), + "ip": ip, }, ) + if retry_after is not None: + _raise_login_rate_limited(retry_after) raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc)) from exc + _login_rate_limiter.record_success(username=payload.username, ip=ip) set_session_cookie(response, session_id, secure=should_use_secure_cookie(request)) await _emit_auth_audit( "account.login", @@ -208,7 +327,7 @@ async def login(payload: LoginRequest, response: Response, request: Request) -> "username": user.username, "role": user.role, "session_id": session_id, - "ip": getattr(getattr(request, "client", None), "host", None), + "ip": ip, }, ) return _to_me_response(user) diff --git a/flocks/server/routes/health.py b/flocks/server/routes/health.py index f2539d038..84669c577 100644 --- a/flocks/server/routes/health.py +++ b/flocks/server/routes/health.py @@ -6,8 +6,6 @@ from pydantic import BaseModel from datetime import datetime -from flocks.config.config import Config - router = APIRouter() @@ -17,8 +15,6 @@ class HealthResponse(BaseModel): status: str version: str timestamp: str - config_dir: str - data_dir: str @router.get( @@ -35,15 +31,12 @@ async def health_check() -> HealthResponse: Returns server status and basic information """ from datetime import UTC - config = Config.get_global() from flocks.updater import get_current_version return HealthResponse( status="healthy", version=get_current_version(), timestamp=datetime.now(UTC).isoformat(), - config_dir=str(config.config_dir), - data_dir=str(config.data_dir), ) diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index 1346410cb..cef7885e4 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -1044,7 +1044,8 @@ def test_start_all_does_not_open_browser_when_restarted_service_remains_unhealth monkeypatch.setattr(service_manager, "_print_status_payload", lambda *_args, **_kwargs: calls.append("status")) monkeypatch.setattr(service_manager, "open_default_browser", lambda *_args, **_kwargs: calls.append("browser")) - service_manager.start_all(service_manager.ServiceConfig(), console) + with pytest.raises(service_manager.ServiceError, match="Flocks service 启动失败"): + service_manager.start_all(service_manager.ServiceConfig(), console) assert calls == ["restart", "status"] assert "[flocks] Flocks daemon 已在运行,但 Flocks service 不可用,正在重启..." in console.messages @@ -1122,6 +1123,39 @@ def test_start_all_without_stop_starts_supervisor_daemon(monkeypatch, tmp_path: ] +def test_start_all_without_stop_raises_when_service_starts_degraded(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + calls: list[str] = [] + console = DummyConsole() + degraded_payload = _supervisor_status_payload() + degraded_payload["backend"].update({ + "state": "degraded", + "health": "degraded", + "last_error": "port unavailable", + }) + + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "cleanup_legacy_runtime_processes", lambda *_args, **_kwargs: None) + monkeypatch.setattr(service_manager, "cleanup_orphan_service_ports", lambda *_args, **_kwargs: None) + monkeypatch.setattr( + service_manager, + "_start_supervisor_process", + lambda _config, _paths, _console: calls.append("daemon") or SimpleNamespace(poll=lambda: None), + ) + monkeypatch.setattr( + service_manager, + "_wait_for_supervisor_ready", + lambda _paths, **_kwargs: calls.append("ready") or degraded_payload, + ) + monkeypatch.setattr(service_manager, "_print_status_payload", lambda _payload, _console, **_kwargs: calls.append("status")) + monkeypatch.setattr(service_manager, "open_default_browser", lambda *_args, **_kwargs: calls.append("browser")) + + with pytest.raises(service_manager.ServiceError, match="Flocks service 启动失败"): + service_manager._start_all_without_stop(service_manager.ServiceConfig(), console) + + assert calls == ["daemon", "ready", "status"] + + def test_start_all_without_stop_prints_before_cleanup(monkeypatch, tmp_path: Path) -> None: paths = _make_runtime_paths(tmp_path) events: list[str] = [] diff --git a/tests/server/routes/test_auth_audit_routes.py b/tests/server/routes/test_auth_audit_routes.py index 00f3bf908..c862c83f5 100644 --- a/tests/server/routes/test_auth_audit_routes.py +++ b/tests/server/routes/test_auth_audit_routes.py @@ -74,6 +74,44 @@ async def _emit(event_type: str, payload: dict): assert emitted[0][1]["username"] == "chenjie" +async def test_login_rate_limits_repeated_failures(monkeypatch: pytest.MonkeyPatch): + from flocks.server.routes import auth as auth_routes + + auth_routes._login_rate_limiter.reset() + calls = {"login": 0} + + async def _login(_username: str, _password: str): + calls["login"] += 1 + raise ValueError("用户名或密码错误") + + async def _emit(_event_type: str, _payload: dict): + return None + + monkeypatch.setattr(auth_routes.AuthService, "login", _login) + monkeypatch.setattr(auth_routes, "_emit_auth_audit", _emit) + + request = SimpleNamespace(client=SimpleNamespace(host="127.0.0.1")) + response = Response() + payload = auth_routes.LoginRequest(username="chenjie", password="bad") + try: + for _ in range(auth_routes._LOGIN_MAX_FAILURES_PER_USER_AND_IP): + with pytest.raises(HTTPException) as exc_info: + await auth_routes.login(payload, response, request) + assert exc_info.value.status_code == 400 + + with pytest.raises(HTTPException) as exc_info: + await auth_routes.login(payload, response, request) + assert exc_info.value.status_code == 429 + assert exc_info.value.headers["Retry-After"] + + with pytest.raises(HTTPException) as exc_info: + await auth_routes.login(payload, response, request) + assert exc_info.value.status_code == 429 + assert calls["login"] == auth_routes._LOGIN_MAX_FAILURES_PER_USER_AND_IP + 1 + finally: + auth_routes._login_rate_limiter.reset() + + async def test_logout_emits_audit_event(monkeypatch: pytest.MonkeyPatch): from flocks.server.routes import auth as auth_routes diff --git a/tests/server/test_auth_compat.py b/tests/server/test_auth_compat.py index d4014e2da..b92191bc9 100644 --- a/tests/server/test_auth_compat.py +++ b/tests/server/test_auth_compat.py @@ -295,6 +295,9 @@ def test_static_prefix_is_exempt(self): def test_protected_path_is_not_exempt(self): assert auth_module.auth_middleware_exempt("/api/session") is False assert auth_module.auth_middleware_exempt("/api/admin/users") is False + assert auth_module.auth_middleware_exempt("/docs") is False + assert auth_module.auth_middleware_exempt("/redoc") is False + assert auth_module.auth_middleware_exempt("/openapi.json") is False def test_channel_webhook_is_exempt_via_regex(self): # /api/channel/{channel_id}/webhook is the public callback entry for diff --git a/tests/server/test_server.py b/tests/server/test_server.py index 12ea20652..d326a7b69 100644 --- a/tests/server/test_server.py +++ b/tests/server/test_server.py @@ -53,8 +53,8 @@ async def test_health_check(client): assert data["status"] == "healthy" assert isinstance(data["version"], str) and data["version"] assert "timestamp" in data - assert "config_dir" in data - assert "data_dir" in data + assert "config_dir" not in data + assert "data_dir" not in data assert "task_manager_started" not in data assert "task_scheduler_running" not in data assert "task_scheduler_available" not in data @@ -66,6 +66,17 @@ async def test_health_check(client): assert "task_oldest_running_seconds" not in data +@pytest.mark.asyncio +async def test_security_headers_present(client): + """Baseline browser security headers should be present on HTTP responses.""" + response = await client.get("/api/health") + + assert response.headers["x-content-type-options"] == "nosniff" + assert response.headers["referrer-policy"] == "no-referrer" + assert response.headers["content-security-policy"] == "frame-ancestors 'self'" + assert response.headers["permissions-policy"] == "camera=(), microphone=(), geolocation=()" + + @pytest.mark.asyncio async def test_task_queue_status_includes_diagnostics(client): response = await client.get("/api/task-system/queue/status") @@ -555,4 +566,3 @@ async def test_question_pending_route_lists_session_requests(client): finally: clear_request_state(req1["id"]) clear_request_state(req2["id"]) - From 77b1e31e7770a33b2baf5f03cb2902b615dbf3ac Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Fri, 3 Jul 2026 18:57:25 +0800 Subject: [PATCH 28/28] fix(server): bound login rate limiter state --- flocks/server/routes/auth.py | 64 ++++++++++++++++++- tests/server/routes/test_auth_audit_routes.py | 31 +++++++++ 2 files changed, 93 insertions(+), 2 deletions(-) diff --git a/flocks/server/routes/auth.py b/flocks/server/routes/auth.py index 813ac1253..5cfcb30e2 100644 --- a/flocks/server/routes/auth.py +++ b/flocks/server/routes/auth.py @@ -27,6 +27,8 @@ _LOGIN_LOCKOUT_SECONDS = 15 * 60 _LOGIN_MAX_FAILURES_PER_USER_AND_IP = 5 _LOGIN_MAX_FAILURES_PER_IP = 20 +_LOGIN_PRUNE_INTERVAL_SECONDS = 60 +_LOGIN_MAX_TRACKED_BUCKETS = 2048 class _LoginRateLimiter: @@ -36,6 +38,7 @@ def __init__(self) -> None: self._lock = threading.Lock() self._failures: dict[tuple[str, str], list[float]] = {} self._locked_until: dict[tuple[str, str], float] = {} + self._last_pruned_at = 0.0 def check(self, *, username: str, ip: str | None) -> int | None: """Return retry-after seconds when the login attempt is currently blocked.""" @@ -50,16 +53,20 @@ def record_failure(self, *, username: str, ip: str | None) -> int | None: """Record a failed login attempt and return retry-after when it locks out.""" now = time.monotonic() with self._lock: + self._prune(now) + user_key = ("user_ip", self._user_ip_key(username, ip)) + ip_key = ("ip", self._ip_key(ip)) user_retry = self._record_failure( - ("user_ip", self._user_ip_key(username, ip)), + user_key, limit=_LOGIN_MAX_FAILURES_PER_USER_AND_IP, now=now, ) ip_retry = self._record_failure( - ("ip", self._ip_key(ip)), + ip_key, limit=_LOGIN_MAX_FAILURES_PER_IP, now=now, ) + self._enforce_capacity(now, preserve={user_key, ip_key}) if user_retry is not None and ip_retry is not None: return max(user_retry, ip_retry) return user_retry if user_retry is not None else ip_retry @@ -76,6 +83,7 @@ def reset(self) -> None: with self._lock: self._failures.clear() self._locked_until.clear() + self._last_pruned_at = 0.0 def _retry_after(self, key: tuple[str, str], now: float) -> int | None: locked_until = self._locked_until.get(key) @@ -100,6 +108,58 @@ def _record_failure(self, key: tuple[str, str], *, limit: int, now: float) -> in self._locked_until[key] = locked_until return _LOGIN_LOCKOUT_SECONDS + def _prune(self, now: float, *, force: bool = False) -> None: + if not force and ( + now - self._last_pruned_at < _LOGIN_PRUNE_INTERVAL_SECONDS + and self._tracked_bucket_count() <= _LOGIN_MAX_TRACKED_BUCKETS + ): + return + cutoff = now - _LOGIN_FAILURE_WINDOW_SECONDS + for key, locked_until in list(self._locked_until.items()): + if locked_until <= now: + self._locked_until.pop(key, None) + for key, failures in list(self._failures.items()): + if self._locked_until.get(key, 0) > now: + continue + active_failures = [timestamp for timestamp in failures if timestamp >= cutoff] + if active_failures: + self._failures[key] = active_failures + else: + self._failures.pop(key, None) + self._last_pruned_at = now + + def _enforce_capacity(self, now: float, *, preserve: set[tuple[str, str]]) -> None: + if self._tracked_bucket_count() <= _LOGIN_MAX_TRACKED_BUCKETS: + return + self._prune(now, force=True) + overflow = self._tracked_bucket_count() - _LOGIN_MAX_TRACKED_BUCKETS + if overflow <= 0: + return + candidates = [ + (max(failures, default=0.0), key) + for key, failures in self._failures.items() + if key not in preserve and self._locked_until.get(key, 0) <= now + ] + candidates.sort() + for _latest_failure, key in candidates[:overflow]: + self._failures.pop(key, None) + self._locked_until.pop(key, None) + overflow = self._tracked_bucket_count() - _LOGIN_MAX_TRACKED_BUCKETS + if overflow <= 0: + return + locked_candidates = [ + (locked_until, key) + for key, locked_until in self._locked_until.items() + if key not in preserve + ] + locked_candidates.sort() + for _locked_until, key in locked_candidates[:overflow]: + self._locked_until.pop(key, None) + self._failures.pop(key, None) + + def _tracked_bucket_count(self) -> int: + return len(set(self._failures) | set(self._locked_until)) + @staticmethod def _user_ip_key(username: str, ip: str | None) -> str: return f"{(username or '').strip().casefold()}@{ip or 'unknown'}" diff --git a/tests/server/routes/test_auth_audit_routes.py b/tests/server/routes/test_auth_audit_routes.py index c862c83f5..c01396eb9 100644 --- a/tests/server/routes/test_auth_audit_routes.py +++ b/tests/server/routes/test_auth_audit_routes.py @@ -112,6 +112,37 @@ async def _emit(_event_type: str, _payload: dict): auth_routes._login_rate_limiter.reset() +async def test_login_rate_limiter_prunes_expired_buckets(): + from flocks.server.routes import auth as auth_routes + + limiter = auth_routes._LoginRateLimiter() + now = auth_routes.time.monotonic() + stale_key = ("user_ip", "stale@127.0.0.1") + limiter._failures[stale_key] = [now - auth_routes._LOGIN_FAILURE_WINDOW_SECONDS - 1] + limiter._locked_until[stale_key] = now - 1 + limiter._last_pruned_at = now - auth_routes._LOGIN_PRUNE_INTERVAL_SECONDS - 1 + + limiter.record_failure(username="chenjie", ip="127.0.0.1") + + assert stale_key not in limiter._failures + assert stale_key not in limiter._locked_until + + +async def test_login_rate_limiter_caps_tracked_buckets(monkeypatch: pytest.MonkeyPatch): + from flocks.server.routes import auth as auth_routes + + monkeypatch.setattr(auth_routes, "_LOGIN_MAX_TRACKED_BUCKETS", 4) + monkeypatch.setattr(auth_routes, "_LOGIN_PRUNE_INTERVAL_SECONDS", 0) + limiter = auth_routes._LoginRateLimiter() + + for index in range(10): + limiter.record_failure(username=f"user{index}", ip="127.0.0.1") + + assert limiter._tracked_bucket_count() <= auth_routes._LOGIN_MAX_TRACKED_BUCKETS + assert ("user_ip", "user9@127.0.0.1") in limiter._failures + assert ("ip", "127.0.0.1") in limiter._failures + + async def test_logout_emits_audit_event(monkeypatch: pytest.MonkeyPatch): from flocks.server.routes import auth as auth_routes