diff --git a/flocks/cli/commands/doctor.py b/flocks/cli/commands/doctor.py index 30b5deb35..812ff2618 100644 --- a/flocks/cli/commands/doctor.py +++ b/flocks/cli/commands/doctor.py @@ -101,20 +101,25 @@ def _print_service_diagnosis() -> None: status_lines = build_status_lines() except Exception as error: console.print(f"[yellow]服务状态检查失败:{error}[/yellow]") - console.print("[yellow]服务不正常,请执行 `flocks restart`[/yellow]") + console.print("[yellow]运行状态异常,请执行 `flocks restart`[/yellow]") return for line in status_lines: console.print(line) if _service_status_is_healthy(status_lines): - console.print("[green]服务正常[/green]") + console.print("[green]运行状态正常[/green]") else: - console.print("[yellow]服务不正常,请执行 `flocks restart`[/yellow]") + console.print("[yellow]运行状态异常,请执行 `flocks restart`[/yellow]") def _service_status_is_healthy(status_lines: list[str]) -> bool: - """Return whether backend and WebUI both look healthy from status lines.""" + """Return whether the current or legacy service status looks healthy.""" + daemon_running = any("daemon:" in line and "state=running" in line for line in status_lines) + flocks_healthy = any("flocks:" in line and "state=healthy" in line for line in status_lines) + if daemon_running and flocks_healthy: + return True + backend_running = any("后端运行中" in line for line in status_lines) webui_running = any("WebUI 运行中" in line for line in status_lines) return backend_running and webui_running diff --git a/flocks/cli/main.py b/flocks/cli/main.py index 37069b6ab..8fce87905 100644 --- a/flocks/cli/main.py +++ b/flocks/cli/main.py @@ -5,7 +5,6 @@ """ import asyncio -import os import secrets as secrets_lib import sys from pathlib import Path @@ -31,10 +30,15 @@ task_app, ) from flocks.cli.commands.update import update_command -from flocks.cli.service_manager import ( +from flocks.cli.service_config import ( ServiceConfig, + ServiceConfigError, + build_service_config, + restart_defaults_from_status_payload, +) +from flocks.cli.service_control import read_supervisor_status +from flocks.cli.service_manager import ( ServiceError, - read_runtime_record, resolve_flocks_cli_command, restart_all, runtime_paths, @@ -43,6 +47,7 @@ start_all, stop_all, ) +from flocks.cli.service_supervisor import run_service_daemon from flocks.config.config import Config from flocks.utils.log import Log, LogLevel @@ -142,6 +147,8 @@ def main_callback( def _service_config( no_browser: bool = False, skip_webui_build: bool = False, + host: Optional[str] = None, + port: Optional[int] = None, server_host: Optional[str] = None, server_port: Optional[int] = None, webui_host: Optional[str] = None, @@ -153,87 +160,36 @@ def _service_config( ) -> ServiceConfig: """Build service config from environment and CLI toggles.""" global_config = Config.get_global() - return ServiceConfig( - backend_host=_resolve_host( - cli_value=server_host, - env_names=("FLOCKS_SERVER_HOST", "FLOCKS_BACKEND_HOST"), - default=default_server_host or global_config.server_host, - ), - backend_port=_resolve_port( - cli_value=server_port, - env_names=("FLOCKS_SERVER_PORT", "FLOCKS_BACKEND_PORT"), - default=default_server_port or global_config.server_port, - label="server", - ), - frontend_host=_resolve_host( - cli_value=webui_host, - env_names=("FLOCKS_WEBUI_HOST", "FLOCKS_FRONTEND_HOST"), - default=default_webui_host or "127.0.0.1", - ), - frontend_port=_resolve_port( - cli_value=webui_port, - env_names=("FLOCKS_WEBUI_PORT", "FLOCKS_FRONTEND_PORT"), - default=default_webui_port or 5173, - label="webui", - ), + return build_service_config( no_browser=no_browser, - skip_frontend_build=skip_webui_build, + skip_webui_build=skip_webui_build, + public_host=host, + public_port=port, + server_host=server_host, + server_port=server_port, + webui_host=webui_host, + webui_port=webui_port, + default_server_host=default_server_host or global_config.server_host, + default_server_port=default_server_port or global_config.server_port, + default_webui_host=default_webui_host or "127.0.0.1", + default_webui_port=default_webui_port or 5173, ) -def _resolve_host(cli_value: Optional[str], env_names: tuple[str, ...], default: str) -> str: - """Resolve a host value from CLI, environment, and default values.""" - if cli_value is not None: - return cli_value - for env_name in env_names: - env_value = os.getenv(env_name) - if env_value: - return env_value - return default - - -def _resolve_port( - cli_value: Optional[int], - env_names: tuple[str, ...], - default: int, - label: str, -) -> int: - """Resolve a port value from CLI, environment, and default values.""" - if cli_value is not None: - return cli_value - for env_name in env_names: - env_value = os.getenv(env_name) - if not env_value: - continue - try: - return int(env_value) - except ValueError as error: - raise ServiceError(f"{label} port from {env_name} must be an integer.") from error - return default - - def _restart_runtime_defaults() -> dict[str, Any]: - """Load host/port defaults from the last recorded service runtime.""" - paths = runtime_paths() - backend = read_runtime_record(paths.backend_pid) - frontend = read_runtime_record(paths.frontend_pid) - defaults: dict[str, Any] = {} - if backend is not None: - if backend.host: - defaults["default_server_host"] = backend.host - if backend.port is not None: - defaults["default_server_port"] = backend.port - if frontend is not None: - if frontend.host: - defaults["default_webui_host"] = frontend.host - if frontend.port is not None: - defaults["default_webui_port"] = frontend.port - return defaults + """Load host/port defaults from the running supervisor when available.""" + try: + status = read_supervisor_status(paths=runtime_paths(), timeout=1.0) + except Exception: + return {} + return restart_defaults_from_status_payload(getattr(status, "raw", status)) def _restart_service_config( no_browser: bool = False, skip_webui_build: bool = False, + host: Optional[str] = None, + port: Optional[int] = None, server_host: Optional[str] = None, server_port: Optional[int] = None, webui_host: Optional[str] = None, @@ -243,6 +199,8 @@ def _restart_service_config( return _service_config( no_browser=no_browser, skip_webui_build=skip_webui_build, + host=host, + port=port, server_host=server_host, server_port=server_port, webui_host=webui_host, @@ -263,21 +221,25 @@ def start( skip_webui_build: bool = typer.Option( False, "--skip-webui-build", - help="Skip `npm run build` before starting WebUI", + help="Skip WebUI static asset build before starting Flocks service", ), + host: Optional[str] = typer.Option(None, "--host", "-h", help="Public service host"), + port: Optional[int] = typer.Option(None, "--port", "-p", help="Public service port"), server_host: Optional[str] = typer.Option(None, "--server-host", help="Backend server host"), server_port: Optional[int] = typer.Option(None, "--server-port", help="Backend server port"), webui_host: Optional[str] = typer.Option(None, "--webui-host", help="WebUI host"), webui_port: Optional[int] = typer.Option(None, "--webui-port", help="WebUI port"), ): """ - Start backend and WebUI in daemon mode + Start Flocks service in daemon mode. """ try: start_all( _service_config( no_browser=no_browser, skip_webui_build=skip_webui_build, + host=host, + port=port, server_host=server_host, server_port=server_port, webui_host=webui_host, @@ -292,7 +254,7 @@ def start( @app.command() def stop(): """ - Stop backend and WebUI + Stop Flocks service. """ try: stop_all(console) @@ -306,21 +268,25 @@ def restart( skip_webui_build: bool = typer.Option( False, "--skip-webui-build", - help="Skip `npm run build` before starting WebUI", + help="Skip WebUI static asset build before starting Flocks service", ), + host: Optional[str] = typer.Option(None, "--host", "-h", help="Public service host"), + port: Optional[int] = typer.Option(None, "--port", "-p", help="Public service port"), server_host: Optional[str] = typer.Option(None, "--server-host", help="Backend server host"), server_port: Optional[int] = typer.Option(None, "--server-port", help="Backend server port"), webui_host: Optional[str] = typer.Option(None, "--webui-host", help="WebUI host"), webui_port: Optional[int] = typer.Option(None, "--webui-port", help="WebUI port"), ): """ - Restart backend and WebUI + Restart Flocks service. """ try: restart_all( _restart_service_config( no_browser=no_browser, skip_webui_build=skip_webui_build, + host=host, + port=port, server_host=server_host, server_port=server_port, webui_host=webui_host, @@ -328,14 +294,14 @@ def restart( ), console, ) - except ServiceError as error: + except (ServiceConfigError, ServiceError) as error: _handle_service_error(error) @app.command() def status(): """ - Show backend and WebUI status + Show Flocks service status. """ try: show_status(console) @@ -345,13 +311,13 @@ def status(): @app.command() def logs( - backend: bool = typer.Option(False, "--backend", help="Only show backend logs"), - webui: bool = typer.Option(False, "--webui", help="Only show WebUI logs"), + backend: bool = typer.Option(False, "--backend", help="Only show service logs"), + webui: bool = typer.Option(False, "--webui", help="Only show service logs"), follow: bool = typer.Option(True, "--follow/--no-follow", help="Follow logs in real time"), lines: int = typer.Option(50, "--lines", "-n", min=0, help="Number of recent lines to show"), ): """ - Show backend and WebUI logs + Show Flocks service logs. """ try: show_logs(console, backend=backend, webui=webui, follow=follow, lines=lines) @@ -403,6 +369,39 @@ def serve( ) +@app.command(name="service-daemon", hidden=True) +def service_daemon( + server_host: str = typer.Option("127.0.0.1", "--server-host", help="Backend server host"), + server_port: int = typer.Option(5173, "--server-port", help="Public service port"), + webui_host: str = typer.Option("127.0.0.1", "--webui-host", help="WebUI host"), + webui_port: int = typer.Option(5173, "--webui-port", help="WebUI port"), + legacy_server_host: Optional[str] = typer.Option(None, "--legacy-server-host", help="Legacy backend host"), + legacy_server_port: Optional[int] = typer.Option(8000, "--legacy-server-port", help="Legacy backend port"), + server_port_migration_hint: bool = typer.Option( + False, + "--server-port-migration-hint", + help="Print server-port migration hint in parent CLI", + ), + skip_webui_build: bool = typer.Option(False, "--skip-webui-build", help="Skip WebUI static asset build"), +): + """ + Run the Flocks service supervisor daemon. + """ + run_service_daemon( + ServiceConfig( + backend_host=server_host, + backend_port=server_port, + frontend_host=webui_host, + frontend_port=webui_port, + legacy_backend_host=legacy_server_host, + legacy_backend_port=legacy_server_port, + server_port_migration_hint=server_port_migration_hint, + no_browser=True, + skip_frontend_build=skip_webui_build, + ), + ) + + @app.command() def tui( directory: Optional[Path] = typer.Option(None, "--directory", "-d", help="Project directory"), diff --git a/flocks/cli/service_config.py b/flocks/cli/service_config.py new file mode 100644 index 000000000..7e6df10e2 --- /dev/null +++ b/flocks/cli/service_config.py @@ -0,0 +1,244 @@ +"""Service configuration model and serialization helpers.""" + +from __future__ import annotations + +import os +from dataclasses import dataclass +from typing import Any + + +class ServiceConfigError(ValueError): + """Raised when service configuration input is invalid.""" + + +@dataclass(frozen=True) +class ServiceConfig: + backend_host: str = "127.0.0.1" + backend_port: int = 5173 + frontend_host: str = "127.0.0.1" + frontend_port: int = 5173 + legacy_backend_host: str | None = "127.0.0.1" + legacy_backend_port: int | None = 8000 + server_port_migration_hint: bool = False + no_browser: bool = False + skip_frontend_build: bool = False + + @property + def backend_url(self) -> str: + return f"http://{_format_host_for_url(loopback_host(self.backend_host))}:{self.backend_port}" + + @property + def frontend_url(self) -> str: + return self.backend_url + + @property + def legacy_cleanup_config(self) -> "ServiceConfig": + return ServiceConfig( + backend_host=self.legacy_backend_host or self.backend_host, + backend_port=self.legacy_backend_port or self.backend_port, + frontend_host=self.frontend_host, + frontend_port=self.frontend_port, + no_browser=self.no_browser, + server_port_migration_hint=self.server_port_migration_hint, + skip_frontend_build=self.skip_frontend_build, + ) + + +def loopback_host(host: str) -> str: + """Return a local access host for wildcard bind addresses.""" + return "127.0.0.1" if host in {"0.0.0.0", "::"} else host + + +def _format_host_for_url(host: str) -> str: + """Wrap IPv6 literals in brackets before composing URLs.""" + if ":" in host and not host.startswith("["): + return f"[{host}]" + return host + + +def service_config_payload(config: ServiceConfig) -> dict[str, object]: + """Serialize service config for the supervisor control API.""" + return { + "backend_host": config.backend_host, + "backend_port": config.backend_port, + "frontend_host": config.frontend_host, + "frontend_port": config.frontend_port, + "legacy_backend_host": config.legacy_backend_host, + "legacy_backend_port": config.legacy_backend_port, + "server_port_migration_hint": config.server_port_migration_hint, + "no_browser": config.no_browser, + "skip_frontend_build": config.skip_frontend_build, + } + + +def service_config_from_payload( + payload: dict[str, Any], + default: ServiceConfig | None = None, + *, + no_browser: bool | None = None, + skip_frontend_build: bool | None = None, +) -> ServiceConfig: + """Deserialize service config from a control or upgrade payload.""" + base = default or ServiceConfig() + resolved_skip_frontend_build = ( + _bool(payload.get("skip_frontend_build"), base.skip_frontend_build) + if skip_frontend_build is None + else skip_frontend_build + ) + resolved_no_browser = _bool(payload.get("no_browser"), base.no_browser) if no_browser is None else no_browser + return ServiceConfig( + backend_host=_string(payload.get("backend_host"), base.backend_host), + backend_port=_positive_int(payload.get("backend_port"), base.backend_port), + frontend_host=_string(payload.get("frontend_host"), base.frontend_host), + frontend_port=_positive_int(payload.get("frontend_port"), base.frontend_port), + legacy_backend_host=_optional_string(payload.get("legacy_backend_host"), base.legacy_backend_host), + legacy_backend_port=_optional_positive_int(payload.get("legacy_backend_port"), base.legacy_backend_port), + server_port_migration_hint=_bool(payload.get("server_port_migration_hint"), base.server_port_migration_hint), + no_browser=resolved_no_browser, + skip_frontend_build=resolved_skip_frontend_build, + ) + + +def service_config_from_status_payload( + payload: dict[str, Any], + *, + default: ServiceConfig | None = None, + no_browser: bool | None = None, + skip_frontend_build: bool | None = None, +) -> ServiceConfig: + """Extract service config from a supervisor status payload.""" + config = payload.get("config") if isinstance(payload.get("config"), dict) else {} + return service_config_from_payload( + config, + default=default, + no_browser=no_browser, + skip_frontend_build=skip_frontend_build, + ) + + +def restart_defaults_from_status_payload(payload: dict[str, Any]) -> dict[str, Any]: + """Return CLI default overrides from a supervisor status payload.""" + config = payload.get("config") if isinstance(payload.get("config"), dict) else {} + defaults: dict[str, Any] = {} + if isinstance(config.get("backend_host"), str): + defaults["default_server_host"] = config["backend_host"] + if _is_positive_int(config.get("backend_port")): + defaults["default_server_port"] = config["backend_port"] + if isinstance(config.get("frontend_host"), str): + defaults["default_webui_host"] = config["frontend_host"] + if _is_positive_int(config.get("frontend_port")): + defaults["default_webui_port"] = config["frontend_port"] + return defaults + + +def build_service_config( + *, + no_browser: bool = False, + skip_webui_build: bool = False, + public_host: str | None = None, + public_port: int | None = None, + server_host: str | None = None, + server_port: int | None = None, + webui_host: str | None = None, + webui_port: int | None = None, + default_server_host: str, + default_server_port: int, + default_webui_host: str = "127.0.0.1", + default_webui_port: int = 5173, +) -> ServiceConfig: + """Build service config from CLI values, environment, and defaults. + + Static WebUI mode uses the old WebUI endpoint as the public FastAPI + listener so remote deployments keep their existing browser URL. + """ + explicit_public_host = _first_host(public_host, ("FLOCKS_HOST", "FLOCKS_PUBLIC_HOST")) + explicit_public_port = _first_port(public_port, ("FLOCKS_PORT", "FLOCKS_PUBLIC_PORT"), "public") + explicit_webui_host = _first_host(webui_host, ("FLOCKS_WEBUI_HOST", "FLOCKS_FRONTEND_HOST")) + explicit_webui_port = _first_port(webui_port, ("FLOCKS_WEBUI_PORT", "FLOCKS_FRONTEND_PORT"), "webui") + explicit_server_host = _first_host(server_host, ("FLOCKS_SERVER_HOST", "FLOCKS_BACKEND_HOST")) + explicit_server_port = _first_port(server_port, ("FLOCKS_SERVER_PORT", "FLOCKS_BACKEND_PORT"), "server") + + resolved_public_host = explicit_public_host or explicit_webui_host or explicit_server_host or default_webui_host + resolved_public_port = explicit_public_port or explicit_webui_port or explicit_server_port or default_webui_port + legacy_host = explicit_server_host or default_server_host + legacy_port = explicit_server_port or default_server_port + show_server_port_hint = ( + explicit_server_port is not None + and (explicit_public_port is not None or explicit_webui_port is not None) + and explicit_server_port != resolved_public_port + ) + + return ServiceConfig( + backend_host=resolved_public_host, + backend_port=resolved_public_port, + frontend_host=resolved_public_host, + frontend_port=resolved_public_port, + legacy_backend_host=legacy_host, + legacy_backend_port=legacy_port, + server_port_migration_hint=show_server_port_hint, + no_browser=no_browser, + skip_frontend_build=skip_webui_build, + ) + + +def with_frontend_build(config: ServiceConfig, *, skip_frontend_build: bool) -> ServiceConfig: + """Return config with only the WebUI build behavior changed.""" + return ServiceConfig( + backend_host=config.backend_host, + backend_port=config.backend_port, + frontend_host=config.frontend_host, + frontend_port=config.frontend_port, + legacy_backend_host=config.legacy_backend_host, + legacy_backend_port=config.legacy_backend_port, + server_port_migration_hint=config.server_port_migration_hint, + no_browser=config.no_browser, + skip_frontend_build=skip_frontend_build, + ) + + +def _first_host(cli_value: str | None, env_names: tuple[str, ...]) -> str | None: + if cli_value is not None: + return cli_value + for env_name in env_names: + env_value = os.getenv(env_name) + if env_value: + return env_value + return None + + +def _first_port(cli_value: int | None, env_names: tuple[str, ...], label: str) -> int | None: + if cli_value is not None: + return cli_value + for env_name in env_names: + env_value = os.getenv(env_name) + if not env_value: + continue + try: + return int(env_value) + except ValueError as error: + raise ServiceConfigError(f"{label} port from {env_name} must be an integer.") from error + return None + + +def _string(value: Any, fallback: str) -> str: + return value if isinstance(value, str) and value else fallback + + +def _optional_string(value: Any, fallback: str | None) -> str | None: + return value if isinstance(value, str) and value else fallback + + +def _positive_int(value: Any, fallback: int) -> int: + return value if _is_positive_int(value) else fallback + + +def _optional_positive_int(value: Any, fallback: int | None) -> int | None: + return value if _is_positive_int(value) else fallback + + +def _is_positive_int(value: Any) -> bool: + return isinstance(value, int) and not isinstance(value, bool) and value > 0 + + +def _bool(value: Any, fallback: bool) -> bool: + return value if isinstance(value, bool) else fallback diff --git a/flocks/cli/service_control.py b/flocks/cli/service_control.py new file mode 100644 index 000000000..34320fa9f --- /dev/null +++ b/flocks/cli/service_control.py @@ -0,0 +1,283 @@ +"""Local supervisor control API client helpers.""" + +from __future__ import annotations + +import os +import socket +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Iterator + +import httpx + +from flocks.cli.service_config import ServiceConfig, service_config_from_status_payload, service_config_payload + +SUPERVISOR_CONTROL_PORT = 48765 +SUPERVISOR_LOG_FILENAME = "daemon.log" +SUPERVISOR_SOCKET_FILENAME = "service-daemon.sock" + + +@dataclass(frozen=True) +class DaemonStatus: + pid: int | None + uptime: float | None + version: str | None + state: str + log_path: str | None + + +@dataclass(frozen=True) +class ManagedServiceStatus: + pid: int | None + host: str + port: int | None + state: str + health: str + last_error: str | None + restart_count: int + last_restart_at: float | None + log_path: str | None + command: tuple[str, ...] + paused: bool = False + + +@dataclass(frozen=True) +class SupervisorStatus: + daemon: DaemonStatus + backend: ManagedServiceStatus + webui: ManagedServiceStatus + config: ServiceConfig + raw: dict[str, Any] + + +def _default_runtime_paths(): + from flocks.cli.service_manager import runtime_paths + + return runtime_paths() + + +def supervisor_log_path(paths) -> Path: + """Return the supervisor daemon log path.""" + return paths.log_dir / SUPERVISOR_LOG_FILENAME + + +def supervisor_socket_path(paths) -> Path: + """Return the Unix control socket path for the supervisor daemon.""" + return paths.run_dir / SUPERVISOR_SOCKET_FILENAME + + +def supervisor_control_port() -> int: + """Return the local TCP control port used on Windows.""" + raw = os.getenv("FLOCKS_CONTROL_PORT") + if raw and raw.isdigit(): + value = int(raw) + if 0 < value < 65536: + return value + return SUPERVISOR_CONTROL_PORT + + +def supervisor_uses_tcp_control() -> bool: + """Return True when the daemon control API should use localhost TCP.""" + return sys.platform == "win32" or not hasattr(socket, "AF_UNIX") + + +def supervisor_control_client(paths=None, timeout: float | None = 2.0) -> httpx.Client: + """Create a client for the local daemon control API.""" + if supervisor_uses_tcp_control(): + return httpx.Client( + base_url=f"http://127.0.0.1:{supervisor_control_port()}", + timeout=timeout, + trust_env=False, + ) + current = paths or _default_runtime_paths() + transport = httpx.HTTPTransport(uds=str(supervisor_socket_path(current))) + return httpx.Client(base_url="http://flocks.local", timeout=timeout, trust_env=False, transport=transport) + + +def control_api_request( + method: str, + path: str, + *, + paths=None, + timeout: float | None = 2.0, + **kwargs, +) -> httpx.Response: + """Send one local control API request.""" + with supervisor_control_client(paths, timeout=timeout) as client: + response = client.request(method, path, **kwargs) + response.raise_for_status() + return response + + +def supervisor_is_running(paths=None) -> bool: + """Return True when the local supervisor control API responds.""" + try: + control_api_request("GET", "/status", paths=paths, timeout=0.75) + return True + except Exception: + return False + + +def _read_control_json(path: str, *, paths=None, timeout: float | None = 2.0) -> dict[str, Any]: + response = control_api_request("GET", path, paths=paths, timeout=timeout) + payload = response.json() + if not isinstance(payload, dict): + raise RuntimeError("daemon control API returned an invalid response.") + return payload + + +def _post_control_json( + path: str, + *, + payload: dict[str, Any] | None = None, + paths=None, + timeout: float | None = 5.0, +) -> dict[str, Any]: + response = control_api_request("POST", path, paths=paths, timeout=timeout, json=payload or {}) + data = response.json() + if not isinstance(data, dict): + raise RuntimeError("daemon control API returned an invalid response.") + return data + + +def read_supervisor_status(paths=None, timeout: float | None = 2.0) -> SupervisorStatus: + """Read and parse the current supervisor status.""" + return parse_supervisor_status(_read_control_json("/status", paths=paths, timeout=timeout)) + + +def request_stop(paths=None, timeout: float | None = 2.0) -> dict[str, Any]: + """Ask the supervisor daemon to stop itself and its children.""" + return _post_control_json("/stop", paths=paths, timeout=timeout) + + +def request_restart( + config: ServiceConfig, + *, + paths=None, + timeout: float | None = 180.0, +) -> SupervisorStatus: + """Ask the supervisor daemon to restart all managed services.""" + payload = _post_control_json("/restart", payload=service_config_payload(config), paths=paths, timeout=timeout) + return parse_supervisor_status(payload) + + +def request_restart_backend(*, paths=None, timeout: float | None = 180.0) -> SupervisorStatus: + """Ask the supervisor daemon to restart backend.""" + payload = _post_control_json("/restart/backend", paths=paths, timeout=timeout) + return parse_supervisor_status(payload) + + +def request_restart_webui( + config: ServiceConfig, + *, + force_frontend_build: bool = False, + paths=None, + timeout: float | None = 180.0, +) -> SupervisorStatus: + """Ask the supervisor daemon to restart WebUI.""" + payload = service_config_payload(config) + if force_frontend_build: + payload["force_frontend_build"] = True + data = _post_control_json("/restart/webui", payload=payload, paths=paths, timeout=timeout) + return parse_supervisor_status(data) + + +def request_prepare_upgrade(*, paths=None, timeout: float | None = 30.0) -> SupervisorStatus: + """Ask the supervisor daemon to pause managed services for upgrade handoff.""" + payload = _post_control_json("/upgrade/prepare", paths=paths, timeout=timeout) + return parse_supervisor_status(payload) + + +def request_resume_upgrade( + config: ServiceConfig, + *, + paths=None, + timeout: float | None = 180.0, +) -> SupervisorStatus: + """Ask the supervisor daemon to resume managed services after upgrade handoff.""" + payload = _post_control_json("/upgrade/resume", payload=service_config_payload(config), paths=paths, timeout=timeout) + return parse_supervisor_status(payload) + + +def read_logs( + *, + service: str, + lines: int, + paths=None, + timeout: float | None = 5.0, +) -> dict[str, Any]: + """Read recent service logs through the supervisor control API.""" + return _read_control_json( + f"/logs?service={service}&lines={lines}&follow=false", + paths=paths, + timeout=timeout, + ) + + +def stream_logs( + *, + service: str, + lines: int, + paths=None, + timeout: float | None = None, +) -> Iterator[str]: + """Stream service logs through the supervisor control API.""" + params = {"service": service, "lines": str(lines), "follow": "true"} + with supervisor_control_client(paths, timeout=timeout) as client: + with client.stream("GET", "/logs", params=params) as response: + response.raise_for_status() + yield from response.iter_lines() + + +def parse_supervisor_status(payload: dict[str, Any]) -> SupervisorStatus: + """Parse a supervisor status payload into typed status objects.""" + daemon = payload.get("daemon") if isinstance(payload.get("daemon"), dict) else {} + backend = payload.get("backend") if isinstance(payload.get("backend"), dict) else {} + webui = payload.get("webui") if isinstance(payload.get("webui"), dict) else {} + return SupervisorStatus( + daemon=_parse_daemon_status(daemon), + backend=_parse_service_status(backend), + webui=_parse_service_status(webui), + config=service_config_from_status_payload(payload), + raw=payload, + ) + + +def _parse_daemon_status(payload: dict[str, Any]) -> DaemonStatus: + return DaemonStatus( + pid=_optional_int(payload.get("pid")), + uptime=_optional_float(payload.get("uptime")), + version=str(payload["version"]) if payload.get("version") is not None else None, + state=str(payload.get("state") or "unknown"), + log_path=str(payload["log_path"]) if payload.get("log_path") is not None else None, + ) + + +def _parse_service_status(payload: dict[str, Any]) -> ManagedServiceStatus: + command = payload.get("command") if isinstance(payload.get("command"), list) else [] + return ManagedServiceStatus( + pid=_optional_int(payload.get("pid")), + host=str(payload.get("host") or "127.0.0.1"), + port=_optional_int(payload.get("port")), + state=str(payload.get("state") or "unknown"), + health=str(payload.get("health") or payload.get("state") or "unknown"), + last_error=str(payload["last_error"]) if payload.get("last_error") is not None else None, + restart_count=_optional_int(payload.get("restart_count")) or 0, + last_restart_at=_optional_float(payload.get("last_restart_at")), + log_path=str(payload["log_path"]) if payload.get("log_path") is not None else None, + command=tuple(str(item) for item in command), + paused=bool(payload.get("paused")), + ) + + +def _optional_int(value: Any) -> int | None: + return value if isinstance(value, int) and not isinstance(value, bool) else None + + +def _optional_float(value: Any) -> float | None: + if isinstance(value, bool): + return None + if isinstance(value, (float, int)): + return float(value) + return None diff --git a/flocks/cli/service_manager.py b/flocks/cli/service_manager.py index df534ffe7..cc4ce8d44 100644 --- a/flocks/cli/service_manager.py +++ b/flocks/cli/service_manager.py @@ -21,11 +21,23 @@ from dataclasses import dataclass from pathlib import Path from shutil import which -from typing import Iterable, Sequence +from typing import Any, Iterable, Sequence import httpx from flocks.browser.admin import stop_all_daemons as stop_all_browser_daemons +from flocks.cli.service_config import ServiceConfig, loopback_host +from flocks.cli.service_control import ( + read_logs, + read_supervisor_status, + request_restart, + request_stop, + stream_logs, + supervisor_is_running, + supervisor_log_path, + supervisor_socket_path, + supervisor_uses_tcp_control, +) try: import fcntl @@ -48,26 +60,14 @@ "src\\win\\async.c", "src/win/async.c", ) +WATCHDOG_PID_FILENAME = "watchdog.pid" +SUPERVISOR_START_TIMEOUT_SECONDS = 180.0 class ServiceError(RuntimeError): """Raised when a service lifecycle action fails.""" -@dataclass(frozen=True) -class ServiceConfig: - backend_host: str = "127.0.0.1" - backend_port: int = 8000 - frontend_host: str = "127.0.0.1" - frontend_port: int = 5173 - no_browser: bool = False - skip_frontend_build: bool = False - - @property - def frontend_url(self) -> str: - return f"http://{_loopback_host(self.frontend_host)}:{self.frontend_port}" - - @dataclass(frozen=True) class RuntimePaths: root: Path @@ -144,13 +144,23 @@ def ensure_runtime_dirs(paths: RuntimePaths | None = None) -> RuntimePaths: return current +def watchdog_pid_path(paths: RuntimePaths) -> Path: + """Return the watchdog runtime record path.""" + return paths.run_dir / WATCHDOG_PID_FILENAME + + def ensure_install_layout(root: Path | None = None) -> Path: """Validate that the installed repo still contains backend and WebUI code.""" current = root or repo_root() + from flocks.server.static_webui import resolve_webui_dist_dir + if not (current / "pyproject.toml").exists(): - raise ServiceError(f"未找到安装目录中的 pyproject.toml: {current}") + if resolve_webui_dist_dir() is None: + raise ServiceError(f"未找到安装目录中的 pyproject.toml 或 WebUI 静态资源: {current}") + return current if not (current / "webui" / "package.json").exists(): - raise ServiceError("未找到 WebUI 源码,请重新安装 Flocks,或设置 FLOCKS_REPO_ROOT 指向有效安装目录。") + if resolve_webui_dist_dir() is None: + raise ServiceError("未找到 WebUI 静态资源,请重新安装 Flocks,或设置 FLOCKS_REPO_ROOT 指向有效安装目录。") return current @@ -393,36 +403,15 @@ def read_runtime_record(pid_file: Path) -> RuntimeRecord | None: return _parse_runtime_record(raw) -def write_runtime_record(pid_file: Path, record: RuntimeRecord) -> None: - """Persist runtime metadata in a backward-compatible JSON format.""" - payload: dict[str, object] = {"pid": record.pid} - if record.pgid is not None: - payload["pgid"] = record.pgid - if record.host is not None: - payload["host"] = record.host - if record.port is not None: - payload["port"] = record.port - if record.command: - payload["command"] = list(record.command) - if record.started_at is not None: - payload["started_at"] = record.started_at - pid_file.write_text(json.dumps(payload, ensure_ascii=True, sort_keys=True), encoding="utf-8") - - def process_runtime_record( process: subprocess.Popen, *, - host: str, - port: int, + host: str | None, + port: int | None, command: Sequence[str], ) -> RuntimeRecord: """Build runtime metadata for a freshly started service process.""" - pgid = None - if sys.platform != "win32": - try: - pgid = os.getpgid(process.pid) - except OSError: - pgid = None + pgid = _process_group_id(process) return RuntimeRecord( pid=process.pid, pgid=pgid, @@ -433,17 +422,30 @@ def process_runtime_record( ) +def _process_group_id(process: subprocess.Popen) -> int | None: + """Return a cached or live Unix process group id for a managed process.""" + if sys.platform == "win32": + return None + cached = getattr(process, "_flocks_pgid", None) + if isinstance(cached, int) and cached > 0: + return cached + try: + pgid = os.getpgid(process.pid) + except OSError: + return None + try: + setattr(process, "_flocks_pgid", pgid) + except Exception: + pass + return pgid + + def read_pid(pid_file: Path) -> int | None: """Read a pid file if it exists and contains a valid integer.""" record = read_runtime_record(pid_file) return record.pid if record else None -def write_pid(pid_file: Path, pid: int) -> None: - """Persist a process id.""" - write_runtime_record(pid_file, RuntimeRecord(pid=pid)) - - def _unix_process_stat(pid: int) -> str | None: """Return the Unix process status code for a pid, if available.""" if sys.platform == "win32" or pid <= 0: @@ -766,12 +768,6 @@ def _resolve_upgrade_runtime(console, *, frontend_port: int, attempt_recover: bo return result -def _effective_frontend_port(paths: RuntimePaths, default: int) -> int: - recorded_port = _recorded_port(paths.frontend_pid, default) - upgrade_info = _read_upgrade_runtime_info(recorded_port) - return upgrade_info.frontend_port or recorded_port - - def cleanup_stale_pid_file(pid_file: Path) -> None: """Remove pid files that no longer point to running processes.""" if not pid_file.exists(): @@ -787,20 +783,6 @@ def cleanup_stale_pid_file(pid_file: Path) -> None: pid_file.unlink(missing_ok=True) -def backend_is_running(config: ServiceConfig, paths: RuntimePaths | None = None) -> bool: - """Return True if the tracked backend process is running.""" - current = paths or runtime_paths() - cleanup_stale_pid_file(current.backend_pid) - return runtime_record_is_running(read_runtime_record(current.backend_pid)) or port_is_in_use(config.backend_port) - - -def frontend_is_running(config: ServiceConfig, paths: RuntimePaths | None = None) -> bool: - """Return True if the tracked frontend process is running.""" - current = paths or runtime_paths() - cleanup_stale_pid_file(current.frontend_pid) - return runtime_record_is_running(read_runtime_record(current.frontend_pid)) or port_is_in_use(config.frontend_port) - - def _port_owner_lookup_available() -> bool: """Return True when the current platform can resolve listener pids.""" return sys.platform == "win32" or bool(which("lsof") or which("fuser")) @@ -861,6 +843,189 @@ def port_is_in_use(port: int, listeners: Sequence[int] | None = None) -> bool: return not _bind_port_available(port) +def _process_command_line(pid: int) -> str: + """Return a process command line for best-effort orphan detection.""" + if pid <= 0: + return "" + if sys.platform == "win32": + snapshot = _windows_process_snapshot(pid) + return str(snapshot.get("command_line") or "") if snapshot else "" + completed = subprocess.run( + ["ps", "-p", str(pid), "-o", "command="], + check=False, + capture_output=True, + text=True, + ) + return completed.stdout.strip() + + +def _trusted_flocks_port_owner(pid: int, *, service: str, root: Path) -> bool: + """Return True only for port owners that look like Flocks leftovers.""" + command_line = _process_command_line(pid).lower() + if not command_line: + return False + root_text = str(root).lower() + webui_text = str(root / "webui").lower() + if service == "backend": + looks_like_uvicorn_backend = "uvicorn" in command_line and "flocks.server.app:app" in command_line + return ( + looks_like_uvicorn_backend + or ("flocks.cli.main" in command_line and "serve" in command_line) + or ("flocks" in command_line and "serve" in command_line and root_text in command_line) + ) + if service == "webui": + looks_like_vite = "vite" in command_line and ( + "preview" in command_line or "--host" in command_line or "--port" in command_line + ) + looks_like_flocks_webui = ( + webui_text in command_line + or root_text in command_line + or "/flocks/webui/" in command_line + or "\\flocks\\webui\\" in command_line + ) + return looks_like_vite and looks_like_flocks_webui + return False + + +def _terminate_orphan_pid(pid: int, label: str, console, *, timeout: float = 5.0) -> None: + """Terminate a trusted orphan process tree by pid.""" + console.print(f"[flocks] 清理残留 {label} 进程(PID={pid})...") + if sys.platform == "win32": + subprocess.run(["taskkill", "/PID", str(pid), "/T", "/F"], check=False, capture_output=True) + return + + pgid: int | None = None + try: + candidate_pgid = os.getpgid(pid) + if candidate_pgid != os.getpgrp(): + pgid = candidate_pgid + except OSError: + pgid = None + + targets = collect_process_tree_pids(pid) + signal_process_group(signal.SIGTERM, pgid) + signal_pid_list(signal.SIGTERM, targets) + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if not any(pid_is_running(target) for target in targets) and not process_group_is_running(pgid): + return + time.sleep(0.25) + signal_process_group(signal.SIGKILL, pgid) + signal_pid_list(signal.SIGKILL, targets) + + +def cleanup_trusted_port_owners(port: int, *, service: str, label: str, console, root: Path | None = None) -> list[int]: + """Clean Flocks-owned orphan processes that are still occupying a service port.""" + current_root = root or ensure_install_layout() + listeners = port_owner_pids(port) + trusted = [pid for pid in listeners if _trusted_flocks_port_owner(pid, service=service, root=current_root)] + for pid in trusted: + _terminate_orphan_pid(pid, label, console) + if trusted: + deadline = time.monotonic() + 5.0 + while time.monotonic() < deadline: + current = port_owner_pids(port) + if not any(pid in trusted for pid in current): + break + time.sleep(0.25) + return trusted + + +def _process_list_pids() -> list[int]: + """Return process ids for best-effort trusted orphan cleanup.""" + if sys.platform == "win32": + completed = subprocess.run( + [ + "powershell", + "-NoProfile", + "-Command", + "Get-CimInstance Win32_Process | ForEach-Object { $_.ProcessId }", + ], + check=False, + capture_output=True, + text=True, + ) + else: + completed = subprocess.run( + ["ps", "-eo", "pid="], + check=False, + capture_output=True, + text=True, + ) + if completed.returncode != 0: + return [] + pids = [] + for line in completed.stdout.splitlines(): + value = line.strip() + if value.isdigit(): + pids.append(int(value)) + return sorted(dict.fromkeys(pids)) + + +def _windows_trusted_daemon_process_pids(*, root: Path) -> list[int]: + """Return trusted Windows daemon pids with a single process query.""" + if sys.platform != "win32": + return [] + root_text = str(root).lower() + env = os.environ.copy() + env["FLOCKS_DAEMON_ROOT_MATCH"] = root_text + env["FLOCKS_DAEMON_CURRENT_PID"] = str(os.getpid()) + powershell = which("powershell") or which("powershell.exe") + if not powershell: + return [] + script = ( + "$root = [Environment]::GetEnvironmentVariable('FLOCKS_DAEMON_ROOT_MATCH'); " + "$currentPid = [int][Environment]::GetEnvironmentVariable('FLOCKS_DAEMON_CURRENT_PID'); " + "Get-CimInstance Win32_Process | Where-Object { " + "$_.ProcessId -ne $currentPid -and $_.CommandLine -and " + "$_.CommandLine.ToLowerInvariant().Contains('service-daemon') -and " + "$_.CommandLine.ToLowerInvariant().Contains('flocks') -and " + "$_.CommandLine.ToLowerInvariant().Contains($root) " + "} | ForEach-Object { $_.ProcessId }" + ) + completed = subprocess.run( + [powershell, "-NoProfile", "-Command", script], + check=False, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + env=env, + ) + if completed.returncode != 0: + return [] + return sorted( + dict.fromkeys(int(line.strip()) for line in completed.stdout.splitlines() if line.strip().isdigit()) + ) + + +def _trusted_flocks_daemon_owner(pid: int, *, root: Path) -> bool: + """Return True only for daemon processes that belong to this Flocks install.""" + if pid <= 0 or pid == os.getpid(): + return False + command_line = _process_command_line(pid).lower() + if not command_line: + return False + root_text = str(root).lower() + return "service-daemon" in command_line and "flocks" in command_line and root_text in command_line + + +def trusted_daemon_process_pids(*, root: Path | None = None) -> list[int]: + """Return trusted daemon pids for the current Flocks install.""" + current_root = root or ensure_install_layout() + if sys.platform == "win32": + return _windows_trusted_daemon_process_pids(root=current_root) + return [pid for pid in _process_list_pids() if _trusted_flocks_daemon_owner(pid, root=current_root)] + + +def cleanup_trusted_daemon_processes(*, console, root: Path | None = None) -> list[int]: + """Clean trusted Flocks daemon processes whose control API is unavailable.""" + trusted = trusted_daemon_process_pids(root=root) + for pid in trusted: + _terminate_orphan_pid(pid, "daemon", console) + return trusted + + def _is_reachable_response(response: httpx.Response) -> bool: """Return True when an HTTP endpoint is reachable enough for startup checks.""" return response.status_code < 500 @@ -877,6 +1042,17 @@ def _is_running_status_response(response: httpx.Response) -> bool: return isinstance(payload, dict) and payload.get("status") == "running" +def _is_healthy_status_response(response: httpx.Response) -> bool: + """Return True when the backend health endpoint reports healthy.""" + if response.status_code != 200: + return False + try: + payload = response.json() + except ValueError: + return False + return isinstance(payload, dict) and payload.get("status") == "healthy" + + def wait_for_http( urls: Sequence[str], name: str, @@ -902,37 +1078,61 @@ def wait_for_http( raise ServiceError(f"{name} 启动超时,请检查日志。") -def start_backend(config: ServiceConfig, console) -> None: - """Start the backend API service if needed.""" - root = ensure_install_layout() - paths = ensure_runtime_dirs() - cleanup_stale_pid_file(paths.backend_pid) +class _StdoutConsole: + """Console adapter for daemon logs redirected to a file.""" - runtime_record = read_runtime_record(paths.backend_pid) - tracked_pid = runtime_record.pid if runtime_record else None - listeners = port_owner_pids(config.backend_port) - if listeners: - if tracked_pid and tracked_pid in listeners: - console.print(f"[flocks] 后端已在运行,PID={tracked_pid}") + def print(self, *args, **_kwargs) -> None: + sys.stdout.write(" ".join(str(arg) for arg in args) + "\n") + sys.stdout.flush() + + +def _backend_health_url(host: str, port: int) -> str: + return f"http://{_format_host_for_url(access_host(host))}:{port}/api/health" + + +def _terminate_process( + process: subprocess.Popen | None, + name: str, + console, + *, + timeout: float = 10.0, +) -> None: + """Terminate a process and its process group without scanning service ports.""" + if process is None: + return + + record = process_runtime_record(process, host=None, port=None, command=()) + if process.poll() is not None and not process_group_is_running(record.pgid): + return + + console.print(f"[flocks] 停止 {name}(PID={process.pid})...") + if sys.platform == "win32": + if process.poll() is None: + subprocess.run(["taskkill", "/PID", str(process.pid), "/T", "/F"], check=False, capture_output=True) + else: + if record.pgid is not None: + signal_process_group(signal.SIGTERM, record.pgid) + else: + signal_pid_list(signal.SIGTERM, collect_process_tree_pids(process.pid)) + + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if process.poll() is not None and not process_group_is_running(record.pgid): return - raise ServiceError( - f"后端端口 {config.backend_port} 已被占用 (PID: {_join_pids(listeners)})," - "与当前运行时记录不一致,请先执行 `flocks stop` 或手动清理残留进程。" - ) - if port_is_in_use(config.backend_port, listeners): - raise ServiceError( - f"后端端口 {config.backend_port} 已被占用,但当前环境无法识别占用 PID;" - "请先安装 lsof 或手动清理残留进程。" - ) + time.sleep(0.25) - if runtime_record is not None and runtime_record_is_running(runtime_record): - raise ServiceError( - "后端运行记录仍存活,但端口未监听;请先执行 `flocks stop` 清理异常状态后重试。" - ) + console.print(f"[flocks] {name} 未在预期时间内退出,强制终止...") + if sys.platform == "win32": + if process.poll() is None: + subprocess.run(["taskkill", "/PID", str(process.pid), "/T", "/F"], check=False, capture_output=True) + else: + if record.pgid is not None: + signal_process_group(signal.SIGKILL, record.pgid) + signal_pid_list(signal.SIGKILL, collect_process_tree_pids(process.pid)) - if runtime_record is not None: - paths.backend_pid.unlink(missing_ok=True) +def _backend_command_and_env(root: Path, config: ServiceConfig) -> tuple[list[str], dict[str, str]]: + """Build the backend service command and environment.""" command = resolve_flocks_cli_command(root) + [ "serve", "--host", @@ -940,201 +1140,161 @@ def start_backend(config: ServiceConfig, console) -> None: "--port", str(config.backend_port), ] + env = os.environ.copy() + env["_FLOCKS_WEBUI_HOST"] = config.frontend_host + env["_FLOCKS_WEBUI_PORT"] = str(config.frontend_port) + env["PYTHONUNBUFFERED"] = "1" + env.setdefault("FLOCKS_CONSOLE_BASE_URL", DEFAULT_FLOCKS_CONSOLE_BASE_URL) + return command, env - backend_env = os.environ.copy() - backend_env["_FLOCKS_WEBUI_HOST"] = config.frontend_host - backend_env["_FLOCKS_WEBUI_PORT"] = str(config.frontend_port) - backend_env["PYTHONUNBUFFERED"] = "1" - backend_env.setdefault("FLOCKS_CONSOLE_BASE_URL", DEFAULT_FLOCKS_CONSOLE_BASE_URL) - - console.print("[flocks] 启动后端服务...") - process = _spawn_process( - command, - cwd=root, - log_path=paths.backend_log, - env=backend_env, - ) - write_runtime_record( - paths.backend_pid, - process_runtime_record( - process, - host=config.backend_host, - port=config.backend_port, - command=command, - ), - ) - _log_startup_config(paths.backend_log, "backend", config.backend_host, config.backend_port, read_runtime_record(paths.backend_pid)) - try: - wait_for_http( - [backend_access_base_url(config)], - "后端服务", - delay=3.0, - validator=_is_running_status_response, +def _build_webui_dist(root: Path, config: ServiceConfig, console) -> None: + """Build the production WebUI static bundle.""" + npm = resolve_npm_executable() + if not npm: + raise ServiceError("WebUI dist 不存在,且未检测到 npm;请先安装 Node.js 22+(包含 npm)后重试。") + if not node_version_satisfies_requirement(): + raise ServiceError(f"检测到的 Node.js 版本过低。构建 WebUI 至少需要 Node.js {MIN_NODE_MAJOR}+。") + + webui_dir = root / "webui" + if not (webui_dir / "package.json").exists(): + raise ServiceError("未找到 WebUI 源码,无法构建静态资源。") + + console.print("[flocks] 准备 Flocks 静态资源...") + frontend_env = build_frontend_env(config) + run_kwargs: dict[str, object] = {"cwd": webui_dir, "check": False, "env": frontend_env} + if sys.platform == "win32": + run_kwargs.update({"capture_output": True, "text": True, "encoding": "utf-8", "errors": "replace"}) + completed = subprocess.run([npm, "run", "build"], **run_kwargs) + if completed.returncode != 0: + output = "\n".join( + value for value in (getattr(completed, "stdout", None), getattr(completed, "stderr", None)) if value ) - except ServiceError: - _emit_service_log_tail(console, paths.backend_log, "后端") - stop_one(config.backend_port, paths.backend_pid, "后端", console) - raise + if windows_frontend_build_assertion_is_recoverable(webui_dir, output): + console.print("[flocks] WebUI 构建产物已生成,忽略 Windows Node.js 退出断言。") + else: + if output: + console.print(output) + raise ServiceError("WebUI 构建失败。") - console.print(f"[flocks] 后端已启动,日志: {paths.backend_log}") +def _ensure_webui_dist(root: Path, config: ServiceConfig, console) -> None: + """Ensure the FastAPI process can serve the production WebUI bundle.""" + from flocks.server.static_webui import WebUIDistMissingError, ensure_webui_dist_dir -def start_frontend(config: ServiceConfig, console) -> None: - """Build and start the WebUI preview service if needed.""" - root = ensure_install_layout() - paths = ensure_runtime_dirs() - cleanup_stale_pid_file(paths.frontend_pid) + try: + ensure_webui_dist_dir() + return + except WebUIDistMissingError: + if config.skip_frontend_build: + raise + + _build_webui_dist(root, config, console) + ensure_webui_dist_dir() + + +def _cleanup_backend_start_port(port: int, console, *, root: Path) -> list[int]: + """Clean trusted leftovers that can occupy the unified public service port.""" + cleaned: list[int] = [] + cleaned.extend( + cleanup_trusted_port_owners( + port, + service="backend", + label="后端", + console=console, + root=root, + ) + ) + cleaned.extend( + cleanup_trusted_port_owners( + port, + service="webui", + label="WebUI", + console=console, + root=root, + ) + ) + return sorted(dict.fromkeys(cleaned)) - runtime_record = read_runtime_record(paths.frontend_pid) - tracked_pid = runtime_record.pid if runtime_record else None - listeners = port_owner_pids(config.frontend_port) - if listeners: - if tracked_pid and tracked_pid in listeners: - console.print(f"[flocks] WebUI 已在运行,PID={tracked_pid}") - return - upgrade_info = _read_upgrade_runtime_info(config.frontend_port) - if upgrade_info.page_active: - _resolve_upgrade_runtime( - console, - frontend_port=upgrade_info.frontend_port or config.frontend_port, - attempt_recover=False, - ) - cleanup_stale_pid_file(paths.frontend_pid) - runtime_record = read_runtime_record(paths.frontend_pid) - tracked_pid = runtime_record.pid if runtime_record else None - listeners = port_owner_pids(config.frontend_port) - if tracked_pid and tracked_pid in listeners: - console.print(f"[flocks] WebUI 已在运行,PID={tracked_pid}") - return - if not listeners: - tracked_pid = runtime_record.pid if runtime_record else None - else: - raise ServiceError( - f"WebUI 端口 {config.frontend_port} 已被占用 (PID: {_join_pids(listeners)})," - "与当前运行时记录不一致,请先执行 `flocks stop` 或手动清理残留进程。" - ) +def _start_backend_process( + config: ServiceConfig, + console, + *, + paths: RuntimePaths | None = None, +) -> subprocess.Popen: + """Start the backend child process for the supervisor.""" + root = ensure_install_layout() + current = paths if paths is not None else ensure_runtime_dirs() + _ensure_webui_dist(root, config, console) - else: + listeners = port_owner_pids(config.backend_port) + if listeners: + _cleanup_backend_start_port(config.backend_port, console, root=root) + listeners = port_owner_pids(config.backend_port) + if listeners: raise ServiceError( - f"WebUI 端口 {config.frontend_port} 已被占用 (PID: {_join_pids(listeners)})," - "与当前运行时记录不一致,请先执行 `flocks stop` 或手动清理残留进程。" + f"server 端口 {config.backend_port} 已被占用 (PID: {_join_pids(listeners)})," + "请先执行 `flocks stop` 或手动清理残留进程。" ) - elif port_is_in_use(config.frontend_port, listeners): + if port_is_in_use(config.backend_port, listeners): raise ServiceError( - f"WebUI 端口 {config.frontend_port} 已被占用,但当前环境无法识别占用 PID;" + f"server 端口 {config.backend_port} 已被占用,但当前环境无法识别占用 PID;" "请先安装 lsof 或手动清理残留进程。" ) - if runtime_record is not None and runtime_record_is_running(runtime_record): - raise ServiceError( - "WebUI 运行记录仍存活,但端口未监听;请先执行 `flocks stop` 清理异常状态后重试。" - ) - - if runtime_record is not None: - paths.frontend_pid.unlink(missing_ok=True) - - npm = resolve_npm_executable() - if not npm: - raise ServiceError("未检测到 npm,请先安装 Node.js 22+(包含 npm)后重试。") - if not node_version_satisfies_requirement(): - raise ServiceError(f"检测到的 Node.js 版本过低。启动 WebUI 至少需要 Node.js {MIN_NODE_MAJOR}+。") - - webui_dir = root / "webui" - frontend_env = build_frontend_env(config) - if not config.skip_frontend_build: - console.print("[flocks] 构建 WebUI...") - run_kwargs: dict[str, object] = {"cwd": webui_dir, "check": False, "env": frontend_env} - if sys.platform == "win32": - run_kwargs.update({"capture_output": True, "text": True, "encoding": "utf-8", "errors": "replace"}) - completed = subprocess.run([npm, "run", "build"], **run_kwargs) - if completed.returncode != 0: - output = "\n".join( - value for value in (getattr(completed, "stdout", None), getattr(completed, "stderr", None)) if value - ) - if windows_frontend_build_assertion_is_recoverable(webui_dir, output): - console.print("[flocks] WebUI 构建产物已生成,忽略 Windows Node.js 退出断言。") - else: - if output: - console.print(output) - raise ServiceError("WebUI 构建失败。") - - command = [ - npm, - "run", - "preview", - "--", - "--host", - config.frontend_host, - "--port", - str(config.frontend_port), - ] - - console.print("[flocks] 启动 WebUI...") - process = _spawn_process( - command, - cwd=webui_dir, - log_path=paths.frontend_log, - env=frontend_env, - ) - write_runtime_record( - paths.frontend_pid, - process_runtime_record( - process, - host=config.frontend_host, - port=config.frontend_port, - command=command, - ), + command, env = _backend_command_and_env(root, config) + process = _spawn_process(command, cwd=root, log_path=current.backend_log, env=env) + record = process_runtime_record( + process, + host=config.backend_host, + port=config.backend_port, + command=command, ) - _log_startup_config(paths.frontend_log, "webui", config.frontend_host, config.frontend_port, read_runtime_record(paths.frontend_pid)) + _log_startup_config(current.backend_log, "backend", config.backend_host, config.backend_port, record) try: - wait_for_http([config.frontend_url], "WebUI") + wait_for_http( + [backend_access_base_url(config)], + "后端服务", + delay=3.0, + validator=_is_running_status_response, + ) except ServiceError: - _emit_service_log_tail(console, paths.frontend_log, "WebUI") - stop_one(config.frontend_port, paths.frontend_pid, "WebUI", console) + _emit_service_log_tail(console, current.backend_log, "后端") + _terminate_process(process, "后端", console) raise - - console.print(f"[flocks] WebUI 已启动,日志: {paths.frontend_log}") - - -def _tracked_processes_stopped( - port: int, - record: RuntimeRecord | None, - tracked_pids: Iterable[int], -) -> bool: - """Return True when the tracked service no longer has running processes.""" - listeners = port_owner_pids(port) - if port_is_in_use(port, listeners): - return False - if runtime_record_is_running(record): - return False - return not any(pid_is_running(pid) for pid in tracked_pids) + return process -def _runtime_record_pids(record: RuntimeRecord | None) -> list[int]: - """Collect the latest pids implied by a runtime record.""" +def stop_runtime_record_process(pid_file: Path, name: str, console) -> None: + """Stop a legacy pid/runtime record without scanning ports.""" + cleanup_stale_pid_file(pid_file) + record = read_runtime_record(pid_file) if record is None: - return [] + pid_file.unlink(missing_ok=True) + return - result: list[int] = [] - if record.pid > 0: - result = append_unique_pids(result, collect_process_tree_pids(record.pid)) - if record.pgid is not None and sys.platform != "win32": - result = append_unique_pids(result, _process_group_member_pids(record.pgid)) - return result + targets = collect_process_tree_pids(record.pid) + console.print(f"[flocks] 清理旧 {name} 进程(PID={record.pid})...") + if sys.platform == "win32": + subprocess.run(["taskkill", "/PID", str(record.pid), "/T", "/F"], check=False, capture_output=True) + else: + if record.pgid is not None: + signal_process_group(signal.SIGTERM, record.pgid) + else: + signal_pid_list(signal.SIGTERM, targets) + deadline = time.monotonic() + 5.0 + while time.monotonic() < deadline: + if not runtime_record_is_running(record): + pid_file.unlink(missing_ok=True) + return + time.sleep(0.25) + if record.pgid is not None: + signal_process_group(signal.SIGKILL, record.pgid) + signal_pid_list(signal.SIGKILL, targets) - -def _current_stop_targets( - port: int, - record: RuntimeRecord | None, - tracked_pids: Iterable[int], -) -> list[int]: - """Refresh the pid list that stop_one() should verify or force kill.""" - result = append_unique_pids([], tracked_pids) - result = append_unique_pids(result, _runtime_record_pids(record)) - return append_unique_pids(result, port_owner_pids(port)) + pid_file.unlink(missing_ok=True) def signal_process_group(sig: signal.Signals, pgid: int | None) -> None: @@ -1147,86 +1307,8 @@ def signal_process_group(sig: signal.Signals, pgid: int | None) -> None: pass -def stop_one(port: int, pid_file: Path, name: str, console) -> None: - """Stop a single service by tracked pid and/or listening port.""" - cleanup_stale_pid_file(pid_file) - runtime_record = read_runtime_record(pid_file) - tracked_pid = runtime_record.pid if runtime_record else None - listeners = port_owner_pids(port) - - target_pids: list[int] = [] - if tracked_pid is not None: - target_pids = append_unique_pids(target_pids, collect_process_tree_pids(tracked_pid)) - target_pids = append_unique_pids(target_pids, listeners) - if sys.platform == "win32" and runtime_record is not None: - filtered_targets: list[int] = [] - for pid in target_pids: - if pid in listeners: - filtered_targets = append_unique_pids(filtered_targets, [pid]) - continue - if pid == runtime_record.pid and not _windows_runtime_record_matches_pid(runtime_record, pid, listeners): - continue - filtered_targets = append_unique_pids(filtered_targets, [pid]) - target_pids = filtered_targets - - group_running = process_group_is_running(runtime_record.pgid if runtime_record else None) - if not target_pids and not group_running: - if port_is_in_use(port, listeners): - raise ServiceError( - f"{name} 端口 {port} 已被占用,但当前环境无法识别占用 PID;" - "请先安装 lsof 或手动处理该进程。" - ) - pid_file.unlink(missing_ok=True) - console.print(f"[flocks] {name} 未运行。") - return - - details = _join_pids(target_pids) if target_pids else "none" - if runtime_record and runtime_record.pgid is not None and sys.platform != "win32": - details = f"{details}; PGID={runtime_record.pgid}" - console.print(f"[flocks] 停止 {name}(端口 {port},PID: {details})...") - - if sys.platform == "win32": - for pid in target_pids: - subprocess.run(["taskkill", "/PID", str(pid), "/T", "/F"], check=False, capture_output=True) - else: - if runtime_record and runtime_record.pgid is not None: - signal_process_group(signal.SIGTERM, runtime_record.pgid) - else: - signal_pid_list(signal.SIGTERM, target_pids) - for _ in range(10): - current_targets = _current_stop_targets(port, runtime_record, target_pids) - if _tracked_processes_stopped(port, runtime_record, current_targets): - pid_file.unlink(missing_ok=True) - console.print(f"[flocks] {name} 已停止。") - return - time.sleep(1) - - console.print(f"[flocks] {name} 未在预期时间内退出,强制终止...") - force_targets = _current_stop_targets(port, runtime_record, target_pids) - if runtime_record and runtime_record.pgid is not None: - signal_process_group(signal.SIGKILL, runtime_record.pgid) - signal_pid_list(signal.SIGKILL, force_targets) - - for _ in range(10): - force_targets = _current_stop_targets(port, runtime_record, target_pids) - if _tracked_processes_stopped(port, runtime_record, force_targets): - pid_file.unlink(missing_ok=True) - console.print(f"[flocks] {name} 已停止。") - return - if sys.platform == "win32": - for pid in force_targets: - subprocess.run(["taskkill", "/PID", str(pid), "/T", "/F"], check=False, capture_output=True) - else: - if runtime_record and runtime_record.pgid is not None: - signal_process_group(signal.SIGKILL, runtime_record.pgid) - signal_pid_list(signal.SIGKILL, force_targets) - time.sleep(1) - - raise ServiceError(f"{name} 未在预期时间内退出,请手动检查端口 {port}。") - - def _recorded_port(pid_file: Path, default: int) -> int: - """Return the port from a runtime record, falling back to *default*.""" + """Return the port from a legacy runtime record, falling back to *default*.""" record = read_runtime_record(pid_file) if record is not None and record.port is not None: return record.port @@ -1234,7 +1316,7 @@ def _recorded_port(pid_file: Path, default: int) -> int: def _recorded_host(pid_file: Path, default: str) -> str: - """Return the host from a runtime record, falling back to *default*.""" + """Return the host from a legacy runtime record, falling back to *default*.""" record = read_runtime_record(pid_file) if record is not None and record.host: return record.host @@ -1243,7 +1325,7 @@ def _recorded_host(pid_file: Path, default: str) -> str: @contextlib.contextmanager def service_lock(paths: RuntimePaths): - """Serialize lifecycle commands with a cross-process lock file.""" + """Serialize CLI lifecycle commands while starting/stopping the daemon.""" lock_path = paths.run_dir / "service.lock" lock_path.parent.mkdir(parents=True, exist_ok=True) handle = lock_path.open("a+", encoding="utf-8") @@ -1295,157 +1377,429 @@ def _log_startup_config( handle.write(line) -def _resolve_stop_ports( +def _wait_for_supervisor_ready( paths: RuntimePaths, - config: ServiceConfig | None = None, -) -> tuple[int, int]: - """Resolve frontend/backend ports for stop flows. + *, + process: subprocess.Popen | None = None, + timeout: float = SUPERVISOR_START_TIMEOUT_SECONDS, +) -> dict[str, Any]: + """Wait for the supervisor control API and managed services to become ready.""" + deadline = time.monotonic() + timeout + last_payload: dict[str, Any] | None = None + while time.monotonic() < deadline: + if process is not None and process.poll() is not None: + raise ServiceError(f"Flocks daemon 启动失败,退出码: {process.returncode}") + try: + status = read_supervisor_status(paths=paths, timeout=1.0) + last_payload = status.raw + backend_state = status.backend.state + webui_state = status.webui.state + if backend_state == "healthy" and webui_state in {"healthy", "static"}: + return status.raw + if backend_state == "degraded" or webui_state == "degraded": + return status.raw + except Exception: + pass + time.sleep(0.5) + if last_payload is not None: + return last_payload + raise ServiceError("Flocks daemon 启动超时,请检查日志。") + + +def _startup_payload_is_ready(payload: dict[str, Any]) -> bool: + """Return whether startup status represents a usable Flocks service.""" + backend = payload.get("backend") if isinstance(payload.get("backend"), dict) else {} + webui = payload.get("webui") if isinstance(payload.get("webui"), dict) else {} + backend_state = str(backend.get("state") or "").lower() + webui_state = str(webui.get("state") or "").lower() + return backend_state == "healthy" and webui_state in {"healthy", "static"} + + +def _startup_failure_message(payload: dict[str, Any]) -> str: + """Build a concise error for failed startup status payloads.""" + daemon = payload.get("daemon") if isinstance(payload.get("daemon"), dict) else {} + backend = payload.get("backend") if isinstance(payload.get("backend"), dict) else {} + webui = payload.get("webui") if isinstance(payload.get("webui"), dict) else {} + details = [] + backend_error = backend.get("last_error") + webui_error = webui.get("last_error") + details.append(f"flocks state={backend.get('state') or 'unknown'}") + if backend_error: + details.append(f"last_error={backend_error}") + if webui.get("state") not in {"healthy", "static"}: + details.append(f"webui state={webui.get('state') or 'unknown'}") + if webui_error and webui_error != backend_error: + details.append(f"webui_error={webui_error}") + log_path = backend.get("log_path") or daemon.get("log_path") + suffix = f";日志: {log_path}" if log_path else "" + return f"Flocks service 启动失败({', '.join(details)}){suffix}" + + +def _start_supervisor_process(config: ServiceConfig, paths: RuntimePaths, console) -> subprocess.Popen: + """Spawn the detached service supervisor daemon.""" + root = ensure_install_layout() + log_path = supervisor_log_path(paths) + if not supervisor_uses_tcp_control(): + supervisor_socket_path(paths).unlink(missing_ok=True) + command = resolve_flocks_cli_command(root) + [ + "service-daemon", + "--server-host", + config.backend_host, + "--server-port", + str(config.backend_port), + "--webui-host", + config.frontend_host, + "--webui-port", + str(config.frontend_port), + ] + if config.legacy_backend_host is not None: + command.extend(["--legacy-server-host", config.legacy_backend_host]) + if config.legacy_backend_port is not None: + command.extend(["--legacy-server-port", str(config.legacy_backend_port)]) + if config.server_port_migration_hint: + command.append("--server-port-migration-hint") + if config.skip_frontend_build: + command.append("--skip-webui-build") + env = os.environ.copy() + env["PYTHONUNBUFFERED"] = "1" + return _spawn_process(command, cwd=root, log_path=log_path, env=env) - When a runtime record is missing or uses the legacy pid-only format, - ``start`` and ``restart`` should fall back to the current CLI config - rather than the static default ports. - """ - frontend_default = config.frontend_port if config is not None else ServiceConfig.frontend_port - backend_default = config.backend_port if config is not None else ServiceConfig.backend_port + +def _service_config_matches(left: ServiceConfig, right: ServiceConfig) -> bool: + """Return True when two configs manage the same service endpoints.""" return ( - _effective_frontend_port(paths, frontend_default), - _recorded_port(paths.backend_pid, backend_default), + left.backend_host == right.backend_host + and left.backend_port == right.backend_port + and left.frontend_host == right.frontend_host + and left.frontend_port == right.frontend_port ) -def _stop_all_locked( - paths: RuntimePaths, - console, - *, - config: ServiceConfig | None = None, -) -> None: - """Stop frontend then backend while reusing the caller's lock.""" - fe_port, be_port = _resolve_stop_ports(paths, config) - try: - _resolve_upgrade_runtime(console, frontend_port=fe_port, attempt_recover=False) - stop_one(fe_port, paths.frontend_pid, "WebUI", console) - stop_one(be_port, paths.backend_pid, "后端", console) - finally: +def _supervisor_backend_is_healthy(status) -> bool: + """Return whether a supervisor status represents an accessible Flocks service.""" + return ( + not status.backend.paused + and status.backend.state.lower() == "healthy" + and status.backend.health.lower() == "healthy" + ) + + +def _legacy_runtime_config(paths: RuntimePaths, fallback: ServiceConfig) -> ServiceConfig: + """Build cleanup config from legacy runtime records when present.""" + return ServiceConfig( + backend_host=_recorded_host(paths.backend_pid, fallback.backend_host), + backend_port=_recorded_port(paths.backend_pid, fallback.backend_port), + frontend_host=_recorded_host(paths.frontend_pid, fallback.frontend_host), + frontend_port=_recorded_port(paths.frontend_pid, fallback.frontend_port), + legacy_backend_host=fallback.legacy_backend_host, + legacy_backend_port=fallback.legacy_backend_port, + no_browser=fallback.no_browser, + skip_frontend_build=fallback.skip_frontend_build, + ) + + +def _unique_cleanup_configs(*configs: ServiceConfig) -> list[ServiceConfig]: + """Deduplicate cleanup configs by backend/WebUI ports.""" + result: list[ServiceConfig] = [] + seen: set[tuple[int, int, int | None]] = set() + for config in configs: + key = (config.backend_port, config.frontend_port, config.legacy_backend_port) + if key in seen: + continue + seen.add(key) + result.append(config) + return result + + +def cleanup_legacy_runtime_processes(paths: RuntimePaths, console) -> None: + """Clean legacy pid/runtime records left by pre-daemon service starts.""" + for pid_file, name in ( + (watchdog_pid_path(paths), "watchdog"), + (paths.frontend_pid, "WebUI"), + (paths.backend_pid, "后端"), + ): + stop_runtime_record_process(pid_file, name, console) + + +def _stop_all_unlocked(console, *, paths: RuntimePaths) -> None: + """Stop managed services; caller must hold the lifecycle lock.""" + cleanup_config = ServiceConfig() + legacy_config = _legacy_runtime_config(paths, cleanup_config) + stop_status = None + if not supervisor_is_running(paths): + console.print("[flocks] Flocks daemon 未运行。") + cleanup_legacy_runtime_processes(paths, console) + cleanup_orphan_service_ports(cleanup_config, console, extra_configs=[legacy_config]) stop_all_browser_daemons() + return + try: + stop_status = read_supervisor_status(paths=paths, timeout=1.0) + cleanup_config = stop_status.config + legacy_config = _legacy_runtime_config(paths, cleanup_config) + except Exception: + pass + try: + request_stop(paths=paths, timeout=2.0) + except Exception as exc: + raise ServiceError(f"无法请求 Flocks daemon 停止: {exc}") from exc + + deadline = time.monotonic() + 20.0 + while time.monotonic() < deadline: + if not supervisor_is_running(paths): + cleanup_legacy_runtime_processes(paths, console) + cleanup_orphan_service_ports(cleanup_config, console, extra_configs=[legacy_config]) + stop_all_browser_daemons() + _print_stop_summary(console, stop_status) + return + time.sleep(0.5) + raise ServiceError("Flocks daemon 未在预期时间内退出。") def stop_all(console) -> None: - """Stop frontend then backend using ports persisted in runtime records.""" + """Stop managed services through the supervisor control API.""" paths = ensure_runtime_dirs() with service_lock(paths): - _stop_all_locked(paths, console) + _stop_all_unlocked(console, paths=paths) def _start_all_without_stop(config: ServiceConfig, console) -> None: - """Start backend and frontend, then print access summary.""" - ensure_runtime_dirs() - start_backend(config, console) - start_frontend(config, console) - show_start_summary(config, console) + """Start the supervisor daemon, then print access summary.""" + paths = ensure_runtime_dirs() + _print_static_port_migration_hint(config, console) + console.print("[flocks] Flocks daemon 启动中...") + cleanup_legacy_runtime_processes(paths, console) + cleanup_orphan_service_ports(config, console) + _ensure_webui_dist(ensure_install_layout(), config, console) + process = _start_supervisor_process(config, paths, console) + console.print("[flocks] Flocks daemon 已启动。") + payload = _wait_for_supervisor_ready(paths, process=process) + _print_status_payload(payload, console, include_daemon_step=False) + if not _startup_payload_is_ready(payload): + raise ServiceError(_startup_failure_message(payload)) if not config.no_browser: open_default_browser(config.frontend_url, console) +def _start_all_unlocked(config: ServiceConfig, console, *, paths: RuntimePaths) -> None: + """Ensure the supervisor daemon is running; caller must hold lifecycle lock.""" + _resolve_upgrade_runtime(console, frontend_port=config.frontend_port, attempt_recover=False) + if supervisor_is_running(paths): + status = None + try: + status = read_supervisor_status(paths=paths, timeout=1.0) + except Exception: + status = None + if status is not None and not _service_config_matches(config, status.config): + console.print("[flocks] Flocks daemon 已在运行,但配置已变化,正在按新配置重启...") + _stop_all_unlocked(console, paths=paths) + _start_all_without_stop(config, console) + return + if status is not None and (status.backend.paused or status.backend.state.lower() == "paused"): + console.print("[flocks] Flocks daemon 已在运行,但 Flocks service 处于暂停状态,正在重新启动...") + _stop_all_unlocked(console, paths=paths) + _start_all_without_stop(config, console) + return + if status is not None and not _supervisor_backend_is_healthy(status): + console.print("[flocks] Flocks daemon 已在运行,但 Flocks service 不可用,正在重启...") + status = request_restart(config, paths=paths) + _print_status_payload(status.raw, console, include_daemon_step=False) + if not _startup_payload_is_ready(status.raw): + raise ServiceError(_startup_failure_message(status.raw)) + if not config.no_browser and _supervisor_backend_is_healthy(status): + open_default_browser(_frontend_url_from_status(status, config.frontend_url), console) + return + console.print("[flocks] Flocks daemon 已在运行。") + show_status(console) + if status is not None and not config.no_browser and _supervisor_backend_is_healthy(status): + try: + url = _frontend_url_from_status(status, config.frontend_url) + except Exception: + url = config.frontend_url + open_default_browser(url, console) + return + _start_all_without_stop(config, console) + + def start_all(config: ServiceConfig, console) -> None: - """Ensure backend and frontend are restarted with a clean state.""" + """Ensure the supervisor daemon is running.""" paths = ensure_runtime_dirs() with service_lock(paths): - _stop_all_locked(paths, console, config=config) - _start_all_without_stop(config, console) + _start_all_unlocked(config, console, paths=paths) def restart_all(config: ServiceConfig, console) -> None: - """Restart backend and frontend.""" + """Restart by stopping the daemon first, then starting a fresh daemon.""" paths = ensure_runtime_dirs() with service_lock(paths): - _stop_all_locked(paths, console, config=config) - _start_all_without_stop(config, console) + _stop_all_unlocked(console, paths=paths) + _start_all_unlocked(config, console, paths=paths) -def build_status_lines(paths: RuntimePaths | None = None) -> list[str]: - """Return a human-readable status summary.""" - current = paths or runtime_paths() - cleanup_stale_pid_file(current.backend_pid) - cleanup_stale_pid_file(current.frontend_pid) - - backend_record = read_runtime_record(current.backend_pid) - frontend_record = read_runtime_record(current.frontend_pid) - backend_port = _recorded_port(current.backend_pid, ServiceConfig.backend_port) - frontend_port = _recorded_port(current.frontend_pid, ServiceConfig.frontend_port) - backend_host = _loopback_host(_recorded_host(current.backend_pid, ServiceConfig.backend_host)) - frontend_host = _loopback_host(_recorded_host(current.frontend_pid, ServiceConfig.frontend_host)) - upgrade_info = _read_upgrade_runtime_info(frontend_port) - if frontend_record is None and upgrade_info.frontend_port is not None: - frontend_port = upgrade_info.frontend_port - if frontend_record is None and upgrade_info.frontend_host: - frontend_host = _loopback_host(upgrade_info.frontend_host) - backend_pid = backend_record.pid if backend_record else None - frontend_pid = frontend_record.pid if frontend_record else None - backend_listeners = port_owner_pids(backend_port) - frontend_listeners = port_owner_pids(frontend_port) - backend_in_use = port_is_in_use(backend_port, backend_listeners) - frontend_in_use = port_is_in_use(frontend_port, frontend_listeners) - - lines: list[str] = [] - if backend_listeners: - lines.append( - f"[flocks] 后端运行中: PID={_join_pids(backend_listeners)} URL=http://{backend_host}:{backend_port}" - ) - elif backend_in_use: - lines.append(f"[flocks] 后端运行中: PID=unknown URL=http://{backend_host}:{backend_port}") - elif pid_is_running(backend_pid): - lines.append(f"[flocks] 后端主进程仍在运行,但端口 {backend_port} 未监听: PID={backend_pid}") - elif process_group_is_running(backend_record.pgid if backend_record else None): - lines.append(f"[flocks] 后端进程组仍在运行,但端口 {backend_port} 未监听: PGID={backend_record.pgid}") - else: - lines.append("[flocks] 后端未运行") +def _print_static_port_migration_hint(config: ServiceConfig, console) -> None: + """Explain legacy server-port behavior when it differs from public WebUI port.""" + if ( + not config.server_port_migration_hint + or config.legacy_backend_port is None + or config.legacy_backend_port == config.backend_port + ): + return + console.print( + "[flocks] API 已与 WebUI 同源," + f"当前统一监听端口为 {config.backend_port};旧 server 端口 {config.legacy_backend_port} 仅用于残留清理。" + ) + + +def _print_stop_summary(console, status) -> None: + """Print stopped services from the last available supervisor status.""" + if status is not None: + if status.backend.pid is not None: + console.print(f"[flocks] flocks 已停止(PID={status.backend.pid})。") + console.print("[flocks] daemon 已停止。") - if upgrade_info.page_active: - lines.append( - f"[flocks] WebUI 临时升级页运行中: PID={_join_pids(upgrade_info.listener_pids)} URL=http://{frontend_host}:{frontend_port}" + +def cleanup_orphan_service_ports(config: ServiceConfig, console, *, extra_configs: Sequence[ServiceConfig] = ()) -> None: + """Clean trusted Flocks leftovers on configured backend/WebUI ports.""" + root = ensure_install_layout() + cleanup_trusted_daemon_processes(console=console, root=root) + candidates: list[ServiceConfig] = [] + for candidate in (config, config.legacy_cleanup_config, *extra_configs): + candidates.append(candidate) + candidates.append(candidate.legacy_cleanup_config) + for cleanup_config in _unique_cleanup_configs(*candidates): + cleanup_trusted_port_owners( + cleanup_config.backend_port, + service="backend", + label="后端", + console=console, + root=root, ) - elif frontend_listeners: - lines.append( - f"[flocks] WebUI 运行中: PID={_join_pids(frontend_listeners)} URL=http://{frontend_host}:{frontend_port}" + cleanup_trusted_port_owners( + cleanup_config.backend_port, + service="webui", + label="WebUI", + console=console, + root=root, ) - elif frontend_in_use: - lines.append(f"[flocks] WebUI 运行中: PID=unknown URL=http://{frontend_host}:{frontend_port}") - elif pid_is_running(frontend_pid): - lines.append(f"[flocks] WebUI 主进程仍在运行,但端口 {frontend_port} 未监听: PID={frontend_pid}") - elif process_group_is_running(frontend_record.pgid if frontend_record else None): - lines.append(f"[flocks] WebUI 进程组仍在运行,但端口 {frontend_port} 未监听: PGID={frontend_record.pgid}") - else: - lines.append("[flocks] WebUI 未运行") + cleanup_trusted_port_owners( + cleanup_config.frontend_port, + service="webui", + label="WebUI", + console=console, + root=root, + ) + cleanup_trusted_port_owners( + cleanup_config.frontend_port, + service="backend", + label="后端", + console=console, + root=root, + ) + + +def build_status_lines(paths: RuntimePaths | None = None) -> list[str]: + """Return a human-readable status summary from the supervisor control API.""" + current = paths or runtime_paths() + try: + status = read_supervisor_status(paths=current) + except Exception: + residual_daemons = [] + try: + residual_daemons = trusted_daemon_process_pids(root=ensure_install_layout()) + except Exception: + residual_daemons = [] + if residual_daemons: + return [ + "[flocks] Flocks daemon control API 未运行", + f"[flocks] 检测到残留 daemon 进程: PID={_join_pids(residual_daemons)}", + f"[flocks] 日志: {supervisor_log_path(current)}", + "[flocks] 可执行 `flocks stop` 清理残留进程。", + ] + return [ + "[flocks] Flocks daemon 未运行", + f"[flocks] 日志: {supervisor_log_path(current)}", + ] + return _status_lines_from_payload(status.raw) + + +def _status_lines_from_payload(payload: dict[str, Any]) -> list[str]: + daemon = payload.get("daemon") if isinstance(payload.get("daemon"), dict) else {} + backend = payload.get("backend") if isinstance(payload.get("backend"), dict) else {} + lines = [ + "[flocks] 服务", + _daemon_status_line(daemon), + _service_status_line("flocks", backend), + "", + "[flocks] 日志", + f"[flocks] daemon: {daemon.get('log_path')}", + ] + log_path = backend.get("log_path") + if log_path: + lines.append(f"[flocks] flocks: {log_path}") + return lines + + +def _service_status_line(label: str, payload: dict[str, Any]) -> str: + host = _loopback_host(str(payload.get("host") or "127.0.0.1")) + port = payload.get("port") + pid = payload.get("pid") + state = payload.get("state") or "unknown" + error = payload.get("last_error") + suffix = f" last_error={error}" if error else "" + pid_part = f" PID={pid}" if pid is not None else "" + return f"[flocks] {label}: state={state}{pid_part} URL=http://{host}:{port}{suffix}" - if upgrade_info.payload_present: - lines.append("[flocks] 检测到未完成的升级恢复状态") - lines.append(f"[flocks] 后端日志: {current.backend_log}") - lines.append(f"[flocks] WebUI 日志: {current.frontend_log}") +def _daemon_status_line(payload: dict[str, Any]) -> str: + pid = payload.get("pid") + state = payload.get("state") or "unknown" + error = payload.get("last_error") + suffix = f" last_error={error}" if error else "" + return f"[flocks] daemon: state={state} PID={pid}{suffix}" + + +def _startup_step_status(state: object, *, ready_states: set[str]) -> str: + return "已启动" if str(state or "").lower() in ready_states else "启动异常" + + +def _startup_status_lines_from_payload(payload: dict[str, Any], *, include_daemon_step: bool = True) -> list[str]: + daemon = payload.get("daemon") if isinstance(payload.get("daemon"), dict) else {} + backend = payload.get("backend") if isinstance(payload.get("backend"), dict) else {} + lines = [] + if include_daemon_step: + lines.append(f"[flocks] Flocks daemon {_startup_step_status(daemon.get('state'), ready_states={'running'})}。") + lines.extend([ + f"[flocks] Flocks service {_startup_step_status(backend.get('state'), ready_states={'healthy'})}。", + "", + "[flocks] 服务", + _daemon_status_line(daemon), + _service_status_line("flocks", backend), + "", + "[flocks] 日志", + f"[flocks] daemon: {daemon.get('log_path')}", + ]) + log_path = backend.get("log_path") + if log_path: + lines.append(f"[flocks] flocks: {log_path}") return lines +def _frontend_url_from_status(status, fallback: str) -> str: + if status.backend.port is not None: + return f"http://{_format_host_for_url(_loopback_host(status.backend.host))}:{status.backend.port}" + return fallback + + +def _print_status_payload(payload: dict[str, Any], console, *, include_daemon_step: bool = True) -> None: + for line in _startup_status_lines_from_payload(payload, include_daemon_step=include_daemon_step): + console.print(line) + + def show_status(console) -> None: """Print service status.""" for line in build_status_lines(): console.print(line) -def show_start_summary(config: ServiceConfig, console) -> None: - """Print URLs and log locations after startup.""" - paths = ensure_runtime_dirs() - console.print() - console.print("[flocks] 日志:") - console.print(f"[flocks] 后端: {paths.backend_log}") - console.print(f"[flocks] WebUI: {paths.frontend_log}") - console.print() - console.print("[flocks] 后端接口:") - console.print(f"[flocks] http://{_loopback_host(config.backend_host)}:{config.backend_port}") - console.print() - console.print("[flocks] 打开浏览器访问:") - console.print(f"[flocks] {config.frontend_url}") - - def show_logs( console, *, @@ -1454,58 +1808,111 @@ def show_logs( follow: bool = True, lines: int = 50, ) -> None: - """Print recent service logs and optionally follow them.""" + """Print recent service logs through the supervisor control API.""" paths = ensure_runtime_dirs() - selections = selected_log_paths(paths, backend=backend, webui=webui) - prefixes = {paths.backend_log: "backend", paths.frontend_log: "webui"} + service = "all" + if backend and not webui: + service = "backend" + elif webui and not backend: + service = "webui" + if not follow: + try: + payload = read_logs(service=service, lines=lines, paths=paths, timeout=5.0) + except Exception as exc: + console.print(f"[flocks] Flocks daemon 日志接口不可用,改为读取本地日志文件: {exc}") + _show_local_logs(console, paths, backend=backend, webui=webui, follow=False, lines=lines) + return + logs = payload.get("logs") if isinstance(payload.get("logs"), dict) else {} + for prefix, entry in logs.items(): + if not isinstance(entry, dict): + continue + console.print(f"[{prefix}] --- {entry.get('path')} ---") + for line in entry.get("lines") or []: + console.print(f"[{prefix}] {line}") + return - for path in selections: + console.print("[flocks] 按 Ctrl+C 退出日志跟随。") + try: + for line in stream_logs(service=service, lines=lines, paths=paths, timeout=None): + console.print(line) + except KeyboardInterrupt: + return + except Exception as exc: + console.print(f"[flocks] Flocks daemon 日志接口不可用,改为跟随本地日志文件: {exc}") + _show_local_logs(console, paths, backend=backend, webui=webui, follow=True, lines=lines) + + +def selected_log_paths( + paths: RuntimePaths, + *, + backend: bool = False, + webui: bool = False, +) -> list[Path]: + """Return the log files selected by CLI flags.""" + if backend and not webui: + return [paths.backend_log] + if webui and not backend: + return [paths.backend_log] + return [paths.backend_log] + + +def _selected_log_entries(paths: RuntimePaths, *, backend: bool = False, webui: bool = False) -> list[tuple[str, Path]]: + """Return local log files selected by CLI flags.""" + if backend and not webui: + return [("flocks", paths.backend_log)] + if webui and not backend: + return [("flocks", paths.backend_log)] + return [ + ("flocks", paths.backend_log), + ("daemon", supervisor_log_path(paths)), + ] + + +def _show_local_logs( + console, + paths: RuntimePaths, + *, + backend: bool = False, + webui: bool = False, + follow: bool = True, + lines: int = 50, +) -> None: + """Print local log files when the daemon control API is unavailable.""" + selections = _selected_log_entries(paths, backend=backend, webui=webui) + for prefix, path in selections: + path.parent.mkdir(parents=True, exist_ok=True) path.touch(exist_ok=True) - console.print(f"[{prefixes[path]}] --- {path} ---") + console.print(f"[{prefix}] --- {path} ---") for line in tail_lines(path, lines): - console.print(f"[{prefixes[path]}] {line}") + console.print(f"[{prefix}] {line}") if not follow: return - console.print("[flocks] 按 Ctrl+C 退出日志跟随。") handles = {} try: - for path in selections: + for prefix, path in selections: handle = path.open("r", encoding="utf-8", errors="replace") handle.seek(0, os.SEEK_END) - handles[path] = handle - + handles[prefix] = handle while True: emitted = False - for path, handle in handles.items(): + for prefix, handle in handles.items(): while True: line = handle.readline() if not line: break emitted = True - console.print(f"[{prefixes[path]}] {line.rstrip()}") + console.print(f"[{prefix}] {line.rstrip()}") if not emitted: time.sleep(FOLLOW_POLL_INTERVAL) + except KeyboardInterrupt: + return finally: for handle in handles.values(): handle.close() -def selected_log_paths( - paths: RuntimePaths, - *, - backend: bool = False, - webui: bool = False, -) -> list[Path]: - """Return the log files selected by CLI flags.""" - if backend and not webui: - return [paths.backend_log] - if webui and not backend: - return [paths.frontend_log] - return [paths.backend_log, paths.frontend_log] - - def tail_lines(path: Path, lines: int) -> list[str]: """Read the last N lines from a text file.""" with path.open("r", encoding="utf-8", errors="replace") as handle: @@ -1596,9 +2003,18 @@ def signal_pid_list(sig: signal.Signals, pids: Iterable[int]) -> None: def open_default_browser(url: str, console) -> None: """Best-effort browser open.""" + if sys.platform == "win32": + startfile = getattr(os, "startfile", None) + if startfile is not None: + try: + startfile(url) + console.print(f"[flocks] 浏览器已打开: {url}") + return + except Exception: + pass try: if webbrowser.open(url): - console.print(f"[flocks] 已使用默认浏览器打开: {url}") + console.print(f"[flocks] 浏览器已打开: {url}") return except Exception: pass @@ -1607,7 +2023,7 @@ def open_default_browser(url: str, console) -> None: def access_host(host: str) -> str: """Return the host that local health checks and browser requests should use.""" - return _loopback_host(host) + return loopback_host(host) def _format_host_for_url(host: str) -> str: @@ -1707,7 +2123,7 @@ def _spawn_process( _cap_service_log_file(log_path, MAX_SERVICE_LOG_BYTES) handle = log_path.open("a", encoding="utf-8") try: - return subprocess.Popen( + process = subprocess.Popen( list(command), cwd=cwd, env=env, @@ -1717,6 +2133,8 @@ def _spawn_process( creationflags=creationflags, **kwargs, ) + _process_group_id(process) + return process finally: handle.close() @@ -1792,7 +2210,7 @@ def _join_pids(pids: Iterable[int]) -> str: def _loopback_host(host: str) -> str: - return "127.0.0.1" if host in {"0.0.0.0", "::"} else host + return loopback_host(host) def _http_to_ws_url(url: str) -> str: diff --git a/flocks/cli/service_process.py b/flocks/cli/service_process.py new file mode 100644 index 000000000..d6407417e --- /dev/null +++ b/flocks/cli/service_process.py @@ -0,0 +1,95 @@ +"""Process adapters used by the service supervisor.""" + +from __future__ import annotations + +import socket +import subprocess +from dataclasses import dataclass +from typing import Protocol + +import httpx + +from flocks.cli.service_config import ServiceConfig + + +@dataclass(frozen=True) +class ServiceProbeResult: + healthy: bool + reason: str | None = None + restart: bool = False + + +class ProcessAdapter(Protocol): + name: str + label: str + + def start(self, config: ServiceConfig, paths, *, built_once: bool = False) -> subprocess.Popen: + """Start the service process.""" + + def stop(self, process: subprocess.Popen | None) -> None: + """Stop the service process group.""" + + def probe(self, process: subprocess.Popen | None, host: str, port: int) -> ServiceProbeResult: + """Probe service process and listener health.""" + + +class BackendProcessAdapter: + name = "backend" + label = "后端" + + def start(self, config: ServiceConfig, paths, *, built_once: bool = False) -> subprocess.Popen: + del built_once + from flocks.cli.service_manager import _StdoutConsole, _start_backend_process + + return _start_backend_process(config, _StdoutConsole(), paths=paths) + + def stop(self, process: subprocess.Popen | None) -> None: + from flocks.cli.service_manager import _StdoutConsole, _terminate_process + + _terminate_process(process, self.label, _StdoutConsole()) + + def probe(self, process: subprocess.Popen | None, host: str, port: int) -> ServiceProbeResult: + if process is None: + return ServiceProbeResult(healthy=False, reason="stopped") + if process.poll() is not None: + return ServiceProbeResult( + healthy=False, + reason=f"process exited with code {process.returncode}", + restart=True, + ) + if not tcp_port_accepts_connections(host, port): + return ServiceProbeResult(healthy=False, reason=f"port {port} is not listening", restart=True) + + from flocks.cli.service_manager import _backend_health_url, _is_healthy_status_response, backend_access_base_url + + url = _backend_health_url(host, port) + try: + with httpx.Client(timeout=2.0, trust_env=False) as client: + response = client.get(url) + root_response = client.get( + backend_access_base_url(ServiceConfig(backend_host=host, backend_port=port)), + headers={"Accept": "text/html"}, + ) + healthy = _is_healthy_status_response(response) and _is_static_webui_response(root_response) + reason = f"health status={response.status_code}, root status={root_response.status_code}" + except Exception as exc: + healthy = False + reason = f"health failed: {exc}" + return ServiceProbeResult(healthy=healthy, reason=reason) + + +def _is_static_webui_response(response: httpx.Response) -> bool: + """Return True only when the unified service serves the SPA index.""" + content_type = response.headers.get("content-type", "").lower() + return response.status_code == 200 and "text/html" in content_type + + +def tcp_port_accepts_connections(host: str, port: int) -> bool: + """Return True when a local service accepts TCP connections.""" + from flocks.cli.service_manager import access_host + + try: + with socket.create_connection((access_host(host), port), timeout=1.0): + return True + except OSError: + return False diff --git a/flocks/cli/service_supervisor.py b/flocks/cli/service_supervisor.py new file mode 100644 index 000000000..e94397300 --- /dev/null +++ b/flocks/cli/service_supervisor.py @@ -0,0 +1,580 @@ +"""Supervisor daemon for the local Flocks service.""" + +from __future__ import annotations + +import datetime +import json +import os +import signal +import socket +import subprocess +import sys +import threading +import time +from dataclasses import dataclass +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from pathlib import Path +from typing import Any +from urllib.parse import parse_qs, urlparse + +from flocks.browser.admin import stop_all_daemons as stop_all_browser_daemons +from flocks.cli.service_config import service_config_from_payload, service_config_payload +from flocks.cli.service_control import ( + supervisor_control_port, + supervisor_log_path, + supervisor_socket_path, + supervisor_uses_tcp_control, +) +from flocks.cli.service_process import BackendProcessAdapter, ProcessAdapter + +SUPERVISOR_CHECK_INTERVAL_SECONDS = 5.0 +SUPERVISOR_HEALTH_FAILURE_THRESHOLD = 2 +SUPERVISOR_BACKOFF_SECONDS = (1.0, 2.0, 5.0, 10.0, 30.0) +_CLIENT_DISCONNECT_ERRORS = (BrokenPipeError, ConnectionResetError, ConnectionAbortedError) + + +@dataclass +class ManagedService: + name: str + label: str + host: str + port: int + log_path: Path + process: subprocess.Popen | None = None + command: tuple[str, ...] = () + state: str = "stopped" + last_error: str | None = None + restart_count: int = 0 + last_restart_at: float | None = None + health_failure_count: int = 0 + next_restart_at: float = 0.0 + built_once: bool = False + + @property + def pid(self) -> int | None: + return self.process.pid if self.process is not None else None + + +def _daemon_log(event: str, details: dict[str, object] | None = None) -> None: + """Write a structured daemon log line to stdout.""" + timestamp = datetime.datetime.now().isoformat(timespec="seconds") + suffix = "" + if details: + suffix = " " + json.dumps(details, ensure_ascii=True, sort_keys=True) + sys.stdout.write(f"[{timestamp}] daemon.{event}{suffix}\n") + sys.stdout.flush() + + +def _health_status_from_service_state(state: str) -> str: + if state in {"healthy", "static", "starting", "restarting", "stopped", "paused"}: + return state + return "degraded" + + +def _service_payload(service: ManagedService, *, paused: bool = False) -> dict[str, object]: + return { + "pid": service.pid, + "host": service.host, + "port": service.port, + "state": "paused" if paused else service.state, + "health": _health_status_from_service_state("paused" if paused else service.state), + "last_error": service.last_error, + "restart_count": service.restart_count, + "last_restart_at": service.last_restart_at, + "log_path": str(service.log_path), + "command": list(service.command), + "paused": paused, + } + + +if hasattr(socket, "AF_UNIX"): + + class _UnixControlServer(ThreadingHTTPServer): + address_family = socket.AF_UNIX + +else: # pragma: no cover - exercised by importing on Windows + _UnixControlServer = None + + +class SupervisorDaemon: + """Owns backend/WebUI child processes and exposes a local control API.""" + + def __init__( + self, + config, + *, + interval: float = SUPERVISOR_CHECK_INTERVAL_SECONDS, + failure_threshold: int = SUPERVISOR_HEALTH_FAILURE_THRESHOLD, + backend_adapter: ProcessAdapter | None = None, + ) -> None: + from flocks.cli.service_manager import ensure_runtime_dirs + + self.config = config + self.paths = ensure_runtime_dirs() + self.interval = interval + self.failure_threshold = failure_threshold + self.backend_adapter = backend_adapter or BackendProcessAdapter() + self.started_at = time.time() + self._lock = threading.RLock() + self._shutdown_requested = threading.Event() + self._server: ThreadingHTTPServer | None = None + self._server_thread: threading.Thread | None = None + self._backend_paused = False + self._webui_paused = False + self.backend = ManagedService( + name="backend", + label="后端", + host=config.backend_host, + port=config.backend_port, + log_path=self.paths.backend_log, + ) + self.webui = ManagedService( + name="webui", + label="WebUI", + host=config.backend_host, + port=config.backend_port, + log_path=self.paths.backend_log, + state="static", + ) + + def run(self) -> None: + """Run the supervisor until the control API asks it to stop.""" + self._install_signal_handlers() + self._cleanup_legacy_runtime() + self._start_control_server() + try: + self.restart_all(reason="startup") + while not self._shutdown_requested.wait(self.interval): + self.tick() + finally: + self.shutdown_children() + self._stop_control_server() + stop_all_browser_daemons() + _daemon_log("stopped") + + def _install_signal_handlers(self) -> None: + if threading.current_thread() is not threading.main_thread(): + return + + def _handle(_signum, _frame) -> None: + self.request_stop() + + for sig in (signal.SIGINT, signal.SIGTERM): + try: + signal.signal(sig, _handle) + except (OSError, ValueError): # pragma: no cover - platform defensive + pass + + def _cleanup_legacy_runtime(self) -> None: + from flocks.cli import service_manager + + console = service_manager._StdoutConsole() + for pid_file, name in ( + (service_manager.watchdog_pid_path(self.paths), "watchdog"), + (self.paths.frontend_pid, "WebUI"), + (self.paths.backend_pid, "backend"), + ): + record = service_manager.read_runtime_record(pid_file) + if record is not None and service_manager.runtime_record_is_running(record): + service_manager.stop_runtime_record_process(pid_file, name, console) + else: + pid_file.unlink(missing_ok=True) + + def _start_control_server(self) -> None: + handler = self._handler_class() + if supervisor_uses_tcp_control(): + server: ThreadingHTTPServer = ThreadingHTTPServer(("127.0.0.1", supervisor_control_port()), handler) + else: + socket_path = supervisor_socket_path(self.paths) + socket_path.parent.mkdir(parents=True, exist_ok=True) + socket_path.unlink(missing_ok=True) + assert _UnixControlServer is not None + server = _UnixControlServer(str(socket_path), handler) + self._server = server + self._server_thread = threading.Thread(target=server.serve_forever, name="flocks-supervisor-control", daemon=True) + self._server_thread.start() + _daemon_log("control_started", {"platform": sys.platform}) + + def _stop_control_server(self) -> None: + if self._server is not None: + self._server.shutdown() + self._server.server_close() + if self._server_thread is not None: + self._server_thread.join(timeout=5.0) + if not supervisor_uses_tcp_control(): + supervisor_socket_path(self.paths).unlink(missing_ok=True) + + def _handler_class(self): + daemon = self + + class ControlHandler(BaseHTTPRequestHandler): + protocol_version = "HTTP/1.0" + + def log_message(self, _format, *_args) -> None: + return + + def _send_json(self, payload: dict[str, object], status: int = 200) -> None: + body = json.dumps(payload, ensure_ascii=False, sort_keys=True).encode("utf-8") + try: + self.send_response(status) + self.send_header("Content-Type", "application/json; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + except _CLIENT_DISCONNECT_ERRORS: + return + + def _read_json(self) -> dict[str, Any]: + length = int(self.headers.get("Content-Length") or "0") + if length <= 0: + return {} + try: + payload = json.loads(self.rfile.read(length).decode("utf-8")) + except (UnicodeDecodeError, json.JSONDecodeError): + return {} + return payload if isinstance(payload, dict) else {} + + def do_GET(self) -> None: + parsed = urlparse(self.path) + try: + if parsed.path == "/status": + self._send_json(daemon.status_payload()) + return + if parsed.path == "/logs": + daemon.handle_logs_request(self, parse_qs(parsed.query)) + return + self._send_json({"error": "not found"}, status=404) + except _CLIENT_DISCONNECT_ERRORS: + return + except Exception as exc: # pragma: no cover - defensive control path + self._send_json({"error": str(exc)}, status=500) + + def do_POST(self) -> None: + parsed = urlparse(self.path) + payload = self._read_json() + try: + if parsed.path == "/stop": + daemon.request_stop() + self._send_json({"status": "stopping"}) + return + if parsed.path == "/restart": + daemon.update_config(payload) + daemon.restart_all(reason="control restart") + self._send_json(daemon.status_payload()) + return + if parsed.path == "/restart/backend": + daemon.restart_backend(reason="control restart") + self._send_json(daemon.status_payload()) + return + if parsed.path == "/restart/webui": + daemon.update_config(payload) + daemon.restart_webui( + reason="control restart", + force_frontend_build=bool(payload.get("force_frontend_build")), + ) + self._send_json(daemon.status_payload()) + return + if parsed.path == "/stop/webui": + self._send_json({"error": "static WebUI is served by Flocks service and cannot be stopped separately"}, status=409) + return + if parsed.path == "/upgrade/prepare": + daemon.prepare_upgrade(reason="control upgrade prepare") + self._send_json(daemon.status_payload()) + return + if parsed.path == "/upgrade/resume": + daemon.update_config(payload) + daemon.resume_upgrade(reason="control upgrade resume") + self._send_json(daemon.status_payload()) + return + self._send_json({"error": "not found"}, status=404) + except _CLIENT_DISCONNECT_ERRORS: + return + except Exception as exc: # pragma: no cover - defensive control path + self._send_json({"error": str(exc)}, status=500) + + return ControlHandler + + def update_config(self, payload: dict[str, Any]) -> None: + with self._lock: + self.config = service_config_from_payload(payload, self.config) + self.backend.host = self.config.backend_host + self.backend.port = self.config.backend_port + self.webui.host = self.config.backend_host + self.webui.port = self.config.backend_port + self.webui.log_path = self.paths.backend_log + + def request_stop(self) -> None: + self._shutdown_requested.set() + + def status_payload(self) -> dict[str, object]: + try: + from flocks import __version__ + except Exception: # pragma: no cover - defensive + __version__ = "unknown" + with self._lock: + return { + "daemon": { + "pid": os.getpid(), + "uptime": time.time() - self.started_at, + "version": __version__, + "state": "stopping" if self._shutdown_requested.is_set() else "running", + "log_path": str(supervisor_log_path(self.paths)), + }, + "backend": _service_payload(self.backend, paused=self._backend_paused), + "webui": _service_payload(self.webui, paused=self._webui_paused), + "config": service_config_payload(self.config), + } + + def handle_logs_request(self, handler: BaseHTTPRequestHandler, query: dict[str, list[str]]) -> None: + from flocks.cli.service_manager import FOLLOW_POLL_INTERVAL, _coerce_positive_int, tail_lines + + service_name = (query.get("service") or ["backend"])[0] + lines = _coerce_positive_int((query.get("lines") or ["50"])[0]) or 50 + follow = (query.get("follow") or ["false"])[0].lower() == "true" + selections = self._log_paths_for_service(service_name) + if not selections: + body = json.dumps({"error": "unknown service"}, ensure_ascii=False).encode("utf-8") + handler.send_response(400) + handler.send_header("Content-Type", "application/json; charset=utf-8") + handler.send_header("Content-Length", str(len(body))) + handler.end_headers() + handler.wfile.write(body) + return + + for _prefix, log_path in selections: + log_path.touch(exist_ok=True) + if not follow: + body = json.dumps( + { + "service": service_name, + "logs": { + prefix: { + "path": str(log_path), + "lines": tail_lines(log_path, lines), + } + for prefix, log_path in selections + }, + }, + ensure_ascii=False, + ).encode("utf-8") + handler.send_response(200) + handler.send_header("Content-Type", "application/json; charset=utf-8") + handler.send_header("Content-Length", str(len(body))) + handler.end_headers() + handler.wfile.write(body) + return + + handler.send_response(200) + handler.send_header("Content-Type", "text/plain; charset=utf-8") + handler.end_headers() + for prefix, log_path in selections: + handler.wfile.write((f"[{prefix}] --- {log_path} ---\n").encode("utf-8", errors="replace")) + for line in tail_lines(log_path, lines): + handler.wfile.write((f"[{prefix}] {line}\n").encode("utf-8", errors="replace")) + handler.wfile.flush() + handles = {} + try: + for prefix, log_path in selections: + handle = log_path.open("r", encoding="utf-8", errors="replace") + handle.seek(0, os.SEEK_END) + handles[prefix] = handle + while not self._shutdown_requested.is_set(): + emitted = False + for prefix, handle in handles.items(): + while True: + line = handle.readline() + if not line: + break + emitted = True + handler.wfile.write((f"[{prefix}] {line}").encode("utf-8", errors="replace")) + if emitted: + handler.wfile.flush() + else: + time.sleep(FOLLOW_POLL_INTERVAL) + finally: + for handle in handles.values(): + handle.close() + + def _log_paths_for_service(self, service_name: str) -> list[tuple[str, Path]]: + if service_name == "backend": + return [("flocks", self.paths.backend_log)] + if service_name == "webui": + return [("flocks", self.paths.backend_log)] + if service_name == "daemon": + return [("daemon", supervisor_log_path(self.paths))] + if service_name == "all": + return [ + ("flocks", self.paths.backend_log), + ("daemon", supervisor_log_path(self.paths)), + ] + return [] + + def restart_all(self, *, reason: str) -> None: + with self._lock: + self._backend_paused = False + self._webui_paused = False + self._restart_service(self.backend, reason=reason, immediate=True) + self._start_backend_locked(immediate=True) + self._sync_static_webui_state() + + def restart_backend(self, *, reason: str) -> None: + with self._lock: + self._backend_paused = False + self._restart_service(self.backend, reason=reason, immediate=True) + self._start_backend_locked(immediate=True) + self._sync_static_webui_state() + + def restart_webui(self, *, reason: str, force_frontend_build: bool = False) -> None: + with self._lock: + self._webui_paused = False + if force_frontend_build: + from flocks.cli.service_config import with_frontend_build + + self.config = with_frontend_build(self.config, skip_frontend_build=False) + self._restart_service(self.backend, reason=f"{reason}: static webui", immediate=True) + self._start_backend_locked(immediate=True) + self._sync_static_webui_state() + + def prepare_upgrade(self, *, reason: str) -> None: + with self._lock: + self._backend_paused = True + self._webui_paused = True + _daemon_log("service_pause", {"service": "backend", "reason": reason}) + _daemon_log("service_pause", {"service": "webui", "reason": reason}) + self.backend.last_error = reason + self.webui.last_error = reason + self._stop_service(self.backend) + self.webui.state = "paused" + + def resume_upgrade(self, *, reason: str) -> None: + with self._lock: + self._backend_paused = False + self._webui_paused = False + _daemon_log("service_resume", {"service": "backend", "reason": reason}) + _daemon_log("service_resume", {"service": "webui", "reason": reason}) + self._probe_backend_locked() + self._start_backend_locked(immediate=True) + self._sync_static_webui_state() + + def shutdown_children(self) -> None: + with self._lock: + self._stop_service(self.backend) + self.webui.state = "stopped" + + def tick(self) -> None: + with self._lock: + if not self._backend_paused: + self._probe_backend_locked() + if not self._backend_paused: + self._start_backend_locked(immediate=False) + self._sync_static_webui_state() + + def _restart_service(self, service: ManagedService, *, reason: str, immediate: bool) -> None: + _daemon_log("service_restart", {"service": service.name, "reason": reason}) + self._stop_service(service) + service.state = "restarting" + service.last_error = reason + service.health_failure_count = 0 + service.restart_count += 1 + service.last_restart_at = time.time() + service.next_restart_at = time.monotonic() if immediate else self._next_restart_time(service.restart_count) + + def _stop_service(self, service: ManagedService) -> None: + adapter = self._adapter_for(service) + adapter.stop(service.process) + service.process = None + service.command = () + service.state = "stopped" + + def _start_backend_locked(self, *, immediate: bool) -> None: + if self.backend.process is not None and self.backend.process.poll() is None: + return + if not immediate and time.monotonic() < self.backend.next_restart_at: + return + self.backend.state = "starting" + try: + process = self.backend_adapter.start(self.config, self.paths) + except Exception as exc: + self._mark_start_failed(self.backend, exc) + return + self.backend.process = process + self.backend.command = tuple(str(item) for item in process.args) + self.backend.state = "healthy" + self.backend.last_error = None + self.backend.health_failure_count = 0 + self._sync_static_webui_state() + + def _mark_start_failed(self, service: ManagedService, error: Exception) -> None: + service.process = None + service.state = "degraded" + service.last_error = str(error) + service.next_restart_at = self._next_restart_time(service.restart_count) + _daemon_log( + "service_start_failed", + {"service": service.name, "error": str(error), "retry_at": service.next_restart_at}, + ) + + def _next_restart_time(self, restart_count: int) -> float: + index = min(max(restart_count, 1) - 1, len(SUPERVISOR_BACKOFF_SECONDS) - 1) + return time.monotonic() + SUPERVISOR_BACKOFF_SECONDS[index] + + def _probe_backend_locked(self) -> None: + result = self.backend_adapter.probe(self.backend.process, self.backend.host, self.backend.port) + if self.backend.process is None: + self.backend.state = "stopped" + return + if result.restart: + self._restart_service(self.backend, reason=result.reason or "backend probe failed", immediate=True) + return + if result.healthy: + self.backend.state = "healthy" + self.backend.health_failure_count = 0 + self.backend.last_error = None + return + + self.backend.health_failure_count += 1 + self.backend.state = "degraded" + self.backend.last_error = result.reason + if self.backend.health_failure_count >= self.failure_threshold: + self._restart_service(self.backend, reason=result.reason or "backend health failed", immediate=True) + + def _adapter_for(self, service: ManagedService) -> ProcessAdapter: + return self.backend_adapter + + def _sync_static_webui_state(self) -> None: + self.webui.host = self.backend.host + self.webui.port = self.backend.port + self.webui.log_path = self.paths.backend_log + self.webui.process = None + self.webui.command = () + if self._webui_paused: + self.webui.state = "paused" + return + if self.backend.state == "healthy": + self.webui.state = "static" + self.webui.last_error = None + elif self.backend.state in {"starting", "restarting"}: + self.webui.state = self.backend.state + self.webui.last_error = self.backend.last_error + else: + self.webui.state = "degraded" + self.webui.last_error = self.backend.last_error or "server is not healthy" + + +def run_service_daemon( + config, + *, + interval: float = SUPERVISOR_CHECK_INTERVAL_SECONDS, + failure_threshold: int = SUPERVISOR_HEALTH_FAILURE_THRESHOLD, +) -> None: + """Run the local supervisor daemon.""" + _daemon_log( + "started", + { + "backend_host": config.backend_host, + "backend_port": config.backend_port, + "frontend_host": config.frontend_host, + "frontend_port": config.frontend_port, + }, + ) + SupervisorDaemon(config, interval=interval, failure_threshold=failure_threshold).run() diff --git a/flocks/server/app.py b/flocks/server/app.py index 8c7cb1b24..c5d408c0b 100644 --- a/flocks/server/app.py +++ b/flocks/server/app.py @@ -13,7 +13,7 @@ from pathlib import Path from typing import Any, Callable, Optional from contextlib import asynccontextmanager -from fastapi import FastAPI, Request, Response, status +from fastapi import FastAPI, Request, status from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse from fastapi.exceptions import RequestValidationError @@ -26,6 +26,7 @@ from flocks.auth.service import AuthService from flocks.extensions import ExtensionOptions, handler_name, normalize_fail_policy, normalize_timeout from flocks.server.auth import apply_auth_for_request, clear_auth_context +from flocks.server.static_webui import maybe_serve_static_webui # Load .env file at startup try: @@ -659,6 +660,13 @@ async def _run_http_middleware_hooks(request: Request, context: dict[str, Any]) "/api/session/status", }) +_SECURITY_HEADERS = { + "X-Content-Type-Options": "nosniff", + "Referrer-Policy": "no-referrer", + "Content-Security-Policy": "frame-ancestors 'self'", + "Permissions-Policy": "camera=(), microphone=(), geolocation=()", +} + def _is_noisy_request_path(path: str) -> bool: """Return True for high-frequency polling endpoints that are noisy on success.""" @@ -681,7 +689,7 @@ def _should_log_request(path: str, status_code: int) -> bool: # CORS Configuration # # Priority order: -# 1. Runtime env vars exported by ``start_backend()`` → add the concrete +# 1. Runtime env vars exported by the supervised backend launcher → add the concrete # ``_FLOCKS_WEBUI_*`` origin inferred from the current CLI launch. # 2. Explicit ``server.cors`` in flocks.json → append user-configured # origins without discarding the runtime ones. @@ -780,7 +788,15 @@ async def __call__(self, scope, receive, send): await self._inner(scope, receive, send) -# Instance Context Middleware +@app.middleware("http") +async def security_headers_middleware(request: Request, call_next): + """Attach baseline browser security headers to every HTTP response.""" + response = await call_next(request) + for name, value in _SECURITY_HEADERS.items(): + response.headers.setdefault(name, value) + return response + + @app.middleware("http") async def instance_context_middleware(request: Request, call_next): """ @@ -796,7 +812,7 @@ async def instance_context_middleware(request: Request, call_next): from urllib.parse import unquote from flocks.project.instance import Instance from flocks.project.bootstrap import instance_bootstrap - + # Skip instance context for global routes, static files, and simple endpoints skip_prefixes = { "/global", "/docs", "/redoc", "/openapi.json", "/health", @@ -891,6 +907,15 @@ async def auth_guard_middleware(request: Request, call_next): clear_auth_context(token) +@app.middleware("http") +async def static_webui_middleware(request: Request, call_next): + """Serve the SPA shell before auth for browser navigations.""" + static_response = await maybe_serve_static_webui(request) + if static_response is not None: + return static_response + return await call_next(request) + + # Error Handlers @app.exception_handler(RequestValidationError) async def validation_exception_handler(request: Request, exc: RequestValidationError): diff --git a/flocks/server/auth.py b/flocks/server/auth.py index 0e9f6f95c..593508eeb 100644 --- a/flocks/server/auth.py +++ b/flocks/server/auth.py @@ -23,9 +23,6 @@ PUBLIC_PATHS = frozenset({ "/", "/health", - "/docs", - "/redoc", - "/openapi.json", "/favicon.ico", "/api/health", "/api/config/ui-display", diff --git a/flocks/server/routes/auth.py b/flocks/server/routes/auth.py index 83ec3df9d..5cfcb30e2 100644 --- a/flocks/server/routes/auth.py +++ b/flocks/server/routes/auth.py @@ -4,6 +4,8 @@ from __future__ import annotations +import threading +import time from typing import Any from fastapi import APIRouter, HTTPException, Request, Response, status @@ -21,6 +23,166 @@ router = APIRouter() +_LOGIN_FAILURE_WINDOW_SECONDS = 5 * 60 +_LOGIN_LOCKOUT_SECONDS = 15 * 60 +_LOGIN_MAX_FAILURES_PER_USER_AND_IP = 5 +_LOGIN_MAX_FAILURES_PER_IP = 20 +_LOGIN_PRUNE_INTERVAL_SECONDS = 60 +_LOGIN_MAX_TRACKED_BUCKETS = 2048 + + +class _LoginRateLimiter: + """In-process failed-login limiter for local account authentication.""" + + def __init__(self) -> None: + self._lock = threading.Lock() + self._failures: dict[tuple[str, str], list[float]] = {} + self._locked_until: dict[tuple[str, str], float] = {} + self._last_pruned_at = 0.0 + + def check(self, *, username: str, ip: str | None) -> int | None: + """Return retry-after seconds when the login attempt is currently blocked.""" + now = time.monotonic() + with self._lock: + retry_after = self._retry_after(("user_ip", self._user_ip_key(username, ip)), now) + if retry_after is not None: + return retry_after + return self._retry_after(("ip", self._ip_key(ip)), now) + + def record_failure(self, *, username: str, ip: str | None) -> int | None: + """Record a failed login attempt and return retry-after when it locks out.""" + now = time.monotonic() + with self._lock: + self._prune(now) + user_key = ("user_ip", self._user_ip_key(username, ip)) + ip_key = ("ip", self._ip_key(ip)) + user_retry = self._record_failure( + user_key, + limit=_LOGIN_MAX_FAILURES_PER_USER_AND_IP, + now=now, + ) + ip_retry = self._record_failure( + ip_key, + limit=_LOGIN_MAX_FAILURES_PER_IP, + now=now, + ) + self._enforce_capacity(now, preserve={user_key, ip_key}) + if user_retry is not None and ip_retry is not None: + return max(user_retry, ip_retry) + return user_retry if user_retry is not None else ip_retry + + def record_success(self, *, username: str, ip: str | None) -> None: + """Clear the exact user/IP failure bucket after a successful login.""" + with self._lock: + key = ("user_ip", self._user_ip_key(username, ip)) + self._failures.pop(key, None) + self._locked_until.pop(key, None) + + def reset(self) -> None: + """Clear limiter state for tests and process lifecycle resets.""" + with self._lock: + self._failures.clear() + self._locked_until.clear() + self._last_pruned_at = 0.0 + + def _retry_after(self, key: tuple[str, str], now: float) -> int | None: + locked_until = self._locked_until.get(key) + if locked_until is None: + return None + if locked_until <= now: + self._locked_until.pop(key, None) + self._failures.pop(key, None) + return None + return max(1, int(locked_until - now)) + + def _record_failure(self, key: tuple[str, str], *, limit: int, now: float) -> int | None: + if retry_after := self._retry_after(key, now): + return retry_after + cutoff = now - _LOGIN_FAILURE_WINDOW_SECONDS + failures = [timestamp for timestamp in self._failures.get(key, []) if timestamp >= cutoff] + failures.append(now) + self._failures[key] = failures + if len(failures) <= limit: + return None + locked_until = now + _LOGIN_LOCKOUT_SECONDS + self._locked_until[key] = locked_until + return _LOGIN_LOCKOUT_SECONDS + + def _prune(self, now: float, *, force: bool = False) -> None: + if not force and ( + now - self._last_pruned_at < _LOGIN_PRUNE_INTERVAL_SECONDS + and self._tracked_bucket_count() <= _LOGIN_MAX_TRACKED_BUCKETS + ): + return + cutoff = now - _LOGIN_FAILURE_WINDOW_SECONDS + for key, locked_until in list(self._locked_until.items()): + if locked_until <= now: + self._locked_until.pop(key, None) + for key, failures in list(self._failures.items()): + if self._locked_until.get(key, 0) > now: + continue + active_failures = [timestamp for timestamp in failures if timestamp >= cutoff] + if active_failures: + self._failures[key] = active_failures + else: + self._failures.pop(key, None) + self._last_pruned_at = now + + def _enforce_capacity(self, now: float, *, preserve: set[tuple[str, str]]) -> None: + if self._tracked_bucket_count() <= _LOGIN_MAX_TRACKED_BUCKETS: + return + self._prune(now, force=True) + overflow = self._tracked_bucket_count() - _LOGIN_MAX_TRACKED_BUCKETS + if overflow <= 0: + return + candidates = [ + (max(failures, default=0.0), key) + for key, failures in self._failures.items() + if key not in preserve and self._locked_until.get(key, 0) <= now + ] + candidates.sort() + for _latest_failure, key in candidates[:overflow]: + self._failures.pop(key, None) + self._locked_until.pop(key, None) + overflow = self._tracked_bucket_count() - _LOGIN_MAX_TRACKED_BUCKETS + if overflow <= 0: + return + locked_candidates = [ + (locked_until, key) + for key, locked_until in self._locked_until.items() + if key not in preserve + ] + locked_candidates.sort() + for _locked_until, key in locked_candidates[:overflow]: + self._locked_until.pop(key, None) + self._failures.pop(key, None) + + def _tracked_bucket_count(self) -> int: + return len(set(self._failures) | set(self._locked_until)) + + @staticmethod + def _user_ip_key(username: str, ip: str | None) -> str: + return f"{(username or '').strip().casefold()}@{ip or 'unknown'}" + + @staticmethod + def _ip_key(ip: str | None) -> str: + return ip or "unknown" + + +_login_rate_limiter = _LoginRateLimiter() + + +def _request_ip(request: Request) -> str | None: + return getattr(getattr(request, "client", None), "host", None) + + +def _raise_login_rate_limited(retry_after: int) -> None: + raise HTTPException( + status_code=status.HTTP_429_TOO_MANY_REQUESTS, + detail="登录失败次数过多,请稍后再试", + headers={"Retry-After": str(retry_after)}, + ) + def _parse_event_type(event_type: str) -> tuple[str, str]: if "." in event_type: @@ -181,22 +343,39 @@ async def bootstrap_admin(payload: BootstrapAdminRequest, response: Response, re @router.post("/login", response_model=MeResponse, summary="登录本地账号") async def login(payload: LoginRequest, response: Response, request: Request) -> MeResponse: + ip = _request_ip(request) + retry_after = _login_rate_limiter.check(username=payload.username, ip=ip) + if retry_after is not None: + await _emit_auth_audit( + "account.login_rate_limited", + { + "username": payload.username, + "ip": ip, + "retry_after": retry_after, + }, + ) + _raise_login_rate_limited(retry_after) + try: user, session_id = await AuthService.login( payload.username, payload.password, ) except ValueError as exc: + retry_after = _login_rate_limiter.record_failure(username=payload.username, ip=ip) await _emit_auth_audit( "account.login_failed", { "username": payload.username, "reason": str(exc), - "ip": getattr(getattr(request, "client", None), "host", None), + "ip": ip, }, ) + if retry_after is not None: + _raise_login_rate_limited(retry_after) raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc)) from exc + _login_rate_limiter.record_success(username=payload.username, ip=ip) set_session_cookie(response, session_id, secure=should_use_secure_cookie(request)) await _emit_auth_audit( "account.login", @@ -208,7 +387,7 @@ async def login(payload: LoginRequest, response: Response, request: Request) -> "username": user.username, "role": user.role, "session_id": session_id, - "ip": getattr(getattr(request, "client", None), "host", None), + "ip": ip, }, ) return _to_me_response(user) diff --git a/flocks/server/routes/health.py b/flocks/server/routes/health.py index f2539d038..84669c577 100644 --- a/flocks/server/routes/health.py +++ b/flocks/server/routes/health.py @@ -6,8 +6,6 @@ from pydantic import BaseModel from datetime import datetime -from flocks.config.config import Config - router = APIRouter() @@ -17,8 +15,6 @@ class HealthResponse(BaseModel): status: str version: str timestamp: str - config_dir: str - data_dir: str @router.get( @@ -35,15 +31,12 @@ async def health_check() -> HealthResponse: Returns server status and basic information """ from datetime import UTC - config = Config.get_global() from flocks.updater import get_current_version return HealthResponse( status="healthy", version=get_current_version(), timestamp=datetime.now(UTC).isoformat(), - config_dir=str(config.config_dir), - data_dir=str(config.data_dir), ) diff --git a/flocks/server/static_webui.py b/flocks/server/static_webui.py new file mode 100644 index 000000000..dede61320 --- /dev/null +++ b/flocks/server/static_webui.py @@ -0,0 +1,129 @@ +"""Static WebUI hosting helpers for the FastAPI server.""" + +from __future__ import annotations + +import os +import re +from pathlib import Path +from urllib.parse import unquote + +from fastapi import Request, Response +from fastapi.responses import FileResponse, PlainTextResponse + +_INDEX_CACHE_CONTROL = "no-store" +_ASSET_CACHE_CONTROL = "public, max-age=31536000, immutable" +_STATIC_CACHE_CONTROL = "no-cache" +_FINGERPRINT_RE = re.compile(r"(?:^|[.-])[0-9a-f]{8,}(?:[.-]|$)", re.IGNORECASE) +_PROTECTED_PREFIXES = ( + "/api", + "/event", + "/global", + "/docs", + "/redoc", + "/openapi.json", + "/health", +) + + +class WebUIDistMissingError(RuntimeError): + """Raised when the production WebUI build output is unavailable.""" + + +def source_webui_dist_dir() -> Path: + """Return the source-tree WebUI dist directory.""" + return Path(__file__).resolve().parents[2] / "webui" / "dist" + + +def packaged_webui_dist_dir() -> Path: + """Return the packaged WebUI static directory.""" + return Path(__file__).resolve().parents[1] / "webui_static" + + +def resolve_webui_dist_dir() -> Path | None: + """Return the first usable WebUI dist directory.""" + candidates: list[Path] = [] + override = os.getenv("FLOCKS_WEBUI_DIST_DIR") + if override: + candidates.append(Path(override).expanduser()) + candidates.extend([source_webui_dist_dir(), packaged_webui_dist_dir()]) + for candidate in candidates: + if (candidate / "index.html").is_file(): + return candidate.resolve() + return None + + +def ensure_webui_dist_dir() -> Path: + """Return the WebUI dist directory or raise a clear startup error.""" + dist_dir = resolve_webui_dist_dir() + if dist_dir is None: + raise WebUIDistMissingError( + "WebUI build output is missing. Run `cd webui && npm run build`, " + "or start without `--skip-webui-build` so Flocks can build it." + ) + return dist_dir + + +async def maybe_serve_static_webui(request: Request) -> Response | None: + """Serve SPA static files for browser navigations. + + API and TUI-compatible requests continue through the existing routers. Only + real static files and browser HTML navigation requests are handled here. + """ + if request.method not in {"GET", "HEAD"}: + return None + + path = request.url.path or "/" + dist_dir = resolve_webui_dist_dir() + if dist_dir is None: + return None + + file_path = _resolve_existing_static_file(dist_dir, path) + if file_path is not None: + return _file_response(file_path, cache_control=_cache_control_for_file(path, file_path)) + + if path.startswith("/assets/"): + return PlainTextResponse("Not found", status_code=404) + if _is_protected_backend_path(path): + return None + if not _accepts_html(request): + return None + + return _file_response(dist_dir / "index.html", cache_control=_INDEX_CACHE_CONTROL) + + +def _resolve_existing_static_file(dist_dir: Path, path: str) -> Path | None: + if path == "/": + return None + relative = unquote(path.lstrip("/")) + candidate = (dist_dir / relative).resolve() + try: + candidate.relative_to(dist_dir) + except ValueError: + return None + if candidate.is_file(): + return candidate + return None + + +def _file_response(path: Path, *, cache_control: str) -> FileResponse: + headers = {"Cache-Control": cache_control} + return FileResponse(path, headers=headers) + + +def _cache_control_for_file(path: str, file_path: Path) -> str: + if file_path.name == "index.html": + return _INDEX_CACHE_CONTROL + if path.startswith("/assets/") or _FINGERPRINT_RE.search(file_path.name): + return _ASSET_CACHE_CONTROL + return _STATIC_CACHE_CONTROL + + +def _is_protected_backend_path(path: str) -> bool: + return any(path == prefix or path.startswith(prefix + "/") for prefix in _PROTECTED_PREFIXES) + + +def _accepts_html(request: Request) -> bool: + accept = request.headers.get("accept", "") + if not accept or accept == "*/*": + return False + return "text/html" in accept or "application/xhtml+xml" in accept diff --git a/flocks/storage/storage.py b/flocks/storage/storage.py index c57d0a0b5..4a1b1ed41 100644 --- a/flocks/storage/storage.py +++ b/flocks/storage/storage.py @@ -23,6 +23,7 @@ T = TypeVar("T", bound=BaseModel) +DDLScript = str | Callable[[aiosqlite.Connection], Awaitable[None]] R = TypeVar("R") @@ -76,7 +77,7 @@ class Storage: # descriptors and ``_initialized=True`` flag are never silently inherited # — a known SQLite corruption vector. _init_pid: Optional[int] = None - _extension_ddls: List[str] = [] + _extension_ddls: List[DDLScript] = [] _sqlite_timeout_s = 5.0 _sqlite_busy_timeout_ms = 5000 _sqlite_journal_mode = "WAL" @@ -683,7 +684,7 @@ def connect_sync(cls, db_path: Optional[Path] = None) -> sqlite3.Connection: return cls.configure_sync_connection(conn) @classmethod - def register_ddl(cls, ddl: str) -> None: + def register_ddl(cls, ddl: DDLScript) -> None: """Register an extension DDL script to be executed during ``init()``. If init() has already completed the DDL is executed immediately @@ -1075,7 +1076,10 @@ async def _bootstrap_schema(cls) -> None: async def _run_extension_ddl() -> None: async with cls.connect(cls._db_path) as db: - await db.executescript(ddl) + if isinstance(ddl, str): + await db.executescript(ddl) + else: + await ddl(db) await db.commit() await cls._run_write_with_retry( diff --git a/flocks/tool/device/models.py b/flocks/tool/device/models.py index a3b5d15d1..e9bc1c27d 100644 --- a/flocks/tool/device/models.py +++ b/flocks/tool/device/models.py @@ -54,15 +54,24 @@ updated_at INTEGER NOT NULL ); CREATE INDEX IF NOT EXISTS idx_device_storage_key ON device_integrations(storage_key); -CREATE INDEX IF NOT EXISTS idx_device_group ON device_integrations(group_id); """) + # Upgrade hook for installations created before group_id was added. -# Storage wraps each DDL in try/except so the duplicate-column error on fresh -# installs is silently ignored. -Storage.register_ddl( - "ALTER TABLE device_integrations ADD COLUMN group_id TEXT NOT NULL DEFAULT '';" -) +async def _ensure_device_integrations_group_id(db: Any) -> None: + cursor = await db.execute("PRAGMA table_info(device_integrations)") + columns = {str(row[1]) for row in await cursor.fetchall()} + if "group_id" in columns: + return + await db.execute("ALTER TABLE device_integrations ADD COLUMN group_id TEXT NOT NULL DEFAULT '';") + + +Storage.register_ddl(_ensure_device_integrations_group_id) + +Storage.register_ddl(""" +CREATE INDEX IF NOT EXISTS idx_device_group ON device_integrations(group_id); +""") + # Per-device tool enabled/disabled overrides. # diff --git a/flocks/updater/restart_handoff.py b/flocks/updater/restart_handoff.py index f6350b400..1ad888b74 100644 --- a/flocks/updater/restart_handoff.py +++ b/flocks/updater/restart_handoff.py @@ -23,14 +23,10 @@ DEFAULT_PARENT_TIMEOUT_SECONDS = 20.0 DEFAULT_PORT_TIMEOUT_SECONDS = 10.0 POST_STOP_PORT_TIMEOUT_SECONDS = 20.0 +SUPERVISOR_STOP_TIMEOUT_SECONDS = 20.0 DEFAULT_POLL_INTERVAL_SECONDS = 0.25 -class _NullConsole: - def print(self, *args, **kwargs) -> None: - return None - - def _record_handoff_log(message: str) -> None: append_upgrade_text_log(f"restart_handoff {message}") @@ -68,50 +64,37 @@ def _wait_for_backend_port_free( return not _backend_port_in_use(port) -def _ensure_backend_port_free(backend_port: int, backend_pid_file: Path) -> bool: +def _ensure_backend_port_free(backend_port: int) -> bool: if _wait_for_backend_port_free(backend_port): return True - _record_handoff_log(f"backend_port_still_in_use port={backend_port}; stopping backend") - try: - service_manager.stop_one(backend_port, backend_pid_file, "backend", _NullConsole()) - except Exception as exc: - _record_handoff_log(f"backend_stop_failed port={backend_port} error={exc}") - return False - + _record_handoff_log(f"backend_port_still_in_use port={backend_port}") return _wait_for_backend_port_free(backend_port, timeout_seconds=POST_STOP_PORT_TIMEOUT_SECONDS) -def _cli_subcommand(argv: Sequence[str]) -> str | None: - for index, value in enumerate(argv[:-2]): - if value == "-m" and argv[index + 1] == "flocks.cli.main": - return argv[index + 2] - return None - - -def _record_backend_runtime_if_direct_serve( - process: subprocess.Popen, - restart_argv: Sequence[str], +def _stop_supervisor_before_restart( *, - backend_host: str, - backend_port: int, - backend_pid_file: Path, -) -> None: - if _cli_subcommand(restart_argv) != "serve": - return + timeout_seconds: float = SUPERVISOR_STOP_TIMEOUT_SECONDS, + poll_interval_seconds: float = DEFAULT_POLL_INTERVAL_SECONDS, +) -> bool: + from flocks.cli import service_control + + paths = service_manager.runtime_paths() + if not service_control.supervisor_is_running(paths): + return True try: - service_manager.write_runtime_record( - backend_pid_file, - service_manager.process_runtime_record( - process, - host=backend_host, - port=backend_port, - command=restart_argv, - ), - ) + service_control.request_stop(paths=paths, timeout=timeout_seconds) except Exception as exc: - _record_handoff_log(f"backend_runtime_record_failed error={exc}") + _record_handoff_log(f"supervisor_stop_request_failed error={exc}") + return False + + deadline = time.monotonic() + timeout_seconds + while time.monotonic() < deadline: + if not service_control.supervisor_is_running(paths): + return True + time.sleep(poll_interval_seconds) + return not service_control.supervisor_is_running(paths) def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: @@ -121,7 +104,7 @@ def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: parser.add_argument("--backend-port", type=int, required=True) parser.add_argument("--frontend-host", required=True) parser.add_argument("--frontend-port", type=int, required=True) - parser.add_argument("--backend-pid-file", required=True) + parser.add_argument("--backend-pid-file") parser.add_argument("--install-root", required=True) parser.add_argument("--uv-path", required=True) parser.add_argument("--sync-timeout", type=int, required=True) @@ -134,6 +117,7 @@ def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: parser.add_argument("--pro-bundle-manifest-path") parser.add_argument("--bundle-sha256") parser.add_argument("--cleanup-dir") + parser.add_argument("--prepare-handover", action="store_true") parser.add_argument("restart_argv", nargs=argparse.REMAINDER) args = parser.parse_args(argv) if args.restart_argv and args.restart_argv[0] == "--": @@ -176,15 +160,67 @@ def _rollback_failed_upgrade(args: argparse.Namespace, error: str) -> None: _record_handoff_log(f"rollback_failed error={exc}") +def _prepare_upgrade_handover(args: argparse.Namespace) -> bool: + from flocks.updater import updater + + try: + updater._prepare_upgrade_handover(args.version) + except Exception as exc: + _record_handoff_log(f"prepare_handover_failed error={exc}") + return False + return True + + +def _rollback_upgrade_handover() -> None: + from flocks.updater import updater + + try: + updater.rollback_upgrade_handover() + except Exception as exc: + _record_handoff_log(f"handover_rollback_failed error={exc}") + + def _cleanup_dir(path_value: str | None) -> None: if not path_value: return shutil.rmtree(Path(path_value), ignore_errors=True) +def _cli_subcommand(argv: Sequence[str]) -> str | None: + """Return the flocks.cli.main subcommand embedded in a Python argv.""" + for index, value in enumerate(argv[:-2]): + if value == "-m" and argv[index + 1] == "flocks.cli.main": + return argv[index + 2] + return None + + +def _restart_argv_for_current_runtime(args: argparse.Namespace, restart_argv: Sequence[str]) -> list[str]: + if _cli_subcommand(restart_argv) != "serve": + return list(restart_argv) + + argv = [ + restart_argv[0], + "-m", + "flocks.cli.main", + "start", + "--no-browser", + "--skip-webui-build", + "--host", + str(args.frontend_host), + "--port", + str(args.frontend_port), + "--server-host", + str(args.backend_host), + "--server-port", + str(args.backend_port), + ] + _record_handoff_log(f"legacy_serve_restart_migrated argv={argv}") + return argv + + def run(argv: Sequence[str] | None = None) -> int: args = _parse_args(argv) - restart_argv = list(args.restart_argv) + restart_argv = _restart_argv_for_current_runtime(args, args.restart_argv) if not restart_argv: _record_handoff_log("missing_restart_argv") return 2 @@ -200,8 +236,11 @@ def run(argv: Sequence[str] | None = None) -> int: _cleanup_dir(args.cleanup_dir) return 1 - backend_pid_file = Path(args.backend_pid_file) - if not _ensure_backend_port_free(args.backend_port, backend_pid_file): + if args.prepare_handover: + if not _prepare_upgrade_handover(args): + _cleanup_dir(args.cleanup_dir) + return 1 + elif not _ensure_backend_port_free(args.backend_port): _record_handoff_log(f"backend_port_unavailable port={args.backend_port}") _cleanup_dir(args.cleanup_dir) return 1 @@ -215,6 +254,13 @@ def run(argv: Sequence[str] | None = None) -> int: _cleanup_dir(args.cleanup_dir) return 1 + if not _stop_supervisor_before_restart(): + _record_handoff_log("supervisor_stop_timeout") + if args.prepare_handover: + _rollback_upgrade_handover() + _cleanup_dir(args.cleanup_dir) + return 1 + try: process = subprocess.Popen( restart_argv, @@ -223,16 +269,11 @@ def run(argv: Sequence[str] | None = None) -> int: ) except OSError as exc: _record_handoff_log(f"restart_spawn_failed error={exc}") + if args.prepare_handover: + _rollback_upgrade_handover() _cleanup_dir(args.cleanup_dir) return 1 - _record_backend_runtime_if_direct_serve( - process, - restart_argv, - backend_host=args.backend_host, - backend_port=args.backend_port, - backend_pid_file=backend_pid_file, - ) _record_handoff_log(f"restart_spawned pid={process.pid}") _cleanup_dir(args.cleanup_dir) return 0 diff --git a/flocks/updater/updater.py b/flocks/updater/updater.py index d076af229..5add0c0f7 100644 --- a/flocks/updater/updater.py +++ b/flocks/updater/updater.py @@ -199,17 +199,6 @@ def _looks_like_windows_python_launcher(entry: str) -> bool: return _windows_path_stem(entry) in {"python", "pythonw", "py"} -def _is_windows_file_in_use_error(exc: BaseException) -> bool: - """Return True when *exc* looks like a Windows file-lock failure.""" - if sys.platform != "win32": - return False - if isinstance(exc, OSError) and getattr(exc, "winerror", None) == 32: - return True - - text = str(exc).lower() - return "winerror 32" in text or "used by another process" in text - - def _is_uv_managed_python_runtime_error(text: str) -> bool: """Return True when uv reports a broken managed Python runtime cache.""" if not text: @@ -1888,13 +1877,15 @@ def print(self, *args, **kwargs) -> None: def _current_service_config(): from flocks.cli import service_manager + from flocks.cli.service_config import service_config_from_status_payload + from flocks.cli.service_control import read_supervisor_status - paths = service_manager.ensure_runtime_dirs() - return service_manager.ServiceConfig( - backend_host=service_manager._recorded_host(paths.backend_pid, service_manager.ServiceConfig.backend_host), - backend_port=service_manager._recorded_port(paths.backend_pid, service_manager.ServiceConfig.backend_port), - frontend_host=service_manager._recorded_host(paths.frontend_pid, service_manager.ServiceConfig.frontend_host), - frontend_port=service_manager._recorded_port(paths.frontend_pid, service_manager.ServiceConfig.frontend_port), + try: + status = read_supervisor_status(paths=service_manager.runtime_paths(), timeout=1.0) + except Exception as exc: + raise RuntimeError("Supervisor control API is unavailable; cannot perform managed upgrade restart.") from exc + return service_config_from_status_payload( + status.raw, no_browser=True, skip_frontend_build=True, ) @@ -2035,7 +2026,11 @@ def _stop_upgrade_page_server(*, frontend_port: int | None = None) -> None: from flocks.cli import service_manager - remaining = service_manager.port_owner_pids(frontend_port) + remaining = [ + pid + for pid in service_manager.port_owner_pids(frontend_port) + if _looks_like_upgrade_page_process(pid) + ] if remaining: log.info( "updater.upgrade_page.port_fallback_kill", @@ -2057,7 +2052,7 @@ def _stop_upgrade_page_server(*, frontend_port: int | None = None) -> None: wait_attempts = 40 wait_interval = 0.25 for _ in range(wait_attempts): - if not service_manager.port_owner_pids(frontend_port): + if not any(_looks_like_upgrade_page_process(pid) for pid in service_manager.port_owner_pids(frontend_port)): return time.sleep(wait_interval) return @@ -2066,8 +2061,23 @@ def _stop_upgrade_page_server(*, frontend_port: int | None = None) -> None: time.sleep(0.3) +def _looks_like_upgrade_page_process(pid: int) -> bool: + """Return True only for the temporary upgrade-page http.server process.""" + try: + from flocks.cli import service_manager + + command_line = service_manager._process_command_line(pid).lower() + except Exception: + return False + if not command_line: + return False + page_dir = str(_upgrade_page_dir()).lower() + return "http.server" in command_line and "upgrade-page" in command_line and page_dir in command_line + + def _prepare_upgrade_handover(version: str) -> dict[str, Any]: from flocks.cli import service_manager + from flocks.cli.service_control import request_prepare_upgrade config = _current_service_config() payload: dict[str, Any] = { @@ -2082,9 +2092,8 @@ def _prepare_upgrade_handover(version: str) -> dict[str, Any]: _persist_upgrade_state(payload, last_error=None) console = _NullConsole() - paths = service_manager.ensure_runtime_dirs() - frontend_port = service_manager._recorded_port(paths.frontend_pid, config.frontend_port) - service_manager.stop_one(frontend_port, paths.frontend_pid, "WebUI", console) + paths = service_manager.runtime_paths() + request_prepare_upgrade(paths=paths, timeout=30.0) try: payload.update(_start_upgrade_page_server(config, version)) @@ -2097,7 +2106,7 @@ def _prepare_upgrade_handover(version: str) -> dict[str, Any]: _stop_upgrade_page_server(frontend_port=config.frontend_port) _clear_upgrade_state() try: - service_manager.start_frontend(config, console) + _start_frontend_with_fallback(config, console, allow_build_fallback=False) except Exception as restart_error: log.error("updater.frontend.restore_failed", {"error": str(restart_error)}) raise @@ -2105,26 +2114,56 @@ def _prepare_upgrade_handover(version: str) -> dict[str, Any]: return payload +def _spawn_restart_handoff(command: list[str], *, cwd: Path) -> subprocess.Popen: + creationflags = 0 + kwargs: dict[str, object] = {} + if sys.platform == "win32": + creationflags = getattr(subprocess, "CREATE_NEW_PROCESS_GROUP", 0) | getattr(subprocess, "CREATE_NO_WINDOW", 0) + startupinfo_cls = getattr(subprocess, "STARTUPINFO", None) + if startupinfo_cls is not None: + startupinfo = startupinfo_cls() + startupinfo.dwFlags |= getattr(subprocess, "STARTF_USESHOWWINDOW", 0) + startupinfo.wShowWindow = getattr(subprocess, "SW_HIDE", 0) + kwargs["startupinfo"] = startupinfo + else: + kwargs["start_new_session"] = True + return subprocess.Popen(command, cwd=cwd, close_fds=True, creationflags=creationflags, **kwargs) + + def _service_config_from_payload( payload: dict[str, Any], *, skip_frontend_build: bool | None = None, ): - from flocks.cli import service_manager + from flocks.cli.service_config import ServiceConfig, service_config_from_payload resolved_skip_frontend_build = ( bool(payload.get("skip_frontend_build", True)) if skip_frontend_build is None else skip_frontend_build ) - return service_manager.ServiceConfig( - backend_host=str(payload.get("backend_host") or service_manager.ServiceConfig.backend_host), - backend_port=int(payload.get("backend_port") or service_manager.ServiceConfig.backend_port), - frontend_host=str(payload.get("frontend_host") or service_manager.ServiceConfig.frontend_host), - frontend_port=int(payload.get("frontend_port") or service_manager.ServiceConfig.frontend_port), + migrated_payload = dict(payload) + backend_port = migrated_payload.get("backend_port") + frontend_port = migrated_payload.get("frontend_port") + if isinstance(backend_port, int) and isinstance(frontend_port, int) and backend_port != frontend_port: + migrated_payload["legacy_backend_host"] = migrated_payload.get("backend_host") + migrated_payload["legacy_backend_port"] = backend_port + migrated_payload["backend_host"] = migrated_payload.get("frontend_host") or migrated_payload.get("backend_host") + migrated_payload["backend_port"] = frontend_port + migrated_payload["server_port_migration_hint"] = True + return service_config_from_payload( + migrated_payload, + default=ServiceConfig(), no_browser=True, skip_frontend_build=resolved_skip_frontend_build, ) +def _handoff_service_config(): + payload = _read_upgrade_state() + if payload is not None: + return _service_config_from_payload(payload, skip_frontend_build=True) + return _current_service_config() + + def _read_upgrade_server_pid() -> tuple[int | None, bool]: pid_path = _upgrade_server_pid_path() if not pid_path.exists(): @@ -2180,25 +2219,31 @@ def read_upgrade_runtime_state(frontend_port: int | None = None) -> dict[str, An } +def _webui_runtime_ready(state: str) -> bool: + return state in {"healthy", "static"} + + def _start_frontend_with_fallback(config, console, *, allow_build_fallback: bool) -> None: - from flocks.cli import service_manager + from flocks.cli.service_config import with_frontend_build + from flocks.cli.service_control import request_restart_webui, request_resume_upgrade try: - service_manager.start_frontend(config, console) + status = request_resume_upgrade( + config, + paths=None, + timeout=180.0, + ) + if not _webui_runtime_ready(status.webui.state): + raise RuntimeError(status.webui.last_error or "WebUI restart did not become healthy") return except Exception: if not allow_build_fallback or not config.skip_frontend_build: raise - rebuilt_config = service_manager.ServiceConfig( - backend_host=config.backend_host, - backend_port=config.backend_port, - frontend_host=config.frontend_host, - frontend_port=config.frontend_port, - no_browser=config.no_browser, - skip_frontend_build=False, - ) - service_manager.start_frontend(rebuilt_config, console) + rebuilt_config = with_frontend_build(config, skip_frontend_build=False) + result = request_restart_webui(rebuilt_config, force_frontend_build=True, paths=None, timeout=180.0) + if not _webui_runtime_ready(result.webui.state): + raise RuntimeError(result.webui.last_error or "WebUI restart did not become healthy") def cleanup_orphan_upgrade_state(*, frontend_port: int | None = None) -> bool: @@ -2914,7 +2959,6 @@ async def perform_update( current_version = get_current_version() effective_update_version = current_version skip_core_replace = False - handover_active = False console_manifest_info: ConsoleManifestRelease | None = None console_manifest_payload = console_manifest_payload if isinstance(console_manifest_payload, dict) else None fmt = _choose_archive_format(ucfg.archive_format) @@ -3106,20 +3150,7 @@ async def _queue_download_progress(progress: UpdateProgress) -> None: ) async def _restore_after_apply_failure() -> None: - nonlocal handover_active if backup_path is None: - if handover_active: - await asyncio.to_thread(rollback_upgrade_handover) - handover_active = False - return - if handover_active: - await asyncio.to_thread( - _rollback_failed_update, - backup_path, - install_root, - current_version, - ) - handover_active = False return await asyncio.to_thread( _restore_backup_if_possible, @@ -3137,28 +3168,6 @@ async def _restore_after_apply_failure() -> None: ) except Exception as exc: final_replace_error: Exception | None = exc - if ( - sys.platform == "win32" - and restart - and needs_handover - and not handover_active - and _is_windows_file_in_use_error(exc) - ): - log.warning("updater.replace.locked_retry_with_handover", {"error": str(exc)}) - try: - _prepare_upgrade_handover(latest_tag) - handover_active = True - if not skip_core_replace: - await asyncio.to_thread( - _replace_install_dir, - content_root, - install_root, - ) - except Exception as retry_exc: - final_replace_error = retry_exc - else: - final_replace_error = None - if final_replace_error is not None: shutil.rmtree(tmp_dir, ignore_errors=True) await _restore_after_apply_failure() @@ -3255,12 +3264,6 @@ async def _restore_after_apply_failure() -> None: restart_argv = _build_restart_argv(install_root) except Exception as exc: log.error("updater.restart.build_argv_failed", {"error": str(exc)}) - if handover_active: - try: - rollback_upgrade_handover() - except Exception: - pass - handover_active = False yield UpdateProgress( stage="error", message=f"Failed to build restart command: {exc}", @@ -3268,20 +3271,6 @@ async def _restore_after_apply_failure() -> None: ) return - if needs_handover and not handover_active: - try: - _prepare_upgrade_handover(latest_tag) - handover_active = True - except Exception as exc: - log.error("updater.handover.failed", {"error": str(exc)}) - await _restore_after_apply_failure() - yield UpdateProgress( - stage="error", - message=f"Failed to prepare WebUI handover: {exc}", - success=False, - ) - return - try: handoff_argv = _build_restart_handoff_argv( restart_argv, @@ -3297,6 +3286,7 @@ async def _restore_after_apply_failure() -> None: pro_bundle_manifest_path=pro_bundle_manifest_path, bundle_sha256=bundle_sha256, cleanup_dir=tmp_dir, + prepare_handover=needs_handover, ) log.info( "updater.restart.handoff_spawn", @@ -3305,21 +3295,11 @@ async def _restore_after_apply_failure() -> None: "restart_argv": restart_argv, }, ) - subprocess.Popen( - handoff_argv, - cwd=install_root, - close_fds=True, - ) + _spawn_restart_handoff(handoff_argv, cwd=install_root) os._exit(0) except Exception as exc: log.error("updater.restart.handoff_spawn_failed", {"error": str(exc)}) shutil.rmtree(tmp_dir, ignore_errors=True) - if handover_active: - try: - rollback_upgrade_handover() - except Exception: - pass - handover_active = False yield UpdateProgress( stage="error", message=f"Failed to restart service: {exc}", @@ -3440,15 +3420,29 @@ def _build_restart_handoff_argv( pro_bundle_manifest_path: Path | None = None, bundle_sha256: str | None = None, cleanup_dir: Path | None = None, + prepare_handover: bool = False, ) -> list[str]: """Wrap the real restart command in a helper that finishes upgrade work.""" - from flocks.cli import service_manager - if not restart_argv: raise ValueError("restart command is empty") - config = _current_service_config() - paths = service_manager.ensure_runtime_dirs() + config = _handoff_service_config() + managed_restart_argv = [ + restart_argv[0], + "-m", + "flocks.cli.main", + "start", + "--no-browser", + "--skip-webui-build", + "--host", + str(config.backend_host), + "--port", + str(config.backend_port), + ] + if config.legacy_backend_host is not None: + managed_restart_argv.extend(["--server-host", str(config.legacy_backend_host)]) + if config.legacy_backend_port is not None: + managed_restart_argv.extend(["--server-port", str(config.legacy_backend_port)]) argv = [ restart_argv[0], "-m", @@ -3463,8 +3457,6 @@ def _build_restart_handoff_argv( str(config.frontend_host), "--frontend-port", str(config.frontend_port), - "--backend-pid-file", - str(paths.backend_pid), "--install-root", str(install_root), "--uv-path", @@ -3490,7 +3482,9 @@ def _build_restart_handoff_argv( argv.extend(["--bundle-sha256", bundle_sha256]) if cleanup_dir is not None: argv.extend(["--cleanup-dir", str(cleanup_dir)]) - argv.extend(["--", *restart_argv]) + if prepare_handover: + argv.append("--prepare-handover") + argv.extend(["--", *managed_restart_argv]) return argv diff --git a/scripts/install.ps1 b/scripts/install.ps1 index aac4a1b4b..8dffbb00d 100644 --- a/scripts/install.ps1 +++ b/scripts/install.ps1 @@ -1293,6 +1293,19 @@ function Main { finally { Pop-Location } + Write-Info (Get-LocalizedText -English "Building WebUI static assets..." -Chinese "正在构建 WebUI 静态资源...") + Push-Location (Join-Path $RootDir "webui") + try { + $null = Invoke-NativeCommandOrFail ` + -Description "WebUI static asset build" ` + -FilePath "npm.cmd" ` + -ArgumentList @("run", "build") ` + -WorkingDirectory (Join-Path $RootDir "webui") ` + -StreamOutput + } + finally { + Pop-Location + } if ($InstallTui) { Install-Bun diff --git a/scripts/install.sh b/scripts/install.sh index 72246984b..35780c3dc 100644 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -1150,6 +1150,15 @@ main() { cd "$ROOT_DIR/webui" npm_config_registry="$NPM_REGISTRY" "$NPM_CMD" install ) + if is_zh_install; then + info "正在构建 WebUI 静态资源..." + else + info "Building WebUI static assets..." + fi + ( + cd "$ROOT_DIR/webui" + "$NPM_CMD" run build + ) if [[ "$INSTALL_TUI" -eq 1 ]]; then install_bun diff --git a/tests/cli/test_doctor_command.py b/tests/cli/test_doctor_command.py index 28ef05c95..4220514cd 100644 --- a/tests/cli/test_doctor_command.py +++ b/tests/cli/test_doctor_command.py @@ -26,8 +26,8 @@ def fake_run(command, *, cwd, check, env): monkeypatch.setattr( "flocks.cli.service_manager.build_status_lines", lambda: [ - "[flocks] 后端运行中: PID=111 URL=http://127.0.0.1:8000", - "[flocks] WebUI 运行中: PID=222 URL=http://127.0.0.1:5173", + "[flocks] daemon: state=running PID=111", + "[flocks] flocks: state=healthy PID=222 URL=http://127.0.0.1:5173", ], ) @@ -37,7 +37,7 @@ def fake_run(command, *, cwd, check, env): assert "Flocks source directory:" in result.stdout assert "scripts/install.sh" in result.stdout assert "安装正常" in result.stdout - assert "服务正常" in result.stdout + assert "运行状态正常" in result.stdout assert len(calls) == 1 command, cwd, check = calls[0] @@ -71,10 +71,25 @@ def fake_run(command, *, cwd, check, env): assert isinstance(env, dict) assert env["FLOCKS_INSTALL_LANGUAGE"] == "zh-CN" assert env["FLOCKS_UV_DEFAULT_INDEX"] == "https://mirrors.aliyun.com/pypi/simple" - assert "服务不正常,请执行 `flocks restart`" in result.stdout + assert "运行状态异常,请执行 `flocks restart`" in result.stdout + + +def test_service_status_is_healthy_accepts_current_daemon_status() -> None: + assert doctor_cmd._service_status_is_healthy( + [ + "[flocks] daemon: state=running PID=111", + "[flocks] flocks: state=healthy PID=222 URL=http://127.0.0.1:5173", + ] + ) + assert not doctor_cmd._service_status_is_healthy( + [ + "[flocks] daemon: state=running PID=111", + "[flocks] flocks: state=degraded PID=222 URL=http://127.0.0.1:5173", + ] + ) -def test_service_status_is_healthy_requires_backend_and_webui() -> None: +def test_service_status_is_healthy_accepts_legacy_backend_and_webui() -> None: assert doctor_cmd._service_status_is_healthy( [ "[flocks] 后端运行中: PID=111 URL=http://127.0.0.1:8000", diff --git a/tests/cli/test_service_commands.py b/tests/cli/test_service_commands.py index 99a267c17..d80928bc5 100644 --- a/tests/cli/test_service_commands.py +++ b/tests/cli/test_service_commands.py @@ -31,7 +31,7 @@ def test_cli_help_lists_service_commands(monkeypatch, tmp_path) -> None: assert result.exit_code == 0 for command in ("start", "stop", "restart", "status", "logs", "session", "mcp", "task", "skills"): assert _help_contains_command(result.stdout, command) - for command in ("agent", "acp", "debug", "run", "serve", "auth", "models"): + for command in ("agent", "acp", "debug", "run", "serve", "service-watchdog", "service-daemon", "auth", "models"): assert not _help_contains_command(result.stdout, command) diff --git a/tests/cli/test_service_manager.py b/tests/cli/test_service_manager.py index ba42f890d..cef7885e4 100644 --- a/tests/cli/test_service_manager.py +++ b/tests/cli/test_service_manager.py @@ -1,6 +1,6 @@ -import contextlib import json -import signal +import shutil +import subprocess import sys from pathlib import Path from types import SimpleNamespace @@ -9,6 +9,14 @@ import pytest from flocks.cli import service_manager +from flocks.cli import service_supervisor +from flocks.cli import service_control +from flocks.cli import service_process +from tests.helpers.service_supervisor import ( + SleeperProcessAdapter, + make_short_runtime_root, + wait_for_process_exit, +) class DummyConsole: @@ -19,6 +27,61 @@ def print(self, *args, **kwargs) -> None: self.messages.append(" ".join(str(arg) for arg in args)) +@pytest.fixture(autouse=True) +def _skip_backend_webui_dist_check(monkeypatch) -> None: + monkeypatch.setattr(service_manager, "_ensure_webui_dist", lambda *_args, **_kwargs: None) + monkeypatch.setattr(service_manager, "_resolve_upgrade_runtime", lambda *_args, **_kwargs: {"action": "noop", "error": None}) + + +def _make_runtime_paths(tmp_path: Path) -> service_manager.RuntimePaths: + return service_manager.RuntimePaths( + root=tmp_path, + run_dir=tmp_path / "run", + log_dir=tmp_path / "logs", + backend_pid=tmp_path / "run" / "backend.pid", + frontend_pid=tmp_path / "run" / "webui.pid", + backend_log=tmp_path / "logs" / "backend.log", + frontend_log=tmp_path / "logs" / "webui.log", + ) + + +def _write_legacy_runtime_record(pid_file: Path, record: service_manager.RuntimeRecord) -> None: + payload: dict[str, object] = {"pid": record.pid} + if record.pgid is not None: + payload["pgid"] = record.pgid + if record.host is not None: + payload["host"] = record.host + if record.port is not None: + payload["port"] = record.port + if record.command: + payload["command"] = list(record.command) + if record.started_at is not None: + payload["started_at"] = record.started_at + pid_file.write_text(json.dumps(payload, ensure_ascii=True, sort_keys=True), encoding="utf-8") + + +def test_supervisor_uses_tcp_control_when_af_unix_is_unavailable(monkeypatch) -> None: + monkeypatch.setattr(service_control.sys, "platform", "linux") + monkeypatch.delattr(service_control.socket, "AF_UNIX", raising=False) + + assert service_control.supervisor_uses_tcp_control() is True + + +def test_service_supervisor_imports_when_af_unix_is_unavailable() -> None: + code = "\n".join( + [ + "import socket", + "if hasattr(socket, 'AF_UNIX'):", + " delattr(socket, 'AF_UNIX')", + "import flocks.cli.service_supervisor", + ] + ) + + completed = subprocess.run([sys.executable, "-c", code], capture_output=True, text=True, check=False) + + assert completed.returncode == 0, completed.stderr + + def test_runtime_paths_follow_flocks_root_env(monkeypatch, tmp_path: Path) -> None: monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path)) @@ -241,7 +304,7 @@ def test_runtime_record_round_trip_preserves_metadata(tmp_path: Path) -> None: started_at=1234.5, ) - service_manager.write_runtime_record(pid_file, record) + _write_legacy_runtime_record(pid_file, record) assert json.loads(pid_file.read_text(encoding="utf-8")) == { "command": ["python", "-m", "uvicorn"], @@ -264,7 +327,7 @@ def test_runtime_record_round_trip_preserves_host(tmp_path: Path) -> None: started_at=1234.5, ) - service_manager.write_runtime_record(pid_file, record) + _write_legacy_runtime_record(pid_file, record) assert json.loads(pid_file.read_text(encoding="utf-8")) == { "command": ["python", "-m", "uvicorn"], @@ -286,7 +349,7 @@ def test_read_runtime_record_rejects_invalid_content(tmp_path: Path) -> None: def test_cleanup_stale_pid_file_keeps_live_process_group(monkeypatch, tmp_path: Path) -> None: pid_file = tmp_path / "backend.pid" - service_manager.write_runtime_record( + _write_legacy_runtime_record( pid_file, service_manager.RuntimeRecord(pid=1001, pgid=2002, port=8000), ) @@ -301,7 +364,7 @@ def test_cleanup_stale_pid_file_keeps_live_process_group(monkeypatch, tmp_path: def test_cleanup_stale_pid_file_removes_reused_windows_pid(monkeypatch, tmp_path: Path) -> None: pid_file = tmp_path / "backend.pid" - service_manager.write_runtime_record( + _write_legacy_runtime_record( pid_file, service_manager.RuntimeRecord( pid=1232, @@ -341,8 +404,95 @@ def test_selected_log_paths_support_specific_targets(tmp_path: Path) -> None: ) assert service_manager.selected_log_paths(paths, backend=True) == [paths.backend_log] - assert service_manager.selected_log_paths(paths, webui=True) == [paths.frontend_log] - assert service_manager.selected_log_paths(paths) == [paths.backend_log, paths.frontend_log] + assert service_manager.selected_log_paths(paths, webui=True) == [paths.backend_log] + assert service_manager.selected_log_paths(paths) == [paths.backend_log] + + +def test_show_logs_falls_back_to_local_files_when_daemon_unavailable(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + paths.log_dir.mkdir(parents=True) + paths.backend_log.write_text("backend-one\nbackend-two\n", encoding="utf-8") + paths.frontend_log.write_text("webui-one\n", encoding="utf-8") + (paths.log_dir / "daemon.log").write_text("daemon-one\n", encoding="utf-8") + console = DummyConsole() + + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr( + service_manager, + "read_logs", + lambda **_kwargs: (_ for _ in ()).throw(service_manager.ServiceError("down")), + ) + + service_manager.show_logs(console, follow=False, lines=1) + + assert any("改为读取本地日志文件" in message for message in console.messages) + assert "[flocks] backend-two" in console.messages + assert "[daemon] daemon-one" in console.messages + + +def test_daemon_log_service_name_uses_daemon_only(tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + daemon = service_supervisor.SupervisorDaemon(service_manager.ServiceConfig()) + daemon.paths = paths + + assert daemon._log_paths_for_service("daemon") == [("daemon", paths.log_dir / "daemon.log")] + assert daemon._log_paths_for_service("supervisor") == [] + + +def test_daemon_log_event_prefix_uses_daemon(capsys) -> None: + service_supervisor._daemon_log("stopped") + + assert "daemon.stopped" in capsys.readouterr().out + + +@pytest.mark.parametrize("disconnect_error", [BrokenPipeError, ConnectionResetError, ConnectionAbortedError]) +def test_supervisor_control_send_json_ignores_disconnected_client(disconnect_error: type[Exception]) -> None: + daemon = service_supervisor.SupervisorDaemon(service_manager.ServiceConfig()) + handler_class = daemon._handler_class() + handler = handler_class.__new__(handler_class) + calls: list[tuple[str, object]] = [] + + handler.send_response = lambda status: calls.append(("status", status)) + handler.send_header = lambda name, value: calls.append((name, value)) + handler.end_headers = lambda: calls.append(("end_headers", None)) + handler.wfile = SimpleNamespace(write=lambda _body: (_ for _ in ()).throw(disconnect_error())) + + handler._send_json({"ok": True}) + + assert calls[0] == ("status", 200) + + +def test_supervisor_control_get_ignores_logs_client_disconnect() -> None: + daemon = service_supervisor.SupervisorDaemon(service_manager.ServiceConfig()) + handler_class = daemon._handler_class() + handler = handler_class.__new__(handler_class) + sent: list[dict[str, object]] = [] + + daemon.handle_logs_request = lambda *_args, **_kwargs: (_ for _ in ()).throw(BrokenPipeError()) + handler.path = "/logs?service=daemon" + handler._send_json = lambda payload, **_kwargs: sent.append(payload) + + handler.do_GET() + + assert sent == [] + + +def test_open_default_browser_uses_windows_startfile(monkeypatch) -> None: + opened: list[str] = [] + console = DummyConsole() + + monkeypatch.setattr(service_manager.sys, "platform", "win32") + monkeypatch.setattr(service_manager.os, "startfile", lambda url: opened.append(url), raising=False) + monkeypatch.setattr( + service_manager.webbrowser, + "open", + lambda _url: (_ for _ in ()).throw(AssertionError("webbrowser should not be used on Windows when startfile exists")), + ) + + service_manager.open_default_browser("http://127.0.0.1:5173", console) + + assert opened == ["http://127.0.0.1:5173"] + assert console.messages == ["[flocks] 浏览器已打开: http://127.0.0.1:5173"] def test_tail_lines_returns_recent_content(tmp_path: Path) -> None: @@ -673,158 +823,380 @@ def test_resolve_flocks_cli_command_falls_back_to_python_module(monkeypatch, tmp ] -def test_build_status_lines_reports_running_and_idle_services(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", +def _supervisor_status_payload() -> dict[str, object]: + return { + "daemon": { + "pid": 100, + "state": "running", + "log_path": "/tmp/logs/daemon.log", + }, + "backend": { + "pid": 111, + "host": "0.0.0.0", + "port": 9000, + "state": "healthy", + "last_error": None, + "log_path": "/tmp/logs/backend.log", + }, + "webui": { + "host": "0.0.0.0", + "port": 9000, + "state": "static", + "last_error": None, + "log_path": "/tmp/logs/backend.log", + }, + } + + +def _supervisor_status(payload: dict[str, object] | None = None) -> service_control.SupervisorStatus: + return service_control.parse_supervisor_status(payload or _supervisor_status_payload()) + + +def test_build_status_lines_reports_supervisor_control_status(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + monkeypatch.setattr(service_manager, "read_supervisor_status", lambda *_args, **_kwargs: _supervisor_status()) + + lines = service_manager.build_status_lines(paths) + + assert lines[0] == "[flocks] 服务" + assert lines[1] == "[flocks] daemon: state=running PID=100" + assert "http://127.0.0.1:9000" in lines[2] + assert lines[5] == "[flocks] daemon: /tmp/logs/daemon.log" + assert lines[6] == "[flocks] flocks: /tmp/logs/backend.log" + + +def test_startup_status_lines_use_progress_summary() -> None: + lines = service_manager._startup_status_lines_from_payload(_supervisor_status_payload()) + + assert lines[:2] == [ + "[flocks] Flocks daemon 已启动。", + "[flocks] Flocks service 已启动。", + ] + assert lines[4] == "[flocks] daemon: state=running PID=100" + assert lines[5] == "[flocks] flocks: state=healthy PID=111 URL=http://127.0.0.1:9000" + assert lines[8] == "[flocks] daemon: /tmp/logs/daemon.log" + assert lines[9] == "[flocks] flocks: /tmp/logs/backend.log" + + +def test_startup_status_lines_mark_unhealthy_steps() -> None: + payload = _supervisor_status_payload() + payload["backend"]["state"] = "degraded" + payload["backend"]["last_error"] = "port occupied" + + lines = service_manager._startup_status_lines_from_payload(payload) + + assert lines[1] == "[flocks] Flocks service 启动异常。" + assert lines[5] == "[flocks] flocks: state=degraded PID=111 URL=http://127.0.0.1:9000 last_error=port occupied" + + +def test_startup_status_lines_can_skip_daemon_step() -> None: + lines = service_manager._startup_status_lines_from_payload( + _supervisor_status_payload(), + include_daemon_step=False, ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - paths.backend_pid.write_text("111", encoding="utf-8") - paths.frontend_pid.write_text("222", encoding="utf-8") - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _: None) + assert lines[:1] == ["[flocks] Flocks service 已启动。"] + + +def test_build_status_lines_reports_daemon_down_without_port_scans(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + calls: list[str] = [] + monkeypatch.setattr( service_manager, - "port_owner_pids", - lambda port: [111] if port == 8000 else [], + "read_supervisor_status", + lambda *_args, **_kwargs: (_ for _ in ()).throw(service_manager.ServiceError("down")), ) - monkeypatch.setattr(service_manager, "pid_is_running", lambda pid: pid == 222) + monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: calls.append("port_owner") or []) + monkeypatch.setattr(service_manager, "port_is_in_use", lambda *_args, **_kwargs: calls.append("port_in_use") or False) + monkeypatch.setattr(service_manager, "trusted_daemon_process_pids", lambda **_kwargs: []) lines = service_manager.build_status_lines(paths) - assert "后端运行中" in lines[0] - assert "WebUI 主进程仍在运行" in lines[1] + assert lines[0] == "[flocks] Flocks daemon 未运行" + assert calls == [] -def test_build_status_lines_uses_custom_server_and_webui_ports(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - service_manager.write_runtime_record( - paths.backend_pid, - service_manager.RuntimeRecord(pid=111, host="0.0.0.0", port=9000), - ) - service_manager.write_runtime_record( - paths.frontend_pid, - service_manager.RuntimeRecord(pid=222, host="0.0.0.0", port=5174), - ) +def test_build_status_lines_reports_residual_daemon_when_control_api_is_down(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _: None) monkeypatch.setattr( service_manager, - "port_owner_pids", - lambda port: [111] if port in {9000, 5174} else [], + "read_supervisor_status", + lambda *_args, **_kwargs: (_ for _ in ()).throw(service_manager.ServiceError("down")), ) - monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: False) + monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) + monkeypatch.setattr(service_manager, "trusted_daemon_process_pids", lambda **_kwargs: [52058]) lines = service_manager.build_status_lines(paths) - assert "http://127.0.0.1:9000" in lines[0] - assert "http://127.0.0.1:5174" in lines[1] + assert lines == [ + "[flocks] Flocks daemon control API 未运行", + "[flocks] 检测到残留 daemon 进程: PID=52058", + f"[flocks] 日志: {paths.log_dir / 'daemon.log'}", + "[flocks] 可执行 `flocks stop` 清理残留进程。", + ] -def test_start_all_stops_services_before_starting(monkeypatch) -> None: +def test_start_all_starts_supervisor_when_control_api_is_down(monkeypatch) -> None: call_order: list[str] = [] - paths = service_manager.RuntimePaths( - root=Path("/tmp"), - run_dir=Path("/tmp/run"), - log_dir=Path("/tmp/logs"), - backend_pid=Path("/tmp/run/backend.pid"), - frontend_pid=Path("/tmp/run/webui.pid"), - backend_log=Path("/tmp/logs/backend.log"), - frontend_log=Path("/tmp/logs/webui.log"), - ) + paths = _make_runtime_paths(Path("/tmp/flocks-test")) monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: (call_order.append("ensure_runtime_dirs"), paths)[1]) - monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call(call_order, "service_lock")) - monkeypatch.setattr(service_manager, "stop_one", lambda port, _pid_file, _name, _console: call_order.append(f"stop_one:{port}")) - monkeypatch.setattr(service_manager, "stop_all_browser_daemons", lambda: call_order.append("stop_browser") or []) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: False) monkeypatch.setattr(service_manager, "_start_all_without_stop", lambda _config, _console: call_order.append("_start_all_without_stop")) service_manager.start_all(service_manager.ServiceConfig(), console=None) - assert call_order == [ - "ensure_runtime_dirs", - "service_lock", - "stop_one:5173", - "stop_one:8000", - "stop_browser", - "_start_all_without_stop", - ] + assert call_order == ["ensure_runtime_dirs", "_start_all_without_stop"] -def test_restart_all_stops_then_starts_under_lock(monkeypatch) -> None: - call_order: list[str] = [] - paths = service_manager.RuntimePaths( - root=Path("/tmp"), - run_dir=Path("/tmp/run"), - log_dir=Path("/tmp/logs"), - backend_pid=Path("/tmp/run/backend.pid"), - frontend_pid=Path("/tmp/run/webui.pid"), - backend_log=Path("/tmp/logs/backend.log"), - frontend_log=Path("/tmp/logs/webui.log"), +def test_start_all_resolves_upgrade_runtime_before_supervisor_status(monkeypatch) -> None: + events: list[str] = [] + console = DummyConsole() + paths = _make_runtime_paths(Path("/tmp/flocks-test")) + + def resolve_upgrade_runtime(_console, *, frontend_port: int, attempt_recover: bool) -> dict[str, object]: + events.append(f"upgrade:{frontend_port}:{attempt_recover}") + return {"action": "cleaned", "error": None} + + def supervisor_running(_paths) -> bool: + events.append("supervisor") + return False + + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "_resolve_upgrade_runtime", resolve_upgrade_runtime) + monkeypatch.setattr(service_manager, "supervisor_is_running", supervisor_running) + monkeypatch.setattr(service_manager, "_start_all_without_stop", lambda _config, _console: events.append("start")) + + service_manager.start_all(service_manager.ServiceConfig(frontend_port=5173), console) + + assert events == ["upgrade:5173:False", "supervisor", "start"] + + +def test_start_all_does_not_duplicate_running_supervisor(monkeypatch) -> None: + calls: list[str] = [] + console = DummyConsole() + + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: _make_runtime_paths(Path("/tmp/flocks-test"))) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: True) + monkeypatch.setattr(service_manager, "show_status", lambda _console: calls.append("status")) + monkeypatch.setattr(service_manager, "open_default_browser", lambda _url, _console: calls.append("browser")) + monkeypatch.setattr(service_manager, "_start_all_without_stop", lambda *_args: calls.append("start")) + + service_manager.start_all(service_manager.ServiceConfig(no_browser=True), console=console) + + assert calls == ["status"] + assert "[flocks] Flocks daemon 已在运行。" in console.messages + + +def test_start_all_restarts_paused_supervisor(monkeypatch) -> None: + calls: list[str] = [] + console = DummyConsole() + paths = _make_runtime_paths(Path("/tmp/flocks-test")) + paused_payload = _supervisor_status_payload() + paused_payload["backend"].update({ + "pid": None, + "state": "paused", + "health": "paused", + "paused": True, + "last_error": "control upgrade prepare", + }) + paused_payload["webui"].update({ + "state": "paused", + "health": "paused", + "paused": True, + "last_error": "control upgrade prepare", + }) + + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: True) + monkeypatch.setattr(service_manager, "read_supervisor_status", lambda *_args, **_kwargs: _supervisor_status(paused_payload)) + monkeypatch.setattr(service_manager, "_stop_all_unlocked", lambda _console, **_kwargs: calls.append("stop")) + monkeypatch.setattr(service_manager, "_start_all_without_stop", lambda _config, _console: calls.append("start")) + + service_manager.start_all(service_manager.ServiceConfig(), console) + + assert calls == ["stop", "start"] + assert "[flocks] Flocks daemon 已在运行,但 Flocks service 处于暂停状态,正在重新启动..." in console.messages + + +def test_start_all_does_not_open_browser_when_restarted_service_remains_unhealthy(monkeypatch) -> None: + calls: list[str] = [] + console = DummyConsole() + paths = _make_runtime_paths(Path("/tmp/flocks-test")) + degraded_payload = _supervisor_status_payload() + degraded_payload["backend"].update({ + "state": "degraded", + "health": "degraded", + "last_error": "port unavailable", + }) + degraded_status = _supervisor_status(degraded_payload) + + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: True) + monkeypatch.setattr(service_manager, "read_supervisor_status", lambda *_args, **_kwargs: degraded_status) + monkeypatch.setattr( + service_manager, + "request_restart", + lambda _config, **_kwargs: calls.append("restart") or degraded_status, ) + monkeypatch.setattr(service_manager, "_print_status_payload", lambda *_args, **_kwargs: calls.append("status")) + monkeypatch.setattr(service_manager, "open_default_browser", lambda *_args, **_kwargs: calls.append("browser")) - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: (call_order.append("ensure_runtime_dirs"), paths)[1]) - monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call(call_order, "service_lock")) - monkeypatch.setattr(service_manager, "stop_one", lambda port, _pid_file, _name, _console: call_order.append(f"stop_one:{port}")) - monkeypatch.setattr(service_manager, "stop_all_browser_daemons", lambda: call_order.append("stop_browser") or []) - monkeypatch.setattr(service_manager, "_start_all_without_stop", lambda _config, _console: call_order.append("_start_all_without_stop")) + with pytest.raises(service_manager.ServiceError, match="Flocks service 启动失败"): + service_manager.start_all(service_manager.ServiceConfig(), console) + + assert calls == ["restart", "status"] + assert "[flocks] Flocks daemon 已在运行,但 Flocks service 不可用,正在重启..." in console.messages + + +def test_start_all_restarts_running_daemon_when_config_changes(monkeypatch) -> None: + calls: list[str] = [] + console = DummyConsole() + paths = _make_runtime_paths(Path("/tmp/flocks-test")) + payload = _supervisor_status_payload() + payload["config"] = { + "backend_host": "127.0.0.1", + "backend_port": 8000, + "frontend_host": "127.0.0.1", + "frontend_port": 5173, + } + + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: True) + monkeypatch.setattr(service_manager, "read_supervisor_status", lambda *_args, **_kwargs: _supervisor_status(payload)) + monkeypatch.setattr(service_manager, "_stop_all_unlocked", lambda _console, **_kwargs: calls.append("stop")) + monkeypatch.setattr(service_manager, "_start_all_without_stop", lambda _config, _console: calls.append("start")) + + service_manager.start_all( + service_manager.ServiceConfig( + backend_host="0.0.0.0", + backend_port=9000, + frontend_host="0.0.0.0", + frontend_port=5273, + no_browser=True, + ), + console, + ) + + assert calls == ["stop", "start"] + assert "[flocks] Flocks daemon 已在运行,但配置已变化,正在按新配置重启..." in console.messages + + +def test_restart_all_stops_then_starts_daemon(monkeypatch) -> None: + call_order: list[str] = [] + paths = _make_runtime_paths(Path("/tmp/flocks-test")) + + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "_stop_all_unlocked", lambda _console, **_kwargs: call_order.append("stop")) + monkeypatch.setattr(service_manager, "_start_all_unlocked", lambda _config, _console, **_kwargs: call_order.append("start")) service_manager.restart_all(service_manager.ServiceConfig(), console=None) - assert call_order == [ - "ensure_runtime_dirs", - "service_lock", - "stop_one:5173", - "stop_one:8000", - "stop_browser", - "_start_all_without_stop", + assert call_order == ["stop", "start"] + + +def test_start_all_without_stop_starts_supervisor_daemon(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + calls: list[str] = [] + console = DummyConsole() + + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "cleanup_legacy_runtime_processes", lambda *_args, **_kwargs: None) + monkeypatch.setattr(service_manager, "cleanup_orphan_service_ports", lambda *_args, **_kwargs: None) + monkeypatch.setattr(service_manager, "_start_supervisor_process", lambda _config, _paths, _console: calls.append("daemon") or SimpleNamespace(poll=lambda: None)) + monkeypatch.setattr(service_manager, "_wait_for_supervisor_ready", lambda _paths, **_kwargs: calls.append("ready") or _supervisor_status_payload()) + monkeypatch.setattr(service_manager, "_print_status_payload", lambda _payload, _console, **_kwargs: calls.append("status")) + monkeypatch.setattr( + service_manager, + "open_default_browser", + lambda _url, _console: calls.append("browser"), + ) + + service_manager._start_all_without_stop(service_manager.ServiceConfig(no_browser=True), console) + + assert calls == ["daemon", "ready", "status"] + assert console.messages == [ + "[flocks] Flocks daemon 启动中...", + "[flocks] Flocks daemon 已启动。", ] -def test_start_all_stops_on_failure_before_restart(monkeypatch) -> None: - paths = service_manager.RuntimePaths( - root=Path("/tmp"), - run_dir=Path("/tmp/run"), - log_dir=Path("/tmp/logs"), - backend_pid=Path("/tmp/run/backend.pid"), - frontend_pid=Path("/tmp/run/webui.pid"), - backend_log=Path("/tmp/logs/backend.log"), - frontend_log=Path("/tmp/logs/webui.log"), +def test_start_all_without_stop_raises_when_service_starts_degraded(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + calls: list[str] = [] + console = DummyConsole() + degraded_payload = _supervisor_status_payload() + degraded_payload["backend"].update({ + "state": "degraded", + "health": "degraded", + "last_error": "port unavailable", + }) + + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "cleanup_legacy_runtime_processes", lambda *_args, **_kwargs: None) + monkeypatch.setattr(service_manager, "cleanup_orphan_service_ports", lambda *_args, **_kwargs: None) + monkeypatch.setattr( + service_manager, + "_start_supervisor_process", + lambda _config, _paths, _console: calls.append("daemon") or SimpleNamespace(poll=lambda: None), + ) + monkeypatch.setattr( + service_manager, + "_wait_for_supervisor_ready", + lambda _paths, **_kwargs: calls.append("ready") or degraded_payload, ) + monkeypatch.setattr(service_manager, "_print_status_payload", lambda _payload, _console, **_kwargs: calls.append("status")) + monkeypatch.setattr(service_manager, "open_default_browser", lambda *_args, **_kwargs: calls.append("browser")) + + with pytest.raises(service_manager.ServiceError, match="Flocks service 启动失败"): + service_manager._start_all_without_stop(service_manager.ServiceConfig(), console) + + assert calls == ["daemon", "ready", "status"] + + +def test_start_all_without_stop_prints_before_cleanup(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + events: list[str] = [] + console = DummyConsole() + + def record_print(message: str) -> None: + events.append(f"print:{message}") + console.messages.append(message) + + console.print = record_print monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call([], "service_lock")) + monkeypatch.setattr(service_manager, "cleanup_legacy_runtime_processes", lambda *_args, **_kwargs: events.append("legacy")) + monkeypatch.setattr(service_manager, "cleanup_orphan_service_ports", lambda *_args, **_kwargs: events.append("orphan")) + monkeypatch.setattr(service_manager, "_ensure_webui_dist", lambda *_args, **_kwargs: events.append("dist")) monkeypatch.setattr( service_manager, - "stop_one", - lambda *_args: (_ for _ in ()).throw(service_manager.ServiceError("stop failed")), + "_start_supervisor_process", + lambda _config, _paths, _console: events.append("daemon") or SimpleNamespace(poll=lambda: None), ) + monkeypatch.setattr(service_manager, "_wait_for_supervisor_ready", lambda _paths, **_kwargs: _supervisor_status_payload()) + monkeypatch.setattr(service_manager, "_print_status_payload", lambda *_args, **_kwargs: None) + + service_manager._start_all_without_stop(service_manager.ServiceConfig(no_browser=True), console) + + assert events[:5] == ["print:[flocks] Flocks daemon 启动中...", "legacy", "orphan", "dist", "daemon"] + + +def test_start_all_propagates_supervisor_start_failure(monkeypatch) -> None: + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: _make_runtime_paths(Path("/tmp/flocks-test"))) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: False) monkeypatch.setattr( service_manager, "_start_all_without_stop", - lambda *_args: (_ for _ in ()).throw(AssertionError("should not start")), + lambda *_args: (_ for _ in ()).throw(service_manager.ServiceError("daemon failed")), ) - with pytest.raises(service_manager.ServiceError, match="stop failed"): + with pytest.raises(service_manager.ServiceError, match="daemon failed"): service_manager.start_all(service_manager.ServiceConfig(), console=None) -def test_start_backend_writes_runtime_metadata(monkeypatch, tmp_path: Path) -> None: +def test_start_backend_process_does_not_write_runtime_metadata(monkeypatch, tmp_path: Path) -> None: paths = service_manager.RuntimePaths( root=tmp_path, run_dir=tmp_path / "run", @@ -869,26 +1241,12 @@ def _capture_spawn(*_args, **kwargs) -> SimpleNamespace: monkeypatch.setattr(service_manager, "_spawn_process", _capture_spawn) - service_manager.start_backend(service_manager.ServiceConfig(), console) + process = service_manager._start_backend_process(service_manager.ServiceConfig(), console) - record = service_manager.read_runtime_record(paths.backend_pid) - assert record is not None - assert record.pid == 2468 - assert record.pgid == 2468 - assert record.host == "127.0.0.1" - assert record.port == 8000 - assert record.command == ( - "python", - "-m", - "flocks.cli.main", - "serve", - "--host", - "127.0.0.1", - "--port", - "8000", - ) + assert process.pid == 2468 + assert not paths.backend_pid.exists() assert probe_calls == [{ - "urls": ["http://127.0.0.1:8000"], + "urls": ["http://127.0.0.1:5173"], "name": "后端服务", "attempts": 30, "delay": 3.0, @@ -896,6 +1254,7 @@ def _capture_spawn(*_args, **kwargs) -> SimpleNamespace: }] assert spawn_env is not None assert spawn_env.get("PYTHONUNBUFFERED") == "1" + assert "[flocks] 启动 Flocks service..." not in console.messages def test_start_backend_rolls_back_when_probe_fails(monkeypatch, tmp_path: Path) -> None: @@ -912,7 +1271,7 @@ def test_start_backend_rolls_back_when_probe_fails(monkeypatch, tmp_path: Path) paths.log_dir.mkdir(parents=True) paths.backend_log.write_text("line1\nline2\nboot failed here\n", encoding="utf-8") console = DummyConsole() - stop_calls: list[tuple[int, Path, str]] = [] + stop_calls: list[str] = [] monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) @@ -927,7 +1286,7 @@ def test_start_backend_rolls_back_when_probe_fails(monkeypatch, tmp_path: Path) monkeypatch.setattr( service_manager, "_spawn_process", - lambda *_args, **_kwargs: SimpleNamespace(pid=2468), + lambda *_args, **_kwargs: SimpleNamespace(pid=2468, poll=lambda: None), ) monkeypatch.setattr( service_manager, @@ -936,14 +1295,14 @@ def test_start_backend_rolls_back_when_probe_fails(monkeypatch, tmp_path: Path) ) monkeypatch.setattr( service_manager, - "stop_one", - lambda port, pid_file, name, _console: stop_calls.append((port, pid_file, name)), + "_terminate_process", + lambda _process, name, _console: stop_calls.append(name), ) with pytest.raises(service_manager.ServiceError, match="启动超时"): - service_manager.start_backend(service_manager.ServiceConfig(), console) + service_manager._start_backend_process(service_manager.ServiceConfig(), console) - assert stop_calls == [(8000, paths.backend_pid, "后端")] + assert stop_calls == ["后端"] joined = "\n".join(console.messages) assert "近期日志" in joined assert "boot failed here" in joined @@ -985,15 +1344,12 @@ def test_start_backend_reports_started_after_probe_succeeds(monkeypatch, tmp_pat lambda *_args, **_kwargs: None, ) - service_manager.start_backend(service_manager.ServiceConfig(), console) + service_manager._start_backend_process(service_manager.ServiceConfig(), console) - record = service_manager.read_runtime_record(paths.backend_pid) - assert record is not None - assert record.pid == 2468 backend_env = spawn_calls[0]["kwargs"]["env"] assert backend_env["_FLOCKS_WEBUI_HOST"] == "127.0.0.1" assert backend_env["_FLOCKS_WEBUI_PORT"] == "5173" - assert console.messages[-1] == f"[flocks] 后端已启动,日志: {paths.backend_log}" + assert not paths.backend_pid.exists() assert backend_env["FLOCKS_CONSOLE_BASE_URL"] == service_manager.DEFAULT_FLOCKS_CONSOLE_BASE_URL @@ -1034,7 +1390,7 @@ def test_start_backend_allows_overriding_console_base_url(monkeypatch, tmp_path: ) monkeypatch.setenv("FLOCKS_CONSOLE_BASE_URL", "https://custom-console.example.com") - service_manager.start_backend(service_manager.ServiceConfig(), console) + service_manager._start_backend_process(service_manager.ServiceConfig(), console) backend_env = spawn_calls[0]["kwargs"]["env"] assert backend_env["FLOCKS_CONSOLE_BASE_URL"] == "https://custom-console.example.com" @@ -1139,185 +1495,205 @@ def test_build_frontend_env_allows_direct_backend_urls_when_opted_in(monkeypatch assert env["VITE_WS_BASE_URL"] == "ws://10.0.0.8:9000" -def test_start_frontend_passes_backend_urls_to_build_and_preview(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", +def _fake_process(pid: int, args: list[str] | None = None, returncode: int | None = None): + return SimpleNamespace(pid=pid, args=args or [str(pid)], returncode=returncode, poll=lambda: returncode) + + +def test_supervisor_recovers_backend_when_port_disappears(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + calls: list[str] = [] + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + daemon = service_supervisor.SupervisorDaemon(service_manager.ServiceConfig(backend_port=9995, frontend_port=9996)) + daemon.paths = paths + daemon.backend.log_path = paths.backend_log + daemon.backend.process = _fake_process(111, ["backend"]) + + monkeypatch.setattr(service_process, "tcp_port_accepts_connections", lambda _host, port: port != 9995) + monkeypatch.setattr(service_manager, "_terminate_process", lambda _process, name, _console: calls.append(f"stop:{name}")) + monkeypatch.setattr( + service_manager, + "_start_backend_process", + lambda *_args, **_kwargs: calls.append("start:backend") or _fake_process(333, ["backend-new"]), ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - console = DummyConsole() - build_calls: list[dict[str, object]] = [] - preview_calls: list[dict[str, object]] = [] - def fake_run(command, **kwargs): - build_calls.append({"command": command, "kwargs": kwargs}) - return SimpleNamespace(returncode=0) + daemon.tick() - def fake_spawn(command, **kwargs): - preview_calls.append({"command": command, "kwargs": kwargs}) - return SimpleNamespace(pid=2468) + assert calls == ["stop:后端", "start:backend"] + assert daemon.backend.pid == 333 - monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) + +def test_supervisor_waits_for_second_backend_health_failure(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + calls: list[str] = [] monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "wait_for_http", lambda *_args, **_kwargs: None) - monkeypatch.setattr(service_manager.os, "getpgid", lambda pid: pid) - monkeypatch.setattr(service_manager, "resolve_npm_executable", lambda: "/usr/bin/npm") - monkeypatch.setattr(service_manager, "node_version_satisfies_requirement", lambda: True) - monkeypatch.setattr(service_manager.subprocess, "run", fake_run) - monkeypatch.setattr(service_manager, "_spawn_process", fake_spawn) - monkeypatch.setenv("__VITE_ADDITIONAL_SERVER_ALLOWED_HOSTS", "preview.example.com") + daemon = service_supervisor.SupervisorDaemon( + service_manager.ServiceConfig(backend_port=9995, frontend_port=9996), + failure_threshold=2, + ) + daemon.paths = paths + daemon.backend.process = _fake_process(111, ["backend"]) - config = service_manager.ServiceConfig( - backend_host="10.0.0.8", - backend_port=9000, - frontend_host="0.0.0.0", - frontend_port=5174, - ) - service_manager.start_frontend(config, console) - - assert build_calls[0]["command"] == ["/usr/bin/npm", "run", "build"] - assert build_calls[0]["kwargs"]["env"]["FLOCKS_API_PROXY_TARGET"] == "http://10.0.0.8:9000" - assert build_calls[0]["kwargs"]["env"]["__VITE_ADDITIONAL_SERVER_ALLOWED_HOSTS"] == "preview.example.com" - assert "VITE_API_BASE_URL" not in build_calls[0]["kwargs"]["env"] - assert "VITE_WS_BASE_URL" not in build_calls[0]["kwargs"]["env"] - - assert preview_calls[0]["command"] == [ - "/usr/bin/npm", - "run", - "preview", - "--", - "--host", - "0.0.0.0", - "--port", - "5174", - ] - assert preview_calls[0]["kwargs"]["env"]["FLOCKS_API_PROXY_TARGET"] == "http://10.0.0.8:9000" - assert preview_calls[0]["kwargs"]["env"]["__VITE_ADDITIONAL_SERVER_ALLOWED_HOSTS"] == "preview.example.com" - assert "VITE_API_BASE_URL" not in preview_calls[0]["kwargs"]["env"] - assert "VITE_WS_BASE_URL" not in preview_calls[0]["kwargs"]["env"] - record = service_manager.read_runtime_record(paths.frontend_pid) - assert record is not None - assert record.host == "0.0.0.0" - assert record.port == 5174 + class FakeClient: + def __init__(self, *_args, **_kwargs) -> None: + pass + def __enter__(self): + return self -def test_start_frontend_tolerates_windows_node_assertion_after_build(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - webui_dir = tmp_path / "webui" - webui_dist = webui_dir / "dist" - webui_dist.mkdir(parents=True) - console = DummyConsole() - preview_calls: list[list[str]] = [] + def __exit__(self, *_args) -> None: + return None - def fake_run(_command, **_kwargs): - (webui_dist / "index.html").write_text("", encoding="utf-8") - return SimpleNamespace( - returncode=3221226505, - stdout="built in 6.83s", - stderr="Assertion failed: !(handle->flags & UV_HANDLE_CLOSING), file src\\win\\async.c, line 76", - ) + def get(self, _url, **_kwargs): + return httpx.Response(503, json={"status": "unhealthy"}) - def fake_spawn(command, **_kwargs): - preview_calls.append(list(command)) - return SimpleNamespace(pid=2468) + monkeypatch.setattr(service_process.httpx, "Client", FakeClient) + monkeypatch.setattr(service_process, "tcp_port_accepts_connections", lambda *_args: True) + monkeypatch.setattr(service_manager, "_terminate_process", lambda _process, name, _console: calls.append(f"stop:{name}")) + monkeypatch.setattr( + service_manager, + "_start_backend_process", + lambda *_args, **_kwargs: calls.append("start:backend") or _fake_process(333, ["backend-new"]), + ) - monkeypatch.setattr(service_manager.sys, "platform", "win32") - monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) + daemon.tick() + assert calls == [] + assert daemon.backend.state == "degraded" + + daemon.tick() + assert calls == ["stop:后端", "start:backend"] + + +def test_backend_probe_rejects_api_root_when_static_webui_missing(monkeypatch) -> None: + class FakeClient: + def __init__(self, *_args, **_kwargs) -> None: + pass + + def __enter__(self): + return self + + def __exit__(self, *_args) -> None: + return None + + def get(self, url, **_kwargs): + if str(url).endswith("/api/health"): + return httpx.Response(200, json={"status": "healthy"}) + return httpx.Response(200, json={"status": "running"}) + + monkeypatch.setattr(service_process, "tcp_port_accepts_connections", lambda *_args: True) + monkeypatch.setattr(service_process.httpx, "Client", FakeClient) + + result = service_process.BackendProcessAdapter().probe(_fake_process(111, ["backend"]), "127.0.0.1", 5173) + + assert result.healthy is False + assert result.reason == "health status=200, root status=200" + + +def test_supervisor_reports_webui_as_static_endpoint(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + calls: list[str] = [] monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "wait_for_http", lambda *_args, **_kwargs: None) - monkeypatch.setattr(service_manager, "resolve_npm_executable", lambda: "npm.cmd") - monkeypatch.setattr(service_manager, "node_version_satisfies_requirement", lambda: True) - monkeypatch.setattr(service_manager.subprocess, "run", fake_run) - monkeypatch.setattr(service_manager, "_spawn_process", fake_spawn) + daemon = service_supervisor.SupervisorDaemon(service_manager.ServiceConfig(backend_port=9995, frontend_port=9996)) + daemon.paths = paths + daemon.backend.process = _fake_process(111, ["backend"]) - service_manager.start_frontend(service_manager.ServiceConfig(), console) + daemon.tick() - assert preview_calls[0][:3] == ["npm.cmd", "run", "preview"] - assert "[flocks] WebUI 构建产物已生成,忽略 Windows Node.js 退出断言。" in console.messages + assert calls == [] + assert daemon.webui.pid is None + assert daemon.webui.state == "static" -def test_start_frontend_passes_direct_backend_urls_when_opted_in(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) +@pytest.mark.skipif(sys.platform == "win32", reason="uses the Unix domain socket control API") +def test_supervisor_rejects_static_webui_stop_control_api(monkeypatch, tmp_path: Path) -> None: + del tmp_path + short_root = make_short_runtime_root("flocks-supervisor-") + paths = _make_runtime_paths(short_root) paths.run_dir.mkdir(parents=True) paths.log_dir.mkdir(parents=True) - console = DummyConsole() - build_calls: list[dict[str, object]] = [] - preview_calls: list[dict[str, object]] = [] + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + daemon = service_supervisor.SupervisorDaemon(service_manager.ServiceConfig()) + daemon._start_control_server() - def fake_run(command, **kwargs): - build_calls.append({"command": command, "kwargs": kwargs}) - return SimpleNamespace(returncode=0) + try: + with pytest.raises(httpx.HTTPStatusError) as exc_info: + service_control.control_api_request("POST", "/stop/webui", paths=paths) + finally: + daemon._stop_control_server() + shutil.rmtree(short_root, ignore_errors=True) - def fake_spawn(command, **kwargs): - preview_calls.append({"command": command, "kwargs": kwargs}) - return SimpleNamespace(pid=2468) + assert exc_info.value.response.status_code == 409 - monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) + +@pytest.mark.skipif(sys.platform == "win32", reason="uses the Unix domain socket control API") +def test_supervisor_upgrade_prepare_control_api_pauses_real_child_restart(monkeypatch, tmp_path: Path) -> None: + del tmp_path + short_root = make_short_runtime_root("flocks-supervisor-") + paths = _make_runtime_paths(short_root) + paths.run_dir.mkdir(parents=True) + paths.log_dir.mkdir(parents=True) monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "wait_for_http", lambda *_args, **_kwargs: None) - monkeypatch.setattr(service_manager.os, "getpgid", lambda pid: pid) - monkeypatch.setattr(service_manager, "resolve_npm_executable", lambda: "/usr/bin/npm") + backend_adapter = SleeperProcessAdapter() + daemon = service_supervisor.SupervisorDaemon( + service_manager.ServiceConfig(backend_port=9995, frontend_port=9996), + backend_adapter=backend_adapter, + ) + daemon._start_control_server() + + try: + daemon.restart_all(reason="test startup") + backend_process = daemon.backend.process + assert backend_process is not None + assert daemon.webui.process is None + assert daemon.webui.state == "static" + + status = service_control.request_prepare_upgrade(paths=paths) + + wait_for_process_exit(backend_process) + assert status.backend.paused is True + assert status.webui.paused is True + assert daemon.backend.process is None + assert backend_process.pid in backend_adapter.stopped + + daemon.tick() + + assert len(backend_adapter.started) == 1 + assert daemon.backend.process is None + assert daemon.status_payload()["backend"]["paused"] is True + finally: + daemon.shutdown_children() + daemon._stop_control_server() + shutil.rmtree(short_root, ignore_errors=True) + + +def test_build_webui_dist_tolerates_windows_node_assertion_after_build(monkeypatch, tmp_path: Path) -> None: + webui_dir = tmp_path / "webui" + webui_dist = webui_dir / "dist" + webui_dist.mkdir(parents=True) + (webui_dir / "package.json").write_text("{}", encoding="utf-8") + console = DummyConsole() + + def fake_run(_command, **_kwargs): + (webui_dist / "index.html").write_text("", encoding="utf-8") + return SimpleNamespace( + returncode=3221226505, + stdout="built in 6.83s", + stderr="Assertion failed: !(handle->flags & UV_HANDLE_CLOSING), file src\\win\\async.c, line 76", + ) + + monkeypatch.setattr(service_manager.sys, "platform", "win32") + monkeypatch.setattr(service_manager, "resolve_npm_executable", lambda: "npm.cmd") monkeypatch.setattr(service_manager, "node_version_satisfies_requirement", lambda: True) monkeypatch.setattr(service_manager.subprocess, "run", fake_run) - monkeypatch.setattr(service_manager, "_spawn_process", fake_spawn) - monkeypatch.setenv(service_manager.WEBUI_DIRECT_BACKEND_URLS_ENV, "true") - config = service_manager.ServiceConfig( - backend_host="10.0.0.8", - backend_port=9000, - frontend_host="0.0.0.0", - frontend_port=5174, - ) - service_manager.start_frontend(config, console) + service_manager._build_webui_dist(tmp_path, service_manager.ServiceConfig(), console) - assert build_calls[0]["kwargs"]["env"]["VITE_API_BASE_URL"] == "http://10.0.0.8:9000" - assert build_calls[0]["kwargs"]["env"]["VITE_WS_BASE_URL"] == "ws://10.0.0.8:9000" - assert preview_calls[0]["kwargs"]["env"]["VITE_API_BASE_URL"] == "http://10.0.0.8:9000" - assert preview_calls[0]["kwargs"]["env"]["VITE_WS_BASE_URL"] == "ws://10.0.0.8:9000" + assert "[flocks] WebUI 构建产物已生成,忽略 Windows Node.js 退出断言。" in console.messages -def test_start_frontend_prefers_bundled_npm_over_path_lookup(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) +def test_build_webui_dist_prefers_bundled_npm_over_path_lookup(monkeypatch, tmp_path: Path) -> None: + webui_dir = tmp_path / "webui" + webui_dir.mkdir() + (webui_dir / "package.json").write_text("{}", encoding="utf-8") console = DummyConsole() build_calls: list[list[str]] = [] @@ -1325,23 +1701,16 @@ def fake_run(command, **_kwargs): build_calls.append(command) return SimpleNamespace(returncode=0) - monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "wait_for_http", lambda *_args, **_kwargs: None) - monkeypatch.setattr(service_manager.os, "getpgid", lambda pid: pid) monkeypatch.setattr(service_manager, "resolve_npm_executable", lambda: r"C:\Users\flocks\AppData\Local\Programs\Flocks\tools\node\npm.cmd") monkeypatch.setattr(service_manager, "node_version_satisfies_requirement", lambda: True) monkeypatch.setattr(service_manager.subprocess, "run", fake_run) - monkeypatch.setattr(service_manager, "_spawn_process", lambda *_args, **_kwargs: SimpleNamespace(pid=2468)) - service_manager.start_frontend(service_manager.ServiceConfig(), console) + service_manager._build_webui_dist(tmp_path, service_manager.ServiceConfig(), console) assert build_calls[0][0] == r"C:\Users\flocks\AppData\Local\Programs\Flocks\tools\node\npm.cmd" -def test_start_backend_raises_on_port_record_mismatch(monkeypatch, tmp_path: Path) -> None: +def test_start_backend_raises_when_port_has_listener(monkeypatch, tmp_path: Path) -> None: paths = service_manager.RuntimePaths( root=tmp_path, run_dir=tmp_path / "run", @@ -1353,15 +1722,68 @@ def test_start_backend_raises_on_port_record_mismatch(monkeypatch, tmp_path: Pat ) paths.run_dir.mkdir(parents=True) paths.log_dir.mkdir(parents=True) - service_manager.write_runtime_record(paths.backend_pid, service_manager.RuntimeRecord(pid=1111, port=8000)) + _write_legacy_runtime_record(paths.backend_pid, service_manager.RuntimeRecord(pid=1111, port=8000)) monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: [9999]) - with pytest.raises(service_manager.ServiceError, match="运行时记录不一致"): - service_manager.start_backend(service_manager.ServiceConfig(), DummyConsole()) + with pytest.raises(service_manager.ServiceError, match="端口 5173 已被占用"): + service_manager._start_backend_process(service_manager.ServiceConfig(), DummyConsole()) + + +def test_start_backend_cleans_trusted_orphan_port_owner(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + paths.run_dir.mkdir(parents=True) + paths.log_dir.mkdir(parents=True) + owners = iter([[9999], [9999], [], [], []]) + cleaned: list[int] = [] + + monkeypatch.setattr(service_manager, "ensure_install_layout", lambda: tmp_path) + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: next(owners)) + monkeypatch.setattr(service_manager, "_process_command_line", lambda _pid: f"{tmp_path}/.venv/bin/python -m flocks.cli.main serve") + monkeypatch.setattr(service_manager, "_terminate_orphan_pid", lambda pid, *_args, **_kwargs: cleaned.append(pid)) + monkeypatch.setattr(service_manager, "port_is_in_use", lambda *_args, **_kwargs: False) + monkeypatch.setattr(service_manager, "resolve_flocks_cli_command", lambda _root: ["/env/bin/python", "-m", "flocks.cli.main"]) + monkeypatch.setattr(service_manager, "_spawn_process", lambda command, **_kwargs: SimpleNamespace(pid=1234, args=command)) + monkeypatch.setattr(service_manager, "process_runtime_record", lambda *_args, **_kwargs: service_manager.RuntimeRecord(pid=1234)) + monkeypatch.setattr(service_manager, "_log_startup_config", lambda *_args, **_kwargs: None) + monkeypatch.setattr(service_manager, "wait_for_http", lambda *_args, **_kwargs: None) + + process = service_manager._start_backend_process(service_manager.ServiceConfig(), DummyConsole(), paths=paths) + + assert process.pid == 1234 + assert cleaned == [9999] + + +def test_backend_cleanup_trusts_cross_worktree_flocks_uvicorn_owner(monkeypatch, tmp_path: Path) -> None: + cleaned: list[int] = [] + owners = iter([[18787], []]) + + monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: next(owners)) + monkeypatch.setattr( + service_manager, + "_process_command_line", + lambda _pid: ( + "/Users/zgy/.codex/worktrees/6be0/flocks/.venv/bin/python " + "/Users/zgy/.codex/worktrees/6be0/flocks/.venv/bin/uvicorn " + "flocks.server.app:app --host 127.0.0.1 --port 8000 --reload --reload-dir flocks" + ), + ) + monkeypatch.setattr(service_manager, "_terminate_orphan_pid", lambda pid, *_args, **_kwargs: cleaned.append(pid)) + + result = service_manager.cleanup_trusted_port_owners( + 8000, + service="backend", + label="后端", + console=DummyConsole(), + root=tmp_path, + ) + + assert result == [18787] + assert cleaned == [18787] def test_start_backend_raises_when_port_in_use_without_pid_lookup(monkeypatch, tmp_path: Path) -> None: @@ -1384,7 +1806,81 @@ def test_start_backend_raises_when_port_in_use_without_pid_lookup(monkeypatch, t monkeypatch.setattr(service_manager, "port_is_in_use", lambda _port, listeners=None: True) with pytest.raises(service_manager.ServiceError, match="无法识别占用 PID"): - service_manager.start_backend(service_manager.ServiceConfig(), DummyConsole()) + service_manager._start_backend_process(service_manager.ServiceConfig(), DummyConsole()) + + +def test_webui_cleanup_trusts_cross_worktree_flocks_vite_owner(monkeypatch, tmp_path: Path) -> None: + cleaned: list[int] = [] + owners = iter([[18962], []]) + + monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: next(owners)) + monkeypatch.setattr( + service_manager, + "_process_command_line", + lambda _pid: ( + "node /Users/zgy/.codex/worktrees/6be0/flocks/webui/node_modules/.bin/vite --host 127.0.0.1 --port 5173" + ), + ) + monkeypatch.setattr(service_manager, "_terminate_orphan_pid", lambda pid, *_args, **_kwargs: cleaned.append(pid)) + + result = service_manager.cleanup_trusted_port_owners( + 5173, + service="webui", + label="WebUI", + console=DummyConsole(), + root=tmp_path, + ) + + assert result == [18962] + assert cleaned == [18962] + + +def test_cleanup_trusted_daemon_processes_cleans_current_install_only(monkeypatch, tmp_path: Path) -> None: + cleaned: list[int] = [] + + monkeypatch.setattr(service_manager, "_process_list_pids", lambda: [111, 222, 333]) + monkeypatch.setattr( + service_manager, + "_process_command_line", + lambda pid: { + 111: f"{tmp_path}/.venv/bin/python -m flocks.cli.main service-daemon --server-port 8000", + 222: "/other/flocks/.venv/bin/python -m flocks.cli.main service-daemon --server-port 8000", + 333: f"{tmp_path}/.venv/bin/python -m flocks.cli.main serve --port 8000", + }[pid], + ) + monkeypatch.setattr(service_manager, "_terminate_orphan_pid", lambda pid, *_args, **_kwargs: cleaned.append(pid)) + + result = service_manager.cleanup_trusted_daemon_processes(console=DummyConsole(), root=tmp_path) + + assert result == [111] + assert cleaned == [111] + + +def test_windows_cleanup_trusted_daemon_processes_uses_single_query(monkeypatch, tmp_path: Path) -> None: + cleaned: list[int] = [] + commands: list[list[str]] = [] + + def fail_per_pid_lookup(_pid: int) -> str: + raise AssertionError("Windows daemon cleanup should not query each pid separately") + + def fake_run(command, **kwargs): + commands.append(command) + assert command[:2] == ["powershell.exe", "-NoProfile"] + assert kwargs["env"]["FLOCKS_DAEMON_ROOT_MATCH"] == str(tmp_path).lower() + return SimpleNamespace(returncode=0, stdout="111\n222\n111\n") + + monkeypatch.setattr(service_manager.sys, "platform", "win32") + monkeypatch.setattr(service_manager, "which", lambda name: "powershell.exe" if name == "powershell" else None) + monkeypatch.setattr(service_manager, "_process_list_pids", lambda: [111, 222, 333]) + monkeypatch.setattr(service_manager, "_process_command_line", fail_per_pid_lookup) + monkeypatch.setattr(service_manager.subprocess, "run", fake_run) + monkeypatch.setattr(service_manager, "_terminate_orphan_pid", lambda pid, *_args, **_kwargs: cleaned.append(pid)) + + result = service_manager.cleanup_trusted_daemon_processes(console=DummyConsole(), root=tmp_path) + + assert result == [111, 222] + assert cleaned == [111, 222] + assert len(commands) == 1 def test_spawn_process_uses_hidden_window_flags_on_windows(monkeypatch, tmp_path: Path) -> None: @@ -1435,10 +1931,12 @@ def fake_popen(*args, **kwargs): monkeypatch.setattr(service_manager.sys, "platform", "darwin") monkeypatch.setattr(service_manager.subprocess, "Popen", fake_popen) + monkeypatch.setattr(service_manager.os, "getpgid", lambda pid: 4321 if pid == 9876 else pid) process = service_manager._spawn_process(["python", "-m", "uvicorn"], cwd=tmp_path, log_path=log_path) assert process.pid == 9876 + assert process._flocks_pgid == 4321 assert captured["args"] == (["python", "-m", "uvicorn"],) assert captured["kwargs"]["cwd"] == tmp_path assert captured["kwargs"]["creationflags"] == 0 @@ -1446,6 +1944,54 @@ def fake_popen(*args, **kwargs): assert "startupinfo" not in captured["kwargs"] +def test_terminate_process_stops_cached_process_group_after_root_exits(monkeypatch) -> None: + signals: list[tuple[str, int]] = [] + group_running = iter([True, False]) + process = SimpleNamespace(pid=9876, returncode=0, poll=lambda: 0, _flocks_pgid=4321) + + monkeypatch.setattr(service_manager.sys, "platform", "darwin") + monkeypatch.setattr(service_manager, "process_group_is_running", lambda _pgid: next(group_running)) + monkeypatch.setattr( + service_manager, + "signal_process_group", + lambda sig, pgid: signals.append((sig.name, pgid)), + ) + monkeypatch.setattr(service_manager, "signal_pid_list", lambda *_args, **_kwargs: None) + monkeypatch.setattr(service_manager, "collect_process_tree_pids", lambda _pid: []) + + service_manager._terminate_process(process, "WebUI", DummyConsole(), timeout=0.1) + + assert signals == [("SIGTERM", 4321)] + + +def test_terminate_orphan_pid_stops_process_group(monkeypatch) -> None: + signals: list[tuple[str, int | tuple[int, ...] | None]] = [] + + monkeypatch.setattr(service_manager.sys, "platform", "darwin") + monkeypatch.setattr(service_manager.os, "getpgid", lambda pid: 18745 if pid == 18787 else pid) + monkeypatch.setattr(service_manager.os, "getpgrp", lambda: 99999) + monkeypatch.setattr(service_manager, "collect_process_tree_pids", lambda _pid: [18787, 18873]) + monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: False) + monkeypatch.setattr(service_manager, "process_group_is_running", lambda _pgid: False) + monkeypatch.setattr( + service_manager, + "signal_process_group", + lambda sig, pgid: signals.append((sig.name, pgid)), + ) + monkeypatch.setattr( + service_manager, + "signal_pid_list", + lambda sig, pids: signals.append((sig.name, tuple(pids))), + ) + + service_manager._terminate_orphan_pid(18787, "后端", DummyConsole(), timeout=0.1) + + assert signals == [ + ("SIGTERM", 18745), + ("SIGTERM", (18787, 18873)), + ] + + def test_spawn_process_appends_without_rotated_suffix(monkeypatch, tmp_path: Path) -> None: log_path = tmp_path / "logs" / "backend.log" log_path.parent.mkdir(parents=True) @@ -1515,309 +2061,11 @@ def fake_popen(*args, **kwargs): assert captured["kwargs"]["env"] == env -def test_stop_one_prefers_process_group_on_unix(monkeypatch, tmp_path: Path) -> None: - pid_file = tmp_path / "backend.pid" - service_manager.write_runtime_record( - pid_file, - service_manager.RuntimeRecord(pid=111, pgid=222, port=8000), - ) - console = DummyConsole() - group_alive = {"value": True} - group_signals: list[tuple[signal.Signals, int | None]] = [] - pid_signals: list[tuple[signal.Signals, list[int]]] = [] - - monkeypatch.setattr(service_manager.sys, "platform", "darwin") - monkeypatch.setattr(service_manager, "collect_process_tree_pids", lambda _pid: [111, 112]) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: False) - monkeypatch.setattr(service_manager, "process_group_is_running", lambda pgid: bool(pgid == 222 and group_alive["value"])) - - def fake_signal_group(sig, pgid): - group_signals.append((sig, pgid)) - if sig == signal.SIGTERM: - group_alive["value"] = False - - monkeypatch.setattr(service_manager, "signal_process_group", fake_signal_group) - monkeypatch.setattr( - service_manager, - "signal_pid_list", - lambda sig, pids: pid_signals.append((sig, list(pids))), - ) - - service_manager.stop_one(8000, pid_file, "后端", console) - - assert group_signals == [(signal.SIGTERM, 222)] - assert pid_signals == [] - assert not pid_file.exists() - - -def test_stop_one_falls_back_to_pid_signals_without_process_group(monkeypatch, tmp_path: Path) -> None: - pid_file = tmp_path / "backend.pid" - pid_file.write_text("111", encoding="utf-8") - console = DummyConsole() - pid_signals: list[tuple[signal.Signals, list[int]]] = [] - alive = {"value": True} - - monkeypatch.setattr(service_manager.sys, "platform", "darwin") - monkeypatch.setattr(service_manager, "collect_process_tree_pids", lambda _pid: [111, 112]) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: alive["value"]) - monkeypatch.setattr(service_manager, "process_group_is_running", lambda _pgid: False) - monkeypatch.setattr( - service_manager, - "signal_pid_list", - lambda sig, pids: ( - pid_signals.append((sig, list(pids))), - alive.__setitem__("value", False), - ), - ) - - service_manager.stop_one(8000, pid_file, "后端", console) - - assert pid_signals[0] == (signal.SIGTERM, [111, 112]) - assert not pid_file.exists() - - -def test_stop_one_uses_taskkill_on_windows(monkeypatch, tmp_path: Path) -> None: - pid_file = tmp_path / "backend.pid" - pid_file.write_text("111", encoding="utf-8") - console = DummyConsole() - commands: list[list[str]] = [] - alive = {"value": True} - - monkeypatch.setattr(service_manager.sys, "platform", "win32") - monkeypatch.setattr(service_manager, "collect_process_tree_pids", lambda _pid: [111, 222]) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: alive["value"]) - - def fake_run(args, **kwargs): - commands.append(list(args)) - alive["value"] = False - return SimpleNamespace(returncode=0) - - monkeypatch.setattr(service_manager.subprocess, "run", fake_run) - service_manager.stop_one(8000, pid_file, "后端", console) - - assert commands == [ - ["taskkill", "/PID", "111", "/T", "/F"], - ["taskkill", "/PID", "222", "/T", "/F"], - ] - - -def test_stop_one_skips_taskkill_for_reused_windows_pid(monkeypatch, tmp_path: Path) -> None: - pid_file = tmp_path / "backend.pid" - service_manager.write_runtime_record( - pid_file, - service_manager.RuntimeRecord( - pid=111, - host="127.0.0.1", - port=8000, - command=("python.exe", "-m", "flocks.cli.main", "serve"), - ), - ) - console = DummyConsole() - - monkeypatch.setattr(service_manager.sys, "platform", "win32") - monkeypatch.setattr(service_manager, "collect_process_tree_pids", lambda _pid: [111]) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "pid_is_running", lambda pid: pid == 111) - monkeypatch.setattr( - service_manager, - "_windows_process_snapshot", - lambda _pid: { - "name": "svchost.exe", - "command_line": r"C:\Windows\System32\svchost.exe -k netsvcs", - "executable_path": r"C:\Windows\System32\svchost.exe", - }, - ) - monkeypatch.setattr( - service_manager.subprocess, - "run", - lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("taskkill should not run")), - ) - - service_manager.stop_one(8000, pid_file, "后端", console) - - assert console.messages[-1] == "[flocks] 后端 未运行。" - assert not pid_file.exists() - - -def test_stop_one_force_kill_refreshes_process_group_members(monkeypatch, tmp_path: Path) -> None: - pid_file = tmp_path / "backend.pid" - service_manager.write_runtime_record( - pid_file, - service_manager.RuntimeRecord(pid=111, pgid=222, port=8000), - ) - console = DummyConsole() - pid_signals: list[tuple[signal.Signals, list[int]]] = [] - group_signals: list[tuple[signal.Signals, int | None]] = [] - alive_group_members = {333} - - monkeypatch.setattr(service_manager.sys, "platform", "darwin") - monkeypatch.setattr(service_manager, "collect_process_tree_pids", lambda _pid: [111]) - monkeypatch.setattr(service_manager, "_process_group_member_pids", lambda pgid: [333] if pgid == 222 and alive_group_members else []) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "pid_is_running", lambda pid: pid in alive_group_members) - monkeypatch.setattr(service_manager, "process_group_is_running", lambda pgid: bool(pgid == 222 and alive_group_members)) - monkeypatch.setattr(service_manager.time, "sleep", lambda _delay: None) - - def fake_signal_group(sig, pgid): - group_signals.append((sig, pgid)) - - def fake_signal_pid_list(sig, pids): - pid_list = list(pids) - pid_signals.append((sig, pid_list)) - if sig == signal.SIGKILL and 333 in pid_list: - alive_group_members.clear() - - monkeypatch.setattr(service_manager, "signal_process_group", fake_signal_group) - monkeypatch.setattr(service_manager, "signal_pid_list", fake_signal_pid_list) - - service_manager.stop_one(8000, pid_file, "后端", console) - - assert (signal.SIGTERM, 222) in group_signals - assert any(sig == signal.SIGKILL and 333 in pids for sig, pids in pid_signals) - assert not pid_file.exists() - assert console.messages[-1] == "[flocks] 后端 已停止。" - - -def test_stop_one_keeps_runtime_record_when_force_kill_still_times_out(monkeypatch, tmp_path: Path) -> None: - pid_file = tmp_path / "backend.pid" - service_manager.write_runtime_record( - pid_file, - service_manager.RuntimeRecord(pid=111, pgid=222, port=8000), - ) - console = DummyConsole() - - monkeypatch.setattr(service_manager.sys, "platform", "darwin") - monkeypatch.setattr(service_manager, "collect_process_tree_pids", lambda _pid: [111]) - monkeypatch.setattr(service_manager, "_process_group_member_pids", lambda pgid: [333] if pgid == 222 else []) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: False) - monkeypatch.setattr(service_manager, "process_group_is_running", lambda pgid: pgid == 222) - monkeypatch.setattr(service_manager, "signal_process_group", lambda *_args: None) - monkeypatch.setattr(service_manager, "signal_pid_list", lambda *_args: None) - monkeypatch.setattr(service_manager.time, "sleep", lambda _delay: None) - - with pytest.raises(service_manager.ServiceError, match="未在预期时间内退出"): - service_manager.stop_one(8000, pid_file, "后端", console) - - assert pid_file.exists() - - -@contextlib.contextmanager -def _record_call(call_order: list[str], name: str): - call_order.append(name) - yield - - -def test_stop_all_reads_port_from_runtime_record(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - service_manager.write_runtime_record(paths.backend_pid, service_manager.RuntimeRecord(pid=111, port=9995)) - service_manager.write_runtime_record(paths.frontend_pid, service_manager.RuntimeRecord(pid=222, port=9996)) - calls: list[tuple[int, Path, str]] = [] - - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call([], "service_lock")) - monkeypatch.setattr(service_manager, "stop_all_browser_daemons", lambda: []) - monkeypatch.setattr( - service_manager, - "stop_one", - lambda port, pid_file, name, _console: calls.append((port, pid_file, name)), - ) - - service_manager.stop_all(console=None) - - assert calls == [ - (9996, paths.frontend_pid, "WebUI"), - (9995, paths.backend_pid, "后端"), - ] - - -def test_stop_all_falls_back_to_default_port_when_record_missing(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - calls: list[int] = [] - - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call([], "service_lock")) - monkeypatch.setattr(service_manager, "stop_all_browser_daemons", lambda: []) - monkeypatch.setattr(service_manager, "stop_one", lambda port, *_args: calls.append(port)) - - service_manager.stop_all(console=None) - - assert calls == [5173, 8000] - - -def test_stop_all_falls_back_to_default_port_when_record_has_no_port(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.backend_pid.write_text("111", encoding="utf-8") - paths.frontend_pid.write_text("222", encoding="utf-8") - calls: list[int] = [] - - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call([], "service_lock")) - monkeypatch.setattr(service_manager, "stop_all_browser_daemons", lambda: []) - monkeypatch.setattr(service_manager, "stop_one", lambda port, *_args: calls.append(port)) - - service_manager.stop_all(console=None) - - assert calls == [5173, 8000] - - -def test_stop_all_also_cleans_browser_daemons(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) +def test_stop_all_uses_supervisor_control_api(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) calls: list[str] = [] - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "service_lock", lambda _paths: _record_call([], "service_lock")) - monkeypatch.setattr( - service_manager, - "stop_one", - lambda _port, _pid_file, name, _console: calls.append(name), - ) - monkeypatch.setattr( - service_manager, - "stop_all_browser_daemons", - lambda: calls.append("browser") or ["default", "remote"], - ) - class FakeConsole: def __init__(self) -> None: self.messages = [] @@ -1825,159 +2073,87 @@ def __init__(self) -> None: def print(self, message) -> None: self.messages.append(message) + states = iter([True, False]) + payload = _supervisor_status_payload() + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: next(states)) + monkeypatch.setattr(service_manager, "read_supervisor_status", lambda *_args, **_kwargs: _supervisor_status(payload)) + monkeypatch.setattr(service_manager, "request_stop", lambda **_kwargs: calls.append("/stop") or {"status": "stopping"}) + monkeypatch.setattr(service_manager, "cleanup_legacy_runtime_processes", lambda _paths, _console: calls.append("legacy")) + monkeypatch.setattr(service_manager, "cleanup_orphan_service_ports", lambda _config, _console, **_kwargs: calls.append("cleanup")) + monkeypatch.setattr(service_manager, "stop_all_browser_daemons", lambda: calls.append("browser")) + console = FakeConsole() service_manager.stop_all(console=console) - assert calls == ["WebUI", "后端", "browser"] - assert console.messages == [] + assert calls == ["/stop", "legacy", "cleanup", "browser"] + assert console.messages == [ + "[flocks] flocks 已停止(PID=111)。", + "[flocks] daemon 已停止。", + ] -def test_build_status_lines_reads_port_from_runtime_record(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - service_manager.write_runtime_record(paths.backend_pid, service_manager.RuntimeRecord(pid=111, port=9995)) - service_manager.write_runtime_record(paths.frontend_pid, service_manager.RuntimeRecord(pid=222, port=9996)) +def test_stop_all_reports_when_supervisor_is_down(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + console = DummyConsole() - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda port: [port] if port in {9995, 9996} else []) - monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: False) + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: False) + monkeypatch.setattr(service_manager, "cleanup_legacy_runtime_processes", lambda _paths, _console: console.messages.append("legacy")) + monkeypatch.setattr( + service_manager, + "cleanup_orphan_service_ports", + lambda _config, _console, **_kwargs: console.messages.append("cleanup"), + ) + monkeypatch.setattr(service_manager, "stop_all_browser_daemons", lambda: console.messages.append("browser")) - lines = service_manager.build_status_lines(paths) + service_manager.stop_all(console) - assert "http://127.0.0.1:9995" in lines[0] - assert "http://127.0.0.1:9996" in lines[1] + assert console.messages == ["[flocks] Flocks daemon 未运行。", "legacy", "cleanup", "browser"] -def test_build_status_lines_uses_recorded_host(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) +def test_stop_all_uses_legacy_runtime_ports_for_orphan_cleanup(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + console = DummyConsole() + captured: list[service_manager.ServiceConfig] = [] paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - service_manager.write_runtime_record( + _write_legacy_runtime_record( paths.backend_pid, - service_manager.RuntimeRecord(pid=111, host="10.0.0.8", port=9000), + service_manager.RuntimeRecord(pid=111, host="0.0.0.0", port=9000), ) - service_manager.write_runtime_record( + _write_legacy_runtime_record( paths.frontend_pid, - service_manager.RuntimeRecord(pid=222, host="0.0.0.0", port=5174), + service_manager.RuntimeRecord(pid=222, host="0.0.0.0", port=5273), ) - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda port: [111] if port == 9000 else [222]) - monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: False) + def fake_cleanup(config, _console, *, extra_configs=()): + captured.append(config) + captured.extend(extra_configs) - lines = service_manager.build_status_lines(paths) + monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) + monkeypatch.setattr(service_manager, "supervisor_is_running", lambda _paths: False) + monkeypatch.setattr(service_manager, "cleanup_legacy_runtime_processes", lambda *_args, **_kwargs: None) + monkeypatch.setattr(service_manager, "cleanup_orphan_service_ports", fake_cleanup) + monkeypatch.setattr(service_manager, "stop_all_browser_daemons", lambda: None) - assert "http://10.0.0.8:9000" in lines[0] - assert "http://127.0.0.1:5174" in lines[1] + service_manager.stop_all(console) + assert [(config.backend_port, config.frontend_port) for config in captured] == [(5173, 5173), (9000, 5273)] -def test_build_status_lines_uses_unknown_pid_when_bind_fallback_detects_listener( - monkeypatch, tmp_path: Path -) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - monkeypatch.setattr(service_manager, "cleanup_stale_pid_file", lambda _path: None) - monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: []) - monkeypatch.setattr(service_manager, "port_is_in_use", lambda _port, listeners=None: True) - monkeypatch.setattr(service_manager, "pid_is_running", lambda _pid: False) - monkeypatch.setattr(service_manager, "process_group_is_running", lambda _pgid: False) +def test_status_lines_include_control_api_errors(monkeypatch, tmp_path: Path) -> None: + paths = _make_runtime_paths(tmp_path) + payload = _supervisor_status_payload() + payload["backend"]["state"] = "degraded" + payload["backend"]["last_error"] = "health failed" + monkeypatch.setattr(service_manager, "read_supervisor_status", lambda *_args, **_kwargs: _supervisor_status(payload)) lines = service_manager.build_status_lines(paths) - assert "PID=unknown" in lines[0] - assert "PID=unknown" in lines[1] - - -def test_service_lock_prevents_concurrent_operations(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - state = {"locked": False} - - class FakeFcntl: - LOCK_EX = 1 - LOCK_NB = 2 - LOCK_UN = 4 - - @staticmethod - def flock(_handle, operation): - if operation == FakeFcntl.LOCK_UN: - state["locked"] = False - return - if state["locked"]: - raise OSError("busy") - state["locked"] = True - - monkeypatch.setattr(service_manager.sys, "platform", "darwin") - monkeypatch.setattr(service_manager, "fcntl", FakeFcntl) - - with service_manager.service_lock(paths): - with pytest.raises(service_manager.ServiceError, match="另一个 flocks 命令正在执行"): - with service_manager.service_lock(paths): - raise AssertionError("should not acquire nested lock") - - -def test_service_lock_releases_on_completion(monkeypatch, tmp_path: Path) -> None: - paths = service_manager.RuntimePaths( - root=tmp_path, - run_dir=tmp_path / "run", - log_dir=tmp_path / "logs", - backend_pid=tmp_path / "run" / "backend.pid", - frontend_pid=tmp_path / "run" / "webui.pid", - backend_log=tmp_path / "logs" / "backend.log", - frontend_log=tmp_path / "logs" / "webui.log", - ) - operations: list[int] = [] - - class FakeFcntl: - LOCK_EX = 1 - LOCK_NB = 2 - LOCK_UN = 4 - - @staticmethod - def flock(_handle, operation): - operations.append(operation) - - monkeypatch.setattr(service_manager.sys, "platform", "darwin") - monkeypatch.setattr(service_manager, "fcntl", FakeFcntl) - - with service_manager.service_lock(paths): - pass + backend_line = next(line for line in lines if "flocks:" in line) + assert "state=degraded" in backend_line + assert "last_error=health failed" in backend_line - assert operations == [FakeFcntl.LOCK_EX | FakeFcntl.LOCK_NB, FakeFcntl.LOCK_UN] def test_log_startup_config_appends_to_log_file(tmp_path: Path) -> None: diff --git a/tests/docker/test_dockerfile_runtime_requirements.py b/tests/docker/test_dockerfile_runtime_requirements.py index 1e2d5ccc1..bbb61e1c8 100644 --- a/tests/docker/test_dockerfile_runtime_requirements.py +++ b/tests/docker/test_dockerfile_runtime_requirements.py @@ -5,14 +5,6 @@ DOCKERFILE = REPO_ROOT / "docker" / "Dockerfile" -def test_runtime_image_installs_required_cli_tools() -> None: - dockerfile = DOCKERFILE.read_text(encoding="utf-8") - - assert "npm install --global agent-browser" in dockerfile - assert "agent-browser install --with-deps" in dockerfile - assert "curl -LsSf https://astral.sh/uv/install.sh | sh" in dockerfile - - def test_runtime_image_no_longer_bundles_system_chromium() -> None: dockerfile = DOCKERFILE.read_text(encoding="utf-8") diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/tests/helpers/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/helpers/service_supervisor.py b/tests/helpers/service_supervisor.py new file mode 100644 index 000000000..f51b19352 --- /dev/null +++ b/tests/helpers/service_supervisor.py @@ -0,0 +1,99 @@ +"""Helpers for service supervisor integration-style tests.""" + +from __future__ import annotations + +import subprocess +import sys +import tempfile +import threading +import time +from pathlib import Path + +from flocks.cli import service_control, service_manager, service_process, service_supervisor + + +class SleeperProcessAdapter: + """Process adapter that starts a real, lightweight child process.""" + + def __init__(self) -> None: + self.started: list[subprocess.Popen] = [] + self.stopped: list[int] = [] + + def start(self, _config, _paths, *, built_once: bool = False) -> subprocess.Popen: + del built_once + process = subprocess.Popen([sys.executable, "-c", "import time; time.sleep(60)"]) + self.started.append(process) + return process + + def stop(self, process: subprocess.Popen | None) -> None: + if process is None: + return + self.stopped.append(process.pid) + process.terminate() + try: + process.wait(timeout=5) + except subprocess.TimeoutExpired: + process.kill() + process.wait(timeout=5) + + def probe(self, process: subprocess.Popen | None, _host: str, _port: int) -> service_process.ServiceProbeResult: + if process is None: + return service_process.ServiceProbeResult(healthy=False, reason="stopped") + if process.poll() is not None: + return service_process.ServiceProbeResult(healthy=False, reason="process exited", restart=True) + return service_process.ServiceProbeResult(healthy=True) + + +def make_short_runtime_root(prefix: str) -> Path: + """Create a short runtime root so Unix domain socket paths fit on macOS.""" + return Path(tempfile.mkdtemp(prefix=prefix, dir="/tmp")) + + +def make_runtime_paths(root: Path) -> service_manager.RuntimePaths: + return service_manager.RuntimePaths( + root=root, + run_dir=root / "run", + log_dir=root / "logs", + backend_pid=root / "run" / "backend.pid", + frontend_pid=root / "run" / "webui.pid", + backend_log=root / "logs" / "backend.log", + frontend_log=root / "logs" / "webui.log", + ) + + +def wait_for_process_exit(process: subprocess.Popen, timeout: float = 5.0) -> None: + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if process.poll() is not None: + return + time.sleep(0.05) + raise AssertionError(f"process {process.pid} did not exit") + + +def wait_for_supervisor(paths: service_manager.RuntimePaths, *, running: bool, timeout: float = 5.0) -> None: + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if service_control.supervisor_is_running(paths) is running: + return + time.sleep(0.05) + raise AssertionError(f"supervisor running={running} was not observed") + + +def start_supervisor( + config: service_manager.ServiceConfig, +) -> tuple[service_supervisor.SupervisorDaemon, threading.Thread]: + daemon = service_supervisor.SupervisorDaemon( + config, + interval=0.05, + backend_adapter=SleeperProcessAdapter(), + ) + thread = threading.Thread(target=daemon.run, daemon=True) + thread.start() + return daemon, thread + + +def stop_supervisor(daemon: service_supervisor.SupervisorDaemon, thread: threading.Thread) -> None: + daemon.request_stop() + thread.join(timeout=5) + daemon.shutdown_children() + daemon._stop_control_server() diff --git a/tests/server/routes/test_auth_audit_routes.py b/tests/server/routes/test_auth_audit_routes.py index 00f3bf908..c01396eb9 100644 --- a/tests/server/routes/test_auth_audit_routes.py +++ b/tests/server/routes/test_auth_audit_routes.py @@ -74,6 +74,75 @@ async def _emit(event_type: str, payload: dict): assert emitted[0][1]["username"] == "chenjie" +async def test_login_rate_limits_repeated_failures(monkeypatch: pytest.MonkeyPatch): + from flocks.server.routes import auth as auth_routes + + auth_routes._login_rate_limiter.reset() + calls = {"login": 0} + + async def _login(_username: str, _password: str): + calls["login"] += 1 + raise ValueError("用户名或密码错误") + + async def _emit(_event_type: str, _payload: dict): + return None + + monkeypatch.setattr(auth_routes.AuthService, "login", _login) + monkeypatch.setattr(auth_routes, "_emit_auth_audit", _emit) + + request = SimpleNamespace(client=SimpleNamespace(host="127.0.0.1")) + response = Response() + payload = auth_routes.LoginRequest(username="chenjie", password="bad") + try: + for _ in range(auth_routes._LOGIN_MAX_FAILURES_PER_USER_AND_IP): + with pytest.raises(HTTPException) as exc_info: + await auth_routes.login(payload, response, request) + assert exc_info.value.status_code == 400 + + with pytest.raises(HTTPException) as exc_info: + await auth_routes.login(payload, response, request) + assert exc_info.value.status_code == 429 + assert exc_info.value.headers["Retry-After"] + + with pytest.raises(HTTPException) as exc_info: + await auth_routes.login(payload, response, request) + assert exc_info.value.status_code == 429 + assert calls["login"] == auth_routes._LOGIN_MAX_FAILURES_PER_USER_AND_IP + 1 + finally: + auth_routes._login_rate_limiter.reset() + + +async def test_login_rate_limiter_prunes_expired_buckets(): + from flocks.server.routes import auth as auth_routes + + limiter = auth_routes._LoginRateLimiter() + now = auth_routes.time.monotonic() + stale_key = ("user_ip", "stale@127.0.0.1") + limiter._failures[stale_key] = [now - auth_routes._LOGIN_FAILURE_WINDOW_SECONDS - 1] + limiter._locked_until[stale_key] = now - 1 + limiter._last_pruned_at = now - auth_routes._LOGIN_PRUNE_INTERVAL_SECONDS - 1 + + limiter.record_failure(username="chenjie", ip="127.0.0.1") + + assert stale_key not in limiter._failures + assert stale_key not in limiter._locked_until + + +async def test_login_rate_limiter_caps_tracked_buckets(monkeypatch: pytest.MonkeyPatch): + from flocks.server.routes import auth as auth_routes + + monkeypatch.setattr(auth_routes, "_LOGIN_MAX_TRACKED_BUCKETS", 4) + monkeypatch.setattr(auth_routes, "_LOGIN_PRUNE_INTERVAL_SECONDS", 0) + limiter = auth_routes._LoginRateLimiter() + + for index in range(10): + limiter.record_failure(username=f"user{index}", ip="127.0.0.1") + + assert limiter._tracked_bucket_count() <= auth_routes._LOGIN_MAX_TRACKED_BUCKETS + assert ("user_ip", "user9@127.0.0.1") in limiter._failures + assert ("ip", "127.0.0.1") in limiter._failures + + async def test_logout_emits_audit_event(monkeypatch: pytest.MonkeyPatch): from flocks.server.routes import auth as auth_routes diff --git a/tests/server/test_auth_compat.py b/tests/server/test_auth_compat.py index d4014e2da..b92191bc9 100644 --- a/tests/server/test_auth_compat.py +++ b/tests/server/test_auth_compat.py @@ -295,6 +295,9 @@ def test_static_prefix_is_exempt(self): def test_protected_path_is_not_exempt(self): assert auth_module.auth_middleware_exempt("/api/session") is False assert auth_module.auth_middleware_exempt("/api/admin/users") is False + assert auth_module.auth_middleware_exempt("/docs") is False + assert auth_module.auth_middleware_exempt("/redoc") is False + assert auth_module.auth_middleware_exempt("/openapi.json") is False def test_channel_webhook_is_exempt_via_regex(self): # /api/channel/{channel_id}/webhook is the public callback entry for diff --git a/tests/server/test_server.py b/tests/server/test_server.py index 12ea20652..d326a7b69 100644 --- a/tests/server/test_server.py +++ b/tests/server/test_server.py @@ -53,8 +53,8 @@ async def test_health_check(client): assert data["status"] == "healthy" assert isinstance(data["version"], str) and data["version"] assert "timestamp" in data - assert "config_dir" in data - assert "data_dir" in data + assert "config_dir" not in data + assert "data_dir" not in data assert "task_manager_started" not in data assert "task_scheduler_running" not in data assert "task_scheduler_available" not in data @@ -66,6 +66,17 @@ async def test_health_check(client): assert "task_oldest_running_seconds" not in data +@pytest.mark.asyncio +async def test_security_headers_present(client): + """Baseline browser security headers should be present on HTTP responses.""" + response = await client.get("/api/health") + + assert response.headers["x-content-type-options"] == "nosniff" + assert response.headers["referrer-policy"] == "no-referrer" + assert response.headers["content-security-policy"] == "frame-ancestors 'self'" + assert response.headers["permissions-policy"] == "camera=(), microphone=(), geolocation=()" + + @pytest.mark.asyncio async def test_task_queue_status_includes_diagnostics(client): response = await client.get("/api/task-system/queue/status") @@ -555,4 +566,3 @@ async def test_question_pending_route_lists_session_requests(client): finally: clear_request_state(req1["id"]) clear_request_state(req2["id"]) - diff --git a/tests/server/test_server_port_config.py b/tests/server/test_server_port_config.py index 059acca42..3964ff3d2 100644 --- a/tests/server/test_server_port_config.py +++ b/tests/server/test_server_port_config.py @@ -206,9 +206,74 @@ def fake_start_all(config, _console): assert result.exit_code == 0 assert captured["config"].backend_host == "0.0.0.0" - assert captured["config"].backend_port == 9000 + assert captured["config"].backend_port == 5174 assert captured["config"].frontend_host == "0.0.0.0" assert captured["config"].frontend_port == 5174 + assert captured["config"].legacy_backend_port == 9000 + assert captured["config"].server_port_migration_hint is True + + def test_start_accepts_public_host_and_port(self, monkeypatch): + """Test start command accepts the unified public host/port options.""" + captured = {} + + def fake_start_all(config, _console): + captured["config"] = config + + monkeypatch.setattr(cli_main, "start_all", fake_start_all) + + result = CliRunner().invoke( + cli_main.app, + [ + "start", + "--host", + "0.0.0.0", + "--port", + "8888", + ], + ) + + assert result.exit_code == 0 + assert captured["config"].backend_host == "0.0.0.0" + assert captured["config"].backend_port == 8888 + assert captured["config"].frontend_host == "0.0.0.0" + assert captured["config"].frontend_port == 8888 + assert captured["config"].legacy_backend_port == 8000 + + def test_public_host_and_port_override_legacy_options(self, monkeypatch): + """Test unified public host/port win over legacy server and WebUI options.""" + captured = {} + + def fake_start_all(config, _console): + captured["config"] = config + + monkeypatch.setattr(cli_main, "start_all", fake_start_all) + + result = CliRunner().invoke( + cli_main.app, + [ + "start", + "--host", + "0.0.0.0", + "--port", + "8888", + "--server-host", + "127.0.0.1", + "--server-port", + "9000", + "--webui-host", + "127.0.0.1", + "--webui-port", + "5174", + ], + ) + + assert result.exit_code == 0 + assert captured["config"].backend_host == "0.0.0.0" + assert captured["config"].backend_port == 8888 + assert captured["config"].frontend_host == "0.0.0.0" + assert captured["config"].frontend_port == 8888 + assert captured["config"].legacy_backend_host == "127.0.0.1" + assert captured["config"].legacy_backend_port == 9000 def test_restart_accepts_server_and_webui_options(self, monkeypatch): """Test restart command accepts explicit server and WebUI host/port options.""" @@ -218,6 +283,7 @@ def fake_restart_all(config, _console): captured["config"] = config monkeypatch.setattr(cli_main, "restart_all", fake_restart_all) + monkeypatch.setattr(cli_main, "read_supervisor_status", lambda **_kwargs: (_ for _ in ()).throw(RuntimeError("down"))) result = CliRunner().invoke( cli_main.app, @@ -236,45 +302,76 @@ def fake_restart_all(config, _console): assert result.exit_code == 0 assert captured["config"].backend_host == "127.0.0.1" - assert captured["config"].backend_port == 9100 + assert captured["config"].backend_port == 5273 assert captured["config"].frontend_host == "127.0.0.1" assert captured["config"].frontend_port == 5273 + assert captured["config"].legacy_backend_port == 9100 - def test_restart_reuses_runtime_recorded_host_and_port(self, monkeypatch, tmp_path: Path): - """Test restart reuses last runtime host/port when CLI and env omit them.""" + def test_restart_accepts_public_host_and_port(self, monkeypatch): + """Test restart command accepts the unified public host/port options.""" captured = {} - paths = SimpleNamespace( - backend_pid=tmp_path / "backend.pid", - frontend_pid=tmp_path / "webui.pid", + + def fake_restart_all(config, _console): + captured["config"] = config + + monkeypatch.setattr(cli_main, "restart_all", fake_restart_all) + monkeypatch.setattr(cli_main, "read_supervisor_status", lambda **_kwargs: (_ for _ in ()).throw(RuntimeError("down"))) + + result = CliRunner().invoke( + cli_main.app, + [ + "restart", + "--host", + "0.0.0.0", + "--port", + "8888", + ], ) - records = { - paths.backend_pid: SimpleNamespace(host="0.0.0.0", port=9000), - paths.frontend_pid: SimpleNamespace(host="0.0.0.0", port=5174), - } + + assert result.exit_code == 0 + assert captured["config"].backend_host == "0.0.0.0" + assert captured["config"].backend_port == 8888 + assert captured["config"].frontend_host == "0.0.0.0" + assert captured["config"].frontend_port == 8888 + assert captured["config"].legacy_backend_port == 8000 + + def test_restart_reuses_supervisor_recorded_host_and_port(self, monkeypatch, tmp_path: Path): + """Test restart reuses supervisor host/port when CLI and env omit them.""" + captured = {} + paths = SimpleNamespace(run_dir=tmp_path) def fake_restart_all(config, _console): captured["config"] = config monkeypatch.setattr(cli_main, "restart_all", fake_restart_all) monkeypatch.setattr(cli_main, "runtime_paths", lambda: paths) - monkeypatch.setattr(cli_main, "read_runtime_record", lambda path: records.get(path)) + monkeypatch.setattr( + cli_main, + "read_supervisor_status", + lambda **_kwargs: { + "config": { + "backend_host": "0.0.0.0", + "backend_port": 9000, + "frontend_host": "0.0.0.0", + "frontend_port": 5174, + } + }, + ) Config._global_config = None result = CliRunner().invoke(cli_main.app, ["restart"]) assert result.exit_code == 0 assert captured["config"].backend_host == "0.0.0.0" - assert captured["config"].backend_port == 9000 + assert captured["config"].backend_port == 5174 assert captured["config"].frontend_host == "0.0.0.0" assert captured["config"].frontend_port == 5174 + assert captured["config"].legacy_backend_port == 9000 - def test_restart_cli_options_override_runtime_record(self, monkeypatch, tmp_path: Path): - """Test explicit restart CLI options override runtime-recorded host/port.""" + def test_restart_cli_options_override_supervisor_record(self, monkeypatch, tmp_path: Path): + """Test explicit restart CLI options override supervisor host/port.""" captured = {} - paths = SimpleNamespace( - backend_pid=tmp_path / "backend.pid", - frontend_pid=tmp_path / "webui.pid", - ) + paths = SimpleNamespace(run_dir=tmp_path) def fake_restart_all(config, _console): captured["config"] = config @@ -283,11 +380,15 @@ def fake_restart_all(config, _console): monkeypatch.setattr(cli_main, "runtime_paths", lambda: paths) monkeypatch.setattr( cli_main, - "read_runtime_record", - lambda path: SimpleNamespace( - host="0.0.0.0", - port=9000 if Path(path) == paths.backend_pid else 5174, - ), + "read_supervisor_status", + lambda **_kwargs: { + "config": { + "backend_host": "0.0.0.0", + "backend_port": 9000, + "frontend_host": "0.0.0.0", + "frontend_port": 5174, + } + }, ) Config._global_config = None @@ -308,17 +409,15 @@ def fake_restart_all(config, _console): assert result.exit_code == 0 assert captured["config"].backend_host == "127.0.0.1" - assert captured["config"].backend_port == 9100 + assert captured["config"].backend_port == 5273 assert captured["config"].frontend_host == "127.0.0.1" assert captured["config"].frontend_port == 5273 + assert captured["config"].legacy_backend_port == 9100 - def test_restart_environment_overrides_runtime_record(self, monkeypatch, tmp_path: Path): - """Test restart environment variables still override runtime-recorded host/port.""" + def test_restart_environment_overrides_supervisor_record(self, monkeypatch, tmp_path: Path): + """Test restart environment variables still override supervisor host/port.""" captured = {} - paths = SimpleNamespace( - backend_pid=tmp_path / "backend.pid", - frontend_pid=tmp_path / "webui.pid", - ) + paths = SimpleNamespace(run_dir=tmp_path) def fake_restart_all(config, _console): captured["config"] = config @@ -327,11 +426,15 @@ def fake_restart_all(config, _console): monkeypatch.setattr(cli_main, "runtime_paths", lambda: paths) monkeypatch.setattr( cli_main, - "read_runtime_record", - lambda path: SimpleNamespace( - host="0.0.0.0", - port=9000 if Path(path) == paths.backend_pid else 5174, - ), + "read_supervisor_status", + lambda **_kwargs: { + "config": { + "backend_host": "0.0.0.0", + "backend_port": 9000, + "frontend_host": "0.0.0.0", + "frontend_port": 5174, + } + }, ) monkeypatch.setenv("FLOCKS_SERVER_HOST", "127.0.0.1") monkeypatch.setenv("FLOCKS_SERVER_PORT", "9101") @@ -343,9 +446,10 @@ def fake_restart_all(config, _console): assert result.exit_code == 0 assert captured["config"].backend_host == "127.0.0.1" - assert captured["config"].backend_port == 9101 + assert captured["config"].backend_port == 5275 assert captured["config"].frontend_host == "127.0.0.1" assert captured["config"].frontend_port == 5275 + assert captured["config"].legacy_backend_port == 9101 def test_service_config_prefers_cli_values(self, monkeypatch): """Test CLI values override environment and default values.""" @@ -362,10 +466,54 @@ def test_service_config_prefers_cli_values(self, monkeypatch): webui_port=5174, ) - assert config.backend_host == "0.0.0.0" - assert config.backend_port == 9000 + assert config.backend_host == "127.0.0.1" + assert config.backend_port == 5174 assert config.frontend_host == "127.0.0.1" assert config.frontend_port == 5174 + assert config.legacy_backend_host == "0.0.0.0" + assert config.legacy_backend_port == 9000 + + def test_service_config_default_public_port_is_webui_port(self, monkeypatch): + """Test service startup defaults to the public WebUI port.""" + monkeypatch.delenv("FLOCKS_HOST", raising=False) + monkeypatch.delenv("FLOCKS_PORT", raising=False) + monkeypatch.delenv("FLOCKS_PUBLIC_HOST", raising=False) + monkeypatch.delenv("FLOCKS_PUBLIC_PORT", raising=False) + monkeypatch.delenv("FLOCKS_SERVER_HOST", raising=False) + monkeypatch.delenv("FLOCKS_SERVER_PORT", raising=False) + monkeypatch.delenv("FLOCKS_WEBUI_HOST", raising=False) + monkeypatch.delenv("FLOCKS_WEBUI_PORT", raising=False) + Config._global_config = None + + config = cli_main._service_config() + + assert config.backend_host == "127.0.0.1" + assert config.backend_port == 5173 + assert config.frontend_host == "127.0.0.1" + assert config.frontend_port == 5173 + assert config.legacy_backend_port == 8000 + + def test_service_config_prefers_public_values(self, monkeypatch): + """Test unified public values override legacy CLI and environment values.""" + monkeypatch.setenv("FLOCKS_WEBUI_HOST", "10.0.0.2") + monkeypatch.setenv("FLOCKS_WEBUI_PORT", "5274") + Config._global_config = None + + config = cli_main._service_config( + host="0.0.0.0", + port=8888, + server_host="127.0.0.1", + server_port=9000, + webui_host="127.0.0.1", + webui_port=5174, + ) + + assert config.backend_host == "0.0.0.0" + assert config.backend_port == 8888 + assert config.frontend_host == "0.0.0.0" + assert config.frontend_port == 8888 + assert config.legacy_backend_host == "127.0.0.1" + assert config.legacy_backend_port == 9000 def test_service_config_uses_server_and_webui_environment(self, monkeypatch): """Test environment variables are used when CLI values are absent.""" @@ -378,9 +526,10 @@ def test_service_config_uses_server_and_webui_environment(self, monkeypatch): config = cli_main._service_config() assert config.backend_host == "0.0.0.0" - assert config.backend_port == 9001 + assert config.backend_port == 5175 assert config.frontend_host == "0.0.0.0" assert config.frontend_port == 5175 + assert config.legacy_backend_port == 9001 def test_service_config_keeps_legacy_env_fallbacks(self, monkeypatch): """Test legacy backend/frontend environment variables still work as fallback.""" @@ -397,9 +546,10 @@ def test_service_config_keeps_legacy_env_fallbacks(self, monkeypatch): config = cli_main._service_config() assert config.backend_host == "0.0.0.0" - assert config.backend_port == 9200 + assert config.backend_port == 5176 assert config.frontend_host == "0.0.0.0" assert config.frontend_port == 5176 + assert config.legacy_backend_port == 9200 def test_cli_tui_command_default_port(self): """Test that CLI tui command uses correct default port.""" @@ -575,13 +725,13 @@ def test_script_port_env_var(self): assert port == '7000' def test_script_port_env_var_default(self): - """Test FLOCKS_PORT defaults to 8000 when not set.""" + """Test FLOCKS_PORT defaults to the public service port when not set.""" # Temporarily remove the env var if it exists old_value = os.environ.pop('FLOCKS_PORT', None) try: - port = int(os.getenv('FLOCKS_PORT', '8000')) - assert port == 8000 + port = int(os.getenv('FLOCKS_PORT', '5173')) + assert port == 5173 finally: # Restore old value if it existed if old_value is not None: diff --git a/tests/server/test_static_webui.py b/tests/server/test_static_webui.py new file mode 100644 index 000000000..8e478c5ac --- /dev/null +++ b/tests/server/test_static_webui.py @@ -0,0 +1,120 @@ +from pathlib import Path + +import pytest +from fastapi import FastAPI +from httpx import ASGITransport, AsyncClient + +from flocks.server.static_webui import maybe_serve_static_webui + + +def _write_dist(root: Path) -> Path: + dist = root / "dist" + assets = dist / "assets" + assets.mkdir(parents=True) + (dist / "index.html").write_text("
Flocks WebUI", encoding="utf-8") + (assets / "app.12345678.js").write_text("console.log('flocks');", encoding="utf-8") + return dist + + +def _app() -> FastAPI: + app = FastAPI() + + @app.middleware("http") + async def static_webui(request, call_next): + response = await maybe_serve_static_webui(request) + if response is not None: + return response + return await call_next(request) + + @app.get("/api/health") + async def health(): + return {"status": "healthy"} + + return app + + +@pytest.mark.asyncio +async def test_static_webui_serves_browser_root(monkeypatch, tmp_path: Path) -> None: + monkeypatch.setenv("FLOCKS_WEBUI_DIST_DIR", str(_write_dist(tmp_path))) + async with AsyncClient(transport=ASGITransport(app=_app()), base_url="http://test") as client: + response = await client.get("/", headers={"Accept": "text/html"}) + + assert response.status_code == 200 + assert "Flocks WebUI" in response.text + assert response.headers["Cache-Control"] == "no-store" + + +@pytest.mark.asyncio +async def test_static_webui_serves_assets_with_immutable_cache(monkeypatch, tmp_path: Path) -> None: + monkeypatch.setenv("FLOCKS_WEBUI_DIST_DIR", str(_write_dist(tmp_path))) + async with AsyncClient(transport=ASGITransport(app=_app()), base_url="http://test") as client: + response = await client.get("/assets/app.12345678.js") + + assert response.status_code == 200 + assert "console.log" in response.text + assert response.headers["Cache-Control"] == "public, max-age=31536000, immutable" + + +@pytest.mark.asyncio +async def test_static_webui_falls_back_for_browser_deep_link(monkeypatch, tmp_path: Path) -> None: + monkeypatch.setenv("FLOCKS_WEBUI_DIST_DIR", str(_write_dist(tmp_path))) + async with AsyncClient(transport=ASGITransport(app=_app()), base_url="http://test") as client: + response = await client.get("/session/abc", headers={"Accept": "text/html"}) + + assert response.status_code == 200 + assert "Flocks WebUI" in response.text + + +@pytest.mark.asyncio +async def test_static_webui_falls_back_before_full_app_auth(monkeypatch, tmp_path: Path) -> None: + monkeypatch.setenv("FLOCKS_WEBUI_DIST_DIR", str(_write_dist(tmp_path))) + from flocks.server.app import app + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get( + "/session/abc", + headers={ + "Accept": "text/html", + "User-Agent": "Mozilla/5.0", + }, + ) + + assert response.status_code == 200 + assert "Flocks WebUI" in response.text + + +@pytest.mark.asyncio +async def test_static_webui_does_not_bypass_full_app_api_auth(monkeypatch, tmp_path: Path) -> None: + monkeypatch.setenv("FLOCKS_WEBUI_DIST_DIR", str(_write_dist(tmp_path))) + from flocks.server.app import app + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get( + "/api/session/abc", + headers={ + "Accept": "text/html", + "User-Agent": "Mozilla/5.0", + }, + ) + + assert response.status_code == 401 + assert "Flocks WebUI" not in response.text + + +@pytest.mark.asyncio +async def test_static_webui_does_not_intercept_api(monkeypatch, tmp_path: Path) -> None: + monkeypatch.setenv("FLOCKS_WEBUI_DIST_DIR", str(_write_dist(tmp_path))) + async with AsyncClient(transport=ASGITransport(app=_app()), base_url="http://test") as client: + response = await client.get("/api/health", headers={"Accept": "text/html"}) + + assert response.status_code == 200 + assert response.json() == {"status": "healthy"} + + +@pytest.mark.asyncio +async def test_static_webui_does_not_fallback_for_non_get(monkeypatch, tmp_path: Path) -> None: + monkeypatch.setenv("FLOCKS_WEBUI_DIST_DIR", str(_write_dist(tmp_path))) + async with AsyncClient(transport=ASGITransport(app=_app()), base_url="http://test") as client: + response = await client.post("/session/abc", headers={"Accept": "text/html"}) + + assert response.status_code == 404 diff --git a/tests/tool/test_device_schema_migration.py b/tests/tool/test_device_schema_migration.py new file mode 100644 index 000000000..40bd9be68 --- /dev/null +++ b/tests/tool/test_device_schema_migration.py @@ -0,0 +1,94 @@ +"""Device integration schema migration tests.""" + +import sqlite3 +from pathlib import Path +from typing import Any + +import pytest + +from flocks.storage.storage import Storage +from flocks.tool.device import models as device_models + + +def _reset_storage_state() -> None: + Storage._initialized = False + Storage._init_pid = None + Storage._db_path = None + + +async def _shutdown_storage() -> None: + await Storage.shutdown() + _reset_storage_state() + + +async def _device_columns(db_path: Path) -> set[str]: + async with Storage.connect(db_path) as db: + cursor = await db.execute("PRAGMA table_info(device_integrations)") + return {str(row[1]) for row in await cursor.fetchall()} + + +async def _device_indexes(db_path: Path) -> set[str]: + async with Storage.connect(db_path) as db: + cursor = await db.execute("PRAGMA index_list(device_integrations)") + return {str(row[1]) for row in await cursor.fetchall()} + + +def _capture_storage_warnings(monkeypatch) -> list[tuple[Any, Any]]: + warnings: list[tuple[Any, Any]] = [] + monkeypatch.setattr(Storage._log, "warn", lambda message=None, extra=None: warnings.append((message, extra))) + return warnings + + +def _extension_ddl_warnings(warnings: list[tuple[Any, Any]]) -> list[tuple[Any, Any]]: + return [entry for entry in warnings if entry[0] == "storage.extension_ddl.failed"] + + +@pytest.mark.asyncio +async def test_device_schema_fresh_init_does_not_warn_duplicate_group_id(monkeypatch, tmp_path: Path) -> None: + warnings = _capture_storage_warnings(monkeypatch) + db_path = tmp_path / "fresh.db" + + _reset_storage_state() + try: + await Storage.init(db_path) + + assert device_models.DEFAULT_GROUP_ID == "default-room" + assert "group_id" in await _device_columns(db_path) + assert "idx_device_group" in await _device_indexes(db_path) + assert _extension_ddl_warnings(warnings) == [] + finally: + await _shutdown_storage() + + +@pytest.mark.asyncio +async def test_device_schema_old_integrations_table_gets_group_id(monkeypatch, tmp_path: Path) -> None: + warnings = _capture_storage_warnings(monkeypatch) + db_path = tmp_path / "old.db" + with sqlite3.connect(db_path) as db: + db.executescript(""" + CREATE TABLE device_integrations ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL, + storage_key TEXT NOT NULL, + service_id TEXT NOT NULL, + enabled INTEGER NOT NULL DEFAULT 1, + verify_ssl INTEGER NOT NULL DEFAULT 0, + fields TEXT NOT NULL DEFAULT '{}', + status TEXT NOT NULL DEFAULT 'unknown', + message TEXT, + latency_ms INTEGER, + checked_at INTEGER, + created_at INTEGER NOT NULL, + updated_at INTEGER NOT NULL + ); + """) + + _reset_storage_state() + try: + await Storage.init(db_path) + + assert "group_id" in await _device_columns(db_path) + assert "idx_device_group" in await _device_indexes(db_path) + assert _extension_ddl_warnings(warnings) == [] + finally: + await _shutdown_storage() diff --git a/tests/updater/test_restart_handoff.py b/tests/updater/test_restart_handoff.py index 6f9ceaab2..9190db208 100644 --- a/tests/updater/test_restart_handoff.py +++ b/tests/updater/test_restart_handoff.py @@ -1,11 +1,17 @@ +import shutil +import sys from pathlib import Path from types import SimpleNamespace +import pytest + +from flocks.cli import service_manager from flocks.updater import restart_handoff +from tests.helpers.service_supervisor import make_short_runtime_root, start_supervisor, stop_supervisor, wait_for_supervisor -def _handoff_args(tmp_path: Path, restart_argv: list[str]) -> list[str]: - return [ +def _handoff_args(tmp_path: Path, restart_argv: list[str], *, prepare_handover: bool = False) -> list[str]: + args = [ "--parent-pid", "1234", "--backend-host", @@ -16,8 +22,6 @@ def _handoff_args(tmp_path: Path, restart_argv: list[str]) -> list[str]: "127.0.0.1", "--frontend-port", "5173", - "--backend-pid-file", - str(tmp_path / "backend.pid"), "--install-root", str(tmp_path), "--uv-path", @@ -28,9 +32,10 @@ def _handoff_args(tmp_path: Path, restart_argv: list[str]) -> list[str]: "2026.4.1", "--current-version", "2026.3.31", - "--", - *restart_argv, ] + if prepare_handover: + args.append("--prepare-handover") + return [*args, "--", *restart_argv] def test_run_waits_for_parent_and_backend_port_before_spawning( @@ -39,6 +44,22 @@ def test_run_waits_for_parent_and_backend_port_before_spawning( ) -> None: events: list[str] = [] restart_argv = ["python.exe", "-m", "flocks.cli.main", "serve", "--host", "127.0.0.1", "--port", "8000"] + expected_restart_argv = [ + "python.exe", + "-m", + "flocks.cli.main", + "start", + "--no-browser", + "--skip-webui-build", + "--host", + "127.0.0.1", + "--port", + "5173", + "--server-host", + "127.0.0.1", + "--server-port", + "8000", + ] monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) monkeypatch.setattr( @@ -49,7 +70,7 @@ def test_run_waits_for_parent_and_backend_port_before_spawning( monkeypatch.setattr( restart_handoff, "_ensure_backend_port_free", - lambda backend_port, backend_pid_file: events.append(f"free-port:{backend_port}:{backend_pid_file.name}") or True, + lambda backend_port: events.append(f"free-port:{backend_port}") or True, ) monkeypatch.setattr( restart_handoff.subprocess, @@ -57,22 +78,133 @@ def test_run_waits_for_parent_and_backend_port_before_spawning( lambda argv, cwd=None, close_fds=False: events.append(f"spawn:{list(argv)}:{cwd}:{close_fds}") or SimpleNamespace(pid=4321), ) + monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: events.append("tasks") or None) monkeypatch.setattr( restart_handoff, - "_record_backend_runtime_if_direct_serve", - lambda process, argv, **kwargs: events.append(f"record:{process.pid}:{list(argv)}:{kwargs['backend_port']}"), + "_stop_supervisor_before_restart", + lambda: events.append("stop-supervisor") or True, ) - monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: events.append("tasks") or None) code = restart_handoff.run(_handoff_args(tmp_path, restart_argv)) + assert code == 0 + assert events == [ + f"log:legacy_serve_restart_migrated argv={expected_restart_argv}", + "log:started parent_pid=1234 backend=127.0.0.1:8000 frontend=127.0.0.1:5173", + "wait-parent:1234", + "free-port:8000", + "tasks", + "stop-supervisor", + f"spawn:{expected_restart_argv}:{tmp_path}:True", + "log:restart_spawned pid=4321", + ] + + +def test_run_keeps_current_start_restart_argv(monkeypatch, tmp_path: Path) -> None: + events: list[str] = [] + restart_argv = [ + "python.exe", + "-m", + "flocks.cli.main", + "start", + "--no-browser", + "--skip-webui-build", + "--host", + "127.0.0.1", + "--port", + "5173", + ] + + monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) + monkeypatch.setattr(restart_handoff, "_wait_for_parent_exit", lambda parent_pid: True) + monkeypatch.setattr(restart_handoff, "_ensure_backend_port_free", lambda backend_port: True) + monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: None) + monkeypatch.setattr(restart_handoff, "_stop_supervisor_before_restart", lambda: True) + monkeypatch.setattr( + restart_handoff.subprocess, + "Popen", + lambda argv, cwd=None, close_fds=False: events.append(f"spawn:{list(argv)}:{cwd}:{close_fds}") + or SimpleNamespace(pid=4321), + ) + + code = restart_handoff.run(_handoff_args(tmp_path, restart_argv)) + + assert code == 0 + assert f"spawn:{restart_argv}:{tmp_path}:True" in events + + +def test_run_accepts_legacy_backend_pid_file_argument(monkeypatch, tmp_path: Path) -> None: + events: list[str] = [] + restart_argv = ["python.exe", "-m", "flocks.cli.main", "start"] + args = _handoff_args(tmp_path, restart_argv) + args[args.index("--install-root"):args.index("--install-root")] = [ + "--backend-pid-file", + str(tmp_path / "backend.pid"), + ] + + monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) + monkeypatch.setattr(restart_handoff, "_wait_for_parent_exit", lambda parent_pid: True) + monkeypatch.setattr(restart_handoff, "_ensure_backend_port_free", lambda backend_port: True) + monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: None) + monkeypatch.setattr(restart_handoff, "_stop_supervisor_before_restart", lambda: True) + monkeypatch.setattr( + restart_handoff.subprocess, + "Popen", + lambda argv, cwd=None, close_fds=False: events.append(f"spawn:{list(argv)}:{cwd}:{close_fds}") + or SimpleNamespace(pid=4321), + ) + + code = restart_handoff.run(args) + + assert code == 0 + assert f"spawn:{restart_argv}:{tmp_path}:True" in events + + +def test_run_prepares_handover_after_parent_exit_without_waiting_for_page_port( + monkeypatch, + tmp_path: Path, +) -> None: + events: list[str] = [] + restart_argv = ["python.exe", "-m", "flocks.cli.main", "start"] + + monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) + monkeypatch.setattr( + restart_handoff, + "_wait_for_parent_exit", + lambda parent_pid: events.append(f"wait-parent:{parent_pid}") or True, + ) + monkeypatch.setattr( + restart_handoff, + "_prepare_upgrade_handover", + lambda args: events.append(f"prepare:{args.version}") or True, + ) + monkeypatch.setattr( + restart_handoff, + "_ensure_backend_port_free", + lambda backend_port: events.append(f"free-port:{backend_port}") or True, + ) + monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: events.append("tasks") or None) + monkeypatch.setattr( + restart_handoff, + "_stop_supervisor_before_restart", + lambda: events.append("stop-supervisor") or True, + ) + monkeypatch.setattr( + restart_handoff.subprocess, + "Popen", + lambda argv, cwd=None, close_fds=False: events.append(f"spawn:{list(argv)}:{cwd}:{close_fds}") + or SimpleNamespace(pid=4321), + ) + + code = restart_handoff.run(_handoff_args(tmp_path, restart_argv, prepare_handover=True)) + assert code == 0 assert events[1:] == [ "wait-parent:1234", - "free-port:8000:backend.pid", + "prepare:2026.4.1", "tasks", + "stop-supervisor", f"spawn:{restart_argv}:{tmp_path}:True", - f"record:4321:{restart_argv}:8000", "log:restart_spawned pid=4321", ] @@ -89,7 +221,7 @@ def test_run_does_not_spawn_when_parent_exit_times_out(monkeypatch, tmp_path: Pa ) monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: events.append("tasks") or None) - code = restart_handoff.run(_handoff_args(tmp_path, ["python.exe", "-m", "flocks.cli.main", "serve"])) + code = restart_handoff.run(_handoff_args(tmp_path, ["python.exe", "-m", "flocks.cli.main", "start"])) assert code == 1 assert events == ["log:started parent_pid=1234 backend=127.0.0.1:8000 frontend=127.0.0.1:5173", "log:parent_exit_timeout parent_pid=1234"] @@ -101,7 +233,7 @@ def test_run_does_not_spawn_when_upgrade_tasks_fail(monkeypatch, tmp_path: Path) monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) monkeypatch.setattr(restart_handoff, "_wait_for_parent_exit", lambda parent_pid: True) - monkeypatch.setattr(restart_handoff, "_ensure_backend_port_free", lambda backend_port, backend_pid_file: True) + monkeypatch.setattr(restart_handoff, "_ensure_backend_port_free", lambda backend_port: True) monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: "sync failed") monkeypatch.setattr(restart_handoff, "_rollback_failed_upgrade", lambda args, error: events.append(f"rollback:{error}")) monkeypatch.setattr( @@ -132,7 +264,7 @@ def crash(_args): monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) monkeypatch.setattr(restart_handoff, "_wait_for_parent_exit", lambda parent_pid: True) - monkeypatch.setattr(restart_handoff, "_ensure_backend_port_free", lambda backend_port, backend_pid_file: True) + monkeypatch.setattr(restart_handoff, "_ensure_backend_port_free", lambda backend_port: True) monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", crash) monkeypatch.setattr(restart_handoff, "_rollback_failed_upgrade", lambda args, error: events.append(f"rollback:{error}")) monkeypatch.setattr( @@ -149,10 +281,99 @@ def crash(_args): assert "spawn" not in events -def test_ensure_backend_port_free_stops_backend_after_wait_timeout(monkeypatch, tmp_path: Path) -> None: +def test_run_does_not_spawn_when_supervisor_stop_fails(monkeypatch, tmp_path: Path) -> None: + events: list[str] = [] + restart_argv = ["python.exe", "-m", "flocks.cli.main", "start"] + + monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) + monkeypatch.setattr(restart_handoff, "_wait_for_parent_exit", lambda parent_pid: True) + monkeypatch.setattr(restart_handoff, "_ensure_backend_port_free", lambda backend_port: True) + monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: None) + monkeypatch.setattr(restart_handoff, "_stop_supervisor_before_restart", lambda: False) + monkeypatch.setattr( + restart_handoff.subprocess, + "Popen", + lambda *_args, **_kwargs: events.append("spawn"), + ) + + code = restart_handoff.run(_handoff_args(tmp_path, restart_argv)) + + assert code == 1 + assert "log:supervisor_stop_timeout" in events + assert "spawn" not in events + + +def test_run_rolls_back_prepared_handover_when_supervisor_stop_fails(monkeypatch, tmp_path: Path) -> None: + events: list[str] = [] + restart_argv = ["python.exe", "-m", "flocks.cli.main", "start"] + + monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) + monkeypatch.setattr(restart_handoff, "_wait_for_parent_exit", lambda parent_pid: True) + monkeypatch.setattr(restart_handoff, "_prepare_upgrade_handover", lambda args: events.append("prepare") or True) + monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: None) + monkeypatch.setattr(restart_handoff, "_stop_supervisor_before_restart", lambda: False) + monkeypatch.setattr(restart_handoff, "_rollback_upgrade_handover", lambda: events.append("rollback-handover")) + monkeypatch.setattr( + restart_handoff.subprocess, + "Popen", + lambda *_args, **_kwargs: events.append("spawn"), + ) + + code = restart_handoff.run(_handoff_args(tmp_path, restart_argv, prepare_handover=True)) + + assert code == 1 + assert "rollback-handover" in events + assert "spawn" not in events + + +def test_run_rolls_back_prepared_handover_when_restart_spawn_fails(monkeypatch, tmp_path: Path) -> None: + events: list[str] = [] + restart_argv = ["python.exe", "-m", "flocks.cli.main", "start"] + + monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) + monkeypatch.setattr(restart_handoff, "_wait_for_parent_exit", lambda parent_pid: True) + monkeypatch.setattr(restart_handoff, "_prepare_upgrade_handover", lambda args: events.append("prepare") or True) + monkeypatch.setattr(restart_handoff, "_run_upgrade_tasks", lambda args: None) + monkeypatch.setattr(restart_handoff, "_stop_supervisor_before_restart", lambda: True) + monkeypatch.setattr(restart_handoff, "_rollback_upgrade_handover", lambda: events.append("rollback-handover")) + monkeypatch.setattr( + restart_handoff.subprocess, + "Popen", + lambda *_args, **_kwargs: (_ for _ in ()).throw(OSError("spawn failed")), + ) + + code = restart_handoff.run(_handoff_args(tmp_path, restart_argv, prepare_handover=True)) + + assert code == 1 + assert "log:restart_spawn_failed error=spawn failed" in events + assert "rollback-handover" in events + + +@pytest.mark.skipif(sys.platform == "win32", reason="uses the Unix domain socket control API") +def test_stop_supervisor_before_restart_waits_until_real_control_api_stops(monkeypatch) -> None: + short_root = make_short_runtime_root("flocks-handoff-") + monkeypatch.setenv("FLOCKS_ROOT", str(short_root)) + paths = service_manager.runtime_paths() + daemon, thread = start_supervisor( + service_manager.ServiceConfig(backend_port=9995, frontend_port=9996), + ) + + try: + wait_for_supervisor(paths, running=True) + + assert restart_handoff._stop_supervisor_before_restart(timeout_seconds=5.0, poll_interval_seconds=0.05) is True + + wait_for_supervisor(paths, running=False) + thread.join(timeout=5) + assert not thread.is_alive() + finally: + stop_supervisor(daemon, thread) + shutil.rmtree(short_root, ignore_errors=True) + + +def test_ensure_backend_port_free_waits_again_after_timeout(monkeypatch) -> None: events: list[str] = [] wait_results = iter([False, True]) - backend_pid_file = tmp_path / "backend.pid" monkeypatch.setattr(restart_handoff, "_record_handoff_log", lambda message: events.append(f"log:{message}")) monkeypatch.setattr( @@ -160,16 +381,10 @@ def test_ensure_backend_port_free_stops_backend_after_wait_timeout(monkeypatch, "_wait_for_backend_port_free", lambda port, **kwargs: events.append(f"wait:{port}:{kwargs.get('timeout_seconds')}") or next(wait_results), ) - monkeypatch.setattr( - restart_handoff.service_manager, - "stop_one", - lambda port, pid_file, name, console: events.append(f"stop:{port}:{pid_file.name}:{name}"), - ) - assert restart_handoff._ensure_backend_port_free(8000, backend_pid_file) is True + assert restart_handoff._ensure_backend_port_free(8000) is True assert events == [ "wait:8000:None", - "log:backend_port_still_in_use port=8000; stopping backend", - "stop:8000:backend.pid:backend", + "log:backend_port_still_in_use port=8000", "wait:8000:20.0", ] diff --git a/tests/updater/test_updater.py b/tests/updater/test_updater.py index 1a176274c..cbeb4b043 100644 --- a/tests/updater/test_updater.py +++ b/tests/updater/test_updater.py @@ -10,8 +10,15 @@ import pytest -from flocks.cli import service_manager +from flocks.cli import service_control, service_manager from flocks.updater import updater +from tests.helpers.service_supervisor import ( + make_short_runtime_root, + start_supervisor, + stop_supervisor, + wait_for_process_exit, + wait_for_supervisor, +) def _write_pyproject_version(pyproject_path: Path, version: str) -> None: @@ -39,6 +46,33 @@ def _prepare_real_restart_runtime(install_root: Path) -> None: python_path.chmod(0o755) +def _webui_control_payload(state: str = "healthy", last_error: str | None = None) -> dict[str, object]: + return { + "webui": { + "state": state, + "last_error": last_error, + }, + } + + +def _webui_control_status( + state: str = "healthy", + last_error: str | None = None, +) -> service_control.SupervisorStatus: + return service_control.parse_supervisor_status(_webui_control_payload(state, last_error)) + + +def test_current_service_config_requires_supervisor_control_api(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + service_control, + "read_supervisor_status", + lambda **_kwargs: (_ for _ in ()).throw(RuntimeError("control down")), + ) + + with pytest.raises(RuntimeError, match="Supervisor control API is unavailable"): + updater._current_service_config() + + def test_run_handles_none_process_output(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: def fake_run(*args, **kwargs): return subprocess.CompletedProcess(args=args[0], returncode=0, stdout=None, stderr=None) @@ -225,13 +259,6 @@ def test_find_executable_checks_windows_cmd_suffixes( assert updater._find_executable("npm") == str(npm_cmd) -def test_is_windows_file_in_use_error_detects_winerror32(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr(updater.sys, "platform", "win32") - - assert updater._is_windows_file_in_use_error(PermissionError("[WinError 32] file in use")) is True - assert updater._is_windows_file_in_use_error(PermissionError("[WinError 5] access denied")) is False - - def test_is_uv_managed_python_runtime_error_detects_virtualenv_creation_failure() -> None: text = ( "Failed to create temporary virtualenv\n" @@ -504,12 +531,14 @@ def test_build_dependency_sync_command_keeps_project_install_on_non_windows( assert updater._build_dependency_sync_command("uv") == ["uv", "sync", "--frozen", "--no-python-downloads"] -def test_wheel_build_config_does_not_force_include_flockshub() -> None: +def test_wheel_build_config_does_not_force_include_runtime_or_build_outputs() -> None: pyproject_path = Path(__file__).resolve().parents[2] / "pyproject.toml" pyproject = tomllib.loads(pyproject_path.read_text(encoding="utf-8")) wheel_config = pyproject["tool"]["hatch"]["build"]["targets"]["wheel"] + forced_includes = wheel_config.get("force-include", {}) - assert ".flocks/flockshub" not in wheel_config.get("force-include", {}) + assert ".flocks/flockshub" not in forced_includes + assert "webui/dist" not in forced_includes def test_build_frontend_subprocess_env_prepends_bundled_node_on_windows( @@ -772,6 +801,50 @@ def test_build_restart_argv_uses_venv_python_on_non_windows( ] +def test_build_restart_handoff_argv_rewrites_serve_to_managed_start( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + config = service_manager.ServiceConfig( + backend_host="10.0.0.8", + backend_port=5273, + frontend_host="10.0.0.8", + frontend_port=5273, + legacy_backend_host="0.0.0.0", + legacy_backend_port=9000, + ) + monkeypatch.setattr(updater, "_handoff_service_config", lambda: config) + monkeypatch.setattr(updater.os, "getpid", lambda: 1234) + + argv = updater._build_restart_handoff_argv( + ["python", "-m", "flocks.cli.main", "serve", "--host", "0.0.0.0", "--port", "9000"], + tmp_path, + uv_path="uv", + sync_timeout=300, + version="2026.4.1", + current_version="2026.3.31", + prepare_handover=True, + ) + + assert "--prepare-handover" in argv[: argv.index("--")] + assert argv[argv.index("--") + 1 :] == [ + "python", + "-m", + "flocks.cli.main", + "start", + "--no-browser", + "--skip-webui-build", + "--host", + "10.0.0.8", + "--port", + "5273", + "--server-host", + "0.0.0.0", + "--server-port", + "9000", + ] + + def test_refresh_global_cli_entry_creates_symlink_on_unix( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, @@ -990,92 +1063,134 @@ def test_safe_remove_renames_locked_directory_on_windows( assert (leftovers[0] / "dist" / "index.html").exists() -def test_prepare_upgrade_handover_writes_state_and_stops_frontend( +@pytest.mark.skipif(sys.platform == "win32", reason="uses the Unix domain socket control API") +def test_prepare_upgrade_handover_writes_state_and_stops_frontend_with_real_control_api( monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, ) -> None: - monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path / ".flocks")) - paths = service_manager.RuntimePaths( - root=tmp_path / ".flocks", - run_dir=tmp_path / ".flocks" / "run", - log_dir=tmp_path / ".flocks" / "logs", - backend_pid=tmp_path / ".flocks" / "run" / "backend.pid", - frontend_pid=tmp_path / ".flocks" / "run" / "webui.pid", - backend_log=tmp_path / ".flocks" / "logs" / "backend.log", - frontend_log=tmp_path / ".flocks" / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - - calls: list[tuple[int, str]] = [] - monkeypatch.setattr(updater, "_current_service_config", lambda: service_manager.ServiceConfig()) + short_root = make_short_runtime_root("flocks-updater-") + monkeypatch.setenv("FLOCKS_ROOT", str(short_root)) + paths = service_manager.runtime_paths() + config = service_manager.ServiceConfig( + backend_host="127.0.0.1", + backend_port=9995, + frontend_host="127.0.0.1", + frontend_port=9996, + ) + daemon, thread = start_supervisor(config) + wait_for_supervisor(paths, running=True) + monkeypatch.setattr( updater, "_start_upgrade_page_server", - lambda config, version: {"upgrade_server_pid": 321, "page_dir": str(tmp_path / "page"), "page_log": str(tmp_path / "upgrade.log")}, - ) - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "_recorded_port", lambda _pid_file, default: default) - monkeypatch.setattr( - service_manager, - "stop_one", - lambda port, _pid_file, name, _console: calls.append((port, name)), + lambda _config, _version: { + "upgrade_server_pid": 321, + "page_dir": str(short_root / "page"), + "page_log": str(short_root / "logs" / "upgrade.log"), + }, ) - payload = updater._prepare_upgrade_handover("2026.3.31.1") - - assert calls == [(5173, "WebUI")] - assert payload["upgrade_server_pid"] == 321 - assert updater._read_upgrade_state()["version"] == "2026.3.31.1" - - -def test_prepare_upgrade_handover_restores_frontend_when_upgrade_page_fails( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path / ".flocks")) - paths = service_manager.RuntimePaths( - root=tmp_path / ".flocks", - run_dir=tmp_path / ".flocks" / "run", - log_dir=tmp_path / ".flocks" / "logs", - backend_pid=tmp_path / ".flocks" / "run" / "backend.pid", - frontend_pid=tmp_path / ".flocks" / "run" / "webui.pid", - backend_log=tmp_path / ".flocks" / "logs" / "backend.log", - frontend_log=tmp_path / ".flocks" / "logs" / "webui.log", - ) - paths.run_dir.mkdir(parents=True) - paths.log_dir.mkdir(parents=True) - - calls: list[tuple[str, bool]] = [] - monkeypatch.setattr(updater, "_current_service_config", lambda: service_manager.ServiceConfig()) - monkeypatch.setattr(service_manager, "ensure_runtime_dirs", lambda: paths) - monkeypatch.setattr(service_manager, "_recorded_port", lambda _pid_file, default: default) - monkeypatch.setattr( - service_manager, - "stop_one", - lambda port, _pid_file, name, _console: calls.append((f"stop:{name}:{port}", True)), + try: + payload = updater._prepare_upgrade_handover("2026.3.31.1") + + status = service_control.read_supervisor_status(paths) + assert status.backend.paused is True + assert status.webui.paused is True + assert payload["upgrade_server_pid"] == 321 + assert payload["backend_port"] == 9995 + assert payload["frontend_port"] == 9996 + assert updater._read_upgrade_state()["version"] == "2026.3.31.1" + finally: + stop_supervisor(daemon, thread) + shutil.rmtree(short_root, ignore_errors=True) + + +@pytest.mark.skipif(sys.platform == "win32", reason="uses the Unix domain socket control API") +def test_prepare_upgrade_handover_restores_frontend_when_upgrade_page_fails_with_real_control_api( + monkeypatch: pytest.MonkeyPatch, +) -> None: + short_root = make_short_runtime_root("flocks-updater-") + monkeypatch.setenv("FLOCKS_ROOT", str(short_root)) + paths = service_manager.runtime_paths() + config = service_manager.ServiceConfig( + backend_host="127.0.0.1", + backend_port=9995, + frontend_host="127.0.0.1", + frontend_port=9996, ) + daemon, thread = start_supervisor(config) + wait_for_supervisor(paths, running=True) + calls: list[str] = [] - def fake_start_frontend(config, _console) -> None: - calls.append(("start_frontend", config.skip_frontend_build)) - - monkeypatch.setattr(service_manager, "start_frontend", fake_start_frontend) - monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: calls.append(("stop_page", True))) + monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **_kw: calls.append("stop_page")) monkeypatch.setattr( updater, "_start_upgrade_page_server", lambda _config, _version: (_ for _ in ()).throw(RuntimeError("page failed")), ) - with pytest.raises(RuntimeError, match="page failed"): - updater._prepare_upgrade_handover("2026.3.31.1") - - assert calls == [ - ("stop:WebUI:5173", True), - ("stop_page", True), - ("start_frontend", False), - ] - assert updater._read_upgrade_state() is None + try: + with pytest.raises(RuntimeError, match="page failed"): + updater._prepare_upgrade_handover("2026.3.31.1") + + status = service_control.read_supervisor_status(paths) + assert calls == ["stop_page"] + assert status.backend.paused is False + assert status.backend.pid is not None + assert status.webui.paused is False + assert status.webui.pid is None + assert status.webui.state == "static" + assert updater._read_upgrade_state() is None + finally: + stop_supervisor(daemon, thread) + shutil.rmtree(short_root, ignore_errors=True) + + +@pytest.mark.skipif(sys.platform == "win32", reason="uses the Unix domain socket control API") +def test_rollback_failed_update_resumes_backend_when_handoff_tasks_fail( + monkeypatch: pytest.MonkeyPatch, +) -> None: + short_root = make_short_runtime_root("flocks-updater-") + monkeypatch.setenv("FLOCKS_ROOT", str(short_root)) + paths = service_manager.runtime_paths() + config = service_manager.ServiceConfig( + backend_host="127.0.0.1", + backend_port=9995, + frontend_host="127.0.0.1", + frontend_port=9996, + ) + daemon, thread = start_supervisor(config) + wait_for_supervisor(paths, running=True) + monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **_kw: None) + + try: + updater._write_upgrade_state( + { + "version": "2026.4.1", + "backend_host": "127.0.0.1", + "backend_port": 9995, + "frontend_host": "127.0.0.1", + "frontend_port": 9996, + "skip_frontend_build": True, + } + ) + old_backend = daemon.backend.process + assert old_backend is not None + service_control.request_prepare_upgrade(paths=paths) + wait_for_process_exit(old_backend) + + updater._rollback_failed_update(None, short_root / "install", "2026.3.31") + + status = service_control.read_supervisor_status(paths) + assert status.backend.paused is False + assert status.webui.paused is False + assert status.backend.pid is not None + assert status.backend.pid != old_backend.pid + assert status.webui.pid is None + assert status.webui.state == "static" + assert updater._read_upgrade_state() is None + finally: + stop_supervisor(daemon, thread) + shutil.rmtree(short_root, ignore_errors=True) def test_recover_upgrade_state_restarts_frontend_and_clears_marker( @@ -1083,14 +1198,15 @@ def test_recover_upgrade_state_restarts_frontend_and_clears_marker( tmp_path: Path, ) -> None: monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path / ".flocks")) - started: list[tuple[int, bool]] = [] + started: list[tuple[int, bool | None]] = [] stopped: list[str] = [] monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: stopped.append("stop")) monkeypatch.setattr( - service_manager, - "start_frontend", - lambda config, _console: started.append((config.frontend_port, config.skip_frontend_build)), + service_control, + "request_resume_upgrade", + lambda config, **_kwargs: started.append((config.frontend_port, config.skip_frontend_build)) + or _webui_control_status(), ) updater._write_upgrade_state( { @@ -1115,16 +1231,25 @@ def test_recover_upgrade_state_retries_frontend_with_build_when_dist_is_missing( tmp_path: Path, ) -> None: monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path / ".flocks")) - starts: list[bool] = [] + starts: list[tuple[str, bool | None, bool | None]] = [] monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: None) - def fake_start_frontend(config, _console) -> None: - starts.append(config.skip_frontend_build) - if config.skip_frontend_build: - raise service_manager.ServiceError("missing dist") + results = iter([ + _webui_control_payload("degraded", "missing dist"), + _webui_control_payload(), + ]) + + def fake_resume_upgrade(config, **_kwargs): + starts.append(("resume", config.skip_frontend_build, None)) + return service_control.parse_supervisor_status(next(results)) - monkeypatch.setattr(service_manager, "start_frontend", fake_start_frontend) + def fake_restart_webui(config, *, force_frontend_build=False, **_kwargs): + starts.append(("restart_webui", config.skip_frontend_build, force_frontend_build or None)) + return service_control.parse_supervisor_status(next(results)) + + monkeypatch.setattr(service_control, "request_resume_upgrade", fake_resume_upgrade) + monkeypatch.setattr(service_control, "request_restart_webui", fake_restart_webui) updater._write_upgrade_state( { "version": "2026.3.31.1", @@ -1138,7 +1263,7 @@ def fake_start_frontend(config, _console) -> None: updater.recover_upgrade_state() - assert starts == [True, False] + assert starts == [("resume", True, None), ("restart_webui", False, True)] assert updater._read_upgrade_state() is None @@ -1147,15 +1272,20 @@ def test_recover_upgrade_state_restart_failure_clears_state_without_restarting_p tmp_path: Path, ) -> None: monkeypatch.setenv("FLOCKS_ROOT", str(tmp_path / ".flocks")) - starts: list[bool] = [] + starts: list[tuple[str, bool | None, bool | None]] = [] monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: None) - def fake_start_frontend(config, _console) -> None: - starts.append(config.skip_frontend_build) - raise service_manager.ServiceError("still broken") + def fake_resume_upgrade(config, **_kwargs): + starts.append(("resume", config.skip_frontend_build, None)) + return _webui_control_status("degraded", "still broken") + + def fake_restart_webui(config, *, force_frontend_build=False, **_kwargs): + starts.append(("restart_webui", config.skip_frontend_build, force_frontend_build or None)) + return _webui_control_status("degraded", "still broken") - monkeypatch.setattr(service_manager, "start_frontend", fake_start_frontend) + monkeypatch.setattr(service_control, "request_resume_upgrade", fake_resume_upgrade) + monkeypatch.setattr(service_control, "request_restart_webui", fake_restart_webui) updater._write_upgrade_state( { "version": "2026.3.31.1", @@ -1167,10 +1297,10 @@ def fake_start_frontend(config, _console) -> None: } ) - with pytest.raises(service_manager.ServiceError, match="still broken"): + with pytest.raises(RuntimeError, match="still broken"): updater.recover_upgrade_state() - assert starts == [True, False] + assert starts == [("resume", True, None), ("restart_webui", False, True)] assert updater._read_upgrade_state() is None @@ -1220,6 +1350,51 @@ def test_start_upgrade_page_server_binds_configured_frontend_host( assert captured["wait_host"] == "0.0.0.0" +def test_stop_upgrade_page_server_does_not_kill_unified_flocks_service( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + flocks_root = tmp_path / ".flocks" + monkeypatch.setenv("FLOCKS_ROOT", str(flocks_root)) + killed: list[int] = [] + + monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: [111]) + monkeypatch.setattr( + service_manager, + "_process_command_line", + lambda _pid: "/env/bin/python -m flocks.cli.main serve --host 127.0.0.1 --port 5173", + ) + monkeypatch.setattr(updater.os, "kill", lambda pid, _sig: killed.append(pid)) + + updater._stop_upgrade_page_server(frontend_port=5173) + + assert killed == [] + + +def test_stop_upgrade_page_server_kills_only_upgrade_page_process( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + flocks_root = tmp_path / ".flocks" + page_dir = flocks_root / "run" / "upgrade-page" + monkeypatch.setenv("FLOCKS_ROOT", str(flocks_root)) + killed: list[int] = [] + + def fake_command_line(pid: int) -> str: + if pid == 222: + return f"/env/bin/python -m http.server 5173 --directory {page_dir}" + return "/env/bin/python -m flocks.cli.main serve --host 127.0.0.1 --port 5173" + + monkeypatch.setattr(service_manager, "port_owner_pids", lambda _port: [111, 222]) + monkeypatch.setattr(service_manager, "_process_command_line", fake_command_line) + monkeypatch.setattr(updater.os, "kill", lambda pid, _sig: killed.append(pid)) + monkeypatch.setattr(updater.time, "sleep", lambda _seconds: None) + + updater._stop_upgrade_page_server(frontend_port=5173) + + assert killed == [222] + + def test_wait_for_upgrade_page_uses_access_host_for_local_probe( monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -1286,12 +1461,21 @@ def test_rollback_failed_update_restores_backup_and_rebuilds_frontend_if_needed( monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: events.append("stop_page")) monkeypatch.setattr(updater.shutil, "rmtree", lambda path, ignore_errors=True: events.append(f"rmtree:{Path(path).name}")) - def fake_start_frontend(config, _console) -> None: - events.append(f"start_frontend:{config.skip_frontend_build}") - if config.skip_frontend_build: - raise service_manager.ServiceError("missing dist") + results = iter([ + _webui_control_payload("degraded", "missing dist"), + _webui_control_payload(), + ]) + + def fake_resume_upgrade(config, **_kwargs) -> service_control.SupervisorStatus: + events.append(f"resume:{config.skip_frontend_build}") + return service_control.parse_supervisor_status(next(results)) - monkeypatch.setattr(service_manager, "start_frontend", fake_start_frontend) + def fake_restart_webui(config, *, force_frontend_build=False, **_kwargs) -> service_control.SupervisorStatus: + events.append(f"restart_webui:{config.skip_frontend_build}:{force_frontend_build or None}") + return service_control.parse_supervisor_status(next(results)) + + monkeypatch.setattr(service_control, "request_resume_upgrade", fake_resume_upgrade) + monkeypatch.setattr(service_control, "request_restart_webui", fake_restart_webui) updater._write_upgrade_state( { "version": "2026.4.1", @@ -1311,8 +1495,8 @@ def fake_start_frontend(config, _console) -> None: "restore:backup.tar.gz:install", "marker:2026.3.31", "stop_page", - "start_frontend:True", - "start_frontend:False", + "resume:True", + "restart_webui:False:True", "rmtree:upgrade-page", ] assert updater._read_upgrade_state() is None @@ -1334,11 +1518,16 @@ def test_rollback_failed_update_clears_state_when_restore_and_frontend_both_fail monkeypatch.setattr(updater, "_stop_upgrade_page_server", lambda **kw: events.append("stop_page")) monkeypatch.setattr(updater.shutil, "rmtree", lambda path, ignore_errors=True: events.append(f"rmtree:{Path(path).name}")) - def fake_start_frontend(config, _console) -> None: - events.append(f"start_frontend:{config.skip_frontend_build}") - raise service_manager.ServiceError("frontend still broken") + def fake_resume_upgrade(config, **_kwargs) -> service_control.SupervisorStatus: + events.append(f"resume:{config.skip_frontend_build}") + return _webui_control_status("degraded", "frontend still broken") + + def fake_restart_webui(config, **_kwargs) -> service_control.SupervisorStatus: + events.append(f"restart_webui:{config.skip_frontend_build}") + return _webui_control_status("degraded", "frontend still broken") - monkeypatch.setattr(service_manager, "start_frontend", fake_start_frontend) + monkeypatch.setattr(service_control, "request_resume_upgrade", fake_resume_upgrade) + monkeypatch.setattr(service_control, "request_restart_webui", fake_restart_webui) updater._write_upgrade_state( { "version": "2026.4.1", @@ -1357,7 +1546,7 @@ def fake_start_frontend(config, _console) -> None: assert events == [ "stop_page", - "start_frontend:True", + "resume:True", "rmtree:upgrade-page", ] assert updater._read_upgrade_state() is None @@ -1513,7 +1702,7 @@ def test_replace_install_dir_copies_dot_flocks_plugins_from_source( @pytest.mark.asyncio -async def test_perform_update_schedules_handoff_after_handover( +async def test_perform_update_schedules_handoff_with_deferred_handover( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, ) -> None: @@ -1566,6 +1755,7 @@ async def fake_sleep(_seconds) -> None: lambda name: "/usr/bin/npm" if name in {"npm", "npm.cmd"} else "/usr/bin/uv", ) monkeypatch.setattr(updater, "_prepare_upgrade_handover", lambda _version: events.append("handover") or {}) + monkeypatch.setattr(updater, "_handoff_service_config", lambda: service_manager.ServiceConfig()) monkeypatch.setattr( updater, "_replace_install_dir", @@ -1576,7 +1766,11 @@ async def fake_sleep(_seconds) -> None: monkeypatch.setattr(updater.asyncio, "sleep", fake_sleep) monkeypatch.setattr(updater, "_rollback_failed_update", lambda *_args: events.append("rollback")) monkeypatch.setattr(updater, "rollback_upgrade_handover", lambda *_args: events.append("rollback_handover")) - monkeypatch.setattr(updater.subprocess, "Popen", lambda argv, **_kwargs: popen_calls.append(list(argv)) or SimpleNamespace(pid=4321)) + monkeypatch.setattr( + updater, + "_spawn_restart_handoff", + lambda argv, **_kwargs: popen_calls.append(list(argv)) or SimpleNamespace(pid=4321), + ) monkeypatch.setattr(updater.os, "_exit", lambda code: (_ for _ in ()).throw(SystemExit(code))) with pytest.raises(SystemExit, match="0"): @@ -1584,22 +1778,33 @@ async def fake_sleep(_seconds) -> None: pass assert events[:2] == ["replace", "sleep"] - assert "handover" in events + assert "handover" not in events assert len(popen_calls) == 1 handoff_argv = popen_calls[0] assert handoff_argv[:3] == ["/usr/bin/python3", "-m", "flocks.updater.restart_handoff"] assert "--uv-path" in handoff_argv assert "--version" in handoff_argv + assert "--prepare-handover" in handoff_argv[: handoff_argv.index("--")] assert handoff_argv[handoff_argv.index("--") + 1 :] == [ "/usr/bin/python3", "-m", "flocks.cli.main", "start", + "--no-browser", + "--skip-webui-build", + "--host", + "127.0.0.1", + "--port", + "5173", + "--server-host", + "127.0.0.1", + "--server-port", + "8000", ] @pytest.mark.asyncio -async def test_perform_update_errors_when_handover_fails_before_frontend_build( +async def test_perform_update_does_not_prepare_handover_before_spawning_handoff( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, ) -> None: @@ -1615,6 +1820,7 @@ async def test_perform_update_errors_when_handover_fails_before_frontend_build( install_root.mkdir() events: list[str] = [] + popen_calls: list[list[str]] = [] async def fake_get_updater_config(): return SimpleNamespace( @@ -1671,12 +1877,20 @@ async def fake_sleep(_seconds) -> None: lambda _version: (_ for _ in ()).throw(RuntimeError("handover boom")), ) monkeypatch.setattr(updater, "_restore_backup_if_possible", lambda *_args: events.append("restore")) + monkeypatch.setattr( + updater, + "_spawn_restart_handoff", + lambda argv, **_kwargs: popen_calls.append(list(argv)) or SimpleNamespace(pid=4321), + ) + monkeypatch.setattr(updater.os, "_exit", lambda code: (_ for _ in ()).throw(SystemExit(code))) - progresses = [step async for step in updater.perform_update("2026.4.1")] + with pytest.raises(SystemExit, match="0"): + async for _step in updater.perform_update("2026.4.1"): + pass - assert progresses[-1].stage == "error" - assert progresses[-1].message == "Failed to prepare WebUI handover: handover boom" - assert events == ["replace", "restore"] + assert events == ["replace"] + assert len(popen_calls) == 1 + assert "--prepare-handover" in popen_calls[0][: popen_calls[0].index("--")] @pytest.mark.asyncio @@ -2710,7 +2924,7 @@ async def fake_run_async(cmd, cwd=None, timeout=None, env=None): @pytest.mark.asyncio -async def test_perform_update_retries_after_windows_file_lock_and_rolls_back_handover_failures( +async def test_perform_update_reports_windows_file_lock_without_stopping_current_backend( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, ) -> None: @@ -2770,24 +2984,25 @@ def fake_replace_install_dir(*_args, **_kwargs): lambda name: "/usr/bin/npm" if name in {"npm", "npm.cmd"} else "/usr/bin/uv", ) monkeypatch.setattr(updater, "_prepare_upgrade_handover", lambda _version: events.append("handover")) + monkeypatch.setattr(updater, "_handoff_service_config", lambda: service_manager.ServiceConfig()) monkeypatch.setattr(updater, "_replace_install_dir", fake_replace_install_dir) monkeypatch.setattr(updater, "_rollback_failed_update", lambda *_args: events.append("rollback")) monkeypatch.setattr(updater, "_restore_backup_if_possible", lambda *_args: events.append("restore")) monkeypatch.setattr(updater, "_build_restart_argv", lambda install_root=None: [r"C:\tool\python.exe", "-m", "flocks.cli.main", "start"]) - monkeypatch.setattr(updater.subprocess, "Popen", lambda *_args, **_kwargs: events.append("popen") or SimpleNamespace(pid=4321)) + monkeypatch.setattr( + updater, + "_spawn_restart_handoff", + lambda *_args, **_kwargs: events.append("popen") or SimpleNamespace(pid=4321), + ) monkeypatch.setattr(updater.os, "_exit", lambda code: (_ for _ in ()).throw(SystemExit(code))) - with pytest.raises(SystemExit, match="0"): - async for _step in updater.perform_update("2026.4.1"): - pass + progresses = [step async for step in updater.perform_update("2026.4.1")] - assert events == [ - "replace-1", - "handover", - "replace-2", - "popen", - ] - assert "restore" not in events + assert progresses[-1].stage == "error" + assert "WinError 32" in progresses[-1].message + assert events == ["replace-1", "restore"] + assert "handover" not in events + assert "popen" not in events @pytest.mark.asyncio @@ -3017,7 +3232,7 @@ async def test_perform_update_spawns_restart_process_on_windows( (staged_webui / "dist").mkdir() (staged_webui / "dist" / "index.html").write_text("", encoding="utf-8") - popen_calls: list[tuple[list[str], Path, bool]] = [] + popen_calls: list[tuple[list[str], Path]] = [] events: list[str] = [] async def fake_get_updater_config(): @@ -3057,7 +3272,12 @@ async def fake_run_async(cmd, cwd=None, timeout=None, env=None): monkeypatch.setattr(updater, "_refresh_global_cli_entry", lambda _root: None) monkeypatch.setattr(updater, "_build_restart_argv", lambda install_root=None: [r"C:\tool\python.exe", "-m", "flocks.cli.main", "start"]) monkeypatch.setattr(updater, "_prepare_upgrade_handover", lambda _version: events.append("handover")) - monkeypatch.setattr(updater.subprocess, "Popen", lambda argv, cwd=None, close_fds=False: popen_calls.append((list(argv), cwd, close_fds)) or SimpleNamespace(pid=4321)) + monkeypatch.setattr(updater, "_handoff_service_config", lambda: service_manager.ServiceConfig()) + monkeypatch.setattr( + updater, + "_spawn_restart_handoff", + lambda argv, cwd=None: popen_calls.append((list(argv), cwd)) or SimpleNamespace(pid=4321), + ) monkeypatch.setattr(updater.os, "_exit", lambda code: (_ for _ in ()).throw(SystemExit(code))) monkeypatch.setattr(updater.os, "execv", lambda *_args: events.append("execv")) @@ -3066,19 +3286,29 @@ async def fake_run_async(cmd, cwd=None, timeout=None, env=None): pass assert len(popen_calls) == 1 - handoff_argv, cwd, close_fds = popen_calls[0] + handoff_argv, cwd = popen_calls[0] assert cwd == tmp_path / "install-root" - assert close_fds is True assert handoff_argv[:3] == [r"C:\tool\python.exe", "-m", "flocks.updater.restart_handoff"] assert "--parent-pid" in handoff_argv assert "--backend-port" in handoff_argv + assert "--prepare-handover" in handoff_argv[: handoff_argv.index("--")] assert handoff_argv[handoff_argv.index("--") + 1 :] == [ r"C:\tool\python.exe", "-m", "flocks.cli.main", "start", + "--no-browser", + "--skip-webui-build", + "--host", + "127.0.0.1", + "--port", + "5173", + "--server-host", + "127.0.0.1", + "--server-port", + "8000", ] - assert events == ["handover"] + assert events == [] assert "execv" not in events @@ -3259,10 +3489,11 @@ async def fake_run_async(cmd, cwd=None, timeout=None, env=None): monkeypatch.setattr(updater, "_refresh_global_cli_entry", lambda _root: None) monkeypatch.setattr(updater, "_build_restart_argv", lambda install_root=None: [r"C:\tool\python.exe", "-m", "flocks.cli.main"]) monkeypatch.setattr(updater, "_prepare_upgrade_handover", lambda _version: events.append("handover")) + monkeypatch.setattr(updater, "_handoff_service_config", lambda: service_manager.ServiceConfig()) monkeypatch.setattr(updater, "rollback_upgrade_handover", lambda: events.append("rollback_handover")) monkeypatch.setattr( - updater.subprocess, - "Popen", + updater, + "_spawn_restart_handoff", lambda *_args, **_kwargs: (_ for _ in ()).throw(OSError("spawn failed")), ) @@ -3270,4 +3501,5 @@ async def fake_run_async(cmd, cwd=None, timeout=None, env=None): assert progresses[-1].stage == "error" assert "Failed to restart service" in progresses[-1].message - assert "rollback_handover" in events + assert "handover" not in events + assert "rollback_handover" not in events diff --git a/webui/src/utils/restartPolling.test.ts b/webui/src/utils/restartPolling.test.ts index 0f3cc54f5..5e456d8d2 100644 --- a/webui/src/utils/restartPolling.test.ts +++ b/webui/src/utils/restartPolling.test.ts @@ -20,21 +20,22 @@ describe('checkRestartReadiness', () => { vi.restoreAllMocks(); }); - it('falls back to the loopback backend health endpoint during static handover', async () => { + it('checks same-origin health without probing the legacy backend port', async () => { const fetchMock = vi.fn(async (input: RequestInfo | URL) => { const url = String(input); if (url === '/api/health') { return new Response('', { status: 404 }); } - if (url === 'http://127.0.0.1:8000/api/health') { - return new Response(JSON.stringify({ status: 'healthy' }), { status: 200 }); - } return new Response('', { status: 200 }); }); vi.stubGlobal('fetch', fetchMock); - await expect(checkRestartReadiness()).resolves.toEqual({ ready: true }); + await expect(checkRestartReadiness()).resolves.toEqual({ + ready: false, + reason: 'health check returned HTTP 404', + }); expect(fetchMock).toHaveBeenCalledWith('/api/health', { cache: 'no-store' }); - expect(fetchMock).toHaveBeenCalledWith('http://127.0.0.1:8000/api/health', { cache: 'no-store' }); + expect(fetchMock).toHaveBeenCalledWith('/', { cache: 'no-store' }); + expect(fetchMock).not.toHaveBeenCalledWith('http://127.0.0.1:8000/api/health', { cache: 'no-store' }); }); }); diff --git a/webui/src/utils/restartPolling.ts b/webui/src/utils/restartPolling.ts index 2b9142bc4..dc142d5c1 100644 --- a/webui/src/utils/restartPolling.ts +++ b/webui/src/utils/restartPolling.ts @@ -29,17 +29,6 @@ async function readUpgradePageState(): Promise