diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c3e66892..4609fd80 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -52,6 +52,9 @@ jobs: docker compose -f docker-compose.test.yml run --rm pytest \ --cov-report=xml:/srv/api/coverage.xml + - name: Python format check (ruff) + run: docker compose -f docker-compose.test.yml run --rm ruff + - name: Run vitest with coverage # Override the default compose command (which runs `npm test`) to # run `npm run coverage` instead. Same apt-get / npm bootstrap as diff --git a/Dockerfile b/Dockerfile index a6511c59..4917668b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,9 @@ FROM node:24-bookworm-slim AS web-builder ARG NPM_VERSION=11.6.2 RUN npm install -g npm@${NPM_VERSION} WORKDIR /build -COPY app/package.json app/package-lock.json ./ +# .npmrc carries legacy-peer-deps=true (openapi-typescript's stale peer range +# vs TS 6) — it MUST be copied before `npm ci` or resolution fails with ERESOLVE. +COPY app/package.json app/package-lock.json app/.npmrc ./ RUN --mount=type=cache,target=/root/.npm \ npm ci --no-audit --no-fund COPY app/ ./ @@ -49,8 +51,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Python source COPY api/ ./api/ -# Built frontend → /srv/api/static (matches api/server.py default STATIC_DIR -# resolution: Path(__file__).parent / "static"). No env var needed. +# Built frontend → /srv/api/static (matches api/app.py DEFAULT_STATIC_DIR +# resolution: Path(__file__).resolve().parent / "static"). No env var needed. COPY --from=web-builder /build/dist /srv/api/static # pyproject.toml uses hatch-vcs (`source = "vcs"`) for dynamic versioning, @@ -81,6 +83,9 @@ HEALTHCHECK --interval=10s --timeout=2s --start-period=3s --retries=3 \ # re-sync that re-downloads dev deps and tries to reinstall the console # script into /srv/.venv/bin (read-only for the non-root runtime user). # Zombie reaping + signal propagation are handled by Docker's --init. +# `python -m api` launches a single uvicorn process (api.app:app) — single +# process by design, see api/security.py (the allowed_roots trust set is +# in-memory; multi-worker would split it). ENTRYPOINT ["/srv/.venv/bin/python", "-m", "api"] CMD ["--port", "8080"] diff --git a/README.md b/README.md index 1533f655..165d6059 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,9 @@ just run # run the local image like an end user Each worktree gets its own `.localhost` URL so source-picker recents stay isolated per project in localStorage. -The pre-push hook runs pytest, vitest, eslint, prettier, and typecheck before pushing to origin. Bypass with `git push --no-verify` if needed. Docker must be running. +The pre-push hook runs pytest, ruff (Python format), vitest, eslint, prettier, and typecheck before pushing to origin. Bypass with `git push --no-verify` if needed. Docker must be running. Apply Python formatting with `just fmt`. + +The backend is a [FastAPI](https://fastapi.tiangolo.com/) app on uvicorn — a single process by design, since the in-memory scan-root trust set (`api/security.py`) can't be split across workers. Scan progress streams over Server-Sent Events (`GET /api/manifest`). Interactive API docs render at `/api/docs` ([Scalar](https://github.com/scalar/scalar)), with the raw schema at `/api/openapi.json` — the source of truth for the generated frontend wire types (`just gen-types`, guarded against drift by `app/src/types/manifest.contract.ts`). ## Release diff --git a/api/__main__.py b/api/__main__.py index 3b486353..d0643570 100644 --- a/api/__main__.py +++ b/api/__main__.py @@ -1,28 +1,22 @@ """api CLI entrypoint. -Surface: - python -m api Serve on :8080. - python -m api --port 8000 Override port. - python -m api --reload Auto-reload on .py changes (dev only). - python -m api --version Print version. + python -m api Serve on :8080 (single uvicorn process). + python -m api --port 8000 Override port. + python -m api --reload Auto-reload on source changes (dev only). + python -m api --version Print version. -The container ENTRYPOINT runs `python -m api`, so this is the only entrypoint -in production. Dev mode uses --reload via docker-compose.dev.yml. - -Port + browser-opening logic that lived in the old cli.py is now external: -Docker handles port mapping (-p HOST:CONTAINER), and end users open the URL -themselves. +SINGLE PROCESS by design — see api/security.py (the allowed_roots trust +set is in-memory; multi-worker would split it). No --workers flag. """ from __future__ import annotations import argparse -import os -import signal import sys -import threading from typing import Optional +import uvicorn + from api import __version__ @@ -32,65 +26,23 @@ def _build_parser() -> argparse.ArgumentParser: description="Visualize a codebase as an isometric 3D city.", ) p.add_argument("--version", action="version", version=f"codecity {__version__}") - p.add_argument( - "--port", - type=int, - default=8080, - help="HTTP port to listen on (default: 8080).", - ) - p.add_argument( - "--reload", - action="store_true", - help="Watch api/**/*.py and re-exec on change (dev only).", - ) + p.add_argument("--port", type=int, default=8080, help="HTTP port (default 8080).") + p.add_argument("--host", default="0.0.0.0", help="Bind host (default 0.0.0.0).") + p.add_argument("--reload", action="store_true", help="Auto-reload (dev only).") return p def main(argv: Optional[list[str]] = None) -> int: - if argv is None: - argv = sys.argv[1:] - args = _build_parser().parse_args(argv) - - if args.reload: - # Defer the import — keeps watchfiles off the cold-start import graph - # for `codecity --version` / `--help` / non-reload runs. - try: - from api._reload import run_with_reload - except ImportError: - print( - "error: --reload is not yet wired up. " - "Use docker compose -f docker-compose.dev.yml up for dev mode.", - file=sys.stderr, - ) - return 2 - return run_with_reload(port=args.port) - - return _serve(port=args.port) - - -def _serve(port: int) -> int: - from api.server import start_server - - _, bound, shutdown = start_server(port=port, host="0.0.0.0") - print( - f"[codecity] listening on http://0.0.0.0:{bound}/", - file=sys.stderr, - flush=True, + args = _build_parser().parse_args(sys.argv[1:] if argv is None else argv) + uvicorn.run( + "api.app:app", + host=args.host, + port=args.port, + reload=args.reload, + reload_dirs=["api"] if args.reload else None, + workers=1, + log_level="info", ) - print("[codecity] Ctrl-C to stop", file=sys.stderr, flush=True) - - stop_event = threading.Event() - - def _handle_signal(signum: int, _frame: object) -> None: - stop_event.set() - - signal.signal(signal.SIGINT, _handle_signal) - signal.signal(signal.SIGTERM, _handle_signal) - - try: - stop_event.wait() - finally: - shutdown() return 0 diff --git a/api/_reload.py b/api/_reload.py deleted file mode 100644 index 791bba6b..00000000 --- a/api/_reload.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Dev-only: watch api/**/*.py and re-exec the process on change. - -Used by `python -m api --reload` (i.e. docker-compose.dev.yml's api command). -Not imported in prod — keeps watchfiles out of the runtime hot path. - -Implementation: re-exec via os.execv. Simpler than child-process supervision -and Compose's `init: true` ensures we get a clean shutdown on SIGTERM. The -api/scan.py and api/server.py state is fully recreated on re-exec. -""" - -from __future__ import annotations - -import os -import signal -import sys -from pathlib import Path -from threading import Event, Thread - -WATCH_ROOT = Path(__file__).resolve().parent - - -def _is_python_source(path: Path) -> bool: - """True if `path` is a .py file under WATCH_ROOT, excluding __pycache__.""" - if path.suffix != ".py": - return False - if "__pycache__" in path.parts: - return False - try: - path.resolve().relative_to(WATCH_ROOT) - except ValueError: - return False - return True - - -def run_with_reload(port: int) -> int: - """Run the server with auto-reload on api/**/*.py changes. - - Returns the server's exit code. Re-execs on first detected change instead - of returning — execv replaces the process image, so the function never - returns in that path. - """ - from watchfiles import watch - - from api.server import start_server - - _, bound, shutdown = start_server(port=port, host="0.0.0.0") - print( - f"[codecity] listening on http://0.0.0.0:{bound}/ (reload enabled)", - file=sys.stderr, - flush=True, - ) - - # One Event drives both the watcher thread (stops the watch() loop) and - # the main thread (wakes from wait() on SIGTERM/SIGINT). The watcher - # never sets it — execv replaces the process before that matters — but - # the main signal handler does. - shutdown_event = Event() - - def _watcher() -> None: - for changes in watch(str(WATCH_ROOT), stop_event=shutdown_event): - relevant = [path for _change, path in changes if _is_python_source(Path(path))] - if not relevant: - continue - for p in relevant: - print(f"[codecity] reload triggered by {p}", file=sys.stderr, flush=True) - # Shut down the server cleanly, then re-exec ourselves with the - # same argv. execv replaces the process — no return. - try: - shutdown() - except Exception as e: # pylint: disable=broad-except - print( - f"[codecity] shutdown error before reload: {e}", - file=sys.stderr, - flush=True, - ) - # Re-exec with `python -m api `. sys.argv[1:] preserves - # --port and --reload because argparse doesn't consume args destructively. - # Revisit if we add subcommands or env-driven config that argparse mutates. - os.execv(sys.executable, [sys.executable, "-m", "api", *sys.argv[1:]]) - - Thread(target=_watcher, daemon=True, name="cc-reload-watcher").start() - - def _handle(signum: int, _frame: object) -> None: - shutdown_event.set() - - signal.signal(signal.SIGINT, _handle) - signal.signal(signal.SIGTERM, _handle) - - try: - shutdown_event.wait() - finally: - shutdown() - return 0 diff --git a/api/app.py b/api/app.py new file mode 100644 index 00000000..aead40fa --- /dev/null +++ b/api/app.py @@ -0,0 +1,75 @@ +"""FastAPI app factory. + +Order matters: API routers register first, the SPA static catch-all last +(it owns every non-/api path). Swagger + default ReDoc are disabled; +Scalar is mounted at /api/docs and OpenAPI JSON relocated to +/api/openapi.json (the source for the generated TS types).""" + +from __future__ import annotations + +from pathlib import Path + +from fastapi import FastAPI, HTTPException, Request +from fastapi.middleware.gzip import GZipMiddleware +from fastapi.responses import HTMLResponse, JSONResponse + +from api.config import GZIP_MIN_BYTES +from api.models.responses import ErrorResponse +from api.routers import commit, file, manifest, meta +from api.sse_compression import SSEGZipMiddleware +from api.static import make_static_router + +DEFAULT_STATIC_DIR = Path(__file__).resolve().parent / "static" + +_SCALAR_HTML = """CodeCity API + + + +""" + + +def _scalar_docs() -> HTMLResponse: + """Serve the Scalar API-reference UI at /api/docs.""" + return HTMLResponse(_SCALAR_HTML) + + +async def _api_error_handler(_request: Request, exc: Exception) -> JSONResponse: + """Render HTTPExceptions as a uniform ErrorResponse JSON body (so an + unknown /api/* path is a 404 JSON, not HTML).""" + status = exc.status_code if isinstance(exc, HTTPException) else 500 + detail = exc.detail if isinstance(exc, HTTPException) else "internal server error" + return JSONResponse( + status_code=status, content=ErrorResponse(error=detail).model_dump() + ) + + +def create_app(static_dir: Path | None = None) -> FastAPI: + # NB: the process-global TRUST set is intentionally NOT reset here — the + # factory must be side-effect-free on session auth state. A fresh process + # starts with an empty TRUST; tests isolate it via an autouse fixture. + app = FastAPI( + title="CodeCity API", + docs_url=None, # disable Swagger UI + redoc_url=None, # disable default ReDoc + openapi_url="/api/openapi.json", + ) + # GZip compresses ordinary responses (it skips text/event-stream); the SSE + # middleware stream-gzips the manifest event stream with per-event flush. + app.add_middleware(GZipMiddleware, minimum_size=GZIP_MIN_BYTES) + app.add_middleware(SSEGZipMiddleware) + + # Registered by reference (not as decorated nested functions) so they are + # plain module-level handlers — no pyright reportUnusedFunction ignore. + app.add_api_route("/api/docs", _scalar_docs, include_in_schema=False) + app.add_exception_handler(HTTPException, _api_error_handler) + + app.include_router(meta.router) + app.include_router(file.router) + app.include_router(commit.router) + app.include_router(manifest.router) + app.include_router(make_static_router(static_dir or DEFAULT_STATIC_DIR)) + return app + + +# Module-level instance for `uvicorn api.app:app` (prod + --reload). +app = create_app() diff --git a/api/config.py b/api/config.py new file mode 100644 index 00000000..85d5a134 --- /dev/null +++ b/api/config.py @@ -0,0 +1,47 @@ +"""Process configuration: env-driven flags and size limits. + +Replaces the old api/env.py. Functions that read env are intentionally +LIVE (re-read per call) so tests can monkeypatch os.environ without a +restart — notably CODECITY_ALLOW_LOCAL_REPOS, which gates local scans. +""" + +from __future__ import annotations + +import os +from pathlib import Path + +# Cap individual /api/file responses (stray symlink to a giant blob). +MAX_FILE_BYTES = 100 * 1024 * 1024 +# Bodies under this skip gzip — framing overhead exceeds the savings. +GZIP_MIN_BYTES = 256 + +# Root for every on-disk cache — the single source of truth for where codecity +# stores things. cache.py hangs its manifest/file-stat/git-history subdirs off +# this; clone.py its `clones/` dir. Read once at import (a fixed location, not a +# live flag); override with CODECITY_CACHE_ROOT (e.g. an XDG dir or a writable +# mount in containers). Tests monkeypatch the per-module copies. +CACHE_ROOT = Path( + os.environ.get("CODECITY_CACHE_ROOT") or Path.home() / ".cache" / "codecity" +) + +# Permissive truthy set (case-insensitive, trimmed) — matches the prior +# api/env.py semantics so e.g. `-e CODECITY_FOO=yes` keeps working. +_TRUTHY = frozenset({"1", "true", "yes", "on"}) + + +def env_bool(name: str, default: bool = False) -> bool: + """True if env var `name` is a truthy string (1/true/yes/on, any case).""" + raw = os.environ.get(name) + if raw is None: + return default + return raw.strip().lower() in _TRUTHY + + +def local_repos_allowed() -> bool: + """Live read of CODECITY_ALLOW_LOCAL_REPOS (re-read per call).""" + return env_bool("CODECITY_ALLOW_LOCAL_REPOS") + + +def quiet() -> bool: + """Live read of CODECITY_QUIET — silences disconnect/scan logs.""" + return env_bool("CODECITY_QUIET") diff --git a/api/env.py b/api/env.py deleted file mode 100644 index d938aff2..00000000 --- a/api/env.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Permissive boolean env-var parsing. - -Single helper shared by every codecity env-driven bool. Truthy values -(case-insensitive, whitespace-trimmed): "1", "true", "yes", "on". -Anything else — including unset, "", "0", "false", "no", "off" — is -False (or the supplied default for unset). - -Matches the convention used by Docker / Django / typical CLI tooling -so users setting ``-e CODECITY_FOO=true`` aren't surprised by silent -failure. -""" - -from __future__ import annotations - -import os - -_TRUTHY = frozenset({"1", "true", "yes", "on"}) - - -def env_bool(name: str, default: bool = False) -> bool: - """Read env var ``name`` as a permissive boolean. - - Returns ``default`` if the variable is unset. For any set value, - returns True only when the trimmed lower-case value is in the - truthy set above. - """ - raw = os.environ.get(name) - if raw is None: - return default - return raw.strip().lower() in _TRUTHY diff --git a/api/models/__init__.py b/api/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/api/models/events.py b/api/models/events.py new file mode 100644 index 00000000..c055fd8a --- /dev/null +++ b/api/models/events.py @@ -0,0 +1,57 @@ +"""SSE event payloads for /api/manifest. Each is the `data:` body of a named +SSE event. Event names describe what the event delivers (not its position in +the sequence): clone-progress / scan-progress / manifest-partial / +manifest-complete / error. These models also document the stream in OpenAPI as +schema components.""" + +from __future__ import annotations + +from typing import Annotated, Literal, Optional + +from pydantic import BaseModel, WithJsonSchema + +from api.models.manifest import Manifest, OptionalInt, OptionalStr + +# These progress fields are absent-or-value, never null on the wire — same +# optional-but-non-nullable treatment as the manifest's optional fields. +_OptionalStage = Annotated[ + Optional[Literal["receiving", "resolving", "counting"]], + WithJsonSchema({"enum": ["receiving", "resolving", "counting"], "type": "string"}), +] + + +class CloneProgressEvent(BaseModel): + """`clone-progress` — git source is being cloned; carries clone progress.""" + + display_root: OptionalStr = None + stage: _OptionalStage = None + percent: OptionalInt = None + + +class ScanProgressEvent(BaseModel): + """`scan-progress` — the working tree is being walked; carries the + heartbeat files-scanned count.""" + + display_root: OptionalStr = None + files_scanned: OptionalInt = None + + +class PartialManifestEvent(BaseModel): + """`manifest-partial` — a manifest with the real tree structure but + placeholder file metadata, sent so the UI can paint the city before + per-file metadata is resolved.""" + + manifest: Manifest + + +class CompleteManifestEvent(BaseModel): + """`manifest-complete` — a manifest with real, fully-populated metadata (a + fresh scan's final pass, or a warm cache hit).""" + + manifest: Manifest + + +class ErrorEvent(BaseModel): + """`error` — a failure after the stream began; carries the message.""" + + error: str diff --git a/api/models/manifest.py b/api/models/manifest.py new file mode 100644 index 00000000..05f4e04d --- /dev/null +++ b/api/models/manifest.py @@ -0,0 +1,124 @@ +"""Pydantic wire models for the scan manifest. Single source of truth for +the OpenAPI schema and the generated app/src/types/manifest.ts. JSON shape +is byte-compatible with the prior TypedDicts.""" + +from __future__ import annotations + +from typing import Annotated, Literal, Optional, Union + +from pydantic import BaseModel, Field, WithJsonSchema, model_validator + +# Optional-but-non-nullable: the field may be absent, but when present is never +# null. The Python type stays Optional (so the default is None and validators +# can check `is None`), while the emitted JSON schema is the bare non-nullable +# type — matching the true wire (absent-or-value, never null). Shared with +# api/models/events.py. +OptionalInt = Annotated[Optional[int], WithJsonSchema({"type": "integer"})] +OptionalStr = Annotated[Optional[str], WithJsonSchema({"type": "string"})] + + +# `created`/`modified` are required-nullable: the scanner always emits the keys +# (as an ISO string or null), so they're present-but-nullable on the wire, not +# optional. (No `= None` default → Pydantic treats them as required.) +class GitMeta(BaseModel): + created: Optional[str] = Field(description="ISO create date, or null") + modified: Optional[str] = Field(description="ISO modify date, or null") + + +class FileNode(BaseModel): + name: str + type: Literal["file"] + path: str + fullPath: str + extension: str + size: int + lines: int + binary: bool + created: str + modified: str + git: GitMeta + # Optional-but-non-nullable (absent for non-media files, a pixel count + # otherwise — never null); see OptionalInt above. + media_width: OptionalInt = None + media_height: OptionalInt = None + + @model_validator(mode="after") + def _media_both_or_neither(self) -> "FileNode": + if (self.media_width is None) != (self.media_height is None): + raise ValueError( + "media_width and media_height must both be set or both absent" + ) + return self + + +class ExtBreakdownEntry(BaseModel): + ext: str + count: int + size: int + + +class DirNode(BaseModel): + name: str + type: Literal["directory"] + path: str + fullPath: str + children: list["TreeNode"] + children_count: int + children_file_count: int + children_dir_count: int + descendants_count: int + descendants_file_count: int + descendants_dir_count: int + descendants_size: int + descendants_ext_breakdown: list[ExtBreakdownEntry] + + +TreeNode = Annotated[Union[FileNode, DirNode], Field(discriminator="type")] + + +# All four string fields are required-nullable: the scanner always emits them +# (null for a fresh repo with no HEAD / no remote), so they're present-but- +# nullable on the wire, not optional. +class RepoInfo(BaseModel): + branch: Optional[str] + remote_url: Optional[str] + head_sha: Optional[str] + head_subject: Optional[str] + dirty: bool + + +class CommitEntry(BaseModel): + date: str = Field(description="YYYY-MM-DD") + files: int + sha: str + authors: list[str] + subject: str + same_day_total: int + + +class BusynessThresholds(BaseModel): + avg: int + busy: int + + +class Manifest(BaseModel): + root: str + scanned_at: str + signature: str + tree_signature: str + tree: DirNode + repo: RepoInfo + commits: list[CommitEntry] + busyness: BusynessThresholds + # Optional-but-non-nullable (absent for local sources, a label string for + # git sources — never null); see OptionalStr above. + display_root: OptionalStr = None + + +class SignatureResponse(BaseModel): + root: str + scanned_at: str + signature: str + + +DirNode.model_rebuild() diff --git a/api/models/responses.py b/api/models/responses.py new file mode 100644 index 00000000..9025e054 --- /dev/null +++ b/api/models/responses.py @@ -0,0 +1,35 @@ +"""Non-streaming JSON response bodies.""" + +from __future__ import annotations + +from pydantic import BaseModel + + +class ErrorResponse(BaseModel): + error: str + + +class FileTooLargeResponse(BaseModel): + error: str + size: int + limit: int + + +class HealthResponse(BaseModel): + ok: bool + + +class ConfigResponse(BaseModel): + allowLocalRepos: bool + + +class CacheClearResponse(BaseModel): + deleted: int + + +class CommitDetailResponse(BaseModel): + sha: str + authors: list[str] + date: str # YYYY-MM-DD + subject: str + body: str diff --git a/api/routers/__init__.py b/api/routers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/api/routers/commit.py b/api/routers/commit.py new file mode 100644 index 00000000..ff98a6b6 --- /dev/null +++ b/api/routers/commit.py @@ -0,0 +1,62 @@ +"""GET /api/commit?sha= — commit detail from any registered scan root.""" + +from __future__ import annotations + +import re +import subprocess + +from fastapi import APIRouter, HTTPException, Query + +from api.models.responses import CommitDetailResponse +from api.security import TRUST +from api.services.scan import build_authors_list + +router = APIRouter(prefix="/api", tags=["commit"]) + +_SHA_RE = re.compile(r"^[0-9a-fA-F]{7,40}$") +_FMT = ( + "%H%x00%an%x00%aI%x00%s%x00" + "%(trailers:key=Co-authored-by,valueonly,separator=%x1f)%x00%b" +) + + +@router.get("/commit", response_model=CommitDetailResponse) +def get_commit(sha: str = Query(...)) -> CommitDetailResponse: + if not _SHA_RE.match(sha.strip()): + raise HTTPException(400, "invalid or missing sha") + roots = TRUST.snapshot() + if not roots: + raise HTTPException( + 404, "no scan root registered yet — fetch /api/manifest first" + ) + for root in roots: + try: + out = subprocess.check_output( + [ + "git", + "-c", + "safe.directory=*", + "-C", + str(root), + "show", + "-s", + f"--format={_FMT}", + sha.strip(), + ], + stderr=subprocess.DEVNULL, + text=True, + ) + except subprocess.CalledProcessError: + continue + parts = out.rstrip("\n").split("\x00", 5) + if len(parts) < 6: + continue + full_sha, author, iso_date, subject, trailers_raw, body = parts + return CommitDetailResponse( + sha=full_sha, + authors=build_authors_list(author, trailers_raw), + date=iso_date[:10], + subject=subject, + body=body, + ) + raise HTTPException(404, "sha not found in any registered scan root") diff --git a/api/routers/file.py b/api/routers/file.py new file mode 100644 index 00000000..b3056a29 --- /dev/null +++ b/api/routers/file.py @@ -0,0 +1,60 @@ +"""GET /api/file — serve a file from disk, restricted to scanned roots.""" + +from __future__ import annotations + +import mimetypes +from pathlib import Path + +from fastapi import APIRouter, HTTPException, Query, Response +from fastapi.responses import JSONResponse + +from api.config import MAX_FILE_BYTES +from api.models.responses import FileTooLargeResponse +from api.security import NoRootsRegisteredError, OutsideRootError, TRUST +from api.services.media import is_media + +router = APIRouter(prefix="/api", tags=["file"]) + + +@router.get("/file") +def get_file( + path: str = Query(..., description="Absolute path inside a scanned root"), +) -> Response: + try: + target = TRUST.assert_inside(Path(path)) + except NoRootsRegisteredError: + raise HTTPException( + 403, "no scan root registered yet — fetch /api/manifest first" + ) + except OutsideRootError: + raise HTTPException(403, "outside scan root") + except (OSError, RuntimeError): + raise HTTPException(404, "not found") + + if not target.is_file(): + raise HTTPException(404, "not a file") + + size = target.stat().st_size + if size > MAX_FILE_BYTES: + return JSONResponse( + status_code=413, + content=FileTooLargeResponse( + error="file too large", size=size, limit=MAX_FILE_BYTES + ).model_dump(), + ) + + guessed, _ = mimetypes.guess_type(str(target)) + body = target.read_bytes() + if is_media(guessed) and guessed: + # Already-compressed media (image/video/audio/pdf): a set Content- + # Encoding makes the app-wide GZipMiddleware skip it, so we don't burn + # CPU re-deflating incompressible bytes for ~0 benefit. 'identity' = + # the body is sent as-is (RFC 9110 §8.4.1). + return Response( + content=body, + media_type=guessed, + headers={"Content-Encoding": "identity"}, + ) + # Non-media (code, configs, extensionless) → text/plain so the preview + # renders the bytes as code; GZipMiddleware compresses it (text gzips well). + return Response(content=body, media_type="text/plain; charset=utf-8") diff --git a/api/routers/manifest.py b/api/routers/manifest.py new file mode 100644 index 00000000..99b570fc --- /dev/null +++ b/api/routers/manifest.py @@ -0,0 +1,289 @@ +"""The manifest routes: GET /api/manifest (SSE stream), GET +/api/manifest/signature, DELETE /api/manifest/cache. + +Source classification/resolution lives in api.services.source; these are the +thin HTTP handlers over it. A ResolveError carries a status + message: the +signature/cache routes turn it into an HTTPException, while the SSE route turns +it into an `error` event (EventSource can't read 4xx bodies).""" + +from __future__ import annotations + +import asyncio +import json +import logging +import threading +from pathlib import Path +from typing import Any, AsyncIterator, Union + +from fastapi import APIRouter, HTTPException, Query, Request +from sse_starlette.sse import EventSourceResponse + +from api.models.events import ( + CloneProgressEvent, + CompleteManifestEvent, + ErrorEvent, + PartialManifestEvent, + ScanProgressEvent, +) +from api.models.manifest import SignatureResponse +from api.models.responses import CacheClearResponse +from api.security import TRUST +from api.services.cache import ( + cache_clear_manifests, + cache_load_manifest, + cache_save_manifest, +) +from api.services.clone import ( + BranchNotFoundError, + CloneError, + HostUnreachableError, + RepoNotFoundError, + clone_dir_for, + ensure_clone, +) +from api.services.scan import ScanCancelledError, scan_tree, signature_tree +from api.services.source import ( + ResolveError, + classify, + resolve_local, + resolve_source, +) + +router = APIRouter(prefix="/api", tags=["manifest"]) + +logger = logging.getLogger("codecity.manifest") + + +@router.get("/manifest/signature", response_model=SignatureResponse) +def signature( + src: str = Query(...), + branch: str | None = Query(None), + no_cache: bool = Query(False), +) -> SignatureResponse: + try: + resolved = resolve_source(src, branch) + except ResolveError as e: + raise HTTPException(e.status, e.message) + try: + sig = signature_tree(str(resolved.path), use_cache=not no_cache) + except Exception as e: # noqa: BLE001 + raise HTTPException(500, f"signature failed: {e}") + return SignatureResponse.model_validate(dict(sig)) + + +@router.delete("/manifest/cache", response_model=CacheClearResponse) +def clear_cache( + src: str = Query(...), + branch: str | None = Query(None), +) -> CacheClearResponse: + if not src: + raise HTTPException(400, "missing 'src' query param") + kind = classify(src) + if kind == "invalid": + raise HTTPException(400, "unrecognized source — pass a local path or a git URL") + if kind == "git": + abs_root = clone_dir_for(src, branch) + else: + # Non-strict resolve so a recents entry for a since-deleted path + # still drops its cache. + abs_root = Path(src).resolve(strict=False) + return CacheClearResponse(deleted=cache_clear_manifests(abs_root)) + + +def _sse(event: str, payload: dict[str, Any]) -> dict[str, Any]: + """sse-starlette event dict: {'event': name, 'data': json-string}.""" + return {"event": event, "data": json.dumps(payload)} + + +def _sse_error(message: str) -> dict[str, Any]: + """An `error` SSE event, single-sourced through the ErrorEvent model.""" + return _sse("error", ErrorEvent(error=message).model_dump()) + + +# Documented SSE event union: surfacing all five event models in the +# OpenAPI `responses` registers each as a schema component (richer Scalar +# docs) AND transitively pulls Manifest -> tree types via the manifest events. +SSEEvent = Union[ + CloneProgressEvent, + ScanProgressEvent, + PartialManifestEvent, + CompleteManifestEvent, + ErrorEvent, +] + + +@router.get( + "/manifest", + responses={ + 200: { + "description": ( + "Server-Sent Events stream (`text/event-stream`). Named events and " + "their JSON `data` payloads: `clone-progress` (CloneProgressEvent), " + "`scan-progress` (ScanProgressEvent), `manifest-partial` " + "(PartialManifestEvent), `manifest-complete` (CompleteManifestEvent), " + "`error` (ErrorEvent). The client closes the connection on " + "`manifest-complete`/`error`." + ), + "model": SSEEvent, + }, + }, +) +async def manifest( + request: Request, + src: str = Query(""), + branch: str | None = Query(None), + no_cache: bool = Query(False), +) -> EventSourceResponse: + use_cache = not no_cache + + async def gen() -> AsyncIterator[dict[str, Any]]: + # Classify + (for local) validate WITHOUT cloning. The git clone runs + # on the worker thread below so its progress streams live and a + # mid-clone disconnect cancels it. Failures become error EVENTS, not + # 4xx (EventSource can't read 4xx bodies). + if not src: + yield _sse_error("missing 'src' query param") + return + kind = classify(src) + if kind == "invalid": + yield _sse_error("unrecognized source — pass a local path or a git URL") + return + local_path: Path | None = None + if kind == "git": + display = f"{src}@{branch}" if branch else src + else: + try: + local_path = await asyncio.to_thread(resolve_local, src) + except ResolveError as e: + yield _sse_error(e.message) + return + display = src + + cancel = threading.Event() + loop = asyncio.get_running_loop() + q: asyncio.Queue[dict[str, Any] | None] = asyncio.Queue() + holder: dict[str, Any] = {"manifest": None, "sig": None, "path": None} + + def _put(item: dict[str, Any] | None) -> None: + loop.call_soon_threadsafe(q.put_nowait, item) + + def _on_clone(payload: tuple[str, int]) -> None: + stage, percent = payload + _put( + _sse( + "clone-progress", + { + "display_root": display, + "stage": stage, + "percent": percent, + }, + ) + ) + + def _on_scan(files_scanned: int) -> None: + _put( + _sse( + "scan-progress", + {"display_root": display, "files_scanned": files_scanned}, + ) + ) + + def _run() -> None: + try: + # Clone phase (git only): emit `clone-progress` FIRST, then clone + # with live progress + cancel support. + if kind == "git": + _put(_sse("clone-progress", {"display_root": display})) + try: + with TRUST.clone_lock: + path = ensure_clone( + src, + branch, + on_progress=_on_clone, + cancel_event=cancel, + ) + except ( + BranchNotFoundError, + RepoNotFoundError, + HostUnreachableError, + ) as e: + _put(_sse_error(str(e))) + return + except CloneError as e: + _put(_sse_error(str(e))) + return + else: + assert local_path is not None + path = local_path + + holder["path"] = path + TRUST.register(path) + _put(_sse("scan-progress", {"display_root": display})) + + # Signature (cache key) + warm-cache short-circuit. + sig = signature_tree(str(path), use_cache=use_cache)["signature"] + holder["sig"] = sig + if use_cache: + cached = cache_load_manifest(path.resolve(), sig) + if cached is not None: + if kind == "git": + cached["display_root"] = display + _put(_sse("manifest-complete", {"manifest": cached})) + return + + # Cold scan: partial + complete manifests, with heartbeat progress. + for ev in scan_tree( + str(path), + use_cache=use_cache, + cancel_event=cancel, + on_scan_progress=_on_scan, + ): + phase = ev["phase"] # "manifest-partial" | "manifest-complete" + m = ev["manifest"] + if kind == "git": + m["display_root"] = display + if phase == "manifest-complete": + holder["manifest"] = m + _put(_sse(phase, {"manifest": m})) + except ScanCancelledError: + pass # client disconnected mid-clone/scan; nothing to report + except Exception as e: # noqa: BLE001 + logger.exception("manifest scan failed for src=%s", src) + _put(_sse_error(f"scan failed: {e}")) + finally: + _put(None) # sentinel + + worker = threading.Thread(target=_run, daemon=True) + worker.start() + + disconnected = False + try: + while True: + if await request.is_disconnected(): + disconnected = True + break + try: + item = await asyncio.wait_for(q.get(), timeout=0.5) + except asyncio.TimeoutError: + continue + if item is None: + break + yield item + finally: + cancel.set() + await asyncio.to_thread(worker.join, 2.0) + + # ALWAYS write cache on a clean final (read gated by no_cache; write is + # not). Skipped on disconnect, and on error (where manifest stays None). + final = holder["manifest"] + sig = holder["sig"] + path = holder["path"] + if ( + final is not None + and not disconnected + and sig is not None + and path is not None + ): + await asyncio.to_thread(cache_save_manifest, path.resolve(), sig, final) + + return EventSourceResponse(gen()) diff --git a/api/routers/meta.py b/api/routers/meta.py new file mode 100644 index 00000000..1105ffcd --- /dev/null +++ b/api/routers/meta.py @@ -0,0 +1,22 @@ +"""Server meta endpoints: GET /api/health (liveness) and GET /api/config +(boot-time feature flags). Grouped here because both are tiny server-info +routes, not domain logic.""" + +from __future__ import annotations + +from fastapi import APIRouter + +from api.config import local_repos_allowed +from api.models.responses import ConfigResponse, HealthResponse + +router = APIRouter(prefix="/api", tags=["meta"]) + + +@router.get("/health", response_model=HealthResponse) +def health() -> HealthResponse: + return HealthResponse(ok=True) + + +@router.get("/config", response_model=ConfigResponse) +def config() -> ConfigResponse: + return ConfigResponse(allowLocalRepos=local_repos_allowed()) diff --git a/api/security.py b/api/security.py new file mode 100644 index 00000000..fbaf2306 --- /dev/null +++ b/api/security.py @@ -0,0 +1,70 @@ +"""The session trust model. + +SINGLE-PROCESS INVARIANT: `allowed_roots` is in-memory module state. +The app MUST run as one process — multi-worker (gunicorn) would split +the trust set across workers and break /api/file and /api/commit. The +Dockerfile and __main__ run a single uvicorn process for this reason. + +Trust rule: every successful manifest scan registers its absolute root. +/api/file and /api/commit then validate that the requested path resolves +under at least one registered root — there is no global filesystem read. +""" + +from __future__ import annotations + +import threading +from pathlib import Path + + +class OutsideRootError(Exception): + """Requested path resolves outside every registered scan root.""" + + +class NoRootsRegisteredError(Exception): + """No scan root registered yet — caller must fetch /api/manifest first.""" + + +class TrustStore: + """Thread-safe set of absolute roots that have been scanned this session.""" + + def __init__(self) -> None: + self._roots: set[Path] = set() + self._lock = threading.Lock() + # Serializes clone-or-update so two concurrent manifest requests for + # the same URL don't race the working tree. + self.clone_lock = threading.Lock() + + def reset(self) -> None: + """Fresh trust set (per-process start; tests call between cases).""" + with self._lock: + self._roots = set() + + def register(self, root: Path) -> None: + with self._lock: + self._roots.add(root.resolve()) + + def snapshot(self) -> set[Path]: + with self._lock: + return set(self._roots) + + def assert_inside(self, raw: Path) -> Path: + """Resolve `raw` (strict) and confirm it sits under a registered root. + + Raises NoRootsRegisteredError if none registered, OutsideRootError + if the resolved path escapes every root. Returns the resolved path. + """ + roots = self.snapshot() + if not roots: + raise NoRootsRegisteredError + target = raw.resolve(strict=True) + for root in roots: + try: + target.relative_to(root) + except ValueError: + continue + return target + raise OutsideRootError + + +# Module-level singleton — the one trust set for the process. +TRUST = TrustStore() diff --git a/api/server.py b/api/server.py deleted file mode 100644 index 5c06343d..00000000 --- a/api/server.py +++ /dev/null @@ -1,1076 +0,0 @@ -"""Local HTTP server backing the browser-served frontend. - -Serves the Vite-built frontend from a static dir supplied by the Docker -entrypoint (or `static_dir=` kwarg to `start_server`) and computes a -scan manifest on demand at `/api/manifest?src=…[&branch=…]`. `src` is -either a local absolute path or a git URL; for git URLs, the repo is -cloned into `~/.cache/codecity/clones/` and scanned from there. - -Bind address is configurable via `start_server(host=...)`; production -(`python -m api`) binds 0.0.0.0 so Docker port-forwarding works. -Container isolation gates external access — the published `-p 8080:8080` -is what the host actually exposes. - -Threading: ``ThreadingHTTPServer`` so concurrent /api/file fetches and a -manifest scan don't serialize on each other. The server runs on a daemon -thread so the main thread can stay responsive (and let Ctrl-C land). - -Trust model: every successful manifest scan registers its absolute root -in ``_State.allowed_roots``. ``/api/file`` then validates that the -requested file resolves under at least one of those roots. This means a -client can only fetch files from directories it has previously asked the -server to scan — there's no global filesystem read. -""" - -from __future__ import annotations - -import gzip -import json -import mimetypes -import os -import queue -import re -import select -import socket as _socket -import subprocess -import sys -import threading -from http import HTTPStatus -from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer -from io import BufferedIOBase -from pathlib import Path -from typing import Any, Callable, Iterable, Literal -from urllib.parse import parse_qs, urlparse - -from api.env import env_bool -from api.clone import ( - CloneError, - BranchNotFoundError, - RepoNotFoundError, - HostUnreachableError, - ensure_clone, -) -from api.cache import ( - cache_clear_manifests, - cache_load_manifest, - cache_save_manifest, -) -from api.media import is_media -from api.scan import ( - ScanCancelledError, - _build_authors_list, - scan_tree, - signature_tree, -) -from api.types import ( - CacheClearResponse, - CommitDetailResponse, - ConfigResponse, - ErrorResponse, - FileTooLargeResponse, - HealthResponse, - Manifest, - ScanStreamEvent, - SignatureResponse, -) - -# Cap individual /api/file responses so a stray symlink to a giant blob -# doesn't try to load 10 GB into the browser. -MAX_FILE_BYTES = 100 * 1024 * 1024 - - -_LOCAL_PATH_PREFIX = re.compile(r"^(/|~|\./|\.\./|[A-Za-z]:[\\/])") -_GIT_SSH_FORM = re.compile(r"^[^@]+@[^:]+:") - - -def _classify_source(raw: str) -> Literal["local", "git", "invalid"]: - """Classify a raw `?src=` value as a local path, a git URL, or invalid. - - Path-like prefixes (absolute, home, relative, Windows drive) → 'local'. - URLs (scheme:// or git@host:path SSH form) → 'git'. - Anything else → 'invalid'. - """ - if not raw: - return "invalid" - if _LOCAL_PATH_PREFIX.match(raw): - return "local" - if "://" in raw or _GIT_SSH_FORM.match(raw): - return "git" - return "invalid" - - -def _is_git_working_tree(path: Path) -> bool: - """Return True if ``path`` is inside a git working tree. - - Runs ``git rev-parse --is-inside-work-tree`` with cwd=path: - - working tree (top-level OR subdir OR linked worktree) → "true" - - bare repo → "false" - - non-git directory → command fails with non-zero exit - - CodeCity is fundamentally git-aware: every local scan needs a real - working tree to walk. Bare repos have no working tree (just the .git - object database) so they're rejected here too. - - Failures (missing git binary, timeout, OS error) all fall through to - False — better to reject with a clear message than to scan a path we - can't verify.""" - try: - result = subprocess.run( - ["git", "rev-parse", "--is-inside-work-tree"], - cwd=str(path), - capture_output=True, - text=True, - check=False, - timeout=5, - # Defensive: a path nested inside a credentialed repo could - # otherwise prompt for a passphrase and hang the subprocess. - # We're only reading metadata, so no auth is ever needed. - env={**os.environ, "GIT_TERMINAL_PROMPT": "0"}, - ) - except (OSError, subprocess.TimeoutExpired): - return False - return result.returncode == 0 and result.stdout.strip() == "true" - - -_NOT_GIT_ERROR = ( - "path is not inside a git working tree. CodeCity requires a git " - "project — try `git init` inside the directory, or paste a git " - "URL instead." -) - -_LOCAL_DISABLED_ERROR = ( - "local repositories are disabled — restart codecity with " - "CODECITY_ALLOW_LOCAL_REPOS=1. " - "See https://github.com/thalida/codecity#local-directories" -) - - -# Bodies under this threshold skip compression — gzip's framing -# overhead (~20 bytes header + trailer) exceeds the savings on small -# responses. The typical hits are /api/health and small error JSON. -_GZIP_MIN_BYTES = 256 - - -def _maybe_gzip( - handler: BaseHTTPRequestHandler, body: bytes, -) -> tuple[bytes, str | None]: - """If the client advertised Accept-Encoding: gzip, gzip-encode body. - - Returns ``(encoded, "gzip")`` when compression applies, ``(body, - None)`` otherwise. Caller is responsible for setting the - Content-Encoding header from the second element when non-None. - - The Accept-Encoding parser is intentionally loose: a substring - check for "gzip" matches the typical "gzip, deflate" or - "gzip;q=1.0". It does not parse RFC 7231 q-values; ``q=0`` would - be misinterpreted as accept, but that's a vanishingly rare config - and the worst case is a successfully-decoded gzip response. - """ - accept = handler.headers.get("Accept-Encoding", "") - if "gzip" not in accept.lower() or len(body) < _GZIP_MIN_BYTES: - return body, None - return gzip.compress(body, compresslevel=6), "gzip" - - -# Where the Vite build output lives. Resolved at import time so tests can -# spin up a server without an installed wheel layout. -STATIC_DIR = Path(__file__).resolve().parent / "static" - - -class _State: - """Module-level state shared by every handler instance.""" - static_dir: Path = STATIC_DIR - # Every absolute path that has been successfully scanned this session. - # /api/file uses this as its trust set. - allowed_roots: set[Path] = set() - # Guards allowed_roots — ThreadingHTTPServer can run a manifest scan - # (writer) concurrently with a file fetch (reader), and CPython will - # raise RuntimeError: Set changed size during iteration if a write - # lands mid-read. - allowed_roots_lock: threading.Lock = threading.Lock() - # Serializes clone-or-update so two concurrent manifest requests for - # the same URL don't race the working tree. ensure_clone is the cache - # itself (filesystem-backed); the lock just keeps it consistent. - clone_lock: threading.Lock = threading.Lock() - - -JsonBody = ( - Manifest - | SignatureResponse - | ErrorResponse - | FileTooLargeResponse - | HealthResponse - | CacheClearResponse - | CommitDetailResponse - | ConfigResponse -) - - -def _send_json(handler: BaseHTTPRequestHandler, status: int, body: JsonBody) -> None: - payload = json.dumps(body).encode("utf-8") - payload, encoding = _maybe_gzip(handler, payload) - handler.send_response(status) - handler.send_header("Content-Type", "application/json; charset=utf-8") - if encoding: - handler.send_header("Content-Encoding", encoding) - handler.send_header("Content-Length", str(len(payload))) - handler.end_headers() - handler.wfile.write(payload) - - -def _stream_events( - handler: BaseHTTPRequestHandler, - events: Iterable[ScanStreamEvent | dict[str, Any]], - cancel_event: threading.Event, -) -> None: - """Stream NDJSON events over a chunked HTTP response. - - Each event becomes one line: `\\n`. Encoded via iterencode - so peak memory is bounded by one ~64 KB chunk, not the serialized - size of the manifest. Wraps wfile in gzip when the client - advertises it. - - Flushes after every event boundary: ``gz.flush()`` emits a - ``Z_SYNC_FLUSH`` DEFLATE block (so decompressors actually see the - bytes — ``GzipFile.write()`` buffers internally and would - otherwise emit nothing until close), then ``handler.wfile.flush()`` - pushes the BufferedWriter into the socket. Without this the - skeleton event would be stuck behind the final event in - production. - - Sets cancel_event on BrokenPipe/ConnectionReset (write-time AND - close-time) so a concurrently-running scan thread can stop ASAP. - Also checks cancel_event between events so a watchdog can - interrupt iteration without waiting for a write to fail.""" - accept = handler.headers.get("Accept-Encoding", "") - use_gzip = "gzip" in accept.lower() - - handler.send_response(HTTPStatus.OK) - handler.send_header("Content-Type", "application/x-ndjson") - if use_gzip: - handler.send_header("Content-Encoding", "gzip") - # No Content-Length → chunked transfer. - handler.end_headers() - - # Both BufferedWriter (handler.wfile) and GzipFile inherit from - # io.BufferedIOBase, so we can reassign without a `# type: ignore`. - sink: BufferedIOBase = handler.wfile - gz: gzip.GzipFile | None = None - if use_gzip: - # mtime=0 → deterministic bytes (helps tests). compresslevel=6 - # matches the existing _maybe_gzip path's choice. - gz = gzip.GzipFile(fileobj=sink, mode="wb", compresslevel=6, mtime=0) - sink = gz - - try: - encoder = json.JSONEncoder() - for event in events: - for chunk in encoder.iterencode(event): - sink.write(chunk.encode("utf-8")) - sink.write(b"\n") - # Boundary flush: emit a Z_SYNC_FLUSH DEFLATE block so the - # decompressor sees this event's bytes, then push from the - # BufferedWriter into the socket. - if gz is not None: - gz.flush() - handler.wfile.flush() - # Let a watchdog interrupt between events without needing a - # write to fail first. - if cancel_event.is_set(): - break - except (BrokenPipeError, ConnectionResetError): - cancel_event.set() - raise - finally: - if gz is not None: - try: - gz.close() - except (BrokenPipeError, ConnectionResetError): - # Gzip buffers most output, so a peer that already - # disconnected often only surfaces at close time. - # Mirror the write-path behavior: surface cancel to the - # surrounding scan, but don't re-raise (we're in - # finally; any real exception already propagated). - cancel_event.set() - - -_WATCHDOG_POLL_SEC = 0.5 - - -def _start_disconnect_watchdog( - handler: BaseHTTPRequestHandler, - cancel_event: threading.Event, -) -> threading.Thread: - """Spawn a daemon thread that watches `handler.connection` for - client-side EOF and sets `cancel_event` when seen. - - Polls every ~500ms via select(); when the socket becomes - readable, peeks one byte — an empty peek means the peer closed. - Loop also exits if cancel_event is set by anyone else (normal - scan completion, or the writer noticing a broken pipe first). - """ - sock = handler.connection - - def _loop() -> None: - while not cancel_event.is_set(): - try: - readable, _, _ = select.select( - [sock], [], [], _WATCHDOG_POLL_SEC, - ) - except (OSError, ValueError): - cancel_event.set() - return - if not readable: - continue - try: - peek = sock.recv(1, _socket.MSG_PEEK) - except OSError: - cancel_event.set() - return - if not peek: - # EOF — client closed its end. - cancel_event.set() - return - # Unexpected data from the client mid-scan (the browser - # isn't supposed to send anything until it reads the - # response). MSG_PEEK didn't consume the byte, so select() - # will keep waking us up on it forever — sleep one poll - # cycle to avoid spinning at 100% CPU. cancel_event.wait() - # both serves as the sleep AND lets the loop exit promptly - # if anyone sets the event during the wait. - cancel_event.wait(_WATCHDOG_POLL_SEC) - - t = threading.Thread(target=_loop, daemon=True, name="cc-disconnect-watchdog") - t.start() - return t - - -def _parse_no_cache(query: str) -> bool: - """Parse ?no_cache=… as a boolean. Strict: only 'true' (any case) - and '1' count as on; absent or anything else is off. Maps to - scan_tree(use_cache=not ).""" - raw = parse_qs(query).get("no_cache", [""])[0].strip().lower() - return raw in ("true", "1") - - -def _resolve_scan_target( - handler: BaseHTTPRequestHandler, query: str -) -> tuple[Path, str, str | None, Literal["local", "git"]] | None: - """Parse ?src=… [&branch=…] and resolve to a scan root. - - Returns (resolved_path, original_src, branch_or_None, kind) on success, or - None after sending the appropriate 4xx/5xx error response. - - Branch semantics: - - Local src: branch is silently ignored. Scan the live working tree. - - Git URL src: branch is passed through to ensure_clone. - """ - params = parse_qs(query) - raw_src = params.get("src", [""])[0] - raw_branch = params.get("branch", [""])[0] or None - - if not raw_src: - _send_json(handler, HTTPStatus.BAD_REQUEST, {"error": "missing 'src' query param"}) - return None - - kind = _classify_source(raw_src) - if kind == "invalid": - _send_json( - handler, - HTTPStatus.BAD_REQUEST, - {"error": "unrecognized source — pass a local path or a git URL"}, - ) - return None - - if kind == "git": - try: - with _State.clone_lock: - local = ensure_clone(raw_src, raw_branch) - return local, raw_src, raw_branch, "git" - except (BranchNotFoundError, RepoNotFoundError, HostUnreachableError) as e: - _send_json(handler, HTTPStatus.BAD_REQUEST, {"error": str(e)}) - return None - except CloneError as e: - _send_json(handler, HTTPStatus.BAD_GATEWAY, {"error": str(e)}) - return None - - # kind == "local" — ignore any &branch=, scan the working tree in place - if not _local_repos_allowed(): - _send_json( - handler, HTTPStatus.FORBIDDEN, {"error": _LOCAL_DISABLED_ERROR} - ) - return None - try: - scan_target = Path(raw_src).resolve(strict=True) - except (OSError, RuntimeError): - _send_json(handler, HTTPStatus.NOT_FOUND, {"error": "path not found"}) - return None - if not scan_target.is_dir(): - _send_json( - handler, HTTPStatus.BAD_REQUEST, {"error": "path is not a directory"} - ) - return None - if not _is_git_working_tree(scan_target): - _send_json( - handler, HTTPStatus.BAD_REQUEST, {"error": _NOT_GIT_ERROR} - ) - return None - return scan_target, raw_src, None, "local" - - -def _local_repos_allowed() -> bool: - """Return True if CODECITY_ALLOW_LOCAL_REPOS is set to a truthy - value. Read fresh on each call so tests can monkeypatch the env - var without restarting the server.""" - return env_bool("CODECITY_ALLOW_LOCAL_REPOS") - - -def _serve_config(handler: BaseHTTPRequestHandler) -> None: - """GET /api/config — server-side feature flags for the frontend.""" - body: ConfigResponse = {"allowLocalRepos": _local_repos_allowed()} - _send_json(handler, HTTPStatus.OK, body) - - -_COMMIT_SHA_RE = re.compile(r"^[0-9a-fA-F]{7,40}$") - - -def _serve_commit_detail(handler: BaseHTTPRequestHandler, query: str) -> None: - """GET /api/commit?sha=. Returns {sha, authors, date, subject, body} - for a commit inside any registered scan root. Validates the sha shape - locally before shelling out to ``git show``. - - Multi-root resolution: tries each allowed scan root in turn and - returns the first hit. Most deployments have one root; for the rare - multi-root case this picks the first repo that contains the sha. - - No email in the response. ``git show`` is the only git call here — no - diff is fetched. - """ - params = parse_qs(query) - sha = (params.get("sha") or [""])[0].strip() - if not _COMMIT_SHA_RE.match(sha): - _send_json(handler, HTTPStatus.BAD_REQUEST, - {"error": "invalid or missing sha"}) - return - - with _State.allowed_roots_lock: - roots_snapshot = set(_State.allowed_roots) - - if not roots_snapshot: - _send_json(handler, HTTPStatus.NOT_FOUND, - {"error": "no scan root registered yet — fetch /api/manifest first"}) - return - - fmt = ("%H%x00%an%x00%aI%x00%s%x00" - "%(trailers:key=Co-authored-by,valueonly,separator=%x1f)%x00%b") - for root in roots_snapshot: - try: - out = subprocess.check_output( - ["git", "-c", "safe.directory=*", "-C", str(root), - "show", "-s", f"--format={fmt}", sha], - stderr=subprocess.DEVNULL, - text=True, - ) - except subprocess.CalledProcessError: - continue - parts = out.rstrip("\n").split("\x00", 5) - if len(parts) < 6: - continue - full_sha, author, iso_date, subject, trailers_raw, body = parts - response: CommitDetailResponse = { - "sha": full_sha, - "authors": _build_authors_list(author, trailers_raw), - "date": iso_date[:10], - "subject": subject, - "body": body, - } - _send_json(handler, HTTPStatus.OK, response) - return - - _send_json(handler, HTTPStatus.NOT_FOUND, - {"error": "sha not found in any registered scan root"}) - - -def _serve_manifest(handler: BaseHTTPRequestHandler, query: str) -> None: - """Stream the scan manifest for the requested source as NDJSON. - - Event sequence for git sources: - cloning → scanning → skeleton → final (cold cache, cold clone) - cloning → scanning → final (warm manifest cache) - Local sources: - scanning → skeleton → final (cold cache) - scanning → final (warm manifest cache) - - The ``cloning`` / ``scanning`` events are lightweight phase markers - (no manifest payload) so the loading-overlay can advance its step - indicator from real server state instead of a wall-clock timer. - - On client disconnect, the watchdog sets the cancel event within - ~500ms; the scan exits via ScanCancelledError and no cache write - happens. - - Pre-stream validation (missing/invalid src, missing local path) - still returns 4xx because no response has started yet. Errors that - arise after the first event is emitted (clone failure, scan failure) - are surfaced as ``{phase: "error"}`` NDJSON events — the HTTP status - is already 200 by then.""" - # Pre-stream validation: param parsing + classify + local-path stat. - # Anything caught here still gets a clean 4xx response. - # - # NOTE: this re-validates the same things as _resolve_scan_target (used - # by /api/manifest/signature). Duplicated deliberately — _serve_manifest - # must emit a chunked NDJSON stream with phase events (`cloning`, - # `scanning`), but _resolve_scan_target calls ensure_clone synchronously - # which would block before any stream byte reaches the client. If you - # change the validation rules, update BOTH places. - params = parse_qs(query) - raw_src = params.get("src", [""])[0] - raw_branch = params.get("branch", [""])[0] or None - - if not raw_src: - _send_json(handler, HTTPStatus.BAD_REQUEST, {"error": "missing 'src' query param"}) - return - - kind = _classify_source(raw_src) - if kind == "invalid": - _send_json( - handler, - HTTPStatus.BAD_REQUEST, - {"error": "unrecognized source — pass a local path or a git URL"}, - ) - return - - local_target: Path | None = None - if kind == "local": - if not _local_repos_allowed(): - _send_json( - handler, - HTTPStatus.FORBIDDEN, - {"error": _LOCAL_DISABLED_ERROR}, - ) - return - try: - local_target = Path(raw_src).resolve(strict=True) - except (OSError, RuntimeError): - _send_json(handler, HTTPStatus.NOT_FOUND, {"error": "path not found"}) - return - if not local_target.is_dir(): - _send_json( - handler, HTTPStatus.BAD_REQUEST, {"error": "path is not a directory"} - ) - return - if not _is_git_working_tree(local_target): - _send_json( - handler, HTTPStatus.BAD_REQUEST, {"error": _NOT_GIT_ERROR} - ) - return - - use_cache = not _parse_no_cache(query) - - cancel_event = threading.Event() - watchdog = _start_disconnect_watchdog(handler, cancel_event) - - def _stamp_display_root(m: "Manifest") -> "Manifest": - if kind == "git": - m["display_root"] = display_root - return m - - # Captured by the closure below so we can decide whether to write - # the manifest cache after _stream_events returns. - state: dict[str, Any] = {"final_manifest": None, "scan_target": None, "sig": None} - - # Display label for the in-flight scan. Hoisted above the first - # yield so the cloning/scanning event can carry it — the client - # uses this to set "{label} (pending)" before any manifest exists. - if kind == "git": - display_root = f"{raw_src}@{raw_branch}" if raw_branch else raw_src - else: - display_root = raw_src - - def _events() -> Iterable[ScanStreamEvent | dict[str, Any]]: - # Git sources: emit cloning, run ensure_clone, then continue. - # Errors during the clone become NDJSON error events because the - # response has already begun streaming by the time this runs. - if kind == "git": - yield {"phase": "cloning", "display_root": display_root} - # ensure_clone is synchronous, but we want its progress - # callbacks to stream out as additional `cloning` events. - # Run it on a worker thread that pushes events into a queue; - # the generator drains the queue until a sentinel arrives, - # then collects the result (or re-raises any exception). - clone_q: queue.Queue[dict[str, Any] | None] = queue.Queue() - clone_result: dict[str, Any] = {"target": None, "error": None} - - def _on_clone_progress(payload: tuple[str, int]) -> None: - stage, percent = payload - clone_q.put({ - "phase": "cloning", - "display_root": display_root, - "stage": stage, - "percent": percent, - }) - - def _run_clone() -> None: - try: - with _State.clone_lock: - clone_result["target"] = ensure_clone( - raw_src, raw_branch, on_progress=_on_clone_progress - ) - except Exception as e: # pylint: disable=broad-except - clone_result["error"] = e - finally: - clone_q.put(None) # sentinel - - clone_thread = threading.Thread(target=_run_clone, daemon=True) - clone_thread.start() - try: - while True: - ev = clone_q.get() - if ev is None: - break - yield ev - finally: - clone_thread.join() - - err = clone_result["error"] - if isinstance(err, (BranchNotFoundError, RepoNotFoundError, HostUnreachableError)): - yield {"phase": "error", "error": str(err)} - return - if isinstance(err, CloneError): - yield {"phase": "error", "error": str(err)} - return - if err is not None: - # Unexpected exception type — surface as an error event so - # the client sees a clean message, then re-raise so the - # outer handler logs it server-side. - yield {"phase": "error", "error": str(err)} - raise err - scan_target = clone_result["target"] - else: - assert local_target is not None - scan_target = local_target - - state["scan_target"] = scan_target - # Register trust root before any file-bearing event reaches the - # client. From this point on /api/file can serve content under - # scan_target. - with _State.allowed_roots_lock: - _State.allowed_roots.add(scan_target.resolve()) - - # For git sources this is the second event (after `cloning`); - # for local sources it's the first. Either way, display_root - # rides along so the client can show the pending label - # immediately for local sources too. - yield {"phase": "scanning", "display_root": display_root} - - # Cheap signature probe — same call the live-poll endpoint uses. - try: - sig_response = signature_tree( - str(scan_target), - use_cache=use_cache, - ) - except Exception as e: # pylint: disable=broad-except - yield {"phase": "error", "error": f"scan failed: {e}"} - return - sig = sig_response["signature"] - state["sig"] = sig - - # Cache lookup. - cached: Manifest | None = None - if use_cache: - cached = cache_load_manifest(scan_target.resolve(), sig) - if cached is not None: - cached = _stamp_display_root(cached) - - if cached is not None: - yield {"phase": "final", "manifest": cached} - return - - # Cache miss — stream live scan. scan_tree is synchronous and - # also yields its own events (skeleton + final); we want the - # heartbeat-driven scanning progress events to interleave with - # those. Same queue/thread pattern as the cloning phase: the - # worker pushes both kinds of events into one queue, the - # generator drains and yields in arrival order. - scan_q: queue.Queue[ScanStreamEvent | dict[str, Any] | None] = queue.Queue() - scan_error: list[BaseException] = [] - - def _on_scan_progress(files_scanned: int) -> None: - scan_q.put({ - "phase": "scanning", - "display_root": display_root, - "files_scanned": files_scanned, - }) - - def _run_scan() -> None: - try: - for ev in scan_tree( - str(scan_target), - use_cache=use_cache, - cancel_event=cancel_event, - on_scan_progress=_on_scan_progress, - ): - scan_q.put(ev) # skeleton + final flow through the same queue - except Exception as e: # pylint: disable=broad-except - scan_error.append(e) - finally: - scan_q.put(None) # sentinel - - scan_thread = threading.Thread(target=_run_scan, daemon=True) - scan_thread.start() - try: - while True: - ev = scan_q.get() - if ev is None: - break - if ev.get("phase") in ("skeleton", "final"): - m = _stamp_display_root(ev["manifest"]) - if ev["phase"] == "final": - state["final_manifest"] = m - yield ev - finally: - scan_thread.join() - - if scan_error: - err = scan_error[0] - if isinstance(err, ScanCancelledError): - # Cancellation isn't an error to surface to the client — - # they disconnected, so there's nobody to read a message. - # Re-raise so the outer try skips the cache write and logs - # the disconnect. - raise err - # Unexpected mid-stream failure (e.g., disk read error - # during _populate_file_metadata). Emit one final error - # event so the client sees a clear message instead of a - # truncated stream / parse error. - yield {"phase": "error", "error": f"scan failed: {err}"} - - try: - _stream_events(handler, _events(), cancel_event) - - # Always write the cache on a successful scan — `use_cache` only - # controls whether we READ from it. A skip-cache (no_cache) scan still - # persists its fresh result, so the next normal load is served the - # up-to-date manifest instead of a stale one. - final_manifest = state["final_manifest"] - scan_target = state["scan_target"] - sig = state["sig"] - if ( - final_manifest is not None - and scan_target is not None - and sig is not None - ): - cache_save_manifest(scan_target.resolve(), sig, final_manifest) - - except ScanCancelledError: - _log_quiet("[scan] cancelled (client disconnected)") - except (BrokenPipeError, ConnectionResetError): - # Writer noticed the disconnect first. handle_error from - # fix #1 swallows the propagated exception at the socketserver - # layer; we just need to skip the cache write. - _log_quiet("[scan] client disconnected mid-stream") - raise - finally: - cancel_event.set() - watchdog.join(timeout=1.0) - - -def _delete_manifest_cache(handler: BaseHTTPRequestHandler, query: str) -> None: - """Clear every cached manifest for the given source. - - Used by the frontend when the user removes an entry from the recents - list — they're done with this source, so its disk cache should go - too. Resolves git URLs to their clone-dir without actually cloning; - resolves local paths non-strictly so cleanup still works for paths - that no longer exist on disk. - - Note: this route is intentionally NOT gated by - `CODECITY_ALLOW_LOCAL_REPOS`. The gate exists to prevent fresh - scans of arbitrary host paths; cache cleanup only manipulates - files under ``CODECITY_CACHE_ROOT`` (a path derived from the - source, not the source itself) and is safe to leave open.""" - params = parse_qs(query) - raw_src = params.get("src", [""])[0] - raw_branch = params.get("branch", [""])[0] or None - - if not raw_src: - _send_json(handler, HTTPStatus.BAD_REQUEST, {"error": "missing 'src' query param"}) - return - - kind = _classify_source(raw_src) - if kind == "invalid": - _send_json( - handler, - HTTPStatus.BAD_REQUEST, - {"error": "unrecognized source — pass a local path or a git URL"}, - ) - return - - if kind == "git": - # Pure path derivation — no clone, no network. - from api.clone import clone_dir_for - abs_root = clone_dir_for(raw_src, raw_branch) - else: - # Local source: non-strict resolve so a recents entry for a - # since-deleted path still drops its cache. - abs_root = Path(raw_src).resolve(strict=False) - - deleted = cache_clear_manifests(abs_root) - _send_json(handler, HTTPStatus.OK, {"deleted": deleted}) - - -def _log_quiet(msg: str) -> None: - """Same env-gated logger as scan._log, duplicated here so server - doesn't import a private from scan. CODECITY_QUIET=1 silences.""" - if not env_bool("CODECITY_QUIET"): - print(msg, file=sys.stderr, flush=True) - - -def _serve_manifest_signature(handler: BaseHTTPRequestHandler, query: str) -> None: - """Cheap variant of /api/manifest — returns just {root, scanned_at, signature}. - - Used by the frontend's live-update poll: hitting this every few - seconds avoids paying for per-file content reads and per-file git - history walks on every tick. The client only fetches the full - manifest when the signature changes. - """ - resolved = _resolve_scan_target(handler, query) - if resolved is None: - return - scan_target, _raw_src, _raw_branch, _kind = resolved - use_cache = not _parse_no_cache(query) - - try: - sig = signature_tree( - str(scan_target), - use_cache=use_cache, - ) - except Exception as e: # pylint: disable=broad-except - _send_json( - handler, - HTTPStatus.INTERNAL_SERVER_ERROR, - {"error": f"signature failed: {e}"}, - ) - return - - _send_json(handler, HTTPStatus.OK, sig) - - -def _serve_file_api(handler: BaseHTTPRequestHandler, query: str) -> None: - """Serve a file from the user's filesystem, restricted to paths inside - any directory that has been successfully scanned this session. - Path-traversal and symlink-escape attempts are caught by - ``Path.resolve()`` + ``relative_to()``.""" - params = parse_qs(query) - raw = params.get("path", [""])[0] - if not raw: - _send_json(handler, HTTPStatus.BAD_REQUEST, {"error": "missing 'path' param"}) - return - - # Snapshot under the lock so a concurrent scan can't mutate the set - # mid-iteration. Set copy is O(roots) — typically a handful. - with _State.allowed_roots_lock: - roots_snapshot = set(_State.allowed_roots) - - if not roots_snapshot: - _send_json( - handler, - HTTPStatus.FORBIDDEN, - {"error": "no scan root registered yet — fetch /api/manifest first"}, - ) - return - - try: - target = Path(raw).resolve(strict=True) - except (OSError, RuntimeError): - _send_json(handler, HTTPStatus.NOT_FOUND, {"error": "not found"}) - return - - # Allow if the target is under ANY registered root. - inside = False - for root in roots_snapshot: - try: - target.relative_to(root) - except ValueError: - continue - inside = True - break - if not inside: - _send_json(handler, HTTPStatus.FORBIDDEN, {"error": "outside scan root"}) - return - - if not target.is_file(): - _send_json(handler, HTTPStatus.NOT_FOUND, {"error": "not a file"}) - return - - size = target.stat().st_size - if size > MAX_FILE_BYTES: - _send_json( - handler, - HTTPStatus.REQUEST_ENTITY_TOO_LARGE, - {"error": "file too large", "size": size, "limit": MAX_FILE_BYTES}, - ) - return - - guessed, _ = mimetypes.guess_type(str(target)) - # Media types (image/video/audio/pdf) keep their guessed MIME so the - # browser can hand them to /