From fe7a7de5874ffbc4ba290e97aae6a863205dd4f0 Mon Sep 17 00:00:00 2001 From: cc Date: Thu, 4 Jun 2026 13:41:32 -0700 Subject: [PATCH 01/14] spec: openspec init --- .gitignore | 2 + docs/coding-agents/index.md | 1 + docs/development/openspec.md | 102 ++++++++++++++ docs/docs.json | 1 + openspec/config.yaml | 45 ++++++ openspec/schemas/dimos-capability/schema.yaml | 128 ++++++++++++++++++ .../dimos-capability/templates/design.md | 35 +++++ .../dimos-capability/templates/docs.md | 19 +++ .../dimos-capability/templates/proposal.md | 32 +++++ .../dimos-capability/templates/spec.md | 16 +++ .../dimos-capability/templates/tasks.md | 15 ++ 11 files changed, 396 insertions(+) create mode 100644 docs/development/openspec.md create mode 100644 openspec/config.yaml create mode 100644 openspec/schemas/dimos-capability/schema.yaml create mode 100644 openspec/schemas/dimos-capability/templates/design.md create mode 100644 openspec/schemas/dimos-capability/templates/docs.md create mode 100644 openspec/schemas/dimos-capability/templates/proposal.md create mode 100644 openspec/schemas/dimos-capability/templates/spec.md create mode 100644 openspec/schemas/dimos-capability/templates/tasks.md diff --git a/.gitignore b/.gitignore index 42bdddfa45..787163e787 100644 --- a/.gitignore +++ b/.gitignore @@ -63,8 +63,10 @@ yolo11n.pt # symlink one of .envrc.* if you'd like to use .envrc .claude +.opencode/ **/CLAUDE.md .direnv/ +.omo/ /logs diff --git a/docs/coding-agents/index.md b/docs/coding-agents/index.md index ff778ac5cf..5ac7c854a7 100644 --- a/docs/coding-agents/index.md +++ b/docs/coding-agents/index.md @@ -3,6 +3,7 @@ ├── worktrees.md (creating provisioned worktrees with `bin/worktree`) ├── style.md (code style guidelines for dimos) ├── testing.md (docs about writing tests) +├── ../development/openspec.md (OpenSpec behavior-spec workflow) ├── docs (these are docs about writing docs) │   ├── codeblocks.md │   ├── doclinks.md diff --git a/docs/development/openspec.md b/docs/development/openspec.md new file mode 100644 index 0000000000..280eb0f57e --- /dev/null +++ b/docs/development/openspec.md @@ -0,0 +1,102 @@ +# OpenSpec Workflow + +DimOS uses OpenSpec as the checked-in planning layer for behavior changes. OpenSpec artifacts live under `openspec/` and should describe what the system is supposed to do, why it is changing, and how contributors or agents should validate the work. + +## Terminology + +Keep these two meanings separate: + +- **OpenSpec capability spec**: Markdown requirements under `openspec/specs//spec.md`. These describe observable behavior and acceptance scenarios. +- **DimOS Spec**: Python Protocol/RPC contracts in files like `dimos/navigation/navigation_spec.py` or `dimos/manipulation/control/arm_driver_spec.py`. These describe module interfaces for code wiring. + +Use "OpenSpec capability spec" in prose when there is any chance of confusion. + +## Schema + +The project uses the `dimos-capability` schema configured in `openspec/config.yaml`. + +The artifact flow is: + +```text +proposal + ├── specs + ├── design + └── docs + └── tasks +``` + +| Artifact | Purpose | +|---|---| +| `proposal.md` | Intent, scope, affected DimOS surfaces, and capability impact. | +| `specs//spec.md` | Behavior-first requirements and scenarios. | +| `design.md` | Module, stream, blueprint, skill/MCP, safety, and rollout decisions. | +| `docs.md` | Documentation impact and doc validation plan. | +| `tasks.md` | Implementation, docs, verification, and manual QA checklist. | + +## When to create a change + +Create an OpenSpec change when work changes observable behavior, public CLI/API/MCP behavior, robot behavior, hardware/simulation/replay workflows, docs that users rely on, or cross-module architecture. + +Do not create a change for a purely mechanical refactor, typo fix, or internal cleanup unless it changes behavior or needs cross-session planning context. + +## Writing specs + +OpenSpec capability specs are behavior contracts, not implementation plans. + +Good spec content: + +- User- or developer-visible behavior. +- Public CLI/API/MCP tool behavior. +- Stream or message behavior that downstream modules rely on. +- Robot safety constraints and hardware/simulation/replay expectations. +- Scenarios that can be tested or manually verified. + +Avoid in specs: + +- Private class/function names. +- Generated-file mechanics. +- Library choices and wiring details. +- Step-by-step implementation tasks. + +Put those details in `design.md` or `tasks.md`. + +## Capability names + +Prefer behavior-domain names over code names. Useful starting points: + +- `module-system` +- `blueprint-composition` +- `cli-lifecycle` +- `agent-skills-mcp` +- `configuration` +- `navigation-stack` +- `manipulation-stack` +- `hardware-adapters` +- `simulation-replay` +- `documentation-system` + +Add specs progressively as changes need them. Do not try to backfill the whole project at once. + +## Validation + +Use OpenSpec validation before implementation and before archiving: + +```bash skip +openspec schema validate dimos-capability +openspec validate +openspec templates --json +``` + +For documentation changes, also run the relevant doc checks from [Writing Docs](/docs/development/writing_docs.md): + +```bash skip +md-babel-py run +``` + +When a change touches blueprint names, module-level blueprint variables, or module registry inputs, run: + +```bash skip +pytest dimos/robot/test_all_blueprints_generation.py +``` + +Then run focused tests for the changed code and manually QA through the actual surface: CLI command, MCP tool, HTTP API, simulation/replay blueprint, hardware procedure, or library driver. diff --git a/docs/docs.json b/docs/docs.json index 58da2ff6a1..f0064c9ab9 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -144,6 +144,7 @@ "group": "Development", "pages": [ "development/conventions", + "development/openspec", "development/testing", "development/docker", "development/grid_testing", diff --git a/openspec/config.yaml b/openspec/config.yaml new file mode 100644 index 0000000000..62a72bba63 --- /dev/null +++ b/openspec/config.yaml @@ -0,0 +1,45 @@ +schema: dimos-capability + +context: | + DimOS is a robotics operating system for generalist robots. Modules communicate + through typed streams (`In[T]`, `Out[T]`) over LCM, SHM, ROS, DDS, or other + transports. Blueprints compose modules into runnable robot stacks. Skills are + `@skill`-annotated RPC methods exposed to agents and MCP clients. + + Terminology boundary: + - "OpenSpec spec" means a behavior specification under `openspec/specs/`. + - "DimOS Spec" means a Python Protocol/RPC contract in `*_spec.py` files, + usually inheriting `dimos.spec.utils.Spec` and `typing.Protocol`. + Keep these separate. OpenSpec specs describe observable behavior; DimOS Specs + describe code-level module interfaces. + + OpenSpec specs should capture current behavior, user/developer-visible + outcomes, public CLI/API/tool surfaces, robot safety constraints, and testable + scenarios. Put implementation choices, class names, module wiring, generated + registry updates, and rollout details in `design.md` or `tasks.md`. + + Documentation lives in: + - `docs/usage/` for user-facing concepts and APIs. + - `docs/capabilities/` for capability and platform guides. + - `docs/development/` for contributor process. + - `docs/coding-agents/` and `AGENTS.md` for coding-agent guidance. + +rules: + proposal: + - "Identify affected DimOS surfaces: modules, streams, blueprints, CLI, skills/MCP, docs, hardware, simulation, replay, or generated registries." + - Use capability names that match behavior domains, not Python class names. + - Mark hardware safety or public API/CLI changes explicitly. + specs: + - Write behavior-first requirements; avoid implementation detail unless it is externally observable. + - Every requirement must include at least one `#### Scenario:` block with concrete observable outcomes. + - Use "OpenSpec capability spec" when prose might otherwise be confused with DimOS Python `Spec` Protocols. + design: + - Call out DimOS `Spec` Protocols, adapter Protocols, blueprint composition, stream names/types, and skill/MCP exposure when relevant. + - Mention generated files and required regeneration commands, especially `pytest dimos/robot/test_all_blueprints_generation.py` for blueprint registry changes. + - Include hardware/simulation/replay assumptions and safety constraints for robot-facing work. + docs: + - List user-facing docs, contributor docs, coding-agent docs, and AGENTS.md updates required by the change. + - Include documentation validation commands for changed docs, such as `doclinks` and `md-babel-py run ` where applicable. + tasks: + - Include verification tasks for OpenSpec validation, relevant pytest targets, type checks when needed, and manual QA through the user-facing surface. + - Add registry generation tasks when blueprint names, module classes, or generated registry inputs change. diff --git a/openspec/schemas/dimos-capability/schema.yaml b/openspec/schemas/dimos-capability/schema.yaml new file mode 100644 index 0000000000..fedb7964ee --- /dev/null +++ b/openspec/schemas/dimos-capability/schema.yaml @@ -0,0 +1,128 @@ +name: dimos-capability +version: 1 +description: DimOS capability workflow - proposal → specs/design/docs → tasks +artifacts: + - id: proposal + generates: proposal.md + description: DimOS change proposal covering intent, scope, capability impact, and affected robot/software surfaces + template: proposal.md + instruction: | + Create the proposal document that establishes WHY this change is needed and what DimOS behavior it affects. + + Sections: + - **Why**: 1-2 concise paragraphs on the problem or opportunity. Explain why the change matters now. + - **What Changes**: Bullet list of added, modified, or removed behavior. Mark public API/CLI or hardware-safety breaking changes with **BREAKING**. + - **Affected DimOS Surfaces**: Identify modules, streams, blueprints, CLI commands, skills/MCP tools, docs, hardware, simulation, replay, generated registries, or external protocols touched by the change. + - **Capabilities**: Identify which OpenSpec capability specs will be created or modified: + - **New Capabilities**: List behavior domains introduced by the change. Each becomes `specs//spec.md`. Use kebab-case names (for example, `agent-skills-mcp`, `blueprint-composition`, `manipulation-stack`). + - **Modified Capabilities**: List existing `openspec/specs//` entries whose requirements change. Only include spec-level behavior changes, not implementation-only refactors. + - **Impact**: Summarize user/developer impact, compatibility risks, dependency changes, documentation updates, and test/QA scope. + + Keep proposals concise. Do not include line-by-line implementation details; put architecture and rollout decisions in `design.md`. + requires: [] + - id: specs + generates: specs/**/*.md + description: Behavior-first OpenSpec capability delta specifications + template: spec.md + instruction: | + Create OpenSpec capability specs that define WHAT DimOS should do, not how it is implemented. + + Create one delta spec file per capability listed in proposal.md: + - New capabilities: use `specs//spec.md` with the exact kebab-case name from the proposal. + - Modified capabilities: use the existing folder from `openspec/specs//`. + + Use these delta sections as `##` headers: + - **ADDED Requirements**: New externally observable behavior. + - **MODIFIED Requirements**: Changed behavior. Include the full updated requirement block, not a partial patch. + - **REMOVED Requirements**: Deprecated behavior. Include **Reason** and **Migration**. + - **RENAMED Requirements**: Name-only changes. Use FROM:/TO: format. + + Requirement format: + - Use `### Requirement: `. + - Use SHALL/MUST for normative requirements. + - Include at least one `#### Scenario: ` per requirement. Scenario headings MUST use exactly four `#` characters. + - Prefer `- **GIVEN**`, `- **WHEN**`, `- **THEN**`, and `- **AND**` bullets. + - Cover happy path plus meaningful edge/error/safety cases. + + DimOS-specific guidance: + - Specify user/developer-visible behavior, robot outcomes, CLI behavior, skill/MCP tool behavior, stream contracts, safety constraints, and compatibility expectations. + - Avoid Python class names, private module internals, transport implementation choices, and generated-file details unless those details are observable API contracts. + - Use "OpenSpec capability spec" in prose when needed to avoid confusion with DimOS Python `Spec` Protocols. + - If the behavior only changes implementation and not observable requirements, do not create a spec delta. + requires: + - proposal + - id: design + generates: design.md + description: DimOS technical design and architecture decisions + template: design.md + instruction: | + Create the design document that explains HOW the change should be implemented in DimOS. + + Include design.md for cross-module changes, new robot/hardware integration, new public interfaces, new dependencies, safety-sensitive behavior, generated registry changes, or unclear architecture. + + Sections: + - **Context**: Current state, relevant modules/blueprints/docs, and constraints. + - **Goals / Non-Goals**: What the design achieves and explicitly excludes. + - **DimOS Architecture**: Modules, streams, transports, blueprints, RPC/module refs, DimOS `Spec` Protocols, adapter Protocols, skills/MCP exposure, CLI entry points, and generated registries involved. + - **Decisions**: Key choices with rationale and alternatives considered. + - **Safety / Simulation / Replay**: Hardware assumptions, sim/replay behavior, safety constraints, and manual QA surface. + - **Risks / Trade-offs**: Known risks and mitigations. + - **Migration / Rollout**: Compatibility, generated files, docs, and deployment steps. + - **Open Questions**: Outstanding decisions or unknowns. + + Reference proposal.md for intent and specs for behavior. Keep line-by-line work in tasks.md. + requires: + - proposal + - id: docs + generates: docs.md + description: Documentation impact plan for user, contributor, and coding-agent docs + template: docs.md + instruction: | + Create the documentation impact plan for the change. + + Sections: + - **User-Facing Docs**: Updates under `docs/usage/`, `docs/capabilities/`, `docs/platforms/`, or README files. + - **Contributor Docs**: Updates under `docs/development/`. + - **Coding-Agent Docs**: Updates under `docs/coding-agents/` or `AGENTS.md`. + - **Doc Validation**: Commands needed for changed docs, such as `doclinks`, `md-babel-py run `, and `bin/gen-diagrams`. + - **No Docs Needed**: If no docs are needed, explain why. + + Match `docs/development/writing_docs.md`: contributor-only docs belong in `docs/development`; user-facing behavior belongs in `docs/usage` or `docs/capabilities`. + requires: + - proposal + - id: tasks + generates: tasks.md + description: Implementation, validation, docs, and manual-QA checklist + template: tasks.md + instruction: | + Create the implementation checklist. The apply phase parses checkbox format, so every actionable task MUST use `- [ ]`. + + Guidelines: + - Group tasks under numbered `##` headings. + - Each task must be `- [ ] X.Y Task description`. + - Keep tasks small enough to complete in one focused session. + - Order tasks by dependency. + - Include docs and validation tasks from docs.md. + - Include generated registry tasks when blueprints or module registry inputs change. + - Include manual QA through the actual user surface: CLI, TUI, HTTP API, MCP tool, simulation/replay blueprint, hardware procedure, or library driver. + + Typical DimOS validation tasks: + - Run `openspec validate `. + - Run focused pytest targets for changed modules. + - Run `pytest dimos/robot/test_all_blueprints_generation.py` when blueprint registry output may change. + - Run docs validation commands for changed docs. + - Run lints/types when the touched area requires them. + + Reference specs for WHAT, design for HOW, and docs.md for documentation work. + requires: + - specs + - design + - docs +apply: + requires: + - tasks + tracks: tasks.md + instruction: | + Read proposal.md, specs, design.md, docs.md, and tasks.md before editing code. + Work through pending tasks, mark checkboxes complete as they finish, and keep artifacts current when implementation changes the plan. + Verify with OpenSpec validation, focused tests, docs checks, and manual QA through the relevant DimOS surface. diff --git a/openspec/schemas/dimos-capability/templates/design.md b/openspec/schemas/dimos-capability/templates/design.md new file mode 100644 index 0000000000..25031ceb8b --- /dev/null +++ b/openspec/schemas/dimos-capability/templates/design.md @@ -0,0 +1,35 @@ +## Context + + + +## Goals / Non-Goals + +**Goals:** + + +**Non-Goals:** + + +## DimOS Architecture + + + +## Decisions + + + +## Safety / Simulation / Replay + + + +## Risks / Trade-offs + + + +## Migration / Rollout + + + +## Open Questions + + diff --git a/openspec/schemas/dimos-capability/templates/docs.md b/openspec/schemas/dimos-capability/templates/docs.md new file mode 100644 index 0000000000..d274aed653 --- /dev/null +++ b/openspec/schemas/dimos-capability/templates/docs.md @@ -0,0 +1,19 @@ +## User-Facing Docs + + + +## Contributor Docs + + + +## Coding-Agent Docs + + + +## Doc Validation + + + +## No Docs Needed + + diff --git a/openspec/schemas/dimos-capability/templates/proposal.md b/openspec/schemas/dimos-capability/templates/proposal.md new file mode 100644 index 0000000000..98d409e8de --- /dev/null +++ b/openspec/schemas/dimos-capability/templates/proposal.md @@ -0,0 +1,32 @@ +## Why + + + +## What Changes + + + +## Affected DimOS Surfaces + + +- Modules/streams: +- Blueprints/CLI: +- Skills/MCP: +- Hardware/simulation/replay: +- Docs/generated registries: + +## Capabilities + +### New Capabilities + +- ``: + +### Modified Capabilities + +- ``: + +## Impact + + diff --git a/openspec/schemas/dimos-capability/templates/spec.md b/openspec/schemas/dimos-capability/templates/spec.md new file mode 100644 index 0000000000..afc0c1ff58 --- /dev/null +++ b/openspec/schemas/dimos-capability/templates/spec.md @@ -0,0 +1,16 @@ +## ADDED Requirements + +### Requirement: + + +#### Scenario: +- **GIVEN** +- **WHEN** +- **THEN** +- **AND** + + diff --git a/openspec/schemas/dimos-capability/templates/tasks.md b/openspec/schemas/dimos-capability/templates/tasks.md new file mode 100644 index 0000000000..b38fcdfabb --- /dev/null +++ b/openspec/schemas/dimos-capability/templates/tasks.md @@ -0,0 +1,15 @@ +## 1. Implementation + +- [ ] 1.1 +- [ ] 1.2 + +## 2. Documentation + +- [ ] 2.1 + +## 3. Verification + +- [ ] 3.1 Run `openspec validate ` +- [ ] 3.2 Run focused tests for changed code +- [ ] 3.3 Run docs validation commands for changed docs +- [ ] 3.4 Manually QA through the relevant DimOS surface (CLI, MCP, simulation/replay, hardware procedure, HTTP API, or library driver) From bccc44c580bbe052aaa4551b795ea36063969f42 Mon Sep 17 00:00:00 2001 From: cc Date: Wed, 10 Jun 2026 21:22:00 -0700 Subject: [PATCH 02/14] feat: add h264 image transport and memory2 storage --- dimos/core/transport.py | 21 + dimos/memory2/module.py | 6 +- dimos/memory2/store/base.py | 43 ++ dimos/memory2/store/sqlite.py | 61 ++- dimos/memory2/video/__init__.py | 15 + dimos/memory2/video/h264.py | 352 ++++++++++++++ dimos/memory2/video/test_h264_storage.py | 190 ++++++++ dimos/msgs/sensor_msgs/Image.py | 120 ++++- dimos/msgs/sensor_msgs/VideoPacket.py | 98 ++++ dimos/msgs/sensor_msgs/test_image.py | 57 +++ dimos/protocol/pubsub/impl/h264_lcm.py | 63 +++ dimos/protocol/pubsub/impl/test_h264_lcm.py | 179 +++++++ dimos/protocol/pubsub/registry.py | 17 +- dimos/protocol/pubsub/test_registry.py | 21 +- dimos/protocol/video/__init__.py | 15 + dimos/protocol/video/demo_h264_video_e2e.py | 278 +++++++++++ dimos/protocol/video/h264.py | 293 ++++++++++++ dimos/protocol/video/test_h264.py | 159 ++++++ dimos/robot/all_blueprints.py | 8 + docs/capabilities/memory/index.md | 76 +++ docs/coding-agents/style.md | 12 + docs/development/testing.md | 31 ++ docs/usage/blueprints.md | 22 + docs/usage/transports/index.md | 45 ++ .../.openspec.yaml | 2 + .../add-h264-codec-mem2-storage/design.md | 452 ++++++++++++++++++ .../add-h264-codec-mem2-storage/docs.md | 57 +++ .../add-h264-codec-mem2-storage/proposal.md | 40 ++ .../specs/h264-image-streams/spec.md | 85 ++++ .../specs/memory2-h264-storage/spec.md | 85 ++++ .../add-h264-codec-mem2-storage/tasks.md | 61 +++ 31 files changed, 2925 insertions(+), 39 deletions(-) create mode 100644 dimos/memory2/video/__init__.py create mode 100644 dimos/memory2/video/h264.py create mode 100644 dimos/memory2/video/test_h264_storage.py create mode 100644 dimos/msgs/sensor_msgs/VideoPacket.py create mode 100644 dimos/protocol/pubsub/impl/h264_lcm.py create mode 100644 dimos/protocol/pubsub/impl/test_h264_lcm.py create mode 100644 dimos/protocol/video/__init__.py create mode 100644 dimos/protocol/video/demo_h264_video_e2e.py create mode 100644 dimos/protocol/video/h264.py create mode 100644 dimos/protocol/video/test_h264.py create mode 100644 openspec/changes/add-h264-codec-mem2-storage/.openspec.yaml create mode 100644 openspec/changes/add-h264-codec-mem2-storage/design.md create mode 100644 openspec/changes/add-h264-codec-mem2-storage/docs.md create mode 100644 openspec/changes/add-h264-codec-mem2-storage/proposal.md create mode 100644 openspec/changes/add-h264-codec-mem2-storage/specs/h264-image-streams/spec.md create mode 100644 openspec/changes/add-h264-codec-mem2-storage/specs/memory2-h264-storage/spec.md create mode 100644 openspec/changes/add-h264-codec-mem2-storage/tasks.md diff --git a/dimos/core/transport.py b/dimos/core/transport.py index 6435003758..0e5d23ad4d 100644 --- a/dimos/core/transport.py +++ b/dimos/core/transport.py @@ -162,6 +162,27 @@ def stop(self) -> None: self._started = False +class H264LcmTransport(LCMTransport): # type: ignore[type-arg] + def __init__(self, topic: str, type: type, config: Any | None = None, **kwargs) -> None: # type: ignore[no-untyped-def] + from dimos.protocol.pubsub.impl.h264_lcm import H264LCM + from dimos.protocol.video.h264 import H264Config + + self.config = config or H264Config() + self.lcm = H264LCM(config=self.config, **kwargs) # type: ignore[assignment] + super().__init__(topic, type) + + def __reduce__(self): # type: ignore[no-untyped-def] + return (H264LcmTransport, (self.topic.topic, self.topic.lcm_type, self.config)) + + def start(self) -> None: + self.lcm.start() + self._started = True + + def stop(self) -> None: + self.lcm.stop() + self._started = False + + class pSHMTransport(PubSubTransport[T]): _started: bool = False diff --git a/dimos/memory2/module.py b/dimos/memory2/module.py index 9a3d90e164..d88ebc75ca 100644 --- a/dimos/memory2/module.py +++ b/dimos/memory2/module.py @@ -254,6 +254,7 @@ class RecorderConfig(MemoryModuleConfig): default_frame_id: str = "base_link" tf_tolerance: float = 0.5 db_path: str | Path = "recording.db" + image_storage: dict[str, Any] = Field(default_factory=dict) class Recorder(MemoryModule): @@ -303,7 +304,10 @@ def start(self) -> None: return for name, port in self.inputs.items(): - stream: Stream[Any] = self.store.stream(name, port.type) + stream_overrides: dict[str, Any] = {} + if name in self.config.image_storage: + stream_overrides["image_storage"] = self.config.image_storage[name] + stream: Stream[Any] = self.store.stream(name, port.type, **stream_overrides) self._port_to_stream(name, port, stream) logger.info("Recording %s (%s)", name, port.type.__name__) diff --git a/dimos/memory2/store/base.py b/dimos/memory2/store/base.py index 7a7162a6d1..35e698ad82 100644 --- a/dimos/memory2/store/base.py +++ b/dimos/memory2/store/base.py @@ -157,6 +157,49 @@ def _create_backend( self, name: str, payload_type: type[Any] | None = None, **config: Any ) -> Backend[Any]: """Create a Backend for the named stream. Called once per stream name.""" + image_storage = config.pop("image_storage", None) + if image_storage is not None: + from dimos.memory2.video.h264 import ( + H264FrameIndexStore, + H264ImageBackend, + storage_config_from_any, + ) + from dimos.msgs.sensor_msgs.Image import Image + + storage_config = storage_config_from_any(image_storage) + if ( + storage_config is not None + and payload_type is not None + and issubclass(payload_type, Image) + ): + bs = config.pop("blob_store", self.config.blob_store) + if bs is None: + raise TypeError("H.264 image storage requires a blob_store") + if isinstance(bs, type): + bs = bs() + obs = config.pop("observation_store", self.config.observation_store) + if obs is None or isinstance(obs, type): + obs = (obs or ListObservationStore)(name=name) + vs = config.pop("vector_store", self.config.vector_store) + if isinstance(vs, type): + vs = vs() + notifier = config.pop("notifier", self.config.notifier) + if notifier is None or isinstance(notifier, type): + notifier = (notifier or SubjectNotifier)() + frame_index = config.pop("frame_index", None) + if frame_index is None: + raise TypeError("H.264 image storage requires a frame_index") + if not isinstance(frame_index, H264FrameIndexStore): + raise TypeError("H.264 image storage frame_index must be H264FrameIndexStore") + return H264ImageBackend( + metadata_store=obs, + blob_store=bs, + frame_index=frame_index, + storage_config=storage_config, + vector_store=vs, + notifier=notifier, + eager_blobs=config.get("eager_blobs", False), + ) codec = self._resolve_codec(payload_type, config.pop("codec", None)) # Instantiate or use provided instances diff --git a/dimos/memory2/store/sqlite.py b/dimos/memory2/store/sqlite.py index bb2b735c1c..7ecd3c04ab 100644 --- a/dimos/memory2/store/sqlite.py +++ b/dimos/memory2/store/sqlite.py @@ -66,12 +66,23 @@ def _open_connection(self) -> sqlite3.Connection: def _assemble_backend(self, name: str, stored: dict[str, Any]) -> Backend[Any]: """Reconstruct a Backend from a stored config dict.""" from dimos.memory2.codecs.base import _resolve_payload_type, codec_from_id + from dimos.memory2.codecs.pickle import PickleCodec + from dimos.memory2.video.h264 import ( + H264FrameIndexStore, + H264ImageBackend, + storage_config_from_any, + ) payload_module = stored["payload_module"] - codec = codec_from_id(stored["codec_id"], payload_module) data_type = _resolve_payload_type(payload_module) eager_blobs = stored.get("eager_blobs", False) page_size = stored.get("page_size", self.config.page_size) + image_storage = storage_config_from_any(stored.get("image_storage")) + codec = ( + PickleCodec() + if image_storage is not None + else codec_from_id(stored["codec_id"], payload_module) + ) backend_conn = self._open_connection() @@ -113,15 +124,26 @@ def _assemble_backend(self, name: str, stored: dict[str, Any]) -> Backend[Any]: blob_store_conn_match=blob_store_conn_match and eager_blobs, page_size=page_size, ) - backend: Backend[Any] = Backend( - metadata_store=metadata_store, - codec=codec, - data_type=data_type, - blob_store=bs, - vector_store=vs, - notifier=notifier, - eager_blobs=eager_blobs, - ) + if image_storage is not None: + backend = H264ImageBackend( + metadata_store=metadata_store, + blob_store=bs, + frame_index=H264FrameIndexStore(backend_conn), + storage_config=image_storage, + vector_store=vs, + notifier=notifier, + eager_blobs=eager_blobs, + ) + else: + backend = Backend( + metadata_store=metadata_store, + codec=codec, + data_type=data_type, + blob_store=bs, + vector_store=vs, + notifier=notifier, + eager_blobs=eager_blobs, + ) return backend @staticmethod @@ -135,6 +157,9 @@ def _serialize_backend( "eager_blobs": backend.eager_blobs, "page_size": page_size, } + if hasattr(backend, "storage_config"): + cfg["codec_id"] = "h264" + cfg["image_storage"] = backend.storage_config.serialize() if backend.blob_store is not None: cfg["blob_store"] = backend.blob_store.serialize() if backend.vector_store is not None: @@ -166,14 +191,26 @@ def _create_backend( backend_conn = self._open_connection() + image_storage = config.get("image_storage") + # Inject conn-shared instances unless user provided overrides if not isinstance(config.get("blob_store"), BlobStore): config["blob_store"] = SqliteBlobStore(conn=backend_conn) if not isinstance(config.get("vector_store"), VectorStore): config["vector_store"] = SqliteVectorStore(conn=backend_conn) - # Resolve codec early — needed for SqliteObservationStore - codec = self._resolve_codec(payload_type, config.get("codec")) + # Resolve codec early — needed for SqliteObservationStore. H.264 image + # streams own blob decoding in H264ImageBackend, so keep sqlite eager + # joins disabled and use a harmless metadata-store codec. + if image_storage is not None: + from dimos.memory2.codecs.pickle import PickleCodec + from dimos.memory2.video.h264 import H264FrameIndexStore + + codec = PickleCodec() + config["frame_index"] = H264FrameIndexStore(backend_conn) + config["eager_blobs"] = False + else: + codec = self._resolve_codec(payload_type, config.get("codec")) config["codec"] = codec # Create SqliteObservationStore with conn-sharing diff --git a/dimos/memory2/video/__init__.py b/dimos/memory2/video/__init__.py new file mode 100644 index 0000000000..86e17cecb4 --- /dev/null +++ b/dimos/memory2/video/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Video storage helpers for memory2.""" diff --git a/dimos/memory2/video/h264.py b/dimos/memory2/video/h264.py new file mode 100644 index 0000000000..9ba73d0f7a --- /dev/null +++ b/dimos/memory2/video/h264.py @@ -0,0 +1,352 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import asdict, dataclass, replace +import sqlite3 +from typing import TYPE_CHECKING, Any, Generic, TypeVar + +from dimos.memory2.backend import Backend +from dimos.memory2.codecs.pickle import PickleCodec +from dimos.memory2.notifier.subject import SubjectNotifier +from dimos.memory2.type.observation import _UNLOADED +from dimos.msgs.sensor_msgs.Image import Image, ImageFormat +from dimos.msgs.sensor_msgs.VideoPacket import VideoPacket +from dimos.protocol.video.h264 import ( + H264CodecAdapter, + H264Config, + H264Decoder, + H264Encoder, + VideoDecodeGapError, +) + +if TYPE_CHECKING: + from collections.abc import Iterator + + from dimos.memory2.blobstore.base import BlobStore + from dimos.memory2.notifier.base import Notifier + from dimos.memory2.observationstore.base import ObservationStore + from dimos.memory2.type.filter import StreamQuery + from dimos.memory2.type.observation import Observation + from dimos.memory2.vectorstore.base import VectorStore + +T = TypeVar("T") + + +@dataclass(frozen=True) +class H264ImageStorageConfig: + """Per-stream memory2 image storage mode for H.264-backed observations.""" + + codec: H264Config = H264Config() + mode: str = "h264" + codec_adapter: H264CodecAdapter | None = None + + def serialize(self) -> dict[str, Any]: + cfg = asdict(self.codec) + cfg["supported_formats"] = [fmt.value for fmt in self.codec.supported_formats] + return {"mode": self.mode, "codec": cfg} + + @classmethod + def parse(cls, raw: H264ImageStorageConfig | dict[str, Any]) -> H264ImageStorageConfig: + if isinstance(raw, cls): + return raw + if not isinstance(raw, dict): + raise TypeError(f"Cannot parse H.264 image storage config from {type(raw).__name__}") + mode = raw.get("mode", "h264") + codec_raw = raw.get("codec", {}) + if isinstance(codec_raw, H264Config): + codec = codec_raw + else: + codec_dict = dict(codec_raw) + formats = codec_dict.get("supported_formats") + if formats is not None: + codec_dict["supported_formats"] = tuple(ImageFormat(fmt) for fmt in formats) + codec = H264Config(**codec_dict) + return cls(codec=codec, mode=mode) + + +@dataclass(frozen=True) +class H264FrameIndexRow: + stream_name: str + observation_id: int + seq: int + keyframe_observation_id: int + is_keyframe: bool + pts: int + width: int + height: int + format: str + codec: str + bitstream: str + + +class H264FrameIndexStore: + """Persistent GOP/keyframe index for H.264-backed image streams.""" + + def __init__(self, conn: sqlite3.Connection) -> None: + self._conn = conn + + def start(self) -> None: + self._conn.execute( + """ + CREATE TABLE IF NOT EXISTS h264_frames ( + stream_name TEXT NOT NULL, + observation_id INTEGER NOT NULL, + seq INTEGER NOT NULL, + keyframe_observation_id INTEGER NOT NULL, + is_keyframe INTEGER NOT NULL, + pts INTEGER NOT NULL, + width INTEGER NOT NULL, + height INTEGER NOT NULL, + format TEXT NOT NULL, + codec TEXT NOT NULL, + bitstream TEXT NOT NULL, + PRIMARY KEY (stream_name, observation_id) + ) + """ + ) + self._conn.execute( + """ + CREATE INDEX IF NOT EXISTS idx_h264_frames_stream_keyframe + ON h264_frames(stream_name, is_keyframe, observation_id) + """ + ) + + def stop(self) -> None: + pass + + def insert(self, stream_name: str, observation_id: int, packet: VideoPacket) -> None: + keyframe_observation_id = ( + observation_id + if packet.is_keyframe + else self._keyframe_observation_id(stream_name, packet.keyframe_seq) + ) + self._conn.execute( + """ + INSERT INTO h264_frames ( + stream_name, observation_id, seq, keyframe_observation_id, is_keyframe, + pts, width, height, format, codec, bitstream + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + stream_name, + observation_id, + packet.seq, + keyframe_observation_id, + int(packet.is_keyframe), + packet.pts, + packet.width, + packet.height, + packet.format, + packet.codec, + packet.bitstream, + ), + ) + + def packet_ids_for_decode(self, stream_name: str, observation_id: int) -> list[int]: + row = self._conn.execute( + """ + SELECT keyframe_observation_id FROM h264_frames + WHERE stream_name = ? AND observation_id = ? + """, + (stream_name, observation_id), + ).fetchone() + if row is None: + raise VideoDecodeGapError(f"No H.264 GOP index for observation {observation_id}") + keyframe_id = int(row[0]) + rows = self._conn.execute( + """ + SELECT observation_id FROM h264_frames + WHERE stream_name = ? AND observation_id BETWEEN ? AND ? + ORDER BY observation_id ASC + """, + (stream_name, keyframe_id, observation_id), + ).fetchall() + ids = [int(item[0]) for item in rows] + if not ids or ids[0] != keyframe_id or ids[-1] != observation_id: + raise VideoDecodeGapError( + f"Incomplete H.264 GOP index for observation {observation_id}" + ) + return ids + + def rows(self, stream_name: str) -> list[H264FrameIndexRow]: + rows = self._conn.execute( + """ + SELECT stream_name, observation_id, seq, keyframe_observation_id, is_keyframe, + pts, width, height, format, codec, bitstream + FROM h264_frames WHERE stream_name = ? ORDER BY observation_id ASC + """, + (stream_name,), + ).fetchall() + return [ + H264FrameIndexRow( + stream_name=row[0], + observation_id=int(row[1]), + seq=int(row[2]), + keyframe_observation_id=int(row[3]), + is_keyframe=bool(row[4]), + pts=int(row[5]), + width=int(row[6]), + height=int(row[7]), + format=row[8], + codec=row[9], + bitstream=row[10], + ) + for row in rows + ] + + def _keyframe_observation_id(self, stream_name: str, keyframe_seq: int) -> int: + row = self._conn.execute( + """ + SELECT observation_id FROM h264_frames + WHERE stream_name = ? AND seq = ? AND is_keyframe = 1 + """, + (stream_name, keyframe_seq), + ).fetchone() + if row is None: + raise VideoDecodeGapError(f"No H.264 keyframe index for seq {keyframe_seq}") + return int(row[0]) + + +class H264ImageBackend(Backend[Image], Generic[T]): + """memory2 backend that stores one H.264 packet blob per Image observation.""" + + def __init__( + self, + *, + metadata_store: ObservationStore[Image], + blob_store: BlobStore, + frame_index: H264FrameIndexStore, + storage_config: H264ImageStorageConfig | None = None, + vector_store: VectorStore | None = None, + notifier: Notifier[Image] | None = None, + eager_blobs: bool = False, + ) -> None: + self.storage_config = storage_config or H264ImageStorageConfig() + self.frame_index = frame_index + self._encoder = H264Encoder( + self.storage_config.codec, + codec=self.storage_config.codec_adapter, + ) + super().__init__( + metadata_store=metadata_store, + codec=PickleCodec(), + data_type=Image, + blob_store=blob_store, + vector_store=vector_store, + notifier=notifier or SubjectNotifier(), + eager_blobs=eager_blobs, + ) + + def start(self) -> None: + super().start() + self.frame_index.start() + + def _make_loader(self, row_id: int) -> Any: + bs = self.blob_store + if bs is None: + raise RuntimeError("BlobStore required for H.264 image storage") + name = self.name + frame_index = self.frame_index + storage_config = self.storage_config + + def loader() -> Image: + packet_ids = frame_index.packet_ids_for_decode(name, row_id) + decoder = H264Decoder(storage_config.codec, codec=storage_config.codec_adapter) + decoded: Image | None = None + for packet_id in packet_ids: + packet = VideoPacket.lcm_decode(bs.get(name, packet_id)) + decoded = decoder.decode(packet) + if decoded is None: + raise VideoDecodeGapError(f"No H.264 packet available for observation {row_id}") + return decoded + + return loader + + def append(self, obs: Observation[Image]) -> Observation[Image]: + payload = obs.data + if not isinstance(payload, Image): + raise TypeError(f"Stream expects Image, got {type(payload).__qualname__}") + obs.data_type = Image + packet = self._encoder.encode(payload) + encoded = packet.lcm_encode() + try: + row_id = self.metadata_store.insert(obs) + obs.id = row_id + assert self.blob_store is not None + self.blob_store.put(self.name, row_id, encoded) + self.frame_index.insert(self.name, row_id, packet) + obs._data = _UNLOADED + obs._loader = self._make_loader(row_id) + if self.vector_store is not None: + emb = getattr(obs, "embedding", None) + if emb is not None: + self.vector_store.put(self.name, row_id, emb) + if hasattr(self.metadata_store, "commit"): + self.metadata_store.commit() + except BaseException: + if hasattr(self.metadata_store, "rollback"): + self.metadata_store.rollback() + raise + self.notifier.notify(obs) + return obs + + def _attach_loaders(self, it: Iterator[Observation[Image]]) -> Iterator[Observation[Image]]: + for obs in it: + obs.data_type = Image + if obs._loader is None and isinstance(obs._data, type(_UNLOADED)): + obs._loader = self._make_loader(obs.id) + yield obs + + def _iterate_snapshot(self, query: StreamQuery) -> Iterator[Observation[Image]]: + it = self._attach_loaders(self.metadata_store.query(query)) + if self.eager_blobs: + for obs in it: + _ = obs.data + yield obs + else: + yield from it + + def serialize(self) -> dict[str, Any]: + cfg = super().serialize() + cfg["codec_id"] = "h264" + cfg["image_storage"] = self.storage_config.serialize() + return cfg + + +def storage_config_from_any(raw: Any) -> H264ImageStorageConfig | None: + if raw is None: + return None + config = H264ImageStorageConfig.parse(raw) + if config.mode != "h264": + return None + return config + + +def storage_config_with_adapter( + config: H264ImageStorageConfig, + adapter: H264CodecAdapter | None, +) -> H264ImageStorageConfig: + return replace(config, codec_adapter=adapter) + + +__all__ = [ + "H264FrameIndexRow", + "H264FrameIndexStore", + "H264ImageBackend", + "H264ImageStorageConfig", + "storage_config_from_any", + "storage_config_with_adapter", +] diff --git a/dimos/memory2/video/test_h264_storage.py b/dimos/memory2/video/test_h264_storage.py new file mode 100644 index 0000000000..cad6c0803f --- /dev/null +++ b/dimos/memory2/video/test_h264_storage.py @@ -0,0 +1,190 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import sqlite3 + +import numpy as np +import pytest + +from dimos.memory2.blobstore.sqlite import SqliteBlobStore +from dimos.memory2.codecs.pickle import PickleCodec +from dimos.memory2.observationstore.sqlite import SqliteObservationStore +from dimos.memory2.store.sqlite import SqliteStore +from dimos.memory2.type.observation import _UNLOADED +from dimos.memory2.video.h264 import ( + H264FrameIndexStore, + H264ImageBackend, + H264ImageStorageConfig, + storage_config_from_any, +) +from dimos.msgs.sensor_msgs.Image import Image, ImageFormat +from dimos.msgs.sensor_msgs.VideoPacket import VideoPacket +from dimos.protocol.video.h264 import UnsupportedVideoImageError, VideoDecodeGapError + + +class FakeH264CodecAdapter: + def encode_image(self, image: Image, *, force_keyframe: bool) -> tuple[bytes, int]: + return image.data.tobytes(), int(image.ts * 1000) + + def decode_packet(self, packet: VideoPacket) -> Image: + channels = 1 if packet.format == ImageFormat.GRAY.value else 3 + shape = ( + (packet.height, packet.width) + if channels == 1 + else (packet.height, packet.width, channels) + ) + arr = np.frombuffer(packet.data, dtype=np.uint8).copy().reshape(shape) + return Image.from_numpy( + arr, format=ImageFormat(packet.format), frame_id=packet.frame_id, ts=packet.ts + ) + + +def _image(seq: int, fmt: ImageFormat = ImageFormat.RGB) -> Image: + data = np.full((2, 2, 3), seq, dtype=np.uint8) + if fmt == ImageFormat.GRAY: + data = np.full((2, 2), seq, dtype=np.uint8) + return Image.from_numpy(data, format=fmt, frame_id="cam", ts=float(seq)) + + +def _make_backend( + conn: sqlite3.Connection, *, config: H264ImageStorageConfig | None = None +) -> H264ImageBackend: + frame_index = H264FrameIndexStore(conn) + blob_store = SqliteBlobStore(conn=conn) + obs_store = SqliteObservationStore( + conn=conn, name="cam", codec=PickleCodec(), blob_store_conn_match=False, page_size=256 + ) + backend = H264ImageBackend( + metadata_store=obs_store, + blob_store=blob_store, + frame_index=frame_index, + storage_config=config or H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()), + ) + backend.start() + return backend + + +def test_storage_config_parse_and_serialize() -> None: + config = H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) + raw = config.serialize() + parsed = H264ImageStorageConfig.parse(raw) + assert parsed.mode == "h264" + assert parsed.codec == config.codec + assert storage_config_from_any(raw) == H264ImageStorageConfig(codec=config.codec) + assert storage_config_from_any({"mode": "jpeg", "codec": raw["codec"]}) is None + + +def test_store_creates_h264_backend_from_config(tmp_path) -> None: + store = SqliteStore(path=str(tmp_path / "h264.db")) + backend = store._create_backend( + "cam", + Image, + image_storage=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()), + ) + assert isinstance(backend, H264ImageBackend) + assert backend.storage_config.mode == "h264" + assert isinstance(backend.storage_config.codec_adapter, FakeH264CodecAdapter) + + +def test_h264_image_stream_keeps_default_jpeg_compatibility(tmp_path) -> None: + store = SqliteStore(path=str(tmp_path / "jpeg.db")) + stream = store.stream("rgb", Image) + obs = stream.append(_image(1)) + assert obs.data.format == ImageFormat.RGB + assert store.stream("rgb").count() == 1 + + +def test_h264_one_observation_and_one_blob_per_frame(tmp_path) -> None: + conn = sqlite3.connect(str(tmp_path / "frames.db")) + backend = _make_backend(conn) + from dimos.memory2.type.observation import Observation + + stored = backend.append(Observation(data_type=Image, _data=_image(1))) + assert stored.id == 1 + assert backend.blob_store is not None + assert backend.blob_store.get("cam", 1) + assert len(backend.frame_index.rows("cam")) == 1 + + +def test_h264_persistent_gop_index_and_lazy_decode(tmp_path) -> None: + db = tmp_path / "gop.db" + with SqliteStore(path=str(db)) as store: + stream = store.stream( + "cam", Image, image_storage=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) + ) + stream.append(_image(1), ts=1.0) + stream.append(_image(2), ts=2.0) + obs = list(stream)[1] + assert obs._loader is not None + assert obs._data is _UNLOADED + assert obs.id == 2 + assert obs.ts == 2.0 + assert obs.data.data.shape == (2, 2, 3) + backend = stream._source + assert isinstance(backend, H264ImageBackend) + assert len(backend.frame_index.rows("cam")) == 2 + + with SqliteStore(path=str(db), must_exist=True) as reopened: + stream = reopened.stream("cam", Image) + assert stream.count() == 2 + backend = stream._source + assert isinstance(backend, H264ImageBackend) + assert backend.storage_config.mode == "h264" + backend.storage_config = H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) + assert reopened.streams.cam.first().data.data.shape == (2, 2, 3) + + +def test_h264_mid_gop_decode_and_missing_gop_failure(tmp_path) -> None: + store = SqliteStore(path=str(tmp_path / "gap.db")) + stream = store.stream( + "cam", Image, image_storage=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) + ) + stream.append(_image(1)) + stream.append(_image(2)) + stream.append(_image(3)) + obs = list(stream)[2] + assert obs.data.data[0, 0, 0] == 3 + + backend = stream._source + assert isinstance(backend, H264ImageBackend) + backend.frame_index._conn.execute("DELETE FROM h264_frames WHERE observation_id = 2") + gap_obs = list(stream)[1] + with pytest.raises(VideoDecodeGapError): + _ = gap_obs.data + + +def test_replay_iterate_returns_decoded_images(tmp_path) -> None: + store = SqliteStore(path=str(tmp_path / "replay.db")) + stream = store.stream( + "cam", Image, image_storage=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) + ) + stream.append(_image(1), ts=1.0) + stream.append(_image(2), ts=2.0) + + replay = store.replay() + images = list(replay.streams.cam.iterate()) + assert [img.ts for img in images] == [1.0, 2.0] + assert [img.data[0, 0, 0] for img in images] == [1, 2] + + +def test_h264_rejects_unsupported_formats(tmp_path) -> None: + store = SqliteStore(path=str(tmp_path / "bad.db")) + stream = store.stream( + "cam", Image, image_storage=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) + ) + rgba = np.zeros((2, 2, 4), dtype=np.uint8) + with pytest.raises(UnsupportedVideoImageError): + stream.append(Image.from_numpy(rgba, format=ImageFormat.RGBA)) diff --git a/dimos/msgs/sensor_msgs/Image.py b/dimos/msgs/sensor_msgs/Image.py index 5eaca03886..b55060625c 100644 --- a/dimos/msgs/sensor_msgs/Image.py +++ b/dimos/msgs/sensor_msgs/Image.py @@ -15,7 +15,7 @@ from __future__ import annotations import base64 -from dataclasses import dataclass, field +from dataclasses import dataclass from enum import Enum import time from typing import TYPE_CHECKING, Any, Literal, TypedDict @@ -39,6 +39,16 @@ from reactivex.observable import Observable +_DEFAULT_IMAGE_DATA = object() + + +class _LazyPixelData: + pass + + +_UNLOADED_PIXELS = _LazyPixelData() + + class ImageFormat(Enum): BGR = "BGR" RGB = "RGB" @@ -82,24 +92,92 @@ class AgentImageMessage(TypedDict): data: str # Base64 encoded image data -@dataclass +@dataclass(init=False) class Image(Timestamped): """Simple NumPy-based image container.""" msg_name = "sensor_msgs.Image" - data: np.ndarray[Any, np.dtype[Any]] = field( - default_factory=lambda: np.zeros((1, 1, 3), dtype=np.uint8) - ) - format: ImageFormat = field(default=ImageFormat.BGR) - frame_id: str = field(default="") - ts: float = field(default_factory=time.time) - - def __post_init__(self) -> None: - if not isinstance(self.data, np.ndarray): - self.data = np.asarray(self.data) - if self.data.ndim < 2: + def __init__( + self, + data: Any = _DEFAULT_IMAGE_DATA, + format: ImageFormat = ImageFormat.BGR, + frame_id: str = "", + ts: float | None = None, + *, + pixel_loader: Callable[[], np.ndarray[Any, np.dtype[Any]]] | None = None, + height: int | None = None, + width: int | None = None, + channels: int | None = None, + dtype: np.dtype[Any] | type[Any] | None = None, + ) -> None: + self.format = format + self.frame_id = frame_id + self.ts = ts if ts is not None else time.time() + self._pixel_loader = pixel_loader + + if pixel_loader is None: + if data is _DEFAULT_IMAGE_DATA: + data = np.zeros((1, 1, 3), dtype=np.uint8) + self.data = data + return + + if height is None or width is None or dtype is None: + raise ValueError("Lazy Image construction requires height, width, and dtype metadata") + if height <= 0 or width <= 0: + raise ValueError("Lazy Image height and width must be positive") + self._data: np.ndarray[Any, np.dtype[Any]] | _LazyPixelData = _UNLOADED_PIXELS + self._height = int(height) + self._width = int(width) + self._channels = int(channels or 1) + self._dtype = np.dtype(dtype) + + @classmethod + def lazy( + cls, + *, + pixel_loader: Callable[[], np.ndarray[Any, np.dtype[Any]]], + height: int, + width: int, + format: ImageFormat = ImageFormat.BGR, + frame_id: str = "", + ts: float | None = None, + channels: int | None = None, + dtype: np.dtype[Any] | type[Any] = np.uint8, + ) -> Image: + """Construct an image whose pixels are materialized on first data access.""" + + return cls( + format=format, + frame_id=frame_id, + ts=ts, + pixel_loader=pixel_loader, + height=height, + width=width, + channels=channels, + dtype=dtype, + ) + + @property + def data(self) -> np.ndarray[Any, np.dtype[Any]]: + if isinstance(self._data, _LazyPixelData): + if self._pixel_loader is None: + raise ValueError("Lazy Image has no pixel loader") + self.data = self._pixel_loader() + self._pixel_loader = None + return self._data + + @data.setter + def data(self, value: Any) -> None: + arr = value if isinstance(value, np.ndarray) else np.asarray(value) + if arr.ndim < 2: raise ValueError("Image requires a 2D/3D NumPy array") + self._data = arr + self._height = int(arr.shape[0]) + self._width = int(arr.shape[1]) + self._channels = 1 if arr.ndim == 2 else int(arr.shape[2]) + self._dtype = arr.dtype + self._pixel_loader = None def __str__(self) -> str: return ( @@ -134,27 +212,25 @@ def __setstate__(self, state: dict[str, Any]) -> None: @property def height(self) -> int: - return int(self.data.shape[0]) + return self._height @property def width(self) -> int: - return int(self.data.shape[1]) + return self._width @property def channels(self) -> int: - if self.data.ndim == 2: - return 1 - if self.data.ndim == 3: - return int(self.data.shape[2]) - raise ValueError("Invalid image dimensions") + return self._channels @property def shape(self) -> tuple[int, ...]: - return tuple(self.data.shape) + if self.channels == 1: + return (self.height, self.width) + return (self.height, self.width, self.channels) @property def dtype(self) -> np.dtype[Any]: - return self.data.dtype + return self._dtype def copy(self) -> Image: return Image(data=self.data.copy(), format=self.format, frame_id=self.frame_id, ts=self.ts) diff --git a/dimos/msgs/sensor_msgs/VideoPacket.py b/dimos/msgs/sensor_msgs/VideoPacket.py new file mode 100644 index 0000000000..0ed619a20d --- /dev/null +++ b/dimos/msgs/sensor_msgs/VideoPacket.py @@ -0,0 +1,98 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import dataclass +import json +import struct +from typing import Any, ClassVar + +_MAGIC = b"DVP1" + + +@dataclass(frozen=True) +class VideoPacket: + """One encoded video frame/access unit. + + The first supported shape is a complete H.264 Annex B access unit for + exactly one source image frame. Delta frames are complete encoded-frame + packets, but they are not necessarily independently decodable without the + preceding GOP state. + """ + + msg_name: ClassVar[str] = "sensor_msgs.VideoPacket" + + seq: int + ts: float + frame_id: str + width: int + height: int + format: str + codec: str + bitstream: str + is_keyframe: bool + keyframe_seq: int + pts: int + data: bytes + + def __post_init__(self) -> None: + if self.seq < 0: + raise ValueError("seq must be non-negative") + if self.width <= 0 or self.height <= 0: + raise ValueError("width and height must be positive") + if self.codec != "h264": + raise ValueError(f"Unsupported video codec: {self.codec!r}") + if self.bitstream != "annex_b": + raise ValueError(f"Unsupported video bitstream: {self.bitstream!r}") + if not isinstance(self.data, bytes): + object.__setattr__(self, "data", bytes(self.data)) + if len(self.data) == 0: + raise ValueError("VideoPacket data must not be empty") + + def lcm_encode(self) -> bytes: + """Encode into a compact self-describing binary envelope.""" + + header = { + "seq": self.seq, + "ts": self.ts, + "frame_id": self.frame_id, + "width": self.width, + "height": self.height, + "format": self.format, + "codec": self.codec, + "bitstream": self.bitstream, + "is_keyframe": self.is_keyframe, + "keyframe_seq": self.keyframe_seq, + "pts": self.pts, + } + header_bytes = json.dumps(header, separators=(",", ":")).encode("utf-8") + return _MAGIC + struct.pack("!I", len(header_bytes)) + header_bytes + self.data + + @classmethod + def lcm_decode(cls, payload: bytes) -> VideoPacket: + """Decode a packet produced by :meth:`lcm_encode`.""" + + if len(payload) < 8 or payload[:4] != _MAGIC: + raise ValueError("Invalid VideoPacket payload") + header_len = struct.unpack("!I", payload[4:8])[0] + header_start = 8 + header_end = header_start + header_len + if header_end > len(payload): + raise ValueError("Truncated VideoPacket header") + header: dict[str, Any] = json.loads(payload[header_start:header_end].decode("utf-8")) + return cls(data=payload[header_end:], **header) + + +__all__ = ["VideoPacket"] diff --git a/dimos/msgs/sensor_msgs/test_image.py b/dimos/msgs/sensor_msgs/test_image.py index d679c6cb69..1214400e3a 100644 --- a/dimos/msgs/sensor_msgs/test_image.py +++ b/dimos/msgs/sensor_msgs/test_image.py @@ -69,6 +69,63 @@ def test_opencv_conversion(img: Image) -> None: assert decoded_img == img +def test_lazy_image_metadata_does_not_materialize_pixels() -> None: + calls = 0 + + def load() -> np.ndarray: + nonlocal calls + calls += 1 + return np.ones((3, 4, 3), dtype=np.uint8) + + img = Image.lazy( + pixel_loader=load, + height=3, + width=4, + channels=3, + dtype=np.uint8, + format=ImageFormat.RGB, + frame_id="cam", + ts=10.0, + ) + + assert img.height == 3 + assert img.width == 4 + assert img.channels == 3 + assert img.shape == (3, 4, 3) + assert img.dtype == np.dtype(np.uint8) + assert img.format == ImageFormat.RGB + assert img.frame_id == "cam" + assert img.ts == 10.0 + assert calls == 0 + + +def test_lazy_image_data_materializes_once() -> None: + calls = 0 + + def load() -> np.ndarray: + nonlocal calls + calls += 1 + return np.ones((3, 4, 3), dtype=np.uint8) + + img = Image.lazy(pixel_loader=load, height=3, width=4, channels=3, dtype=np.uint8) + + assert img.data.sum() == 36 + assert img.data.sum() == 36 + assert calls == 1 + + +def test_eager_image_compatibility_after_lazy_support() -> None: + data = np.ones((2, 3, 3), dtype=np.uint8) + img = Image(data=data, format=ImageFormat.BGR, frame_id="cam", ts=11.0) + + assert img.data is data + assert img.height == 2 + assert img.width == 3 + assert img.channels == 3 + assert img.shape == (2, 3, 3) + assert img.dtype == np.dtype(np.uint8) + + @pytest.mark.tool def test_sharpness_stream() -> None: get_data("unitree_office_walk") # Preload data for testing diff --git a/dimos/protocol/pubsub/impl/h264_lcm.py b/dimos/protocol/pubsub/impl/h264_lcm.py new file mode 100644 index 0000000000..28e6f75ac7 --- /dev/null +++ b/dimos/protocol/pubsub/impl/h264_lcm.py @@ -0,0 +1,63 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""H.264-encoded Image transport over LCM.""" + +from __future__ import annotations + +from dimos.msgs.sensor_msgs.Image import Image +from dimos.msgs.sensor_msgs.VideoPacket import VideoPacket +from dimos.protocol.pubsub.encoders import DecodingError, LCMTopicProto, PubSubEncoderMixin +from dimos.protocol.pubsub.impl.lcmpubsub import LCMPubSubBase +from dimos.protocol.video.h264 import H264Config, H264Decoder, H264Encoder, VideoDecodeGapError + + +class H264EncoderMixin(PubSubEncoderMixin[LCMTopicProto, Image, bytes]): + """Encoder mixin for Image streams using H.264 packets on the wire.""" + + def __init__(self, *, config: H264Config | None = None, **kwargs: object) -> None: + super().__init__(**kwargs) # type: ignore[misc] + self.h264_config = config or H264Config() + self._encoder: H264Encoder | None = None + self._decoder: H264Decoder | None = None + + def encode(self, msg: Image, topic: LCMTopicProto) -> bytes: + if self._encoder is None: + self._encoder = H264Encoder(self.h264_config) + return self._encoder.encode(msg).lcm_encode() + + def decode(self, msg: bytes, topic: LCMTopicProto) -> Image: + if topic.topic == "LCM_SELF_TEST": + raise DecodingError("Ignoring LCM_SELF_TEST topic") + if topic.lcm_type is not None and not issubclass(topic.lcm_type, Image): + raise DecodingError(f"H.264 LCM topic {topic.topic!r} is not typed as Image") + if self._decoder is None: + self._decoder = H264Decoder(self.h264_config) + try: + packet = VideoPacket.lcm_decode(msg) + except ValueError as exc: + raise DecodingError(str(exc)) from exc + try: + return self._decoder.decode(packet) + except VideoDecodeGapError as exc: + raise DecodingError(str(exc)) from exc + + +class H264LCM( # type: ignore[misc] + H264EncoderMixin, + LCMPubSubBase, +): ... + + +__all__ = ["H264LCM", "H264EncoderMixin"] diff --git a/dimos/protocol/pubsub/impl/test_h264_lcm.py b/dimos/protocol/pubsub/impl/test_h264_lcm.py new file mode 100644 index 0000000000..03ee35b88e --- /dev/null +++ b/dimos/protocol/pubsub/impl/test_h264_lcm.py @@ -0,0 +1,179 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from collections.abc import Callable +from dataclasses import dataclass + +import numpy as np +import pytest + +from dimos.msgs.protocol import DimosMsg +from dimos.msgs.sensor_msgs.Image import Image, ImageFormat +from dimos.msgs.sensor_msgs.VideoPacket import VideoPacket +from dimos.protocol.pubsub.encoders import DecodingError, LCMTopicProto +from dimos.protocol.pubsub.impl.h264_lcm import H264LCM, H264EncoderMixin +from dimos.protocol.video.h264 import VideoDecodeGapError + + +@dataclass +class StubTopic: + topic: str + lcm_type: type[DimosMsg] | None = None + + +class FakeEncoder: + def encode(self, image: Image) -> VideoPacket: + return VideoPacket( + seq=0, + ts=image.ts, + frame_id=image.frame_id, + width=image.width, + height=image.height, + format=image.format.value, + codec="h264", + bitstream="annex_b", + is_keyframe=True, + keyframe_seq=0, + pts=90, + data=b"\x00\x00\x00\x01\x65", + ) + + +class FakeDecoder: + def __init__(self, *, fail: bool = False) -> None: + self.fail = fail + + def decode(self, packet: VideoPacket) -> Image: + if self.fail: + raise VideoDecodeGapError("waiting for keyframe") + return Image( + data=np.zeros((packet.height, packet.width, 3), dtype=np.uint8), + format=ImageFormat(packet.format), + frame_id=packet.frame_id, + ts=packet.ts, + ) + + +class InMemoryPubSubBase: + def __init__(self, **_: object) -> None: + self._subscribers: list[tuple[LCMTopicProto, Callable[[bytes, LCMTopicProto], None]]] = [] + + def publish(self, topic: LCMTopicProto, message: bytes) -> None: + for subscribed_topic, callback in self._subscribers: + if subscribed_topic.topic == topic.topic: + callback(message, topic) + + def subscribe( + self, topic: LCMTopicProto, callback: Callable[[bytes, LCMTopicProto], None] + ) -> Callable[[], None]: + item = (topic, callback) + self._subscribers.append(item) + + def unsubscribe() -> None: + self._subscribers.remove(item) + + return unsubscribe + + +class InMemoryH264PubSub(H264EncoderMixin, InMemoryPubSubBase): # type: ignore[misc] + pass + + +def test_h264_lcm_encodes_image_as_video_packet_bytes() -> None: + transport = H264LCM() + transport._encoder = FakeEncoder() # type: ignore[assignment] + image = Image(data=np.zeros((2, 3, 3), dtype=np.uint8), format=ImageFormat.RGB, frame_id="cam") + + payload = transport.encode(image, StubTopic("/color", Image)) + packet = VideoPacket.lcm_decode(payload) + + assert packet.codec == "h264" + assert packet.bitstream == "annex_b" + assert packet.width == 3 + assert packet.height == 2 + assert packet.is_keyframe is True + + +def test_h264_lcm_decodes_video_packet_bytes_to_image() -> None: + transport = H264LCM() + transport._decoder = FakeDecoder() # type: ignore[assignment] + packet = FakeEncoder().encode( + Image(data=np.zeros((2, 3, 3), dtype=np.uint8), format=ImageFormat.RGB, frame_id="cam") + ) + + image = transport.decode(packet.lcm_encode(), StubTopic("/color", Image)) + + assert image.frame_id == "cam" + assert image.shape == (2, 3, 3) + + +def test_h264_lcm_suppresses_decode_gap() -> None: + transport = H264LCM() + transport._decoder = FakeDecoder(fail=True) # type: ignore[assignment] + packet = FakeEncoder().encode( + Image(data=np.zeros((2, 3, 3), dtype=np.uint8), format=ImageFormat.RGB, frame_id="cam") + ) + + with pytest.raises(DecodingError, match="waiting for keyframe"): + transport.decode(packet.lcm_encode(), StubTopic("/color", Image)) + + +def test_h264_lcm_suppresses_non_video_packet_payload() -> None: + transport = H264LCM() + + with pytest.raises(DecodingError, match="Invalid VideoPacket payload"): + transport.decode(b"not-a-video-packet", StubTopic("/color", Image)) + + +def test_h264_lcm_publish_subscribe_delivers_decoded_image() -> None: + transport = InMemoryH264PubSub() + transport._encoder = FakeEncoder() # type: ignore[assignment] + transport._decoder = FakeDecoder() # type: ignore[assignment] + topic = StubTopic("/color", Image) + received: list[Image] = [] + + transport.subscribe(topic, lambda image, _topic: received.append(image)) + transport.publish( + topic, + Image(data=np.zeros((2, 3, 3), dtype=np.uint8), format=ImageFormat.RGB, frame_id="cam"), + ) + + assert len(received) == 1 + assert received[0].frame_id == "cam" + assert received[0].shape == (2, 3, 3) + + +def test_h264_lcm_late_subscriber_waits_for_keyframe() -> None: + transport = InMemoryH264PubSub() + topic = StubTopic("/color", Image) + received: list[Image] = [] + decoder = FakeDecoder(fail=True) + transport._decoder = decoder # type: ignore[assignment] + + transport.subscribe(topic, lambda image, _topic: received.append(image)) + delta_packet = FakeEncoder().encode( + Image(data=np.zeros((2, 3, 3), dtype=np.uint8), format=ImageFormat.RGB, frame_id="cam") + ) + InMemoryPubSubBase.publish(transport, topic, delta_packet.lcm_encode()) + + decoder.fail = False + keyframe_packet = FakeEncoder().encode( + Image(data=np.zeros((2, 3, 3), dtype=np.uint8), format=ImageFormat.RGB, frame_id="cam") + ) + InMemoryPubSubBase.publish(transport, topic, keyframe_packet.lcm_encode()) + + assert len(received) == 1 + assert received[0].frame_id == "cam" diff --git a/dimos/protocol/pubsub/registry.py b/dimos/protocol/pubsub/registry.py index 82afd2cef6..9bbb47f5e9 100644 --- a/dimos/protocol/pubsub/registry.py +++ b/dimos/protocol/pubsub/registry.py @@ -22,13 +22,13 @@ :[#] -- ````: registry key, e.g. ``lcm``, ``jpeg_lcm``, ``plcm``, ``pshm``, - ``shm``, ``jpeg_shm``. +- ````: registry key, e.g. ``lcm``, ``jpeg_lcm``, ``h264_lcm``, + ``plcm``, ``pshm``, ``shm``, ``jpeg_shm``. - ````: channel/key, passed verbatim to the transport constructor. - ````: optional ``module.ClassName`` resolved via ``dimos.msgs.helpers.resolve_msg_type`` (e.g. ``sensor_msgs.Image``). -Typed protos (``lcm``, ``jpeg_lcm``) require a message type — either from the +Typed protos (``lcm``, ``jpeg_lcm``, ``h264_lcm``) require a message type — either from the ``#``-suffix or the ``msg_type`` kwarg. Pickled / self-describing protos (``plcm``, ``pshm``, ``shm``, ``jpeg_shm``) don't. """ @@ -61,6 +61,16 @@ def _make_jpeg_lcm(topic: str, msg_type: type | None) -> Any: return JpegLcmTransport(topic, msg_type) +def _make_h264_lcm(topic: str, msg_type: type | None) -> Any: + if msg_type is None: + raise ValueError( + "proto 'h264_lcm' requires a message type (URI '#suffix' or msg_type kwarg)" + ) + from dimos.core.transport import H264LcmTransport + + return H264LcmTransport(topic, msg_type) + + def _make_plcm(topic: str, msg_type: type | None) -> Any: # pickled LCM: receivers unpickle Python objects, no type registration needed. from dimos.core.transport import pLCMTransport @@ -92,6 +102,7 @@ def _make_jpeg_shm(topic: str, msg_type: type | None) -> Any: _REGISTRY: dict[str, Callable[[str, type | None], Any]] = { "lcm": _make_lcm, "jpeg_lcm": _make_jpeg_lcm, + "h264_lcm": _make_h264_lcm, "plcm": _make_plcm, "pshm": _make_pshm, "shm": _make_shm, diff --git a/dimos/protocol/pubsub/test_registry.py b/dimos/protocol/pubsub/test_registry.py index 9d7796c2ce..70d8a8f073 100644 --- a/dimos/protocol/pubsub/test_registry.py +++ b/dimos/protocol/pubsub/test_registry.py @@ -17,6 +17,7 @@ import pytest from dimos.core.transport import ( + H264LcmTransport, JpegLcmTransport, JpegShmTransport, LCMTransport, @@ -35,7 +36,15 @@ def test_supported_protos_includes_known_set() -> None: """Registry exposes the canonical proto names.""" - assert set(supported_protos()) >= {"lcm", "jpeg_lcm", "plcm", "pshm", "shm", "jpeg_shm"} + assert set(supported_protos()) >= { + "lcm", + "jpeg_lcm", + "h264_lcm", + "plcm", + "pshm", + "shm", + "jpeg_shm", + } @pytest.mark.parametrize( @@ -43,6 +52,7 @@ def test_supported_protos_includes_known_set() -> None: [ ("lcm:/color_image", ("lcm", "/color_image", None)), ("jpeg_lcm:/color_image", ("jpeg_lcm", "/color_image", None)), + ("h264_lcm:/color_image", ("h264_lcm", "/color_image", None)), ("pshm:color_image", ("pshm", "color_image", None)), ("shm:foo/bar", ("shm", "foo/bar", None)), ( @@ -53,6 +63,10 @@ def test_supported_protos_includes_known_set() -> None: "jpeg_lcm:/color_image#sensor_msgs.Image", ("jpeg_lcm", "/color_image", "sensor_msgs.Image"), ), + ( + "h264_lcm:/color_image#sensor_msgs.Image", + ("h264_lcm", "/color_image", "sensor_msgs.Image"), + ), ], ) def test_parse_pubsub_uri_happy_paths(uri: str, expected: tuple[str, str, str | None]) -> None: @@ -92,6 +106,11 @@ def test_make_pubsub_transport_jpeg_lcm_uses_JpegLcmTransport() -> None: assert isinstance(t, JpegLcmTransport) +def test_make_pubsub_transport_h264_lcm_uses_H264LcmTransport() -> None: + t = make_pubsub_transport("h264_lcm:/color_image", msg_type=Image) + assert isinstance(t, H264LcmTransport) + + def test_make_pubsub_transport_plcm_uses_pLCMTransport() -> None: t = make_pubsub_transport("plcm:/anything") assert isinstance(t, pLCMTransport) diff --git a/dimos/protocol/video/__init__.py b/dimos/protocol/video/__init__.py new file mode 100644 index 0000000000..4452bdd191 --- /dev/null +++ b/dimos/protocol/video/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Video codec helpers.""" diff --git a/dimos/protocol/video/demo_h264_video_e2e.py b/dimos/protocol/video/demo_h264_video_e2e.py new file mode 100644 index 0000000000..168d7339d2 --- /dev/null +++ b/dimos/protocol/video/demo_h264_video_e2e.py @@ -0,0 +1,278 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Synthetic end-to-end H.264 image transport and memory2 storage demo.""" + +from __future__ import annotations + +import threading +import time + +import numpy as np + +from dimos.core.coordination.blueprints import autoconnect +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In, Out +from dimos.core.transport import H264LcmTransport +from dimos.hardware.sensors.camera.module import CameraModule +from dimos.hardware.sensors.camera.webcam import Webcam +from dimos.memory2.module import OnExisting, Recorder +from dimos.memory2.store.sqlite import SqliteStore +from dimos.memory2.video.h264 import H264ImageStorageConfig +from dimos.msgs.sensor_msgs.Image import Image, ImageFormat +from dimos.protocol.pubsub.impl.h264_lcm import H264LCM +from dimos.protocol.video.h264 import H264Config +from dimos.utils.logging_config import setup_logger +from dimos.visualization.vis_module import vis_module + +logger = setup_logger() + + +class SyntheticVideoSourceConfig(ModuleConfig): + width: int = 160 + height: int = 120 + fps: float = 10.0 + frame_count: int = 90 + output_frame_id: str = "h264_e2e_camera" + seed: int = 7 + + +class SyntheticVideoSource(Module): + """Deterministic RGB image source for H.264 transport/storage QA.""" + + config: SyntheticVideoSourceConfig + color_image: Out[Image] + + _thread: threading.Thread | None = None + _stop_event: threading.Event | None = None + + @rpc + def start(self) -> None: + super().start() + self._stop_event = threading.Event() + self._thread = threading.Thread(target=self._publish_loop, daemon=True) + self._thread.start() + logger.info( + "Started synthetic H.264 video source: %sx%s @ %.2f FPS for %s frames", + self.config.width, + self.config.height, + self.config.fps, + self.config.frame_count, + ) + + @rpc + def stop(self) -> None: + if self._stop_event is not None: + self._stop_event.set() + if self._thread is not None: + self._thread.join(timeout=2.0) + self._thread = None + super().stop() + + def _publish_loop(self) -> None: + assert self._stop_event is not None + period = 1.0 / max(self.config.fps, 0.1) + next_publish = time.monotonic() + for seq in range(self.config.frame_count): + if self._stop_event.is_set(): + break + frame = self._make_frame(seq) + self.color_image.publish(frame) + next_publish += period + time.sleep(max(0.0, next_publish - time.monotonic())) + logger.info("Synthetic H.264 source finished publishing frames") + + def _make_frame(self, seq: int) -> Image: + yy, xx = np.indices((self.config.height, self.config.width), dtype=np.uint16) + base = (xx + (yy * 3) + (seq * 5) + self.config.seed) % 256 + data = np.stack( + (base, (base + 85) % 256, (base + 170) % 256), + axis=2, + ).astype(np.uint8) + return Image( + data=data, + format=ImageFormat.RGB, + frame_id=self.config.output_frame_id, + ts=time.time(), + ) + + +class H264E2ERecorder(Recorder): + """Recorder with a typed image input for the synthetic H.264 demo.""" + + color_image: In[Image] + + +class H264WebcamRecorder(Recorder): + """Recorder with a typed image input for webcam H.264 QA.""" + + color_image: In[Image] + + +class H264MemoryReplayConfig(ModuleConfig): + db_path: str = "webcam_h264.db" + speed: float = 1.0 + seek: float | None = None + duration: float | None = None + loop: bool = False + + +class H264MemoryReplay(Module): + """Replay a memory2 H.264 image stream as normal `Image` frames.""" + + config: H264MemoryReplayConfig + color_image: Out[Image] + + @rpc + def start(self) -> None: + super().start() + store = self.register_disposable(SqliteStore(path=self.config.db_path, must_exist=True)) + replay = store.replay( + speed=self.config.speed, + seek=self.config.seek, + duration=self.config.duration, + loop=self.config.loop, + ) + + def on_error(error: Exception) -> None: + logger.error("H.264 replay pipeline error: %s", error, exc_info=True) + + self.register_disposable( + replay.streams.color_image.observable().subscribe( + on_next=self.color_image.publish, + on_error=on_error, + ) + ) + + +class H264VideoProbe(Module): + """Probe decoded H.264 `Image` delivery and report QA status.""" + + color_image: In[Image] + + _lock: threading.Lock + _received: int + _last_ts: float | None + _dimensions: tuple[int, int] | None + _frame_id: str | None + _failures: list[str] + + @rpc + def start(self) -> None: + super().start() + self._lock = threading.Lock() + self._received = 0 + self._last_ts = None + self._dimensions = None + self._frame_id = None + self._failures = [] + self.color_image.subscribe(self._on_image) + + def _on_image(self, image: Image) -> None: + with self._lock: + if self._last_ts is not None and image.ts < self._last_ts: + self._failures.append(f"timestamp regressed: {image.ts} < {self._last_ts}") + dims = (image.width, image.height) + if self._dimensions is None: + self._dimensions = dims + elif self._dimensions != dims: + self._failures.append(f"dimension changed: {dims} != {self._dimensions}") + if self._frame_id is None: + self._frame_id = image.frame_id + elif self._frame_id != image.frame_id: + self._failures.append(f"frame_id changed: {image.frame_id} != {self._frame_id}") + self._last_ts = image.ts + self._received += 1 + + if self._received % 10 == 0: + logger.info("H.264 video probe received %s decoded frames", self._received) + + @rpc + def summary(self) -> str: + """Return decoded-frame QA status for the synthetic H.264 demo.""" + with self._lock: + status = "ok" if not self._failures else "failed" + return ( + f"status={status} received={self._received} " + f"dimensions={self._dimensions} frame_id={self._frame_id!r} " + f"last_ts={self._last_ts} failures={self._failures}" + ) + + +_h264_config = H264Config(bitrate=1_000_000, target_fps=10, keyframe_interval=15) +_webcam_h264_config = H264Config(bitrate=2_000_000, target_fps=15, keyframe_interval=30) + + +def _webcam() -> Webcam: + return Webcam(camera_index=0, width=640, height=480, fps=15.0) + + +demo_h264_video_e2e = autoconnect( + SyntheticVideoSource.blueprint(), + H264E2ERecorder.blueprint( + db_path="h264_video_e2e.db", + on_existing=OnExisting.OVERWRITE, + image_storage={ + "color_image": H264ImageStorageConfig(codec=_h264_config), + }, + ), + H264VideoProbe.blueprint(), +).transports( + { + ("color_image", Image): H264LcmTransport( + "/demo_h264_video_e2e/color_image", + Image, + config=_h264_config, + ) + } +) + + +demo_h264_webcam_record = autoconnect( + CameraModule.blueprint(hardware=_webcam, transform=None, frequency=15.0), + H264WebcamRecorder.blueprint( + db_path="webcam_h264.db", + on_existing=OnExisting.OVERWRITE, + image_storage={ + "color_image": H264ImageStorageConfig(codec=_webcam_h264_config), + }, + ), +).transports( + { + ("color_image", Image): H264LcmTransport( + "/demo_h264_webcam_record/color_image", + Image, + config=_webcam_h264_config, + ) + } +) + + +demo_h264_webcam_replay = autoconnect( + H264MemoryReplay.blueprint(db_path="webcam_h264.db"), + H264VideoProbe.blueprint(), + vis_module( + "rerun", + rerun_config={"pubsubs": [H264LCM(config=_webcam_h264_config)]}, + ), +).transports( + { + ("color_image", Image): H264LcmTransport( + "/demo_h264_webcam_replay/color_image", + Image, + config=_webcam_h264_config, + ) + } +) diff --git a/dimos/protocol/video/h264.py b/dimos/protocol/video/h264.py new file mode 100644 index 0000000000..52303d45c6 --- /dev/null +++ b/dimos/protocol/video/h264.py @@ -0,0 +1,293 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from collections.abc import Callable, Sequence +from dataclasses import dataclass, field +from fractions import Fraction +from typing import TYPE_CHECKING, Protocol + +import numpy as np + +from dimos.msgs.sensor_msgs.Image import Image, ImageFormat +from dimos.msgs.sensor_msgs.VideoPacket import VideoPacket + +if TYPE_CHECKING: + import av + + +class MissingVideoDependencyError(ImportError): + """Raised when H.264 support is selected without required video packages.""" + + +class UnsupportedVideoImageError(ValueError): + """Raised when an image cannot be represented by the H.264 adapter.""" + + +class VideoDecodeGapError(RuntimeError): + """Raised when a decoder cannot safely decode because GOP state is invalid.""" + + +@dataclass(frozen=True) +class H264Config: + """Configuration for opt-in H.264 image encoding.""" + + bitrate: int = 2_000_000 + target_fps: int = 30 + keyframe_interval: int = 30 + profile: str = "baseline" + preset: str = "veryfast" + tune: str = "zerolatency" + max_gop_frames: int = 30 + pixel_format: str = "yuv420p" + supported_formats: tuple[ImageFormat, ...] = field( + default_factory=lambda: (ImageFormat.RGB, ImageFormat.BGR, ImageFormat.GRAY) + ) + + def __post_init__(self) -> None: + if self.bitrate <= 0: + raise ValueError("bitrate must be positive") + if self.target_fps <= 0: + raise ValueError("target_fps must be positive") + if self.keyframe_interval <= 0: + raise ValueError("keyframe_interval must be positive") + if self.max_gop_frames <= 0: + raise ValueError("max_gop_frames must be positive") + + +class H264CodecAdapter(Protocol): + """DimOS-facing codec adapter; hides aiortc/RTP details from public APIs.""" + + def encode_image(self, image: Image, *, force_keyframe: bool) -> tuple[bytes, int]: ... + + def decode_packet(self, packet: VideoPacket) -> Image: ... + + +@dataclass(frozen=True) +class H264AccessUnit: + """Complete Annex B access unit for one source frame.""" + + data: bytes + + @classmethod + def from_rtp_payloads( + cls, + payloads: Sequence[bytes], + depayload: Callable[[bytes], bytes], + ) -> H264AccessUnit: + """Assemble RTP-sized H.264 payloads into one Annex B access unit.""" + + if not payloads: + raise ValueError("H.264 encoder returned no payloads") + data = b"".join(depayload(payload) for payload in payloads) + if not data.startswith((b"\x00\x00\x01", b"\x00\x00\x00\x01")): + raise ValueError("H.264 access unit is not Annex B byte-stream data") + return cls(data=data) + + +def ensure_supported_image(image: Image, config: H264Config) -> None: + """Validate the first-version H.264 image input contract.""" + + if image.format not in config.supported_formats: + supported = ", ".join(fmt.value for fmt in config.supported_formats) + raise UnsupportedVideoImageError( + f"H.264 image encoding supports {supported}; got {image.format.value}" + ) + if image.dtype != np.dtype(np.uint8): + raise UnsupportedVideoImageError( + f"H.264 image encoding requires uint8 data; got {image.dtype}" + ) + if image.channels not in (1, 3): + raise UnsupportedVideoImageError( + f"H.264 image encoding requires 1 or 3 channels; got {image.channels}" + ) + + +class AiortcH264Codec: + """Small adapter around aiortc's H.264 encoder/decoder internals.""" + + def __init__(self, config: H264Config | None = None) -> None: + self.config = config or H264Config() + try: + from aiortc.codecs.h264 import ( + H264Decoder as AiortcDecoder, + H264Encoder as AiortcEncoder, + h264_depayload, + ) + from aiortc.jitterbuffer import JitterFrame + import av + except ImportError as exc: + raise MissingVideoDependencyError( + "H.264 image mode requires aiortc, PyAV, FFmpeg, and H.264 codec support" + ) from exc + + self._av = av + self._jitter_frame_type = JitterFrame + self._depayload = h264_depayload + self._encoder = AiortcEncoder() + self._decoder = AiortcDecoder() + self._frame_index = 0 + self._time_base = Fraction(1, self.config.target_fps) + if hasattr(self._encoder, "target_bitrate"): + self._encoder.target_bitrate = self.config.bitrate + + def encode_image(self, image: Image, *, force_keyframe: bool) -> tuple[bytes, int]: + ensure_supported_image(image, self.config) + frame = self._to_video_frame(image) + payloads, pts = self._encoder.encode(frame, force_keyframe=force_keyframe) + access_unit = H264AccessUnit.from_rtp_payloads(payloads, self._depayload) + return access_unit.data, int(pts) + + def decode_packet(self, packet: VideoPacket) -> Image: + frame = self._jitter_frame_type(data=packet.data, timestamp=packet.pts) + decoded_frames = self._decoder.decode(frame) + if not decoded_frames: + raise VideoDecodeGapError("H.264 decoder produced no frame") + return self._from_video_frame(decoded_frames[0], packet) + + def _to_video_frame(self, image: Image) -> av.VideoFrame: + fmt = _av_input_format(image.format) + frame = self._av.VideoFrame.from_ndarray(np.ascontiguousarray(image.data), format=fmt) + frame.pts = self._frame_index + frame.time_base = self._time_base + self._frame_index += 1 + return frame + + @staticmethod + def _from_video_frame(frame: av.VideoFrame, packet: VideoPacket) -> Image: + image_format = ImageFormat(packet.format) + arr = frame.to_ndarray(format=_av_input_format(image_format)) + return Image(data=arr, format=image_format, frame_id=packet.frame_id, ts=packet.ts) + + +class H264Encoder: + """Encode a normal DimOS Image stream into per-frame H.264 packets.""" + + def __init__( + self, + config: H264Config | None = None, + *, + codec: H264CodecAdapter | None = None, + ) -> None: + self.config = config or H264Config() + self._codec = codec or AiortcH264Codec(self.config) + self._seq = 0 + self._keyframe_seq = -1 + + def encode(self, image: Image, *, force_keyframe: bool = False) -> VideoPacket: + ensure_supported_image(image, self.config) + is_keyframe = self._should_force_keyframe(force_keyframe) + access_unit, pts = self._codec.encode_image(image, force_keyframe=is_keyframe) + if is_keyframe: + self._keyframe_seq = self._seq + packet = VideoPacket( + seq=self._seq, + ts=image.ts, + frame_id=image.frame_id, + width=image.width, + height=image.height, + format=image.format.value, + codec="h264", + bitstream="annex_b", + is_keyframe=is_keyframe, + keyframe_seq=self._keyframe_seq, + pts=pts, + data=access_unit, + ) + self._seq += 1 + return packet + + def _should_force_keyframe(self, requested: bool) -> bool: + if requested or self._seq == 0 or self._keyframe_seq < 0: + return True + since_keyframe = self._seq - self._keyframe_seq + return since_keyframe >= min(self.config.keyframe_interval, self.config.max_gop_frames) + + +class GopBuffer: + """Track H.264 GOP validity across a packet stream.""" + + def __init__(self) -> None: + self.expected_seq: int | None = None + self.keyframe_seq: int | None = None + self.valid = False + + def accept(self, packet: VideoPacket) -> bool: + """Return True when the packet can be safely decoded.""" + + if self.expected_seq is not None and packet.seq != self.expected_seq: + self.valid = False + self.expected_seq = packet.seq + 1 + + if packet.is_keyframe: + self.keyframe_seq = packet.seq + self.valid = True + return True + + if not self.valid: + return False + if self.keyframe_seq is None or packet.keyframe_seq != self.keyframe_seq: + self.valid = False + return False + return True + + +class H264Decoder: + """Decode per-frame H.264 packets into normal DimOS Images.""" + + def __init__( + self, + config: H264Config | None = None, + *, + codec: H264CodecAdapter | None = None, + gop_buffer: GopBuffer | None = None, + ) -> None: + self.config = config or H264Config() + self._codec = codec or AiortcH264Codec(self.config) + self._gop_buffer = gop_buffer or GopBuffer() + + def decode(self, packet: VideoPacket) -> Image: + if not self._gop_buffer.accept(packet): + raise VideoDecodeGapError( + f"Cannot decode H.264 packet seq={packet.seq}; waiting for next keyframe" + ) + return self._codec.decode_packet(packet) + + +def _av_input_format(format: ImageFormat) -> str: + match format: + case ImageFormat.RGB: + return "rgb24" + case ImageFormat.BGR: + return "bgr24" + case ImageFormat.GRAY: + return "gray" + case _: + raise UnsupportedVideoImageError(f"Unsupported H.264 image format: {format.value}") + + +__all__ = [ + "AiortcH264Codec", + "GopBuffer", + "H264AccessUnit", + "H264CodecAdapter", + "H264Config", + "H264Decoder", + "H264Encoder", + "MissingVideoDependencyError", + "UnsupportedVideoImageError", + "VideoDecodeGapError", + "ensure_supported_image", +] diff --git a/dimos/protocol/video/test_h264.py b/dimos/protocol/video/test_h264.py new file mode 100644 index 0000000000..6f3825db7f --- /dev/null +++ b/dimos/protocol/video/test_h264.py @@ -0,0 +1,159 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import builtins +from dataclasses import dataclass + +import numpy as np +import pytest + +from dimos.msgs.sensor_msgs.Image import Image, ImageFormat +from dimos.msgs.sensor_msgs.VideoPacket import VideoPacket +from dimos.protocol.video.h264 import ( + AiortcH264Codec, + GopBuffer, + H264AccessUnit, + H264Config, + H264Decoder, + H264Encoder, + MissingVideoDependencyError, + UnsupportedVideoImageError, + VideoDecodeGapError, +) + + +@dataclass +class FakeCodec: + encoded_force_keyframes: list[bool] + decoded_packets: list[int] + + def encode_image(self, image: Image, *, force_keyframe: bool) -> tuple[bytes, int]: + self.encoded_force_keyframes.append(force_keyframe) + if force_keyframe: + return b"\x00\x00\x00\x01\x67sps\x00\x00\x00\x01\x68pps\x00\x00\x00\x01\x65idr", 90 + return b"\x00\x00\x00\x01\x41delta", 180 + + def decode_packet(self, packet: VideoPacket) -> Image: + self.decoded_packets.append(packet.seq) + return Image( + data=np.zeros((packet.height, packet.width, 3), dtype=np.uint8), + format=ImageFormat(packet.format), + frame_id=packet.frame_id, + ts=packet.ts, + ) + + +def _image(format: ImageFormat = ImageFormat.RGB, dtype: np.dtype = np.dtype(np.uint8)) -> Image: + return Image( + data=np.zeros((4, 6, 3), dtype=dtype), + format=format, + frame_id="cam", + ts=123.0, + ) + + +def _packet(seq: int, *, key: bool, keyframe_seq: int | None = None) -> VideoPacket: + return VideoPacket( + seq=seq, + ts=123.0 + seq, + frame_id="cam", + width=6, + height=4, + format=ImageFormat.RGB.value, + codec="h264", + bitstream="annex_b", + is_keyframe=key, + keyframe_seq=seq if key else (0 if keyframe_seq is None else keyframe_seq), + pts=seq * 90, + data=b"\x00\x00\x00\x01\x65" if key else b"\x00\x00\x00\x01\x41", + ) + + +def test_video_packet_serializes_complete_access_unit() -> None: + packet = _packet(0, key=True) + + decoded = VideoPacket.lcm_decode(packet.lcm_encode()) + + assert decoded == packet + assert decoded.codec == "h264" + assert decoded.bitstream == "annex_b" + assert decoded.data.startswith(b"\x00\x00\x00\x01") + + +def test_access_unit_assembles_depayloaded_annex_b_fragments() -> None: + unit = H264AccessUnit.from_rtp_payloads( + [b"payload-a", b"payload-b"], + lambda payload: b"\x00\x00\x00\x01" + payload, + ) + + assert unit.data == b"\x00\x00\x00\x01payload-a\x00\x00\x00\x01payload-b" + + +def test_encoder_emits_keyframe_metadata_and_periodic_keyframes() -> None: + codec = FakeCodec(encoded_force_keyframes=[], decoded_packets=[]) + encoder = H264Encoder(H264Config(keyframe_interval=2, max_gop_frames=2), codec=codec) + + p0 = encoder.encode(_image()) + p1 = encoder.encode(_image()) + p2 = encoder.encode(_image()) + + assert [p0.seq, p1.seq, p2.seq] == [0, 1, 2] + assert [p0.is_keyframe, p1.is_keyframe, p2.is_keyframe] == [True, False, True] + assert [p0.keyframe_seq, p1.keyframe_seq, p2.keyframe_seq] == [0, 0, 2] + assert codec.encoded_force_keyframes == [True, False, True] + assert b"\x67" in p0.data and b"\x68" in p0.data + + +def test_gop_buffer_suppresses_delta_after_sequence_gap_until_keyframe() -> None: + codec = FakeCodec(encoded_force_keyframes=[], decoded_packets=[]) + decoder = H264Decoder(codec=codec, gop_buffer=GopBuffer()) + + assert decoder.decode(_packet(0, key=True)).frame_id == "cam" + assert decoder.decode(_packet(1, key=False, keyframe_seq=0)).frame_id == "cam" + + with pytest.raises(VideoDecodeGapError): + decoder.decode(_packet(3, key=False, keyframe_seq=0)) + with pytest.raises(VideoDecodeGapError): + decoder.decode(_packet(4, key=False, keyframe_seq=0)) + + assert decoder.decode(_packet(5, key=True)).frame_id == "cam" + assert codec.decoded_packets == [0, 1, 5] + + +def test_unsupported_image_format_and_dtype_fail_explicitly() -> None: + codec = FakeCodec(encoded_force_keyframes=[], decoded_packets=[]) + encoder = H264Encoder(codec=codec) + + with pytest.raises(UnsupportedVideoImageError, match="RGBA"): + encoder.encode(_image(ImageFormat.RGBA)) + with pytest.raises(UnsupportedVideoImageError, match="uint8"): + encoder.encode(_image(dtype=np.dtype(np.uint16))) + + +def test_missing_aiortc_dependencies_raise_actionable_error( + monkeypatch: pytest.MonkeyPatch, +) -> None: + real_import = builtins.__import__ + + def fake_import(name: str, *args: object, **kwargs: object) -> object: + if name == "av" or name.startswith("aiortc"): + raise ImportError(name) + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", fake_import) + + with pytest.raises(MissingVideoDependencyError, match="H.264 image mode requires"): + AiortcH264Codec() diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index efd0f563ba..58649b6ed8 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -50,6 +50,9 @@ "demo-error-on-name-conflicts": "dimos.robot.unitree.demo_error_on_name_conflicts:demo_error_on_name_conflicts", "demo-google-maps-skill": "dimos.agents.skills.demo_google_maps_skill:demo_google_maps_skill", "demo-gps-nav": "dimos.agents.skills.demo_gps_nav:demo_gps_nav", + "demo-h264-video-e2e": "dimos.protocol.video.demo_h264_video_e2e:demo_h264_video_e2e", + "demo-h264-webcam-record": "dimos.protocol.video.demo_h264_video_e2e:demo_h264_webcam_record", + "demo-h264-webcam-replay": "dimos.protocol.video.demo_h264_video_e2e:demo_h264_webcam_replay", "demo-mcp-stress-test": "dimos.core.tests.stress_test_blueprint:demo_mcp_stress_test", "demo-object-scene-registration": "dimos.perception.demo_object_scene_registration:demo_object_scene_registration", "demo-osm": "dimos.mapping.osm.demo_osm:demo_osm", @@ -165,6 +168,10 @@ "gps-nav-skill-container": "dimos.agents.skills.gps_nav_skill.GpsNavSkillContainer", "grasping-module": "dimos.manipulation.grasping.grasping.GraspingModule", "gstreamer-camera-module": "dimos.hardware.sensors.camera.gstreamer.gstreamer_camera.GstreamerCameraModule", + "h264-e2-e-recorder": "dimos.protocol.video.demo_h264_video_e2e.H264E2ERecorder", + "h264-memory-replay": "dimos.protocol.video.demo_h264_video_e2e.H264MemoryReplay", + "h264-video-probe": "dimos.protocol.video.demo_h264_video_e2e.H264VideoProbe", + "h264-webcam-recorder": "dimos.protocol.video.demo_h264_video_e2e.H264WebcamRecorder", "joint-trajectory-controller": "dimos.manipulation.control.trajectory_controller.joint_trajectory_controller.JointTrajectoryController", "joystick-module": "dimos.robot.unitree.b1.joystick_module.JoystickModule", "keyboard-teleop": "dimos.robot.unitree.keyboard_teleop.KeyboardTeleop", @@ -215,6 +222,7 @@ "simple-planner": "dimos.navigation.nav_stack.modules.simple_planner.simple_planner.SimplePlanner", "spatial-memory": "dimos.perception.spatial_perception.SpatialMemory", "speak-skill": "dimos.agents.skills.speak_skill.SpeakSkill", + "synthetic-video-source": "dimos.protocol.video.demo_h264_video_e2e.SyntheticVideoSource", "tare-planner": "dimos.navigation.nav_stack.modules.tare_planner.tare_planner.TarePlanner", "temporal-memory": "dimos.perception.experimental.temporal_memory.temporal_memory.TemporalMemory", "terrain-analysis": "dimos.navigation.nav_stack.modules.terrain_analysis.terrain_analysis.TerrainAnalysis", diff --git a/docs/capabilities/memory/index.md b/docs/capabilities/memory/index.md index c2fd3a35be..739ab81f28 100644 --- a/docs/capabilities/memory/index.md +++ b/docs/capabilities/memory/index.md @@ -206,3 +206,79 @@ plot_mosaic(matches.map(lambda obs: obs.data).to_list(), "assets/grid.png") ``` ![output](assets/grid.png) + +## H.264 image storage + +memory2 stores `Image` streams with the default JPEG image codec unless a stream +opts into H.264. Use H.264 storage for high-rate camera streams when disk usage +matters and frame-to-frame compression is worth the dependency cost. + +```python skip +from dimos.memory2.store.sqlite import SqliteStore +from dimos.memory2.video.h264 import H264ImageStorageConfig +from dimos.msgs.sensor_msgs.Image import Image +from dimos.protocol.video.h264 import H264Config + +store = SqliteStore(path="robot_video.db") +color = store.stream( + "color_image", + Image, + image_storage=H264ImageStorageConfig( + codec=H264Config(bitrate=2_000_000, keyframe_interval=30), + ), +) +``` + +Recorders can configure the same setting per input stream: + +```python skip +from dimos.memory2.module import Recorder +from dimos.memory2.video.h264 import H264ImageStorageConfig +from dimos.protocol.video.h264 import H264Config + +recorder = Recorder.blueprint( + db_path="robot_video.db", + image_storage={ + "color_image": H264ImageStorageConfig( + codec=H264Config(bitrate=2_000_000, keyframe_interval=30), + ) + }, +) +``` + +H.264 storage keeps the normal memory2 shape: one observation row per source +frame. The blob for that observation stores one serialized video packet whose +payload is a complete H.264 Annex B access unit, not individual RTP fragments. +The store also writes GOP metadata so lazy decode and replay can start at the +nearest prior keyframe. + +Metadata queries do not decode pixels. You can inspect timestamps, poses, tags, +frame ids, and dimensions without paying decode cost. Accessing `obs.data` +decodes lazily from the nearest usable keyframe through the requested frame and +returns a normal `Image`. Replay emits decoded `Image` values in timestamp order. + +H.264 storage currently supports uint8 RGB, BGR, and grayscale images. It raises +an explicit error for depth images, 16-bit images, alpha formats, and other +unsupported pixel layouts. The default `store.stream("color_image", Image)` path +continues to use JPEG. + +### Synthetic H.264 QA blueprint + +The `demo-h264-video-e2e` blueprint exercises both live H.264 LCM transport and +H.264 memory2 storage without a robot or physical camera: + +```bash skip +dimos run demo-h264-video-e2e --daemon +dimos log -f +``` + +The blueprint publishes deterministic synthetic RGB frames, records them to +`h264_video_e2e.db`, and runs a probe that logs decoded-frame count, dimensions, +timestamp monotonicity, frame id stability, and validation failures. Use it after +codec or storage changes to inspect: + +- logs from the source, recorder, and probe; +- memory2 metadata queries that do not touch `obs.data`; +- lazy `obs.data` decode for both keyframe and mid-GOP observations; +- replay of the recorded stream; and +- sequence-gap behavior, if you inject packet loss in the transport tests. diff --git a/docs/coding-agents/style.md b/docs/coding-agents/style.md index 3e13faae9b..86ef7058e1 100644 --- a/docs/coding-agents/style.md +++ b/docs/coding-agents/style.md @@ -49,3 +49,15 @@ from dimos.memory2.stream import Stream from dimos.memory2.store.base import Store from dimos.memory2.stream import Stream ``` + +## H.264 image packet shape + +When editing H.264 image transport or memory2 storage, keep the public module +contract as `Out[Image]` and `In[Image]`. Do not expose RTP fragments to module +authors or memory2 observations. + +For LCM, DDS, and memory2 storage, each encoded packet must contain all H.264 NAL +units for exactly one source frame as one Annex B access unit. Store one memory2 +observation per source frame. P-frames still depend on earlier GOP state, so +decode from a valid keyframe and suppress output after sequence gaps until the +next keyframe. diff --git a/docs/development/testing.md b/docs/development/testing.md index 40e429797c..fd7631635b 100644 --- a/docs/development/testing.md +++ b/docs/development/testing.md @@ -63,6 +63,37 @@ When writing or debugging a specific self-hosted test, override `-m` yourself to pytest -m self_hosted dimos/path/to/test_something.py ``` +### H.264 image transport and storage tests + +The H.264 unit tests use fake codec adapters where possible, so they run in the +default suite without requiring FFmpeg/libx264. Run the focused tests after +changing video packet shape, lazy `Image` behavior, H.264 transport, memory2 +storage, or the demo blueprint: + +```bash +uv run pytest dimos/protocol/video/test_h264.py dimos/msgs/sensor_msgs/test_image.py -q +uv run pytest dimos/protocol/pubsub/impl/test_h264_lcm.py dimos/protocol/pubsub/test_registry.py -q +uv run pytest dimos/memory2/video/test_h264_storage.py -q +CI=1 uv run pytest dimos/robot/test_all_blueprints_generation.py -q +``` + +The runtime H.264 path uses `aiortc`, PyAV, FFmpeg, and libx264. If a test or +manual run instantiates the real codec and those dependencies are missing, H.264 +should fail with an actionable dependency error. Keep fake-adapter unit tests in +place so the default suite still covers packet semantics, GOP handling, and +memory2 behavior. + +When you add or rename a runnable demo blueprint, regenerate +`dimos/robot/all_blueprints.py` with: + +```bash +uv run pytest dimos/robot/test_all_blueprints_generation.py +``` + +Locally, that command may update `all_blueprints.py` and then fail to remind you +to commit the generated file. Re-run it with `CI=1` after the file is current to +verify generation is clean. + ## Testing on a fresh Ubuntu install CI tests dimos with pre-built images and cached deps, so it can't catch gaps diff --git a/docs/usage/blueprints.md b/docs/usage/blueprints.md index bceb356cd7..237ade1e14 100644 --- a/docs/usage/blueprints.md +++ b/docs/usage/blueprints.md @@ -163,6 +163,28 @@ base_blueprint = base_blueprint.transports({ Note: `expanded_blueprint` does not get the transport overrides because it's created from the initial value of `base_blueprint`, not the second. +For compressed camera streams, opt into H.264 on the image edge while keeping the +module stream type as `Image`: + +```python skip +from dimos.core.transport import H264LcmTransport +from dimos.msgs.sensor_msgs.Image import Image +from dimos.protocol.video.h264 import H264Config + +camera_blueprint = camera_blueprint.transports( + { + ("color_image", Image): H264LcmTransport( + "/camera/color_image", + Image, + config=H264Config(bitrate=2_000_000, keyframe_interval=30), + ) + } +) +``` + +`H264LcmTransport` publishes one complete H.264 Annex B packet per source frame. +Downstream modules still receive decoded `Image` values. + ## Remapping connections Sometimes you need to rename a connection to match what other modules expect. You can use `remappings` to rename module connections: diff --git a/docs/usage/transports/index.md b/docs/usage/transports/index.md index 5415aab1d0..74dd14674b 100644 --- a/docs/usage/transports/index.md +++ b/docs/usage/transports/index.md @@ -114,6 +114,50 @@ ros = nav.transports( ) ``` +### H.264 image transport + +Use `H264LcmTransport` when a high-rate `Image` stream needs video compression +over LCM. The module API stays the same: publishers still call +`Out[Image].publish(image)`, and subscribers still receive `Image` values. The +transport encodes each source frame as one H.264 Annex B access unit on the wire +and decodes it at the subscriber. + +```python skip +from dimos.core.transport import H264LcmTransport +from dimos.msgs.sensor_msgs.Image import Image +from dimos.protocol.video.h264 import H264Config + +blueprint = blueprint.transports( + { + ("color_image", Image): H264LcmTransport( + "/camera/color_image", + Image, + config=H264Config( + bitrate=2_000_000, + target_fps=30, + keyframe_interval=30, + ), + ) + } +) +``` + +H.264 transport is opt-in. The default image paths remain unchanged: normal LCM +uses the `Image` LCM encoding, and memory2 still stores images with the default +JPEG codec unless configured otherwise. + +H.264 is stateful. Keyframes bootstrap late subscribers and recovery after packet +loss. If an LCM subscriber detects a sequence gap in the middle of a GOP, it +suppresses decoded output until the next keyframe. Keyframes include decoder +parameter data, such as SPS/PPS, so a new subscriber can start decoding at a +keyframe. + +The first H.264 image path supports uint8 RGB, BGR, and grayscale images. It +raises an explicit error for depth, 16-bit, alpha, or other unsupported image +formats instead of silently converting pixels. Selecting H.264 requires the video +dependencies used by `aiortc`, PyAV, FFmpeg, and libx264; projects that do not +select H.264 do not need those dependencies at runtime. + --- ## Using transports with modules @@ -473,6 +517,7 @@ python -m pytest -svm tool -k "not bytes" dimos/protocol/pubsub/benchmark/test_b | `Memory` | Testing only, single process | No | No | Minimal reference impl | | `SharedMemory` | Multi-process on same machine | Yes | No | Highest throughput (IPC) | | `LCM` | Robot LAN broadcast (UDP multicast) | Yes | Yes | Best-effort; can drop packets on LAN | +| `H264LCM` | Opt-in compressed `Image` streams | Yes | Yes | H.264 Annex B access units over LCM | | `Redis` | Network pubsub via Redis server | Yes | Yes | Central broker; adds hop | | `ROS` | ROS 2 topic communication | Yes | Yes | Integrates with RViz/ROS tools | | `DDS` | Cyclone DDS without ROS (WIP) | Yes | Yes | WIP | diff --git a/openspec/changes/add-h264-codec-mem2-storage/.openspec.yaml b/openspec/changes/add-h264-codec-mem2-storage/.openspec.yaml new file mode 100644 index 0000000000..fb1ec77bfd --- /dev/null +++ b/openspec/changes/add-h264-codec-mem2-storage/.openspec.yaml @@ -0,0 +1,2 @@ +schema: dimos-capability +created: 2026-06-10 diff --git a/openspec/changes/add-h264-codec-mem2-storage/design.md b/openspec/changes/add-h264-codec-mem2-storage/design.md new file mode 100644 index 0000000000..7919a641db --- /dev/null +++ b/openspec/changes/add-h264-codec-mem2-storage/design.md @@ -0,0 +1,452 @@ +## Context + +DimOS transports currently move typed stream payloads and are mostly stateless per message. Image-specific compression already exists as JPEG transport adapters: `JpegLcmTransport` and `JpegShmTransport` wrap a carrier with an image encoder/decoder while subscribers still receive `Image` objects. This is the right precedent for H.264, but H.264 differs because decoding depends on GOP state rather than one independent compressed frame. + +memory2 currently stores image observations through the normal `Backend` path. `codec_for(Image)` selects `JpegCodec`, which is stateless per row. `Observation` already supports lazy payloads through `_UNLOADED` and `_loader`, which is the correct surface for memory2 H.264 decode. The current `Codec.encode(value) -> bytes` contract is not expressive enough for H.264 writes because the encoder is stateful, keyframes are periodic, and later packets depend on earlier packets. + +The design therefore introduces H.264 as an image codec layer that can sit above multiple carriers, not as a replacement for LCM, DDS, ROS, SHM, or WebRTC. Carrier adapters move compressed video packets between machines; endpoint adapters decode those packets back into `Image` objects for normal modules. + +The aiortc project is the preferred implementation source for the video codec layer now that DimOS already depends on it for WebRTC-related functionality. It implements Python WebRTC/ORTC video send/receive paths, including H.264 encode/decode, H.264 RTP packetization/depacketization, PyAV-backed `libx264` encoding, Baseline/zerolatency-style settings, and WebRTC loss-recovery mechanisms such as NACK/PLI. DimOS should directly wrap aiortc's H.264 encoder/decoder where practical, while converting aiortc RTP payload details into a Foxglove-style complete Annex B access unit before exposing packets to non-WebRTC carriers or memory2. + +Foxglove's `CompressedVideo` design is the right compatibility target for DimOS packet shape: one message contains the compressed video data needed for exactly one source frame, H.264 data is Annex B, B-frames are not supported, and every IDR keyframe includes parameter sets such as SPS/PPS. This does not remove the need for keyframes: P-frames still depend on prior decoded reference frames. It does remove the need for DimOS memory2/LCM/DDS consumers to reason about individual RTP fragments. + +## Goals / Non-Goals + +**Goals:** + +- Preserve `Out[Image]` and `In[Image]` as the public module stream contract. +- Add a carrier-neutral per-frame `VideoPacket` representation for complete H.264 Annex B access units, matching Foxglove's one compressed-video message per encoder input frame model. +- Add stateful H.264 encoder/decoder components with deterministic GOP/keyframe behavior. +- Add an LCM carrier adapter first, modeled after `JpegLcmTransport`. +- Add memory2 H.264 storage that keeps one observation per frame, stores one packet blob per frame, and lazily reconstructs `obs.data` as `Image`. +- Provide top-level opt-in configuration for live transports and recorder/store storage. +- Use aiortc directly for H.264 encode/decode through a DimOS adapter, and use aiortc public WebRTC APIs for the future WebRTC carrier. +- Keep JPEG transport and JPEG memory2 storage as defaults. + +**Non-Goals:** + +- Replacing underlying carriers such as LCM, DDS, ROS, SHM, or WebRTC. +- Making every transport support H.264 in the first implementation; DDS/SHM/WebRTC carriers are follow-ups. +- Exposing aiortc RTP payload fragments or WebRTC session state as public DimOS module, transport, or memory2 storage APIs. +- Supporting depth images, 16-bit images, alpha formats, or arbitrary pixel formats in the first implementation. +- Making `codec_for(Image)` return H.264 by default. +- Guaranteeing random access without decoding from a prior keyframe. +- Exposing video packets to normal module authors as the default stream type. + +## DimOS Architecture + +### Layering + +The design has three layers: + +```text +Module API layer + Out[Image] / In[Image] + │ + ▼ +Codec layer + H264Encoder / H264Decoder / GopBuffer + Image ⇄ VideoPacket + │ + ▼ +Carrier layer + H264LcmTransport first + DDS / SHM / WebRTC later +``` + +The carrier still performs inter-process or inter-machine communication. The H.264 layer only changes how image payloads are encoded before carrier publish and decoded after carrier receive. + +### Proposed classes and locations + +Core packet and codec classes: + +- `dimos/msgs/sensor_msgs/VideoPacket.py` + - Carrier-neutral message for one encoded video frame/access unit. + - Fields: `seq`, `ts`, `frame_id`, `width`, `height`, `format`, `codec`, `bitstream`, `is_keyframe`, `keyframe_seq`, `pts`, `data`. + - First supported `codec`: `h264`. + - First supported `bitstream`: Annex B complete access unit for exactly one source frame, aligned with Foxglove `CompressedVideo` expectations: for every full-frame encoder input call, DimOS creates one `VideoPacket` containing all NAL units emitted for that input frame. + - A `VideoPacket` is a complete encoded-frame packet, not necessarily an independently decodable image. Keyframe packets must contain enough decoder bootstrap data for late join and random access, including SPS/PPS on every IDR; delta-frame packets require prior decoded GOP state. + +- `dimos/protocol/video/h264.py` + - `H264Config`: bitrate, target fps, keyframe interval, profile, preset/tune, max GOP frames, pixel format. + - `AiortcH264Codec`: small DimOS adapter around `aiortc.codecs.h264.H264Encoder`, `aiortc.codecs.h264.H264Decoder`, and `aiortc.codecs.h264.h264_depayload`. + - `H264Encoder`: DimOS-facing wrapper that runs at the publishing or recording endpoint and converts `Image` to ordered `VideoPacket` values using aiortc. + - `H264Decoder`: DimOS-facing wrapper that runs at the subscribing or replay/decode endpoint and converts ordered `VideoPacket` values to decoded `Image` values using aiortc. + - `GopBuffer`: tracks the latest keyframe and following delta packets, detects sequence gaps, and suppresses output until the next keyframe after a gap. + - `H264AccessUnit`: helper that converts aiortc RTP payload batches into a complete Annex B access unit before building a `VideoPacket`. + - `UnsupportedVideoImageError` / `VideoDecodeGapError`: explicit errors for unsupported image formats and unusable GOP state. + +Implementation dependency: + +- aiortc's `src/aiortc/codecs/h264.py` provides the mechanics DimOS should call rather than reimplement initially: `H264Encoder.encode()` uses PyAV `libx264`, forces keyframes by setting frame picture type, emits Baseline/zerolatency H.264, and returns RTP-sized H.264 payloads plus timestamp; `h264_depayload()` converts RTP H.264 payloads back to Annex B bytes; `H264Decoder.decode()` decodes a depayloaded `JitterFrame` through PyAV. +- DimOS should assemble the aiortc payloads for one encoded source frame into a single Annex B `VideoPacket.data` value before publication/storage. This packet carries every NAL unit emitted for that encoder input frame, but only IDR/keyframe packets are expected to be independently bootstrappable. WebRTC carriers may keep aiortc RTP packetization internally, but LCM/DDS/memory2 should exchange complete access units. +- The adapter should avoid leaking aiortc classes such as `JitterFrame` and RTP payload descriptors into DimOS public APIs. If future aiortc versions change these codec internals, only `AiortcH264Codec` should need adjustment. + +Image lazy data support: + +- `dimos/msgs/sensor_msgs/Image.py` + - Add an explicit lazy pixel path mirroring `Observation`: metadata fields remain available, while `data` materializes pixels on access. + - `height`, `width`, `format`, `frame_id`, and `ts` must be available without forcing decode. + - Existing eager construction remains valid. + - This is needed for transport subscribers that inspect metadata or keep only the latest frame without always decoding pixels. + +LCM carrier classes: + +- `dimos/protocol/pubsub/impl/h264_lcm.py` + - `H264LCM`: LCM pubsub encoder/decoder that publishes serialized `VideoPacket` values on the wire and returns `Image` objects to subscribers. + - Holds one encoder per publisher instance and one `GopBuffer`/decoder per subscriber instance. + +- `dimos/core/transport.py` + - `H264LcmTransport`: mirrors `JpegLcmTransport` and instantiates `H264LCM` lazily to avoid importing video dependencies at normal startup. + - Reduces to `(H264LcmTransport, (topic, type, config))` for worker serialization. + +WebRTC carrier classes, later: + +- `dimos/protocol/pubsub/impl/webrtc_video.py` + - Uses aiortc public APIs such as `RTCPeerConnection`, media tracks, RTP senders/receivers, and RTCP feedback. + - Lets WebRTC own packetization, jitter buffering, retransmission/NACK, PLI keyframe requests, bitrate adaptation, and NAT traversal. + - Bridges between DimOS `Image` and WebRTC `VideoFrame` at the module boundary. + - Optionally exports encoded packets into the DimOS `VideoPacket` format for memory2 recording when aiortc exposes a clean encoded-frame hook; otherwise the first WebRTC integration may decode to `Image` and let memory2 re-encode. + - If exporting, convert WebRTC RTP payloads into complete Annex B access units first; do not persist raw RTP fragments. + +memory2 storage classes: + +- `dimos/memory2/video/h264.py` + - `H264ImageStorageConfig`: mode/config object for opt-in memory2 H.264 image storage. + - `H264ImageBackend`: image-specific backend or payload strategy that owns encoder state and writes one observation row plus one `VideoPacket` blob per frame. + - `H264FrameIndexStore`: creates and queries a standalone GOP index table. + - `H264ObservationLoader`: reconstructs a requested frame by loading the nearest keyframe packet and ordered delta packets through the requested observation. + - `H264ReplayDecodeSession`: shares decoder state during sequential replay so adjacent frames decode once. + +Store/recorder integration: + +- `dimos/memory2/store/sqlite.py` + - Recognize image storage config when creating a stream. + - Route `Image` streams with `mode="h264"` to the H.264 image backend. + - Persist storage config in `_streams` so reopening the database selects the right loader. + +- `dimos/memory2/module.py` + - Add recorder-level per-stream image storage configuration. + - Recorder still subscribes to `In[Image]`; storage mode controls how incoming images are persisted. + +### Where components run + +Live LCM path across machines: + +```text +Source machine / worker process + module Out[Image] + └─ H264LcmTransport.broadcast() + └─ H264Encoder encodes Image -> VideoPacket + └─ LCM publishes packet bytes + +Network / LCM multicast + carries VideoPacket bytes, not numpy pixels + +Subscriber machine / worker process + H264LcmTransport.subscribe() + └─ LCM receives packet bytes + └─ GopBuffer validates seq/keyframe state + └─ H264Decoder produces Image or lazy Image + └─ module In[Image] callback +``` + +memory2 recording path: + +```text +Recorder module process + In[Image] receives normal Image + └─ stream.append(Image) + └─ H264ImageBackend owns encoder state + ├─ observation table row: ts / pose / tags + ├─ blob row: serialized VideoPacket with complete Annex B access unit + └─ h264 frame index: seq / keyframe row / pts / format +``` + +memory2 replay/decode path: + +```text +Replay or query process + stream query returns Observation[Image] metadata + └─ obs.data + └─ H264ObservationLoader loads keyframe + delta packet chain + └─ H264Decoder reconstructs Image +``` + +The first implementation may re-encode images when recording a decoded `Image` stream that originally arrived over H.264 transport. Preserving incoming packet bytes end-to-end can be a later optimization via a packet side-channel; it is not required to make the public behavior correct. + +WebRTC/aiortc path, later: + +```text +Source machine / async WebRTC worker + Image source track + └─ aiortc encodes VideoFrame -> RTP/H.264 + └─ WebRTC handles packetization, jitter, NACK/PLI, bandwidth + +Network / WebRTC session + carries RTP media packets + +Subscriber machine / async WebRTC worker + aiortc receives/decodes RTP media + └─ adapter converts VideoFrame -> Image + └─ module In[Image] callback +``` + +This path is intentionally different from LCM and memory2 storage. WebRTC is a session protocol with negotiated codecs and RTP packet state; memory2 still needs deterministic per-observation packet rows and GOP lookup independent of any active peer connection. + +### Top-level activation and configuration + +Live transport activation should use existing blueprint transport mapping: + +```python +from dimos.core.transport import H264LcmTransport +from dimos.protocol.video.h264 import H264Config + +blueprint = autoconnect(camera(), consumer()).transports( + { + ("color_image", Image): H264LcmTransport( + "/color_image", + Image, + config=H264Config( + bitrate=2_000_000, + keyframe_interval=30, + profile="baseline", + tune="zerolatency", + ), + ) + } +) +``` + +memory2 direct store activation: + +```python +from dimos.memory2.video.h264 import H264ImageStorageConfig +from dimos.protocol.video.h264 import H264Config + +stream = store.stream( + "color_image", + Image, + image_storage=H264ImageStorageConfig( + codec=H264Config(bitrate=2_000_000, keyframe_interval=30), + ), +) +``` + +Recorder activation: + +```python +MyRecorder.blueprint( + image_storage={ + "color_image": H264ImageStorageConfig( + codec=H264Config(bitrate=2_000_000, keyframe_interval=30), + ) + } +) +``` + +Default behavior stays unchanged: + +```python +store.stream("color_image", Image) # JPEG-backed memory2 storage +LCMTransport("/color_image", Image) # normal LCM image transport +``` + +### Proposed end-to-end test blueprint + +Add one runnable synthetic blueprint that proves live H.264 transmission and H.264 memory2 storage through the normal DimOS surfaces, without robot hardware or a physical camera. + +Proposed location and registry name: + +- `dimos/protocol/video/demo_h264_video_e2e.py` +- Blueprint variable: `demo_h264_video_e2e` +- CLI name after registry generation: `demo-h264-video-e2e` + +Components: + +- `SyntheticVideoSource(Module)` + - Publishes deterministic `color_image: Out[Image]` frames. + - Uses a moving pattern, frame counter overlay/encoded pixels, and fixed metadata: width, height, format, frame_id, timestamp cadence. + - Defaults to a short loop-friendly rate such as 15 or 30 FPS, with configurable width, height, FPS, frame count, and pattern seed. + +- `H264E2ERecorder(Recorder)` + - Declares `color_image: In[Image]`. + - Uses recorder-level `image_storage={"color_image": H264ImageStorageConfig(...)}` so memory2 writes the received image stream as H.264 packets rather than JPEG blobs. + - Defaults `db_path` to an explicit temporary/demo path such as `h264_video_e2e.db` so manual QA can inspect it. + +- `H264VideoProbe(Module)` + - Subscribes to `color_image: In[Image]` after live H.264 transport decode. + - Tracks received frame count, monotonic timestamps, dimensions, frame_id, and approximate pixel/checksum expectations for the deterministic pattern. + - Exposes a simple RPC/status method for manual QA, e.g. `summary() -> str`, reporting frames received, drops detected, first/last seq-equivalent frame marker, and validation errors. + +Blueprint sketch: + +```python +from dimos.core.coordination.blueprints import autoconnect +from dimos.core.transport import H264LcmTransport +from dimos.memory2.video.h264 import H264ImageStorageConfig +from dimos.msgs.sensor_msgs import Image +from dimos.protocol.video.h264 import H264Config + +h264_config = H264Config( + bitrate=2_000_000, + target_fps=30, + keyframe_interval=30, + profile="baseline", + tune="zerolatency", +) + +demo_h264_video_e2e = autoconnect( + SyntheticVideoSource.blueprint(width=640, height=360, fps=30), + H264E2ERecorder.blueprint( + db_path="h264_video_e2e.db", + image_storage={ + "color_image": H264ImageStorageConfig(codec=h264_config), + }, + ), + H264VideoProbe.blueprint(expected_width=640, expected_height=360), +).transports( + { + ("color_image", Image): H264LcmTransport( + "/demo/h264_video_e2e/color_image", + Image, + config=h264_config, + ) + } +) +``` + +This blueprint intentionally exercises two independent H.264 paths: + +1. **Live transmission:** `SyntheticVideoSource` publishes normal `Image`; `H264LcmTransport` encodes to `VideoPacket`, transmits over LCM, decodes back to `Image`, and delivers to normal `In[Image]` subscribers. +2. **Storage:** `H264E2ERecorder` receives normal `Image` and writes memory2 observations using H.264 image storage, including GOP index rows and one Annex B packet blob per observation. + +Manual QA contract: + +- Run `dimos run demo-h264-video-e2e --daemon`. +- Confirm logs show H.264 encoder initialization, periodic keyframes, probe frame counts, and recorder append counts. +- Open the produced memory2 store and query `color_image` observations without touching `obs.data`; metadata should be available without decode. +- Access `obs.data` on a keyframe and a mid-GOP delta frame; both should return decoded `Image` pixels, with the mid-GOP read decoding from the nearest prior keyframe. +- Replay the stored stream and confirm decoded images arrive on the normal replay schedule. +- Run a seq-gap variant, either by a test-only packet drop option in `H264LcmTransport` or a direct `GopBuffer` driver, and verify the probe receives no corrupted images and resumes only after the next keyframe. + +The blueprint should be excluded from normal hardware requirements and should not require a viewer. If it is registered as a runnable blueprint, regenerate `dimos/robot/all_blueprints.py` with `pytest dimos/robot/test_all_blueprints_generation.py`. + +### Storage schema + +Use existing per-stream observation and blob tables for primary data: + +```text +color_image + id, ts, value, pose fields, tags + +color_image_blob + id -> serialized VideoPacket Annex B access unit +``` + +Add a standalone GOP index table for H.264 image streams: + +```text +h264_frames + stream_name + observation_id + seq + keyframe_observation_id + is_keyframe + pts + width + height + format + codec + bitstream +``` + +This table is storage-owned metadata. Generic observation tables remain focused on timeline, pose, tags, and scalar values. + +### DimOS Spec Protocols, skills/MCP, CLI, generated registries + +No new DimOS Python `Spec` Protocol is required for the first version because encode/decode is transport and storage behavior, not cross-module RPC. No skills or MCP tools are exposed. + +No CLI command is required for the core feature. The synthetic `demo-h264-video-e2e` blueprint is the manual QA surface for end-to-end live transmission and storage. If the runnable blueprint is added, regenerate `dimos/robot/all_blueprints.py` with `pytest dimos/robot/test_all_blueprints_generation.py`. + +## Decisions + +1. **Make H.264 a codec layer above carriers, not a carrier itself.** + - Rationale: publisher and subscriber may run on different machines, and LCM/DDS/SHM/WebRTC remain responsible for communication. + - Alternative rejected: a monolithic `H264ImageTransport` that hides the underlying carrier, because it does not generalize cleanly to DDS or WebRTC. + +2. **Use one complete Annex B `VideoPacket` per source frame.** + - Rationale: this preserves frame timestamps, sequence numbers, GOP state, and memory2 one-observation-per-frame semantics while matching Foxglove `CompressedVideo` expectations. Each packet contains all NAL units emitted for one encoder input frame. + - Key detail: "complete packet for a frame" is not the same as "standalone-decodable frame." IDR/keyframe packets can bootstrap decode when they include SPS/PPS; P-frame packets still require prior decoded GOP state. + - Alternative rejected: MP4 segment files as the primary model, because live transports and per-frame memory2 replay become harder to align. + +3. **Keep `codec_for(Image)` as JPEG.** + - Rationale: H.264 writes need stateful encoder ownership and GOP indexing; the stateless memory2 `Codec` contract should remain simple and backward compatible. + +4. **Decode only from valid GOP state.** + - Rationale: missing H.264 packets can corrupt decoded pixels. After a seq gap, subscribers and storage loaders should suppress or fail decode until a keyframe restores a self-contained GOP. + - Key detail: complete per-frame access units remove RTP-fragment handling from DimOS storage, but they do not remove inter-frame dependencies; P-frames still require prior decoded reference frames. + +5. **Use aiortc's H.264 codec classes through a DimOS adapter.** + - Rationale: aiortc already depends on PyAV, sets up `libx264`, handles keyframe forcing, produces RTP-sized H.264 payloads, and provides depayload/decode logic. Reusing it reduces new codec code and aligns with the WebRTC transport dependency. + - Boundary: DimOS stores and transports one logical Annex B `VideoPacket` per source frame; aiortc's multiple RTP payloads are an implementation detail converted before leaving the codec adapter. + - Alternative rejected: copy aiortc's H.264 implementation into DimOS immediately, because direct wrapping is simpler while aiortc is already a dependency. + +6. **Configure WebRTC as a future carrier, not the core codec abstraction.** + - Rationale: WebRTC already solves live RTP packetization, jitter, packet loss, keyframe requests, NAT traversal, and adaptive bitrate, but memory2 still needs deterministic per-observation packet storage and replay. + - aiortc should be the preferred Python implementation path for this carrier because it already supports sending and receiving H.264 video and RTCP recovery feedback. + +7. **Store one complete access unit per observation, not one observation per RTP fragment.** + - Rationale: aiortc's encoder returns multiple RTP-sized payloads for one source frame. memory2 should depayload and assemble them into one Annex B `VideoPacket` for the frame, so queries, replay, pose, tags, and GOP indexes remain frame-oriented and Foxglove-compatible. + - Alternative rejected: storing each RTP fragment as its own observation, because replay and random access would inherit network packetization complexity and break frame-level memory2 semantics. + +8. **Repeat decoder parameter sets on every IDR keyframe.** + - Rationale: late join, random access, and memory2 partial reads require keyframes to bootstrap decoding without relying on stream-start state. + - Alternative rejected: sending SPS/PPS only at stream startup, because late subscribers and mid-recording reads may never decode. + +## Safety / Simulation / Replay + +This change affects image transport and recording only. It does not command robot hardware, alter control loops, or expose new skills. Existing hardware safety assumptions remain unchanged. + +Simulation and hardware cameras use the same `Image` semantics. Unsupported image formats such as depth or 16-bit images should fail at H.264 configuration/append/publish time with a clear error, not silently convert or corrupt data. + +Replay must emit normal decoded `Image` objects on the existing memory2 replay schedule. Sequential replay should share decoder state so normal playback decodes each packet once. Seek or random access may decode from the nearest prior keyframe through the requested frame. + +Manual QA should use the synthetic `demo-h264-video-e2e` blueprint so no robot or physical camera is required. The demo should verify live LCM round-trip, memory2 append/query without decode, lazy `obs.data` decode, replay, and seq-gap behavior. + +## Risks / Trade-offs + +- **Stateful codec complexity:** H.264 has encoder and decoder state. Mitigation: keep state in explicit `H264Encoder`, `H264Decoder`, and `GopBuffer` classes rather than hiding it in `Codec`. +- **Lazy `Image.data` compatibility:** Existing `Image` assumes eager numpy data. Mitigation: add lazy pixel support carefully so metadata properties do not force decode and eager construction remains unchanged. +- **Packet loss:** LCM has no built-in reliable delivery or late-join keyframe durability. Mitigation: periodic IDR frames and seq-gap suppression; later add keyframe request or durable carriers where available. +- **Dependency variability:** aiortc/PyAV/FFmpeg support varies by platform. Mitigation: keep H.264 optional under the extra that already provides aiortc/WebRTC support, preserve JPEG defaults, and fail clearly when video mode is selected without dependencies. +- **aiortc codec API stability:** aiortc codec classes are importable and useful, but the most stable aiortc surface is WebRTC itself. Mitigation: isolate all direct codec imports in `AiortcH264Codec`, pin/verify aiortc versions, and add focused tests around encode/depayload/decode behavior. +- **Double encode on record:** A recorder consuming decoded H.264 transport images may re-encode for memory2 storage. Mitigation: accept this in the first version; consider packet pass-through as a later optimization. +- **Random access latency:** Mid-GOP access requires decoding from a prior keyframe. Mitigation: short GOP defaults and decoder reuse during sequential replay. + +## Migration / Rollout + +1. Reuse the existing aiortc/WebRTC dependency path for H.264 support; add a lightweight `video` extra only if users need H.264 storage without the broader WebRTC extra. +2. Add `VideoPacket`, H.264 config, `AiortcH264Codec`, DimOS-facing encoder/decoder wrappers, GOP buffer, Annex B access-unit assembly, and explicit errors. +3. Add lazy pixel support to `Image` while preserving eager API behavior. +4. Add `H264LCM` and `H264LcmTransport` as the first live carrier adapter. +5. Add memory2 H.264 storage config, backend/payload strategy, GOP index table, and lazy loader. +6. Add registry serialization so reopened SQLite stores know which streams use H.264 storage. +7. Add `demo_h264_video_e2e` for synthetic end-to-end live transport plus memory2 storage QA. +8. Add tests and synthetic manual QA for live transport, storage, lazy decode, replay, unsupported formats, and seq gaps. +9. Update memory2 and transport docs with opt-in examples and dependency notes. + +Rollback is straightforward because all behavior is opt-in. Removing H.264 configuration returns live streams and new recordings to existing transport/JPEG behavior. Existing H.264-backed recordings still require the video dependency to decode pixels, but metadata should remain queryable. + +No generated blueprint registry update is needed unless a runnable demo blueprint is added. + +## Open Questions + +- Should the packet message be named `VideoPacket`, `EncodedImagePacket`, or `CompressedVideoFrame`? +- Should LCM H.264 publish raw packet bytes under an `Image` channel name or use a distinct LCM message type/channel suffix internally? +- What default bitrate, keyframe interval, and target FPS should be used for common DimOS camera streams? +- Should first-version memory2 storage store packet blobs in the existing `{stream}_blob` table or introduce a dedicated packet blob table? +- Should transport subscribers receive lazy `Image` objects by default, or should eager decode remain the default for maximum compatibility? +- Should WebRTC integration reuse this `VideoPacket` abstraction, or map directly between `Image` and WebRTC media tracks with optional packet export for memory2? +- Does aiortc expose a stable encoded-frame hook that can avoid decode/re-encode when recording a WebRTC H.264 stream into memory2? +- Should `AiortcH264Codec` pin to aiortc minor versions or include compatibility tests against the minimum supported aiortc version? diff --git a/openspec/changes/add-h264-codec-mem2-storage/docs.md b/openspec/changes/add-h264-codec-mem2-storage/docs.md new file mode 100644 index 0000000000..fa5ef9e7b9 --- /dev/null +++ b/openspec/changes/add-h264-codec-mem2-storage/docs.md @@ -0,0 +1,57 @@ +## User-Facing Docs + +- Update `docs/usage/transports/index.md` or the image-transport-specific transport docs to describe opt-in H.264 image transport behavior: + - Public module streams remain `Out[Image]` and `In[Image]`. + - `H264LcmTransport` compresses image payloads internally as H.264 and delivers decoded `Image` objects to subscribers. + - H.264 packets contain complete Annex B access units for one source frame, matching Foxglove-style `CompressedVideo` expectations. + - Delta frames require prior GOP state; after packet loss or late join, subscribers resume on the next keyframe. + - Unsupported image formats fail clearly rather than silently converting. +- Update `docs/usage/blueprints.md` with an opt-in blueprint transport mapping example for `H264LcmTransport` and `H264Config`. +- Update memory2 user docs, likely under `docs/usage/` or the memory2 capability docs, to describe opt-in H.264-backed image storage: + - Default image storage remains JPEG-backed. + - Users opt in per stream with `H264ImageStorageConfig`. + - memory2 still stores one observation per source frame. + - metadata queries do not require pixel decode. + - accessing `obs.data` lazily reconstructs an `Image`, decoding from the nearest prior keyframe when needed. + - replay emits decoded `Image` frames on the normal replay schedule. +- Add a short manual QA section for `demo-h264-video-e2e` after the demo blueprint exists: + - run `dimos run demo-h264-video-e2e --daemon` + - inspect probe/recorder logs + - query the generated memory2 store + - validate lazy decode, replay, and seq-gap recovery. +- Mention optional video dependencies in the installation or feature docs. Users should know that H.264 mode requires the aiortc/PyAV/FFmpeg dependency path while JPEG defaults remain available without selecting H.264. + +## Contributor Docs + +- Update `docs/development/testing.md` or a nearby development testing guide with H.264-specific test commands once tests exist: + - unit tests for `VideoPacket`, H.264 access-unit assembly, GOP buffering, unsupported formats, and lazy `Image.data` behavior + - memory2 storage tests for append/query/lazy decode/reopen/replay + - synthetic end-to-end demo/blueprint smoke test for live LCM transmission and memory2 recording. +- Document dependency expectations for contributors who run video tests locally, including how to install the relevant `uv` extras and how tests should skip clearly when video dependencies are unavailable. +- If `demo_h264_video_e2e` is registered as a runnable blueprint, contributor docs should remind maintainers to regenerate `dimos/robot/all_blueprints.py` with `pytest dimos/robot/test_all_blueprints_generation.py`. + +## Coding-Agent Docs + +- Update `docs/coding-agents/index.md` or a focused coding-agent guide if agents are expected to modify image transports or memory2 storage: + - H.264 is opt-in and must not replace JPEG defaults. + - Keep public module contracts as `Image` streams. + - Store complete Annex B access units per source frame, not RTP fragments. + - Preserve one memory2 observation per source frame. + - Avoid negative-only OpenSpec requirements when adding or editing specs; include positive `MUST`/`SHALL` statements. +- No `AGENTS.md` update is required unless maintainers want the H.264/Foxglove packet-shape rule to become a repo-wide coding-agent constraint. + +## Doc Validation + +- Run documentation link validation for changed docs if available: + - `uv run doclinks` +- Run markdown code-block validation for docs that contain executable Python snippets, for example: + - `uv run md-babel-py run docs/usage/blueprints.md` + - `uv run md-babel-py run ` +- If diagrams are added or regenerated, run: + - `bin/gen-diagrams` +- Validate generated blueprint registry freshness if the demo blueprint is registered: + - `uv run pytest dimos/robot/test_all_blueprints_generation.py` + +## No Docs Needed + +Documentation is needed. This change adds user-visible opt-in transport and memory2 storage configuration, dependency requirements, replay/lazy-decode behavior, and a runnable synthetic QA blueprint. diff --git a/openspec/changes/add-h264-codec-mem2-storage/proposal.md b/openspec/changes/add-h264-codec-mem2-storage/proposal.md new file mode 100644 index 0000000000..b808cc4828 --- /dev/null +++ b/openspec/changes/add-h264-codec-mem2-storage/proposal.md @@ -0,0 +1,40 @@ +## Why + +DimOS image streams currently use full `Image` objects over typed transports and memory2 stores images as independent JPEG payloads. That is simple and compatible, but inefficient for long-running camera streams and remote subscribers because each frame is compressed independently and no shared video codec state is reused. + +DimOS needs an opt-in H.264 image-stream path that preserves the public `Image` stream contract while allowing live transports and memory2 storage to carry compact video frame packets. The design should make H.264 reusable across carriers such as LCM first, and DDS/WebRTC later, while keeping memory2 queries, pose/tag alignment, and replay frame semantics intact. + +## What Changes + +- Add a carrier-neutral H.264 image packet behavior for RGB/BGR-style `Image` streams, with one encoded video access unit per source frame. +- Add stateful H.264 encode/decode behavior that produces periodic self-contained keyframes, rejects unsupported image formats clearly, detects sequence gaps, and resumes delivery only after a valid keyframe. +- Add an opt-in live transport path for H.264 image streams, starting with LCM, that exposes decoded `Image` objects to subscribers rather than video packets. +- Add memory2 H.264 image storage that preserves one observation row per frame, stores per-frame video packet payloads, indexes GOP/keyframe relationships, and lazily reconstructs `obs.data` as an `Image` on demand. +- Preserve the existing JPEG image codec and JPEG-backed memory2 storage as the default behavior. +- No hardware-safety behavior changes are intended. +- No public robot-control, skill, or MCP breaking changes are intended. + +## Affected DimOS Surfaces + +- Modules/streams: typed `Image` streams, image-specific transport adapters, memory2 Recorder ingestion, memory2 Stream/Observation lazy payload access, and replay output of decoded images. +- Blueprints/CLI: blueprints may opt image streams into H.264-capable transports or memory2 H.264 storage; existing blueprint behavior remains unchanged unless configured. +- Skills/MCP: no direct skill or MCP behavior changes expected. +- Hardware/simulation/replay: camera-heavy hardware and simulation streams may benefit from reduced bandwidth/storage; replay must continue to emit normal decoded `Image` frames on the same schedule. +- Docs/generated registries: memory2 and transport docs need updates; generated blueprint registries are not expected to change unless new demo blueprints are added. + +## Capabilities + +### New Capabilities + +- `h264-image-streams`: Covers carrier-neutral H.264 image packets, live image-stream encode/decode behavior, keyframe/GOP handling, sequence-gap behavior, and transport compatibility expectations. +- `memory2-h264-storage`: Covers opt-in H.264-backed memory2 image observation storage, per-frame packet persistence, GOP indexing, lazy `Image` reconstruction, and replay compatibility. + +### Modified Capabilities + +- None. + +## Impact + +Users and developers gain a more bandwidth- and storage-efficient option for camera streams while keeping existing `Image` stream consumers and memory2 query/replay behavior familiar. Existing JPEG-backed recordings, default transports, and non-image streams remain compatible. + +Compatibility risk centers on adding optional video codec dependencies, preserving lazy-load lifetimes, making GOP recovery deterministic after packet loss or missing storage rows, and avoiding silent corruption when frames cannot be decoded. Documentation and QA should cover opt-in configuration, supported image formats, dependency installation, LCM live-stream behavior, memory2 append/query/lazy-decode/replay behavior, packet-loss recovery, and a small synthetic image-stream demo. diff --git a/openspec/changes/add-h264-codec-mem2-storage/specs/h264-image-streams/spec.md b/openspec/changes/add-h264-codec-mem2-storage/specs/h264-image-streams/spec.md new file mode 100644 index 0000000000..1cb004ca3f --- /dev/null +++ b/openspec/changes/add-h264-codec-mem2-storage/specs/h264-image-streams/spec.md @@ -0,0 +1,85 @@ +## ADDED Requirements + +### Requirement: Opt-in H.264 image streams preserve the Image contract +DimOS SHALL allow an image stream to opt into H.264 encoding while preserving `Image` as the public stream payload type for publishers and subscribers. + +#### Scenario: Publisher and subscriber use normal Image objects +- **GIVEN** a blueprint configures an image stream for H.264 live transmission +- **AND** the source module publishes `Image` values on an `Out[Image]` stream +- **WHEN** a downstream module subscribes through an `In[Image]` stream +- **THEN** the downstream callback receives decoded `Image` values +- **AND** the module author does not need to publish or subscribe to encoded video packet values. + +#### Scenario: Existing image streams remain unchanged by default +- **GIVEN** a blueprint does not opt an image stream into H.264 transmission +- **WHEN** the blueprint runs with its existing image transport configuration +- **THEN** DimOS MUST preserve the existing image transport behavior +- **AND** H.264 dependencies or settings are not required for that stream. + +### Requirement: H.264 packets are complete per-frame Annex B access units +DimOS SHALL represent each H.264-transmitted source image frame as one complete encoded-frame packet containing the Annex B NAL units emitted for that encoder input frame. + +#### Scenario: One encoded packet corresponds to one source frame +- **GIVEN** an H.264-enabled image stream publishes one source `Image` frame +- **WHEN** DimOS encodes that frame for a non-WebRTC carrier or for packet inspection +- **THEN** the encoded packet data MUST contain all NAL units emitted for that source frame in Annex B form +- **AND** the encoded packet represents exactly one source frame. + +#### Scenario: Delta-frame packets require GOP state +- **GIVEN** an encoded packet contains a delta frame +- **WHEN** a decoder processes that packet without the prior GOP state required by H.264 +- **THEN** DimOS MUST treat the packet as requiring recovery from a keyframe +- **AND** DimOS MUST avoid presenting corrupted image pixels as a valid decoded `Image`. + +### Requirement: Keyframes bootstrap late join and recovery +DimOS SHALL provide periodic keyframes for H.264 image streams so subscribers can start or recover decoding at bounded intervals. + +#### Scenario: Late subscriber waits for a keyframe +- **GIVEN** an H.264 image stream is already publishing +- **WHEN** a subscriber joins after the stream has started +- **THEN** DimOS MUST begin delivering decoded images only after the subscriber has valid keyframe-based decoder state +- **AND** the subscriber must not receive corrupted decoded images from incomplete GOP state. + +#### Scenario: Keyframes include decoder parameter data +- **GIVEN** an H.264 image stream emits an IDR keyframe +- **WHEN** the keyframe packet is used to bootstrap a new decoder +- **THEN** the keyframe packet MUST include the decoder parameter information needed for that bootstrap, such as SPS/PPS for H.264 Annex B streams +- **AND** later delta frames in the same GOP may depend on that decoded keyframe state. + +### Requirement: Sequence gaps recover safely +DimOS SHALL detect missing or out-of-order H.264 live-stream packets and resume decoded image delivery from a valid keyframe state. + +#### Scenario: Packet loss occurs mid-GOP +- **GIVEN** a subscriber is decoding an H.264 image stream +- **WHEN** DimOS detects a sequence gap before the next keyframe +- **THEN** DimOS MUST stop delivering decoded images from the invalid GOP state +- **AND** DimOS SHALL resume delivery after a subsequent keyframe establishes valid decoder state. + +### Requirement: Unsupported image formats fail explicitly +DimOS SHALL accept only image formats supported by the configured H.264 image-stream mode and provide a clear failure for unsupported formats. + +#### Scenario: Supported color image is transmitted +- **GIVEN** an H.264-enabled image stream receives a supported 8-bit color `Image` format +- **WHEN** DimOS encodes and transmits the image +- **THEN** subscribers MUST receive a decoded `Image` with the expected dimensions, timestamp, frame identifier, and color format semantics. + +#### Scenario: Unsupported image format is rejected +- **GIVEN** an H.264-enabled image stream receives an unsupported image format such as depth, 16-bit, or alpha data +- **WHEN** DimOS attempts to encode or publish the image through the H.264 stream mode +- **THEN** DimOS MUST fail with a clear unsupported-format error +- **AND** DimOS MUST preserve safety by avoiding silent lossy conversion or corrupted output. + +### Requirement: H.264 stream configuration is observable and bounded +DimOS SHALL expose user-configurable H.264 stream settings for bitrate, keyframe cadence, frame-rate assumptions, and low-latency profile behavior. + +#### Scenario: Blueprint opts into H.264 settings +- **GIVEN** a blueprint configures an image stream for H.264 live transmission with bitrate and keyframe cadence settings +- **WHEN** the blueprint runs +- **THEN** DimOS MUST apply those settings to the H.264 stream behavior +- **AND** subscribers must continue to observe normal `Image` payloads rather than codec-specific internals. + +#### Scenario: H.264 dependencies are unavailable +- **GIVEN** a user selects H.264 image-stream mode in an environment without the required video codec dependencies +- **WHEN** DimOS starts or initializes the H.264 stream +- **THEN** DimOS MUST fail with an actionable dependency error +- **AND** DimOS MUST preserve non-H.264 image-stream behavior for configurations that do not select H.264. diff --git a/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-h264-storage/spec.md b/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-h264-storage/spec.md new file mode 100644 index 0000000000..a73dfa1af4 --- /dev/null +++ b/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-h264-storage/spec.md @@ -0,0 +1,85 @@ +## ADDED Requirements + +### Requirement: H.264 image storage is opt-in per memory2 stream +memory2 SHALL allow image streams to opt into H.264-backed storage while preserving the default image-storage behavior for streams that do not opt in. + +#### Scenario: Stream opts into H.264 storage +- **GIVEN** a memory2 image stream is configured for H.264-backed storage +- **WHEN** the stream appends `Image` values +- **THEN** memory2 MUST store those image observations using H.264-backed payloads +- **AND** queries for the stream must continue to return image observations associated with the original frame timestamps. + +#### Scenario: Stream uses default image storage +- **GIVEN** a memory2 image stream is created without H.264 image-storage configuration +- **WHEN** the stream appends `Image` values +- **THEN** memory2 MUST preserve the existing default image-storage behavior +- **AND** existing JPEG-backed recordings remain readable through the normal memory2 APIs. + +### Requirement: H.264 storage preserves one observation per source frame +memory2 SHALL store H.264-backed image streams with one observation corresponding to each source image frame. + +#### Scenario: Recording a sequence of image frames +- **GIVEN** a recorder receives a sequence of `Image` frames on an H.264-backed memory2 stream +- **WHEN** memory2 stores the sequence +- **THEN** memory2 MUST create one queryable observation per source frame +- **AND** each observation must retain its timestamp, frame identifier, pose metadata when available, and tags independently of pixel decode. + +### Requirement: Stored H.264 packets are complete frame access units +memory2 SHALL store each H.264-backed image observation with an encoded payload that contains the complete Annex B access unit for that source frame. + +#### Scenario: Stored packet is inspected or exported +- **GIVEN** an H.264-backed image observation has an encoded payload +- **WHEN** the payload is inspected by storage tooling or exported to a compatible video-message format +- **THEN** the payload MUST represent all NAL units emitted for that source frame in Annex B form +- **AND** memory2 MUST avoid exposing individual RTP fragments as the stored observation payload. + +### Requirement: GOP metadata supports random access and replay +memory2 SHALL persist enough GOP and keyframe metadata for H.264-backed image streams to decode requested observations and replay streams deterministically. + +#### Scenario: Query decodes a mid-GOP observation +- **GIVEN** a user queries an H.264-backed image observation whose encoded payload is a delta frame +- **WHEN** the user accesses the observation pixel data +- **THEN** memory2 MUST decode from the nearest prior usable keyframe through the requested observation +- **AND** the returned value must be a decoded `Image` for the requested observation. + +#### Scenario: Required GOP data is missing +- **GIVEN** an H.264-backed image observation requires prior GOP data to decode +- **WHEN** memory2 cannot load a usable keyframe or required delta-frame sequence +- **THEN** memory2 MUST fail the pixel decode with a clear storage/decode error +- **AND** memory2 MUST avoid returning corrupted pixels as a valid `Image`. + +### Requirement: Metadata queries do not force pixel decode +memory2 SHALL allow metadata access for H.264-backed image observations without decoding image pixels. + +#### Scenario: Query reads observation metadata only +- **GIVEN** a memory2 store contains H.264-backed image observations +- **WHEN** a user queries observations and reads timestamps, frame identifiers, pose metadata, tags, width, height, or image format metadata +- **THEN** memory2 MUST provide that metadata without requiring H.264 pixel decode +- **AND** pixel decode should occur only when the user accesses image data. + +### Requirement: Lazy pixel access reconstructs Image values +memory2 SHALL lazily reconstruct `Image` values for H.264-backed observations when pixel data is requested. + +#### Scenario: User accesses observation data +- **GIVEN** a queried H.264-backed image observation has not decoded its pixels yet +- **WHEN** the user accesses `obs.data` +- **THEN** memory2 MUST return a decoded `Image` value for that observation +- **AND** subsequent compatible accesses may reuse decoded state without changing observable image contents. + +### Requirement: H.264-backed replay emits normal Image frames +memory2 SHALL replay H.264-backed image streams as normal decoded `Image` frames on the existing replay schedule. + +#### Scenario: Replaying a stored H.264 image stream +- **GIVEN** a memory2 store contains an H.264-backed image stream +- **WHEN** replay is started for that stream +- **THEN** replay MUST emit decoded `Image` values in observation timestamp order +- **AND** consumers of replayed streams must not need to consume encoded video packet values. + +### Requirement: H.264 storage survives store reopen +memory2 SHALL persist H.264 storage configuration and frame-index metadata so a reopened store can query, decode, and replay H.264-backed image streams. + +#### Scenario: Reopen and decode +- **GIVEN** a memory2 store was written with an H.264-backed image stream +- **WHEN** the process closes and a later process reopens the store +- **THEN** memory2 MUST recognize the stream as H.264-backed +- **AND** the reopened store must support metadata query, lazy pixel decode, and replay for the stored observations. diff --git a/openspec/changes/add-h264-codec-mem2-storage/tasks.md b/openspec/changes/add-h264-codec-mem2-storage/tasks.md new file mode 100644 index 0000000000..4842b7a728 --- /dev/null +++ b/openspec/changes/add-h264-codec-mem2-storage/tasks.md @@ -0,0 +1,61 @@ +## 1. Core video packet and codec behavior + +- [x] 1.1 Add the carrier-neutral encoded video frame message for one complete H.264 Annex B access unit per source `Image` frame, including sequence, timestamp, frame identifier, dimensions, format, codec, bitstream, keyframe, keyframe-reference, presentation timestamp, and payload fields. +- [x] 1.2 Add H.264 configuration covering bitrate, target FPS, keyframe interval, profile, preset/tune, maximum GOP length, and supported pixel format settings. +- [x] 1.3 Add the aiortc-backed H.264 adapter that converts `Image` frames to H.264 output and converts H.264 input back to `Image` while keeping aiortc/RTP internals out of public DimOS APIs. +- [x] 1.4 Add access-unit assembly so all NAL units emitted for one encoder input frame are stored or transmitted as one Annex B packet, not as individual RTP fragments. +- [x] 1.5 Add GOP/keyframe state tracking that detects sequence gaps, marks decoder state invalid, suppresses corrupted output, and resumes only after a usable keyframe. +- [x] 1.6 Add explicit errors for unsupported image formats, missing video dependencies, and unusable GOP/decode state. +- [x] 1.7 Add focused codec tests for per-frame Annex B packet shape, keyframe metadata, SPS/PPS bootstrap behavior, sequence-gap handling, dependency errors, and unsupported image formats. + +## 2. Image lazy data support + +- [x] 2.1 Extend `Image` construction/access so metadata fields remain available without forcing pixel decode when an image is backed by a lazy pixel loader. +- [x] 2.2 Preserve existing eager `Image` behavior and compatibility for current JPEG, LCM, SHM, memory2, and visualization consumers. +- [x] 2.3 Add tests proving eager images still work, lazy images expose metadata without decode, and `Image.data` materializes pixels exactly when requested. + +## 3. Live H.264 image transport + +- [x] 3.1 Add the H.264 LCM pubsub adapter that publishes encoded video frame packets on the wire and delivers decoded `Image` values to subscribers. +- [x] 3.2 Add `H264LcmTransport` to the transport layer with worker-safe serialization behavior matching existing transport patterns. +- [x] 3.3 Keep normal image transport behavior unchanged unless a blueprint explicitly opts a stream into H.264 transport. +- [x] 3.4 Add live transport tests for `Out[Image]` to `In[Image]` delivery, keyframe bootstrap, late subscriber behavior, sequence-gap recovery, and default transport compatibility. + +## 4. memory2 H.264 image storage + +- [x] 4.1 Add per-stream H.264 image storage configuration for direct store creation and recorder configuration. +- [x] 4.2 Route configured memory2 `Image` streams to H.264-backed storage while leaving unconfigured `Image` streams on the existing default storage path. +- [x] 4.3 Store one observation row per source frame and one encoded Annex B frame packet payload per observation. +- [x] 4.4 Add persistent GOP/keyframe index metadata for H.264-backed image streams. +- [x] 4.5 Persist and reload per-stream storage configuration so reopened stores recognize H.264-backed image streams. +- [x] 4.6 Add lazy observation loading that returns metadata without decode and reconstructs `Image` pixels from the nearest prior usable keyframe through the requested observation when `obs.data` is accessed. +- [x] 4.7 Add replay support that emits decoded `Image` values in observation timestamp order for H.264-backed streams. +- [x] 4.8 Add memory2 tests for append/query, metadata access without decode, keyframe and mid-GOP lazy decode, missing-GOP failure, store reopen, replay, default JPEG compatibility, and unsupported formats. + +## 5. Synthetic end-to-end blueprint and manual QA surface + +- [x] 5.1 Add `dimos/protocol/video/demo_h264_video_e2e.py` with a deterministic synthetic `Image` source, H.264 memory2 recorder, and decoded-frame probe. +- [x] 5.2 Configure the blueprint to exercise both live H.264 LCM transmission and H.264 memory2 storage through normal `Image` stream surfaces. +- [x] 5.3 Add probe status or logs that report received frame counts, dimensions, timestamp monotonicity, validation failures, and drop/recovery observations. +- [x] 5.4 Register the runnable blueprint as `demo-h264-video-e2e` if it is intended to be exposed through `dimos run`. +- [x] 5.5 Regenerate and verify `dimos/robot/all_blueprints.py` if the demo blueprint is registered. + +## 6. Documentation + +- [x] 6.1 Update user-facing transport docs with H.264 opt-in behavior, `Image` stream preservation, Annex B per-frame packets, keyframe/GOP recovery, unsupported formats, and dependency notes. +- [x] 6.2 Update blueprint docs with an H.264 image transport mapping example. +- [x] 6.3 Update memory2 docs with H.264 image storage configuration, one-observation-per-frame behavior, metadata query without decode, lazy `obs.data` decode, random access from keyframes, and replay behavior. +- [x] 6.4 Add docs for running and inspecting the `demo-h264-video-e2e` synthetic QA blueprint. +- [x] 6.5 Update contributor testing docs with video dependency setup, focused test targets, skip behavior when dependencies are unavailable, and blueprint-registry regeneration guidance. +- [x] 6.6 Update coding-agent docs if maintainers want the H.264/Foxglove packet-shape rule documented for future agent edits. + +## 7. Verification + +- [x] 7.1 Run `openspec validate add-h264-codec-mem2-storage --strict`. +- [x] 7.2 Run focused unit tests for H.264 codec/access-unit/GOP behavior. +- [x] 7.3 Run focused unit tests for lazy `Image` behavior. +- [x] 7.4 Run focused memory2 storage tests for H.264 append/query/lazy decode/reopen/replay/default compatibility. +- [x] 7.5 Run focused live transport tests for H.264 LCM round-trip and sequence-gap recovery. +- [x] 7.6 Run `uv run pytest dimos/robot/test_all_blueprints_generation.py` if the demo blueprint is registered. +- [x] 7.7 Run relevant docs validation, including `uv run doclinks` if available and `uv run md-babel-py run ` for executable markdown snippets. +- [x] 7.8 Manually run `dimos run demo-h264-video-e2e --daemon`, inspect logs/probe status, query the generated memory2 store without pixel decode, access `obs.data` for keyframe and mid-GOP observations, replay the stream, and verify sequence-gap recovery behavior. From 4e01f9a1e04261002a11b29ac299add0f2869333 Mon Sep 17 00:00:00 2001 From: cc Date: Thu, 11 Jun 2026 19:22:12 -0700 Subject: [PATCH 03/14] refactor: use payload strategy for h264 storage --- dimos/memory2/backend.py | 50 ++++- dimos/memory2/module.py | 6 +- dimos/memory2/replay.py | 34 ++- dimos/memory2/store/base.py | 45 +--- dimos/memory2/store/sqlite.py | 101 +++++---- dimos/memory2/test_payload_strategy.py | 164 ++++++++++++++ dimos/memory2/video/h264.py | 200 +++++++++--------- dimos/memory2/video/test_h264_storage.py | 156 +++++++++++--- dimos/msgs/sensor_msgs/Image.py | 120 ++--------- dimos/msgs/sensor_msgs/test_image.py | 47 +--- dimos/protocol/video/demo_h264_video_e2e.py | 14 +- docs/capabilities/memory/index.md | 31 +-- docs/coding-agents/style.md | 4 +- docs/development/testing.md | 2 +- docs/usage/transports/index.md | 4 + .../add-h264-codec-mem2-storage/design.md | 88 ++++---- .../add-h264-codec-mem2-storage/docs.md | 4 +- .../add-h264-codec-mem2-storage/proposal.md | 2 +- .../specs/h264-image-streams/spec.md | 15 ++ .../specs/memory2-h264-storage/spec.md | 39 ++-- .../specs/memory2-payload-strategies/spec.md | 56 +++++ .../add-h264-codec-mem2-storage/tasks.md | 23 +- 22 files changed, 757 insertions(+), 448 deletions(-) create mode 100644 dimos/memory2/test_payload_strategy.py create mode 100644 openspec/changes/add-h264-codec-mem2-storage/specs/memory2-payload-strategies/spec.md diff --git a/dimos/memory2/backend.py b/dimos/memory2/backend.py index 7b95bd6335..53ce318aaa 100644 --- a/dimos/memory2/backend.py +++ b/dimos/memory2/backend.py @@ -17,7 +17,7 @@ from __future__ import annotations from dataclasses import replace -from typing import TYPE_CHECKING, Any, Generic, TypeVar +from typing import TYPE_CHECKING, Any, Generic, Protocol, TypeVar from dimos.core.resource import CompositeResource from dimos.memory2.codecs.base import Codec, codec_id @@ -40,6 +40,30 @@ T = TypeVar("T") +class PayloadStrategy(Protocol[T]): + """Stateful payload encoding hook for a memory2 stream. + + Stateless storage uses ``Codec`` directly. Stateful encodings such as video + can provide a strategy without requiring a special Backend subclass. + """ + + codec_id: str + + def start(self) -> None: ... + def stop(self) -> None: ... + def encode(self, value: T) -> bytes: ... + def after_blob_put(self, stream_name: str, row_id: int, encoded: bytes) -> None: ... + def make_loader(self, stream_name: str, row_id: int, blob_store: BlobStore) -> Any: ... + def attach_loaders( + self, + stream_name: str, + observations: Iterator[Observation[T]], + blob_store: BlobStore, + ) -> Iterator[Observation[T]]: ... + def should_suppress_decode_error(self, error: BaseException) -> bool: ... + def serialize(self) -> dict[str, Any]: ... + + class Backend(CompositeResource, Generic[T]): """Orchestrates metadata, blob, vector, and live stores for one stream. (encode → insert → store blob → index vector → notify) lives here, @@ -55,6 +79,7 @@ def __init__( vector_store: VectorStore | None = None, notifier: Notifier[T] | None = None, eager_blobs: bool = False, + payload_strategy: PayloadStrategy[T] | None = None, ) -> None: super().__init__() self.metadata_store = self.register_disposable(metadata_store) @@ -64,6 +89,7 @@ def __init__( self.vector_store = self.register_disposable(vector_store) if vector_store else None self.notifier: Notifier[T] = self.register_disposable(notifier or SubjectNotifier()) self.eager_blobs = eager_blobs + self.payload_strategy = payload_strategy def start(self) -> None: self.metadata_store.start() @@ -71,6 +97,13 @@ def start(self) -> None: self.blob_store.start() if self.vector_store is not None: self.vector_store.start() + if self.payload_strategy is not None: + self.payload_strategy.start() + + def stop(self) -> None: + if self.payload_strategy is not None: + self.payload_strategy.stop() + super().stop() @property def name(self) -> str: @@ -81,6 +114,8 @@ def _make_loader(self, row_id: int) -> Any: if bs is None: raise RuntimeError("BlobStore required but not configured") name, codec = self.name, self.codec + if self.payload_strategy is not None: + return self.payload_strategy.make_loader(name, row_id, bs) def loader() -> Any: raw = bs.get(name, row_id) @@ -107,7 +142,10 @@ def append(self, obs: Observation[T]) -> Observation[T]: # Encode payload before any locking (avoids holding locks during IO) encoded: bytes | None = None if self.blob_store is not None and not is_scalar: - encoded = self.codec.encode(payload) + if self.payload_strategy is not None: + encoded = self.payload_strategy.encode(payload) + else: + encoded = self.codec.encode(payload) try: # Insert metadata, get assigned id @@ -118,6 +156,8 @@ def append(self, obs: Observation[T]) -> Observation[T]: if encoded is not None: assert self.blob_store is not None self.blob_store.put(self.name, row_id, encoded) + if self.payload_strategy is not None: + self.payload_strategy.after_blob_put(self.name, row_id, encoded) # Replace inline data with lazy loader obs._data = _UNLOADED obs._loader = self._make_loader(row_id) @@ -155,6 +195,9 @@ def _attach_loaders(self, it: Iterator[Observation[T]]) -> Iterator[Observation[ obs.data_type = self.data_type yield obs return + if self.payload_strategy is not None: + yield from self.payload_strategy.attach_loaders(self.name, it, self.blob_store) + return for obs in it: obs.data_type = self.data_type if obs._loader is None and isinstance(obs._data, type(_UNLOADED)): @@ -263,4 +306,7 @@ def serialize(self) -> dict[str, Any]: "blob_store": self.blob_store.serialize() if self.blob_store else None, "vector_store": self.vector_store.serialize() if self.vector_store else None, "notifier": self.notifier.serialize(), + "payload_strategy": self.payload_strategy.serialize() + if self.payload_strategy is not None + else None, } diff --git a/dimos/memory2/module.py b/dimos/memory2/module.py index d88ebc75ca..c3264d5116 100644 --- a/dimos/memory2/module.py +++ b/dimos/memory2/module.py @@ -254,7 +254,7 @@ class RecorderConfig(MemoryModuleConfig): default_frame_id: str = "base_link" tf_tolerance: float = 0.5 db_path: str | Path = "recording.db" - image_storage: dict[str, Any] = Field(default_factory=dict) + payload_strategies: dict[str, Any] = Field(default_factory=dict) class Recorder(MemoryModule): @@ -305,8 +305,8 @@ def start(self) -> None: for name, port in self.inputs.items(): stream_overrides: dict[str, Any] = {} - if name in self.config.image_storage: - stream_overrides["image_storage"] = self.config.image_storage[name] + if name in self.config.payload_strategies: + stream_overrides["payload_strategy"] = self.config.payload_strategies[name] stream: Stream[Any] = self.store.stream(name, port.type, **stream_overrides) self._port_to_stream(name, port, stream) logger.info("Recording %s (%s)", name, port.type.__name__) diff --git a/dimos/memory2/replay.py b/dimos/memory2/replay.py index 516d39b372..91c977cbbd 100644 --- a/dimos/memory2/replay.py +++ b/dimos/memory2/replay.py @@ -138,6 +138,13 @@ def _decode(self, obs: Any) -> T: data = self._autocast(data) return cast("T", data) + def _should_suppress_decode_error(self, error: BaseException) -> bool: + stream = self._replay.store.stream(self._name) + source = getattr(stream, "_source", None) + strategy = getattr(source, "payload_strategy", None) + should_suppress = getattr(strategy, "should_suppress_decode_error", None) + return bool(should_suppress is not None and should_suppress(error)) + def _base_stream(self) -> Stream[Any]: """Memory2 Stream bounded by the replay window, ordered by ts.""" cfg = self._replay.config @@ -189,8 +196,14 @@ def iterate_ts(self) -> Iterator[tuple[float, T]]: emitted = False obs: Any for obs in self._base_stream(): + try: + decoded = self._decode(obs) + except BaseException as exc: + if self._should_suppress_decode_error(exc): + continue + raise emitted = True - yield (obs.ts, self._decode(obs)) + yield (obs.ts, decoded) if not self._replay.config.loop or not emitted: break @@ -203,6 +216,10 @@ def first(self) -> T | None: return self._decode(self._base_stream().first()) except LookupError: return None + except BaseException as exc: + if self._should_suppress_decode_error(exc): + return None + raise def find_closest(self, timestamp: float, tolerance: float = 1.0) -> T | None: s: Stream[Any] = self._replay.store.stream(self._name) @@ -210,7 +227,12 @@ def find_closest(self, timestamp: float, tolerance: float = 1.0) -> T | None: obs: Any = s.at(timestamp, tolerance).first() except LookupError: return None - return self._decode(obs) + try: + return self._decode(obs) + except BaseException as exc: + if self._should_suppress_decode_error(exc): + return None + raise def observable(self) -> Observable[T]: """Timed Observable scheduled against the Replay's shared anchor. @@ -239,8 +261,14 @@ def make_iterator() -> Iterator[tuple[float, T]]: emitted = False obs: Any for obs in base(): + try: + decoded = decode(obs) + except BaseException as exc: + if self._should_suppress_decode_error(exc): + continue + raise emitted = True - yield (obs.ts, decode(obs)) + yield (obs.ts, decoded) if not loop or not emitted: break diff --git a/dimos/memory2/store/base.py b/dimos/memory2/store/base.py index 35e698ad82..62c5bf53fe 100644 --- a/dimos/memory2/store/base.py +++ b/dimos/memory2/store/base.py @@ -157,50 +157,8 @@ def _create_backend( self, name: str, payload_type: type[Any] | None = None, **config: Any ) -> Backend[Any]: """Create a Backend for the named stream. Called once per stream name.""" - image_storage = config.pop("image_storage", None) - if image_storage is not None: - from dimos.memory2.video.h264 import ( - H264FrameIndexStore, - H264ImageBackend, - storage_config_from_any, - ) - from dimos.msgs.sensor_msgs.Image import Image - - storage_config = storage_config_from_any(image_storage) - if ( - storage_config is not None - and payload_type is not None - and issubclass(payload_type, Image) - ): - bs = config.pop("blob_store", self.config.blob_store) - if bs is None: - raise TypeError("H.264 image storage requires a blob_store") - if isinstance(bs, type): - bs = bs() - obs = config.pop("observation_store", self.config.observation_store) - if obs is None or isinstance(obs, type): - obs = (obs or ListObservationStore)(name=name) - vs = config.pop("vector_store", self.config.vector_store) - if isinstance(vs, type): - vs = vs() - notifier = config.pop("notifier", self.config.notifier) - if notifier is None or isinstance(notifier, type): - notifier = (notifier or SubjectNotifier)() - frame_index = config.pop("frame_index", None) - if frame_index is None: - raise TypeError("H.264 image storage requires a frame_index") - if not isinstance(frame_index, H264FrameIndexStore): - raise TypeError("H.264 image storage frame_index must be H264FrameIndexStore") - return H264ImageBackend( - metadata_store=obs, - blob_store=bs, - frame_index=frame_index, - storage_config=storage_config, - vector_store=vs, - notifier=notifier, - eager_blobs=config.get("eager_blobs", False), - ) codec = self._resolve_codec(payload_type, config.pop("codec", None)) + payload_strategy = config.pop("payload_strategy", None) # Instantiate or use provided instances obs = config.pop("observation_store", self.config.observation_store) @@ -227,6 +185,7 @@ def _create_backend( vector_store=vs, notifier=notifier, eager_blobs=config.get("eager_blobs", False), + payload_strategy=payload_strategy, ) def stream(self, name: str, payload_type: type[T] | None = None, **overrides: Any) -> Stream[T]: diff --git a/dimos/memory2/store/sqlite.py b/dimos/memory2/store/sqlite.py index 7ecd3c04ab..5558bc0c55 100644 --- a/dimos/memory2/store/sqlite.py +++ b/dimos/memory2/store/sqlite.py @@ -24,6 +24,7 @@ from dimos.memory2.blobstore.base import BlobStore from dimos.memory2.blobstore.sqlite import SqliteBlobStore from dimos.memory2.codecs.base import codec_id +from dimos.memory2.codecs.pickle import PickleCodec from dimos.memory2.observationstore.sqlite import SqliteObservationStore from dimos.memory2.registry import RegistryStore, deserialize_component, qual from dimos.memory2.store.base import Store, StoreConfig @@ -66,25 +67,20 @@ def _open_connection(self) -> sqlite3.Connection: def _assemble_backend(self, name: str, stored: dict[str, Any]) -> Backend[Any]: """Reconstruct a Backend from a stored config dict.""" from dimos.memory2.codecs.base import _resolve_payload_type, codec_from_id - from dimos.memory2.codecs.pickle import PickleCodec - from dimos.memory2.video.h264 import ( - H264FrameIndexStore, - H264ImageBackend, - storage_config_from_any, - ) payload_module = stored["payload_module"] data_type = _resolve_payload_type(payload_module) eager_blobs = stored.get("eager_blobs", False) page_size = stored.get("page_size", self.config.page_size) - image_storage = storage_config_from_any(stored.get("image_storage")) + payload_strategy_data = stored.get("payload_strategy") codec = ( PickleCodec() - if image_storage is not None + if payload_strategy_data is not None else codec_from_id(stored["codec_id"], payload_module) ) backend_conn = self._open_connection() + payload_strategy = self._deserialize_payload_strategy(payload_strategy_data, backend_conn) # Reconstruct components from serialized config bs_data = stored.get("blob_store") @@ -124,27 +120,16 @@ def _assemble_backend(self, name: str, stored: dict[str, Any]) -> Backend[Any]: blob_store_conn_match=blob_store_conn_match and eager_blobs, page_size=page_size, ) - if image_storage is not None: - backend = H264ImageBackend( - metadata_store=metadata_store, - blob_store=bs, - frame_index=H264FrameIndexStore(backend_conn), - storage_config=image_storage, - vector_store=vs, - notifier=notifier, - eager_blobs=eager_blobs, - ) - else: - backend = Backend( - metadata_store=metadata_store, - codec=codec, - data_type=data_type, - blob_store=bs, - vector_store=vs, - notifier=notifier, - eager_blobs=eager_blobs, - ) - return backend + return Backend( + metadata_store=metadata_store, + codec=codec, + data_type=data_type, + blob_store=bs, + vector_store=vs, + notifier=notifier, + eager_blobs=eager_blobs, + payload_strategy=payload_strategy, + ) @staticmethod def _serialize_backend( @@ -153,13 +138,14 @@ def _serialize_backend( """Serialize a backend's config for registry storage.""" cfg: dict[str, Any] = { "payload_module": payload_module, - "codec_id": codec_id(backend.codec), + "codec_id": backend.payload_strategy.codec_id + if backend.payload_strategy is not None + else codec_id(backend.codec), "eager_blobs": backend.eager_blobs, "page_size": page_size, } - if hasattr(backend, "storage_config"): - cfg["codec_id"] = "h264" - cfg["image_storage"] = backend.storage_config.serialize() + if backend.payload_strategy is not None: + cfg["payload_strategy"] = backend.payload_strategy.serialize() if backend.blob_store is not None: cfg["blob_store"] = backend.blob_store.serialize() if backend.vector_store is not None: @@ -191,7 +177,7 @@ def _create_backend( backend_conn = self._open_connection() - image_storage = config.get("image_storage") + payload_strategy = self._payload_strategy_from_config(config, backend_conn) # Inject conn-shared instances unless user provided overrides if not isinstance(config.get("blob_store"), BlobStore): @@ -199,16 +185,13 @@ def _create_backend( if not isinstance(config.get("vector_store"), VectorStore): config["vector_store"] = SqliteVectorStore(conn=backend_conn) - # Resolve codec early — needed for SqliteObservationStore. H.264 image - # streams own blob decoding in H264ImageBackend, so keep sqlite eager + # Resolve codec early — needed for SqliteObservationStore. Stateful + # payload strategies own blob encoding/decoding, so keep sqlite eager # joins disabled and use a harmless metadata-store codec. - if image_storage is not None: - from dimos.memory2.codecs.pickle import PickleCodec - from dimos.memory2.video.h264 import H264FrameIndexStore - + if payload_strategy is not None: codec = PickleCodec() - config["frame_index"] = H264FrameIndexStore(backend_conn) config["eager_blobs"] = False + config["payload_strategy"] = payload_strategy else: codec = self._resolve_codec(payload_type, config.get("codec")) config["codec"] = codec @@ -239,12 +222,48 @@ def _create_backend( return backend + @staticmethod + def _deserialize_payload_strategy( + data: dict[str, Any] | None, + conn: sqlite3.Connection, + ) -> Any | None: + if data is None: + return None + strategy = deserialize_component(data) + SqliteStore._bind_payload_strategy(strategy, conn) + return strategy + + @staticmethod + def _payload_strategy_from_config( + config: dict[str, Any], conn: sqlite3.Connection + ) -> Any | None: + strategy = config.pop("payload_strategy", None) + if strategy is None: + return None + SqliteStore._bind_payload_strategy(strategy, conn) + return strategy + + @staticmethod + def _bind_payload_strategy(strategy: Any, conn: sqlite3.Connection) -> None: + bind = getattr(strategy, "bind_sqlite", None) + if bind is not None: + bind(conn) + def list_streams(self) -> list[str]: db_names = set(self._registry.list_streams()) return sorted(db_names | set(self._streams.keys())) def delete_stream(self, name: str) -> None: + stored = self._registry.get(name) + payload_strategy = None + if stored is not None: + payload_strategy = self._deserialize_payload_strategy( + stored.get("payload_strategy"), + self._registry_conn, + ) super().delete_stream(name) + if payload_strategy is not None and hasattr(payload_strategy, "delete_stream"): + payload_strategy.delete_stream(name) self._registry_conn.execute(f'DROP TABLE IF EXISTS "{name}"') self._registry_conn.execute(f'DROP TABLE IF EXISTS "{name}_blob"') self._registry_conn.execute(f'DROP TABLE IF EXISTS "{name}_vec"') diff --git a/dimos/memory2/test_payload_strategy.py b/dimos/memory2/test_payload_strategy.py new file mode 100644 index 0000000000..5bc0a92d00 --- /dev/null +++ b/dimos/memory2/test_payload_strategy.py @@ -0,0 +1,164 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import pytest + +from dimos.memory2.store.sqlite import SqliteStore +from dimos.memory2.type.observation import _UNLOADED + +if TYPE_CHECKING: + from collections.abc import Iterator + + from dimos.memory2.blobstore.base import BlobStore + from dimos.memory2.type.observation import Observation + + +class SuppressMeError(RuntimeError): + pass + + +class DoNotSuppressError(RuntimeError): + pass + + +class PrefixPayloadStrategy: + codec_id = "prefix" + + def __init__(self, prefix: str = "encoded:") -> None: + self.prefix = prefix + self.started = False + self.stopped = False + self.bound_sqlite = False + self.encoded_values: list[str] = [] + self.blob_rows: list[tuple[str, int, bytes]] = [] + + def start(self) -> None: + self.started = True + + def stop(self) -> None: + self.stopped = True + + def bind_sqlite(self, _conn: Any) -> None: + self.bound_sqlite = True + + def encode(self, value: str) -> bytes: + self.encoded_values.append(value) + return f"{self.prefix}{value}".encode() + + def after_blob_put(self, stream_name: str, row_id: int, encoded: bytes) -> None: + self.blob_rows.append((stream_name, row_id, encoded)) + + def _decode(self, raw: bytes) -> str: + value = raw.decode() + if not value.startswith(self.prefix): + raise ValueError("payload strategy prefix missing") + decoded = value.removeprefix(self.prefix) + if decoded == "skip": + raise SuppressMeError("skip this payload") + if decoded == "boom": + raise DoNotSuppressError("do not suppress this payload") + return decoded + + def make_loader(self, stream_name: str, row_id: int, blob_store: BlobStore) -> Any: + def loader() -> str: + return self._decode(blob_store.get(stream_name, row_id)) + + return loader + + def attach_loaders( + self, + stream_name: str, + observations: Iterator[Observation[str]], + blob_store: BlobStore, + ) -> Iterator[Observation[str]]: + for obs in observations: + obs.data_type = str + if obs._loader is None and isinstance(obs._data, type(_UNLOADED)): + row_id = obs.id + obs._loader = self.make_loader(stream_name, row_id, blob_store) + yield obs + + def should_suppress_decode_error(self, error: BaseException) -> bool: + return isinstance(error, SuppressMeError) + + def serialize(self) -> dict[str, Any]: + return { + "class": f"{type(self).__module__}.{type(self).__qualname__}", + "config": {"prefix": self.prefix}, + } + + +def test_payload_strategy_encodes_loads_and_stops(tmp_path) -> None: + strategy = PrefixPayloadStrategy(prefix="p:") + store = SqliteStore(path=str(tmp_path / "strategy.db")) + stream = store.stream("events", str, payload_strategy=strategy) + + appended = stream.append("hello", ts=1.0) + + assert strategy.bound_sqlite + assert strategy.started + assert strategy.encoded_values == ["hello"] + assert strategy.blob_rows == [("events", appended.id, b"p:hello")] + queried = stream.first() + assert queried._data is _UNLOADED + assert queried.data == "hello" + + store.stop() + assert strategy.stopped + + +def test_payload_strategy_persists_and_binds_on_reopen(tmp_path) -> None: + db = tmp_path / "strategy-reopen.db" + with SqliteStore(path=str(db)) as store: + stream = store.stream( + "events", + str, + payload_strategy=PrefixPayloadStrategy(prefix="stored:"), + ) + stream.append("hello", ts=1.0) + + with SqliteStore(path=str(db), must_exist=True) as reopened: + stream = reopened.stream("events", str) + assert stream._source is not None + strategy = stream._source.payload_strategy + assert isinstance(strategy, PrefixPayloadStrategy) + assert strategy.prefix == "stored:" + assert strategy.bound_sqlite + assert stream.first().data == "hello" + + +def test_replay_skips_strategy_suppressed_decode_errors(tmp_path) -> None: + store = SqliteStore(path=str(tmp_path / "strategy-replay.db")) + stream = store.stream("events", str, payload_strategy=PrefixPayloadStrategy()) + stream.append("first", ts=1.0) + stream.append("skip", ts=2.0) + stream.append("third", ts=3.0) + + assert list(store.replay().streams.events.iterate()) == ["first", "third"] + + +def test_replay_surfaces_non_suppressed_strategy_errors(tmp_path) -> None: + store = SqliteStore(path=str(tmp_path / "strategy-replay-error.db")) + stream = store.stream("events", str, payload_strategy=PrefixPayloadStrategy()) + stream.append("first", ts=1.0) + stream.append("boom", ts=2.0) + + replay_iter = store.replay().streams.events.iterate() + assert next(replay_iter) == "first" + with pytest.raises(DoNotSuppressError): + next(replay_iter) diff --git a/dimos/memory2/video/h264.py b/dimos/memory2/video/h264.py index 9ba73d0f7a..c517c1c67d 100644 --- a/dimos/memory2/video/h264.py +++ b/dimos/memory2/video/h264.py @@ -16,11 +16,8 @@ from dataclasses import asdict, dataclass, replace import sqlite3 -from typing import TYPE_CHECKING, Any, Generic, TypeVar +from typing import TYPE_CHECKING, Any -from dimos.memory2.backend import Backend -from dimos.memory2.codecs.pickle import PickleCodec -from dimos.memory2.notifier.subject import SubjectNotifier from dimos.memory2.type.observation import _UNLOADED from dimos.msgs.sensor_msgs.Image import Image, ImageFormat from dimos.msgs.sensor_msgs.VideoPacket import VideoPacket @@ -36,13 +33,7 @@ from collections.abc import Iterator from dimos.memory2.blobstore.base import BlobStore - from dimos.memory2.notifier.base import Notifier - from dimos.memory2.observationstore.base import ObservationStore - from dimos.memory2.type.filter import StreamQuery from dimos.memory2.type.observation import Observation - from dimos.memory2.vectorstore.base import VectorStore - -T = TypeVar("T") @dataclass(frozen=True) @@ -127,11 +118,18 @@ def start(self) -> None: def stop(self) -> None: pass + def delete_stream(self, stream_name: str) -> None: + self._conn.execute("DELETE FROM h264_frames WHERE stream_name = ?", (stream_name,)) + def insert(self, stream_name: str, observation_id: int, packet: VideoPacket) -> None: keyframe_observation_id = ( observation_id if packet.is_keyframe - else self._keyframe_observation_id(stream_name, packet.keyframe_seq) + else self._keyframe_observation_id( + stream_name, + packet.keyframe_seq, + current_observation_id=observation_id, + ) ) self._conn.execute( """ @@ -207,123 +205,133 @@ def rows(self, stream_name: str) -> list[H264FrameIndexRow]: for row in rows ] - def _keyframe_observation_id(self, stream_name: str, keyframe_seq: int) -> int: + def _keyframe_observation_id( + self, + stream_name: str, + keyframe_seq: int, + *, + current_observation_id: int, + ) -> int: row = self._conn.execute( """ SELECT observation_id FROM h264_frames - WHERE stream_name = ? AND seq = ? AND is_keyframe = 1 + WHERE stream_name = ? AND seq = ? AND is_keyframe = 1 AND observation_id <= ? + ORDER BY observation_id DESC + LIMIT 1 """, - (stream_name, keyframe_seq), + (stream_name, keyframe_seq, current_observation_id), ).fetchone() if row is None: raise VideoDecodeGapError(f"No H.264 keyframe index for seq {keyframe_seq}") return int(row[0]) -class H264ImageBackend(Backend[Image], Generic[T]): - """memory2 backend that stores one H.264 packet blob per Image observation.""" +class H264ImagePayloadStrategy: + """Stateful H.264 payload strategy for logical ``Stream[Image]`` storage.""" + + codec_id = "h264" def __init__( self, *, - metadata_store: ObservationStore[Image], - blob_store: BlobStore, - frame_index: H264FrameIndexStore, - storage_config: H264ImageStorageConfig | None = None, - vector_store: VectorStore | None = None, - notifier: Notifier[Image] | None = None, - eager_blobs: bool = False, + storage_config: H264ImageStorageConfig | dict[str, Any] | None = None, + frame_index: H264FrameIndexStore | None = None, ) -> None: - self.storage_config = storage_config or H264ImageStorageConfig() - self.frame_index = frame_index - self._encoder = H264Encoder( - self.storage_config.codec, - codec=self.storage_config.codec_adapter, - ) - super().__init__( - metadata_store=metadata_store, - codec=PickleCodec(), - data_type=Image, - blob_store=blob_store, - vector_store=vector_store, - notifier=notifier or SubjectNotifier(), - eager_blobs=eager_blobs, + self.storage_config = ( + H264ImageStorageConfig.parse(storage_config) + if storage_config is not None + else H264ImageStorageConfig() ) + self.frame_index = frame_index + self._encoder: H264Encoder | None = None + + def bind_frame_index(self, frame_index: H264FrameIndexStore) -> None: + self.frame_index = frame_index + + def bind_sqlite(self, conn: sqlite3.Connection) -> None: + self.bind_frame_index(H264FrameIndexStore(conn)) def start(self) -> None: - super().start() + if self.frame_index is None: + raise RuntimeError("H.264 image payload strategy requires a frame index store") self.frame_index.start() - def _make_loader(self, row_id: int) -> Any: - bs = self.blob_store - if bs is None: - raise RuntimeError("BlobStore required for H.264 image storage") - name = self.name + def stop(self) -> None: + pass + + def encode(self, value: Image) -> bytes: + if not isinstance(value, Image): + raise TypeError( + f"H.264 image payload strategy expects Image, got {type(value).__name__}" + ) + if self._encoder is None: + self._encoder = H264Encoder( + self.storage_config.codec, + codec=self.storage_config.codec_adapter, + ) + return self._encoder.encode(value).lcm_encode() + + def after_blob_put(self, stream_name: str, row_id: int, encoded: bytes) -> None: frame_index = self.frame_index + if frame_index is None: + raise RuntimeError("H.264 image payload strategy requires a frame index store") + frame_index.insert(stream_name, row_id, VideoPacket.lcm_decode(encoded)) + + def make_loader(self, stream_name: str, row_id: int, blob_store: BlobStore) -> Any: storage_config = self.storage_config def loader() -> Image: - packet_ids = frame_index.packet_ids_for_decode(name, row_id) decoder = H264Decoder(storage_config.codec, codec=storage_config.codec_adapter) - decoded: Image | None = None - for packet_id in packet_ids: - packet = VideoPacket.lcm_decode(bs.get(name, packet_id)) - decoded = decoder.decode(packet) - if decoded is None: - raise VideoDecodeGapError(f"No H.264 packet available for observation {row_id}") - return decoded + packet = VideoPacket.lcm_decode(blob_store.get(stream_name, row_id)) + return decoder.decode(packet) return loader - def append(self, obs: Observation[Image]) -> Observation[Image]: - payload = obs.data - if not isinstance(payload, Image): - raise TypeError(f"Stream expects Image, got {type(payload).__qualname__}") - obs.data_type = Image - packet = self._encoder.encode(payload) - encoded = packet.lcm_encode() - try: - row_id = self.metadata_store.insert(obs) - obs.id = row_id - assert self.blob_store is not None - self.blob_store.put(self.name, row_id, encoded) - self.frame_index.insert(self.name, row_id, packet) - obs._data = _UNLOADED - obs._loader = self._make_loader(row_id) - if self.vector_store is not None: - emb = getattr(obs, "embedding", None) - if emb is not None: - self.vector_store.put(self.name, row_id, emb) - if hasattr(self.metadata_store, "commit"): - self.metadata_store.commit() - except BaseException: - if hasattr(self.metadata_store, "rollback"): - self.metadata_store.rollback() - raise - self.notifier.notify(obs) - return obs - - def _attach_loaders(self, it: Iterator[Observation[Image]]) -> Iterator[Observation[Image]]: - for obs in it: + def attach_loaders( + self, + stream_name: str, + observations: Iterator[Observation[Image]], + blob_store: BlobStore, + ) -> Iterator[Observation[Image]]: + decoder = H264Decoder(self.storage_config.codec, codec=self.storage_config.codec_adapter) + + for obs in observations: obs.data_type = Image if obs._loader is None and isinstance(obs._data, type(_UNLOADED)): - obs._loader = self._make_loader(obs.id) + row_id = obs.id + + def loader(row_id: int = row_id) -> Image: + packet = VideoPacket.lcm_decode(blob_store.get(stream_name, row_id)) + return decoder.decode(packet) + + obs._loader = loader yield obs - def _iterate_snapshot(self, query: StreamQuery) -> Iterator[Observation[Image]]: - it = self._attach_loaders(self.metadata_store.query(query)) - if self.eager_blobs: - for obs in it: - _ = obs.data - yield obs - else: - yield from it + def should_suppress_decode_error(self, error: BaseException) -> bool: + return isinstance(error, VideoDecodeGapError) + + def delete_stream(self, stream_name: str) -> None: + if self.frame_index is not None: + self.frame_index.delete_stream(stream_name) def serialize(self) -> dict[str, Any]: - cfg = super().serialize() - cfg["codec_id"] = "h264" - cfg["image_storage"] = self.storage_config.serialize() - return cfg + return { + "class": f"{type(self).__module__}.{type(self).__qualname__}", + "config": {"storage_config": self.storage_config.serialize()}, + } + + +def h264_image_payload_strategy_from_any(raw: Any) -> H264ImagePayloadStrategy | None: + storage_config = storage_config_from_any(raw) + if storage_config is None: + return None + return H264ImagePayloadStrategy(storage_config=storage_config) + + +def bind_sqlite_frame_index(strategy: Any, conn: sqlite3.Connection) -> Any: + if isinstance(strategy, H264ImagePayloadStrategy): + strategy.bind_frame_index(H264FrameIndexStore(conn)) + return strategy def storage_config_from_any(raw: Any) -> H264ImageStorageConfig | None: @@ -345,8 +353,10 @@ def storage_config_with_adapter( __all__ = [ "H264FrameIndexRow", "H264FrameIndexStore", - "H264ImageBackend", + "H264ImagePayloadStrategy", "H264ImageStorageConfig", + "bind_sqlite_frame_index", + "h264_image_payload_strategy_from_any", "storage_config_from_any", "storage_config_with_adapter", ] diff --git a/dimos/memory2/video/test_h264_storage.py b/dimos/memory2/video/test_h264_storage.py index cad6c0803f..10297039c3 100644 --- a/dimos/memory2/video/test_h264_storage.py +++ b/dimos/memory2/video/test_h264_storage.py @@ -19,6 +19,7 @@ import numpy as np import pytest +from dimos.memory2.backend import Backend from dimos.memory2.blobstore.sqlite import SqliteBlobStore from dimos.memory2.codecs.pickle import PickleCodec from dimos.memory2.observationstore.sqlite import SqliteObservationStore @@ -26,8 +27,9 @@ from dimos.memory2.type.observation import _UNLOADED from dimos.memory2.video.h264 import ( H264FrameIndexStore, - H264ImageBackend, + H264ImagePayloadStrategy, H264ImageStorageConfig, + h264_image_payload_strategy_from_any, storage_config_from_any, ) from dimos.msgs.sensor_msgs.Image import Image, ImageFormat @@ -61,17 +63,22 @@ def _image(seq: int, fmt: ImageFormat = ImageFormat.RGB) -> Image: def _make_backend( conn: sqlite3.Connection, *, config: H264ImageStorageConfig | None = None -) -> H264ImageBackend: +) -> Backend[Image]: frame_index = H264FrameIndexStore(conn) + strategy = H264ImagePayloadStrategy( + storage_config=config or H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()), + frame_index=frame_index, + ) blob_store = SqliteBlobStore(conn=conn) obs_store = SqliteObservationStore( conn=conn, name="cam", codec=PickleCodec(), blob_store_conn_match=False, page_size=256 ) - backend = H264ImageBackend( + backend = Backend( metadata_store=obs_store, + codec=PickleCodec(), + data_type=Image, blob_store=blob_store, - frame_index=frame_index, - storage_config=config or H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()), + payload_strategy=strategy, ) backend.start() return backend @@ -84,6 +91,7 @@ def test_storage_config_parse_and_serialize() -> None: assert parsed.mode == "h264" assert parsed.codec == config.codec assert storage_config_from_any(raw) == H264ImageStorageConfig(codec=config.codec) + assert isinstance(h264_image_payload_strategy_from_any(raw), H264ImagePayloadStrategy) assert storage_config_from_any({"mode": "jpeg", "codec": raw["codec"]}) is None @@ -92,11 +100,14 @@ def test_store_creates_h264_backend_from_config(tmp_path) -> None: backend = store._create_backend( "cam", Image, - image_storage=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()), + payload_strategy=H264ImagePayloadStrategy( + storage_config=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) + ), ) - assert isinstance(backend, H264ImageBackend) - assert backend.storage_config.mode == "h264" - assert isinstance(backend.storage_config.codec_adapter, FakeH264CodecAdapter) + assert isinstance(backend, Backend) + assert isinstance(backend.payload_strategy, H264ImagePayloadStrategy) + assert backend.payload_strategy.storage_config.mode == "h264" + assert isinstance(backend.payload_strategy.storage_config.codec_adapter, FakeH264CodecAdapter) def test_h264_image_stream_keeps_default_jpeg_compatibility(tmp_path) -> None: @@ -116,60 +127,133 @@ def test_h264_one_observation_and_one_blob_per_frame(tmp_path) -> None: assert stored.id == 1 assert backend.blob_store is not None assert backend.blob_store.get("cam", 1) - assert len(backend.frame_index.rows("cam")) == 1 + assert isinstance(backend.payload_strategy, H264ImagePayloadStrategy) + assert backend.payload_strategy.frame_index is not None + assert len(backend.payload_strategy.frame_index.rows("cam")) == 1 def test_h264_persistent_gop_index_and_lazy_decode(tmp_path) -> None: db = tmp_path / "gop.db" with SqliteStore(path=str(db)) as store: stream = store.stream( - "cam", Image, image_storage=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) + "cam", + Image, + payload_strategy=H264ImagePayloadStrategy( + storage_config=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) + ), ) stream.append(_image(1), ts=1.0) stream.append(_image(2), ts=2.0) - obs = list(stream)[1] + observations = list(stream) + obs = observations[1] assert obs._loader is not None assert obs._data is _UNLOADED assert obs.id == 2 assert obs.ts == 2.0 + assert observations[0].data.data.shape == (2, 2, 3) assert obs.data.data.shape == (2, 2, 3) backend = stream._source - assert isinstance(backend, H264ImageBackend) - assert len(backend.frame_index.rows("cam")) == 2 + assert isinstance(backend.payload_strategy, H264ImagePayloadStrategy) + assert backend.payload_strategy.frame_index is not None + assert len(backend.payload_strategy.frame_index.rows("cam")) == 2 with SqliteStore(path=str(db), must_exist=True) as reopened: stream = reopened.stream("cam", Image) assert stream.count() == 2 backend = stream._source - assert isinstance(backend, H264ImageBackend) - assert backend.storage_config.mode == "h264" - backend.storage_config = H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) + assert isinstance(backend.payload_strategy, H264ImagePayloadStrategy) + assert backend.payload_strategy.storage_config.mode == "h264" + backend.payload_strategy.storage_config = H264ImageStorageConfig( + codec_adapter=FakeH264CodecAdapter() + ) assert reopened.streams.cam.first().data.data.shape == (2, 2, 3) +def test_h264_reopen_append_uses_nearest_reset_sequence_keyframe(tmp_path) -> None: + db = tmp_path / "reopen_append.db" + conn = sqlite3.connect(str(db)) + backend = _make_backend(conn) + from dimos.memory2.type.observation import Observation + + backend.append(Observation(ts=1.0, data_type=Image, _data=_image(1))) + backend.append(Observation(ts=2.0, data_type=Image, _data=_image(2))) + backend.stop() + conn.close() + + reopened_conn = sqlite3.connect(str(db)) + reopened_backend = _make_backend(reopened_conn) + reopened_backend.append(Observation(ts=3.0, data_type=Image, _data=_image(3))) + reopened_backend.append(Observation(ts=4.0, data_type=Image, _data=_image(4))) + + assert isinstance(reopened_backend.payload_strategy, H264ImagePayloadStrategy) + assert reopened_backend.payload_strategy.frame_index is not None + rows = reopened_backend.payload_strategy.frame_index.rows("cam") + assert [(row.observation_id, row.seq, row.keyframe_observation_id) for row in rows] == [ + (1, 0, 1), + (2, 1, 1), + (3, 0, 3), + (4, 1, 3), + ] + + def test_h264_mid_gop_decode_and_missing_gop_failure(tmp_path) -> None: store = SqliteStore(path=str(tmp_path / "gap.db")) stream = store.stream( - "cam", Image, image_storage=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) + "cam", + Image, + payload_strategy=H264ImagePayloadStrategy( + storage_config=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) + ), ) stream.append(_image(1)) stream.append(_image(2)) stream.append(_image(3)) - obs = list(stream)[2] - assert obs.data.data[0, 0, 0] == 3 + observations = list(stream) + assert [obs.data.data[0, 0, 0] for obs in observations] == [1, 2, 3] backend = stream._source - assert isinstance(backend, H264ImageBackend) - backend.frame_index._conn.execute("DELETE FROM h264_frames WHERE observation_id = 2") - gap_obs = list(stream)[1] + assert isinstance(backend.payload_strategy, H264ImagePayloadStrategy) + assert backend.payload_strategy.frame_index is not None + assert backend.blob_store is not None + backend.blob_store.delete("cam", 2) + gap_observations = list(stream) + assert gap_observations[0].data.data[0, 0, 0] == 1 + with pytest.raises(KeyError): + _ = gap_observations[1].data + gap_obs = gap_observations[2] with pytest.raises(VideoDecodeGapError): _ = gap_obs.data +def test_h264_replay_seek_suppresses_delta_until_next_keyframe(tmp_path) -> None: + config = H264ImageStorageConfig( + codec_adapter=FakeH264CodecAdapter(), + codec=H264ImageStorageConfig().codec, + ) + store = SqliteStore(path=str(tmp_path / "seek.db")) + stream = store.stream( + "cam", + Image, + payload_strategy=H264ImagePayloadStrategy(storage_config=config), + ) + for seq in range(1, 34): + stream.append(_image(seq), ts=float(seq)) + + replay = store.replay(from_timestamp=2.0) + images = list(replay.streams.cam.iterate()) + + assert images[0].ts == 31.0 + assert [img.data[0, 0, 0] for img in images[:3]] == [31, 32, 33] + + def test_replay_iterate_returns_decoded_images(tmp_path) -> None: store = SqliteStore(path=str(tmp_path / "replay.db")) stream = store.stream( - "cam", Image, image_storage=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) + "cam", + Image, + payload_strategy=H264ImagePayloadStrategy( + storage_config=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) + ), ) stream.append(_image(1), ts=1.0) stream.append(_image(2), ts=2.0) @@ -183,8 +267,30 @@ def test_replay_iterate_returns_decoded_images(tmp_path) -> None: def test_h264_rejects_unsupported_formats(tmp_path) -> None: store = SqliteStore(path=str(tmp_path / "bad.db")) stream = store.stream( - "cam", Image, image_storage=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) + "cam", + Image, + payload_strategy=H264ImagePayloadStrategy( + storage_config=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) + ), ) rgba = np.zeros((2, 2, 4), dtype=np.uint8) with pytest.raises(UnsupportedVideoImageError): stream.append(Image.from_numpy(rgba, format=ImageFormat.RGBA)) + + +def test_sqlite_delete_stream_removes_h264_frame_index_rows(tmp_path) -> None: + db = tmp_path / "delete.db" + store = SqliteStore(path=str(db)) + stream = store.stream( + "cam", + Image, + payload_strategy=H264ImagePayloadStrategy( + storage_config=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) + ), + ) + stream.append(_image(1)) + store.delete_stream("cam") + + conn = sqlite3.connect(str(db)) + count = conn.execute("SELECT COUNT(*) FROM h264_frames WHERE stream_name = 'cam'").fetchone()[0] + assert count == 0 diff --git a/dimos/msgs/sensor_msgs/Image.py b/dimos/msgs/sensor_msgs/Image.py index b55060625c..5eaca03886 100644 --- a/dimos/msgs/sensor_msgs/Image.py +++ b/dimos/msgs/sensor_msgs/Image.py @@ -15,7 +15,7 @@ from __future__ import annotations import base64 -from dataclasses import dataclass +from dataclasses import dataclass, field from enum import Enum import time from typing import TYPE_CHECKING, Any, Literal, TypedDict @@ -39,16 +39,6 @@ from reactivex.observable import Observable -_DEFAULT_IMAGE_DATA = object() - - -class _LazyPixelData: - pass - - -_UNLOADED_PIXELS = _LazyPixelData() - - class ImageFormat(Enum): BGR = "BGR" RGB = "RGB" @@ -92,92 +82,24 @@ class AgentImageMessage(TypedDict): data: str # Base64 encoded image data -@dataclass(init=False) +@dataclass class Image(Timestamped): """Simple NumPy-based image container.""" msg_name = "sensor_msgs.Image" - def __init__( - self, - data: Any = _DEFAULT_IMAGE_DATA, - format: ImageFormat = ImageFormat.BGR, - frame_id: str = "", - ts: float | None = None, - *, - pixel_loader: Callable[[], np.ndarray[Any, np.dtype[Any]]] | None = None, - height: int | None = None, - width: int | None = None, - channels: int | None = None, - dtype: np.dtype[Any] | type[Any] | None = None, - ) -> None: - self.format = format - self.frame_id = frame_id - self.ts = ts if ts is not None else time.time() - self._pixel_loader = pixel_loader - - if pixel_loader is None: - if data is _DEFAULT_IMAGE_DATA: - data = np.zeros((1, 1, 3), dtype=np.uint8) - self.data = data - return - - if height is None or width is None or dtype is None: - raise ValueError("Lazy Image construction requires height, width, and dtype metadata") - if height <= 0 or width <= 0: - raise ValueError("Lazy Image height and width must be positive") - self._data: np.ndarray[Any, np.dtype[Any]] | _LazyPixelData = _UNLOADED_PIXELS - self._height = int(height) - self._width = int(width) - self._channels = int(channels or 1) - self._dtype = np.dtype(dtype) - - @classmethod - def lazy( - cls, - *, - pixel_loader: Callable[[], np.ndarray[Any, np.dtype[Any]]], - height: int, - width: int, - format: ImageFormat = ImageFormat.BGR, - frame_id: str = "", - ts: float | None = None, - channels: int | None = None, - dtype: np.dtype[Any] | type[Any] = np.uint8, - ) -> Image: - """Construct an image whose pixels are materialized on first data access.""" - - return cls( - format=format, - frame_id=frame_id, - ts=ts, - pixel_loader=pixel_loader, - height=height, - width=width, - channels=channels, - dtype=dtype, - ) - - @property - def data(self) -> np.ndarray[Any, np.dtype[Any]]: - if isinstance(self._data, _LazyPixelData): - if self._pixel_loader is None: - raise ValueError("Lazy Image has no pixel loader") - self.data = self._pixel_loader() - self._pixel_loader = None - return self._data - - @data.setter - def data(self, value: Any) -> None: - arr = value if isinstance(value, np.ndarray) else np.asarray(value) - if arr.ndim < 2: + data: np.ndarray[Any, np.dtype[Any]] = field( + default_factory=lambda: np.zeros((1, 1, 3), dtype=np.uint8) + ) + format: ImageFormat = field(default=ImageFormat.BGR) + frame_id: str = field(default="") + ts: float = field(default_factory=time.time) + + def __post_init__(self) -> None: + if not isinstance(self.data, np.ndarray): + self.data = np.asarray(self.data) + if self.data.ndim < 2: raise ValueError("Image requires a 2D/3D NumPy array") - self._data = arr - self._height = int(arr.shape[0]) - self._width = int(arr.shape[1]) - self._channels = 1 if arr.ndim == 2 else int(arr.shape[2]) - self._dtype = arr.dtype - self._pixel_loader = None def __str__(self) -> str: return ( @@ -212,25 +134,27 @@ def __setstate__(self, state: dict[str, Any]) -> None: @property def height(self) -> int: - return self._height + return int(self.data.shape[0]) @property def width(self) -> int: - return self._width + return int(self.data.shape[1]) @property def channels(self) -> int: - return self._channels + if self.data.ndim == 2: + return 1 + if self.data.ndim == 3: + return int(self.data.shape[2]) + raise ValueError("Invalid image dimensions") @property def shape(self) -> tuple[int, ...]: - if self.channels == 1: - return (self.height, self.width) - return (self.height, self.width, self.channels) + return tuple(self.data.shape) @property def dtype(self) -> np.dtype[Any]: - return self._dtype + return self.data.dtype def copy(self) -> Image: return Image(data=self.data.copy(), format=self.format, frame_id=self.frame_id, ts=self.ts) diff --git a/dimos/msgs/sensor_msgs/test_image.py b/dimos/msgs/sensor_msgs/test_image.py index 1214400e3a..354a52b78d 100644 --- a/dimos/msgs/sensor_msgs/test_image.py +++ b/dimos/msgs/sensor_msgs/test_image.py @@ -69,52 +69,7 @@ def test_opencv_conversion(img: Image) -> None: assert decoded_img == img -def test_lazy_image_metadata_does_not_materialize_pixels() -> None: - calls = 0 - - def load() -> np.ndarray: - nonlocal calls - calls += 1 - return np.ones((3, 4, 3), dtype=np.uint8) - - img = Image.lazy( - pixel_loader=load, - height=3, - width=4, - channels=3, - dtype=np.uint8, - format=ImageFormat.RGB, - frame_id="cam", - ts=10.0, - ) - - assert img.height == 3 - assert img.width == 4 - assert img.channels == 3 - assert img.shape == (3, 4, 3) - assert img.dtype == np.dtype(np.uint8) - assert img.format == ImageFormat.RGB - assert img.frame_id == "cam" - assert img.ts == 10.0 - assert calls == 0 - - -def test_lazy_image_data_materializes_once() -> None: - calls = 0 - - def load() -> np.ndarray: - nonlocal calls - calls += 1 - return np.ones((3, 4, 3), dtype=np.uint8) - - img = Image.lazy(pixel_loader=load, height=3, width=4, channels=3, dtype=np.uint8) - - assert img.data.sum() == 36 - assert img.data.sum() == 36 - assert calls == 1 - - -def test_eager_image_compatibility_after_lazy_support() -> None: +def test_eager_image_compatibility() -> None: data = np.ones((2, 3, 3), dtype=np.uint8) img = Image(data=data, format=ImageFormat.BGR, frame_id="cam", ts=11.0) diff --git a/dimos/protocol/video/demo_h264_video_e2e.py b/dimos/protocol/video/demo_h264_video_e2e.py index 168d7339d2..a848bd1eda 100644 --- a/dimos/protocol/video/demo_h264_video_e2e.py +++ b/dimos/protocol/video/demo_h264_video_e2e.py @@ -30,7 +30,7 @@ from dimos.hardware.sensors.camera.webcam import Webcam from dimos.memory2.module import OnExisting, Recorder from dimos.memory2.store.sqlite import SqliteStore -from dimos.memory2.video.h264 import H264ImageStorageConfig +from dimos.memory2.video.h264 import H264ImagePayloadStrategy, H264ImageStorageConfig from dimos.msgs.sensor_msgs.Image import Image, ImageFormat from dimos.protocol.pubsub.impl.h264_lcm import H264LCM from dimos.protocol.video.h264 import H264Config @@ -224,8 +224,10 @@ def _webcam() -> Webcam: H264E2ERecorder.blueprint( db_path="h264_video_e2e.db", on_existing=OnExisting.OVERWRITE, - image_storage={ - "color_image": H264ImageStorageConfig(codec=_h264_config), + payload_strategies={ + "color_image": H264ImagePayloadStrategy( + storage_config=H264ImageStorageConfig(codec=_h264_config) + ), }, ), H264VideoProbe.blueprint(), @@ -245,8 +247,10 @@ def _webcam() -> Webcam: H264WebcamRecorder.blueprint( db_path="webcam_h264.db", on_existing=OnExisting.OVERWRITE, - image_storage={ - "color_image": H264ImageStorageConfig(codec=_webcam_h264_config), + payload_strategies={ + "color_image": H264ImagePayloadStrategy( + storage_config=H264ImageStorageConfig(codec=_webcam_h264_config) + ), }, ), ).transports( diff --git a/docs/capabilities/memory/index.md b/docs/capabilities/memory/index.md index 739ab81f28..b95b024f8b 100644 --- a/docs/capabilities/memory/index.md +++ b/docs/capabilities/memory/index.md @@ -215,7 +215,7 @@ matters and frame-to-frame compression is worth the dependency cost. ```python skip from dimos.memory2.store.sqlite import SqliteStore -from dimos.memory2.video.h264 import H264ImageStorageConfig +from dimos.memory2.video.h264 import H264ImagePayloadStrategy, H264ImageStorageConfig from dimos.msgs.sensor_msgs.Image import Image from dimos.protocol.video.h264 import H264Config @@ -223,8 +223,10 @@ store = SqliteStore(path="robot_video.db") color = store.stream( "color_image", Image, - image_storage=H264ImageStorageConfig( - codec=H264Config(bitrate=2_000_000, keyframe_interval=30), + payload_strategy=H264ImagePayloadStrategy( + storage_config=H264ImageStorageConfig( + codec=H264Config(bitrate=2_000_000, keyframe_interval=30), + ), ), ) ``` @@ -233,14 +235,16 @@ Recorders can configure the same setting per input stream: ```python skip from dimos.memory2.module import Recorder -from dimos.memory2.video.h264 import H264ImageStorageConfig +from dimos.memory2.video.h264 import H264ImagePayloadStrategy, H264ImageStorageConfig from dimos.protocol.video.h264 import H264Config recorder = Recorder.blueprint( db_path="robot_video.db", - image_storage={ - "color_image": H264ImageStorageConfig( - codec=H264Config(bitrate=2_000_000, keyframe_interval=30), + payload_strategies={ + "color_image": H264ImagePayloadStrategy( + storage_config=H264ImageStorageConfig( + codec=H264Config(bitrate=2_000_000, keyframe_interval=30), + ), ) }, ) @@ -249,13 +253,14 @@ recorder = Recorder.blueprint( H.264 storage keeps the normal memory2 shape: one observation row per source frame. The blob for that observation stores one serialized video packet whose payload is a complete H.264 Annex B access unit, not individual RTP fragments. -The store also writes GOP metadata so lazy decode and replay can start at the -nearest prior keyframe. +The store also writes H.264 frame metadata for cleanup, diagnostics, and future +indexed decode work. Metadata queries do not decode pixels. You can inspect timestamps, poses, tags, -frame ids, and dimensions without paying decode cost. Accessing `obs.data` -decodes lazily from the nearest usable keyframe through the requested frame and -returns a normal `Image`. Replay emits decoded `Image` values in timestamp order. +and frame ids without paying decode cost. Accessing `obs.data` decodes lazily +when the H.264 decode session has valid GOP state and returns a normal `Image`. +Replay emits decoded `Image` values in timestamp order and suppresses deltas +until the first keyframe at or after the replay start point. H.264 storage currently supports uint8 RGB, BGR, and grayscale images. It raises an explicit error for depth images, 16-bit images, alpha formats, and other @@ -279,6 +284,6 @@ codec or storage changes to inspect: - logs from the source, recorder, and probe; - memory2 metadata queries that do not touch `obs.data`; -- lazy `obs.data` decode for both keyframe and mid-GOP observations; +- lazy `obs.data` decode after a valid keyframe, with best-effort suppression of undecodable deltas; - replay of the recorded stream; and - sequence-gap behavior, if you inject packet loss in the transport tests. diff --git a/docs/coding-agents/style.md b/docs/coding-agents/style.md index 86ef7058e1..7e903b408b 100644 --- a/docs/coding-agents/style.md +++ b/docs/coding-agents/style.md @@ -59,5 +59,5 @@ authors or memory2 observations. For LCM, DDS, and memory2 storage, each encoded packet must contain all H.264 NAL units for exactly one source frame as one Annex B access unit. Store one memory2 observation per source frame. P-frames still depend on earlier GOP state, so -decode from a valid keyframe and suppress output after sequence gaps until the -next keyframe. +decode from a valid keyframe and suppress output after sequence gaps, late join, +or replay seek until the next keyframe. diff --git a/docs/development/testing.md b/docs/development/testing.md index fd7631635b..b402fd8114 100644 --- a/docs/development/testing.md +++ b/docs/development/testing.md @@ -67,7 +67,7 @@ pytest -m self_hosted dimos/path/to/test_something.py The H.264 unit tests use fake codec adapters where possible, so they run in the default suite without requiring FFmpeg/libx264. Run the focused tests after -changing video packet shape, lazy `Image` behavior, H.264 transport, memory2 +changing video packet shape, eager `Image` compatibility, H.264 transport, memory2 storage, or the demo blueprint: ```bash diff --git a/docs/usage/transports/index.md b/docs/usage/transports/index.md index 74dd14674b..7d5a30113d 100644 --- a/docs/usage/transports/index.md +++ b/docs/usage/transports/index.md @@ -152,6 +152,10 @@ suppresses decoded output until the next keyframe. Keyframes include decoder parameter data, such as SPS/PPS, so a new subscriber can start decoding at a keyframe. +LCM H.264 is best-effort in v1. DimOS does not yet provide transport QoS, +durable keyframe cache, keyframe request, or PLI behavior for LCM. Those belong +in a later QoS/video-session design. + The first H.264 image path supports uint8 RGB, BGR, and grayscale images. It raises an explicit error for depth, 16-bit, alpha, or other unsupported image formats instead of silently converting pixels. Selecting H.264 requires the video diff --git a/openspec/changes/add-h264-codec-mem2-storage/design.md b/openspec/changes/add-h264-codec-mem2-storage/design.md index 7919a641db..fa9dd99fbb 100644 --- a/openspec/changes/add-h264-codec-mem2-storage/design.md +++ b/openspec/changes/add-h264-codec-mem2-storage/design.md @@ -65,7 +65,7 @@ Core packet and codec classes: - Fields: `seq`, `ts`, `frame_id`, `width`, `height`, `format`, `codec`, `bitstream`, `is_keyframe`, `keyframe_seq`, `pts`, `data`. - First supported `codec`: `h264`. - First supported `bitstream`: Annex B complete access unit for exactly one source frame, aligned with Foxglove `CompressedVideo` expectations: for every full-frame encoder input call, DimOS creates one `VideoPacket` containing all NAL units emitted for that input frame. - - A `VideoPacket` is a complete encoded-frame packet, not necessarily an independently decodable image. Keyframe packets must contain enough decoder bootstrap data for late join and random access, including SPS/PPS on every IDR; delta-frame packets require prior decoded GOP state. + - A `VideoPacket` is a complete encoded-frame packet, not necessarily an independently decodable image. Keyframe packets must contain enough decoder bootstrap data for late join and recovery, including SPS/PPS on every IDR; delta-frame packets require prior decoded GOP state. - `dimos/protocol/video/h264.py` - `H264Config`: bitrate, target fps, keyframe interval, profile, preset/tune, max GOP frames, pixel format. @@ -82,13 +82,12 @@ Implementation dependency: - DimOS should assemble the aiortc payloads for one encoded source frame into a single Annex B `VideoPacket.data` value before publication/storage. This packet carries every NAL unit emitted for that encoder input frame, but only IDR/keyframe packets are expected to be independently bootstrappable. WebRTC carriers may keep aiortc RTP packetization internally, but LCM/DDS/memory2 should exchange complete access units. - The adapter should avoid leaking aiortc classes such as `JitterFrame` and RTP payload descriptors into DimOS public APIs. If future aiortc versions change these codec internals, only `AiortcH264Codec` should need adjustment. -Image lazy data support: +Image payload semantics: - `dimos/msgs/sensor_msgs/Image.py` - - Add an explicit lazy pixel path mirroring `Observation`: metadata fields remain available, while `data` materializes pixels on access. - - `height`, `width`, `format`, `frame_id`, and `ts` must be available without forcing decode. - - Existing eager construction remains valid. - - This is needed for transport subscribers that inspect metadata or keep only the latest frame without always decoding pixels. + - Keep `Image` as the eager numpy-backed payload used by existing modules, transports, visualization, and JPEG storage. + - H.264 laziness belongs at memory2's `Observation.data` boundary, not inside `Image`. + - When H.264 decode succeeds, `obs.data` returns a normal eager `Image`. LCM carrier classes: @@ -112,22 +111,21 @@ WebRTC carrier classes, later: memory2 storage classes: - `dimos/memory2/video/h264.py` - - `H264ImageStorageConfig`: mode/config object for opt-in memory2 H.264 image storage. - - `H264ImageBackend`: image-specific backend or payload strategy that owns encoder state and writes one observation row plus one `VideoPacket` blob per frame. - - `H264FrameIndexStore`: creates and queries a standalone GOP index table. - - `H264ObservationLoader`: reconstructs a requested frame by loading the nearest keyframe packet and ordered delta packets through the requested observation. - - `H264ReplayDecodeSession`: shares decoder state during sequential replay so adjacent frames decode once. + - `H264ImagePayloadStrategy`: generic memory2 payload strategy for logical `Stream[Image]` storage. + - `H264ImageStorageConfig`: config object consumed by the payload strategy. + - `H264FrameIndexStore`: stores H.264 frame metadata for cleanup, diagnostics, and future indexed decode work. + - The strategy owns encoder state on append and writes one observation row plus one serialized `VideoPacket` blob per source frame. + - Observation loaders and replay use the same H.264 decode-session policy as live transport: deltas are suppressed until a valid keyframe establishes decoder state. Store/recorder integration: - `dimos/memory2/store/sqlite.py` - - Recognize image storage config when creating a stream. - - Route `Image` streams with `mode="h264"` to the H.264 image backend. - - Persist storage config in `_streams` so reopening the database selects the right loader. + - Persist generic `payload_strategy` config in `_streams` so reopening the database restores the selected payload strategy. + - Bind SQLite-backed auxiliary stores to strategies through generic strategy hooks rather than H.264-specific `Store` branches. - `dimos/memory2/module.py` - - Add recorder-level per-stream image storage configuration. - - Recorder still subscribes to `In[Image]`; storage mode controls how incoming images are persisted. + - Add recorder-level per-stream `payload_strategies` configuration. + - Recorder still subscribes to `In[Image]`; the payload strategy controls how incoming images are persisted. ### Where components run @@ -147,7 +145,7 @@ Subscriber machine / worker process H264LcmTransport.subscribe() └─ LCM receives packet bytes └─ GopBuffer validates seq/keyframe state - └─ H264Decoder produces Image or lazy Image + └─ H264Decoder produces eager Image └─ module In[Image] callback ``` @@ -157,10 +155,10 @@ memory2 recording path: Recorder module process In[Image] receives normal Image └─ stream.append(Image) - └─ H264ImageBackend owns encoder state + └─ generic Backend delegates payload bytes to H264ImagePayloadStrategy ├─ observation table row: ts / pose / tags ├─ blob row: serialized VideoPacket with complete Annex B access unit - └─ h264 frame index: seq / keyframe row / pts / format + └─ h264 frame metadata: seq / keyframe / pts / format ``` memory2 replay/decode path: @@ -169,8 +167,9 @@ memory2 replay/decode path: Replay or query process stream query returns Observation[Image] metadata └─ obs.data - └─ H264ObservationLoader loads keyframe + delta packet chain - └─ H264Decoder reconstructs Image + └─ H264 payload strategy decodes through H264Decoder session state + ├─ delta before valid keyframe: suppress/fail clearly + └─ keyframe and following deltas: return eager Image ``` The first implementation may re-encode images when recording a decoded `Image` stream that originally arrived over H.264 transport. Preserving incoming packet bytes end-to-end can be a later optimization via a packet side-channel; it is not required to make the public behavior correct. @@ -221,14 +220,16 @@ blueprint = autoconnect(camera(), consumer()).transports( memory2 direct store activation: ```python -from dimos.memory2.video.h264 import H264ImageStorageConfig +from dimos.memory2.video.h264 import H264ImagePayloadStrategy, H264ImageStorageConfig from dimos.protocol.video.h264 import H264Config stream = store.stream( "color_image", Image, - image_storage=H264ImageStorageConfig( - codec=H264Config(bitrate=2_000_000, keyframe_interval=30), + payload_strategy=H264ImagePayloadStrategy( + storage_config=H264ImageStorageConfig( + codec=H264Config(bitrate=2_000_000, keyframe_interval=30), + ), ), ) ``` @@ -237,9 +238,11 @@ Recorder activation: ```python MyRecorder.blueprint( - image_storage={ - "color_image": H264ImageStorageConfig( - codec=H264Config(bitrate=2_000_000, keyframe_interval=30), + payload_strategies={ + "color_image": H264ImagePayloadStrategy( + storage_config=H264ImageStorageConfig( + codec=H264Config(bitrate=2_000_000, keyframe_interval=30), + ), ) } ) @@ -271,7 +274,7 @@ Components: - `H264E2ERecorder(Recorder)` - Declares `color_image: In[Image]`. - - Uses recorder-level `image_storage={"color_image": H264ImageStorageConfig(...)}` so memory2 writes the received image stream as H.264 packets rather than JPEG blobs. + - Uses recorder-level `payload_strategies={"color_image": H264ImagePayloadStrategy(...)}` so memory2 writes the received image stream as H.264 packets rather than JPEG blobs. - Defaults `db_path` to an explicit temporary/demo path such as `h264_video_e2e.db` so manual QA can inspect it. - `H264VideoProbe(Module)` @@ -284,7 +287,7 @@ Blueprint sketch: ```python from dimos.core.coordination.blueprints import autoconnect from dimos.core.transport import H264LcmTransport -from dimos.memory2.video.h264 import H264ImageStorageConfig +from dimos.memory2.video.h264 import H264ImagePayloadStrategy, H264ImageStorageConfig from dimos.msgs.sensor_msgs import Image from dimos.protocol.video.h264 import H264Config @@ -300,8 +303,10 @@ demo_h264_video_e2e = autoconnect( SyntheticVideoSource.blueprint(width=640, height=360, fps=30), H264E2ERecorder.blueprint( db_path="h264_video_e2e.db", - image_storage={ - "color_image": H264ImageStorageConfig(codec=h264_config), + payload_strategies={ + "color_image": H264ImagePayloadStrategy( + storage_config=H264ImageStorageConfig(codec=h264_config), + ), }, ), H264VideoProbe.blueprint(expected_width=640, expected_height=360), @@ -326,7 +331,7 @@ Manual QA contract: - Run `dimos run demo-h264-video-e2e --daemon`. - Confirm logs show H.264 encoder initialization, periodic keyframes, probe frame counts, and recorder append counts. - Open the produced memory2 store and query `color_image` observations without touching `obs.data`; metadata should be available without decode. -- Access `obs.data` on a keyframe and a mid-GOP delta frame; both should return decoded `Image` pixels, with the mid-GOP read decoding from the nearest prior keyframe. +- Access `obs.data` during ordered replay/query. Delta frames before the first valid keyframe after the start point may be suppressed or fail clearly; the first keyframe at or after the start point and later deltas should return decoded `Image` pixels. - Replay the stored stream and confirm decoded images arrive on the normal replay schedule. - Run a seq-gap variant, either by a test-only packet drop option in `H264LcmTransport` or a direct `GopBuffer` driver, and verify the probe receives no corrupted images and resumes only after the next keyframe. @@ -381,10 +386,10 @@ No CLI command is required for the core feature. The synthetic `demo-h264-video- - Alternative rejected: MP4 segment files as the primary model, because live transports and per-frame memory2 replay become harder to align. 3. **Keep `codec_for(Image)` as JPEG.** - - Rationale: H.264 writes need stateful encoder ownership and GOP indexing; the stateless memory2 `Codec` contract should remain simple and backward compatible. + - Rationale: H.264 writes need stateful encoder ownership; the stateless memory2 `Codec` contract should remain simple and backward compatible. H.264 storage uses a generic payload strategy instead of changing the default codec. 4. **Decode only from valid GOP state.** - - Rationale: missing H.264 packets can corrupt decoded pixels. After a seq gap, subscribers and storage loaders should suppress or fail decode until a keyframe restores a self-contained GOP. + - Rationale: missing H.264 packets can corrupt decoded pixels. After a seq gap, late join, or replay seek into a GOP, subscribers and memory2 replay should suppress or fail decode until a keyframe restores a self-contained GOP. - Key detail: complete per-frame access units remove RTP-fragment handling from DimOS storage, but they do not remove inter-frame dependencies; P-frames still require prior decoded reference frames. 5. **Use aiortc's H.264 codec classes through a DimOS adapter.** @@ -410,28 +415,30 @@ This change affects image transport and recording only. It does not command robo Simulation and hardware cameras use the same `Image` semantics. Unsupported image formats such as depth or 16-bit images should fail at H.264 configuration/append/publish time with a clear error, not silently convert or corrupt data. -Replay must emit normal decoded `Image` objects on the existing memory2 replay schedule. Sequential replay should share decoder state so normal playback decodes each packet once. Seek or random access may decode from the nearest prior keyframe through the requested frame. +Replay must emit normal decoded `Image` objects on the existing memory2 replay schedule. Sequential replay should share decoder state so normal playback decodes each packet once. + +V1 H.264 decode is best-effort. Late subscribers and memory2 replay/query starting at timestamp `T` start without prior GOP state; delta frames are suppressed until the first keyframe at or after `T`, then that keyframe and following decodable deltas are available. Full QoS, durable keyframe cache, keyframe request/PLI, and indexed random decode are follow-up design work. Manual QA should use the synthetic `demo-h264-video-e2e` blueprint so no robot or physical camera is required. The demo should verify live LCM round-trip, memory2 append/query without decode, lazy `obs.data` decode, replay, and seq-gap behavior. ## Risks / Trade-offs - **Stateful codec complexity:** H.264 has encoder and decoder state. Mitigation: keep state in explicit `H264Encoder`, `H264Decoder`, and `GopBuffer` classes rather than hiding it in `Codec`. -- **Lazy `Image.data` compatibility:** Existing `Image` assumes eager numpy data. Mitigation: add lazy pixel support carefully so metadata properties do not force decode and eager construction remains unchanged. +- **Observation-level lazy decode:** Existing `Image` remains eager. Mitigation: keep H.264 laziness at `Observation.data` so generic image consumers remain unchanged. - **Packet loss:** LCM has no built-in reliable delivery or late-join keyframe durability. Mitigation: periodic IDR frames and seq-gap suppression; later add keyframe request or durable carriers where available. - **Dependency variability:** aiortc/PyAV/FFmpeg support varies by platform. Mitigation: keep H.264 optional under the extra that already provides aiortc/WebRTC support, preserve JPEG defaults, and fail clearly when video mode is selected without dependencies. - **aiortc codec API stability:** aiortc codec classes are importable and useful, but the most stable aiortc surface is WebRTC itself. Mitigation: isolate all direct codec imports in `AiortcH264Codec`, pin/verify aiortc versions, and add focused tests around encode/depayload/decode behavior. - **Double encode on record:** A recorder consuming decoded H.264 transport images may re-encode for memory2 storage. Mitigation: accept this in the first version; consider packet pass-through as a later optimization. -- **Random access latency:** Mid-GOP access requires decoding from a prior keyframe. Mitigation: short GOP defaults and decoder reuse during sequential replay. +- **Best-effort random access:** Mid-GOP access without prior decoder state may be unavailable in v1. Mitigation: short GOP defaults, decoder reuse during sequential replay, and suppression until the first keyframe after the start point. ## Migration / Rollout 1. Reuse the existing aiortc/WebRTC dependency path for H.264 support; add a lightweight `video` extra only if users need H.264 storage without the broader WebRTC extra. 2. Add `VideoPacket`, H.264 config, `AiortcH264Codec`, DimOS-facing encoder/decoder wrappers, GOP buffer, Annex B access-unit assembly, and explicit errors. -3. Add lazy pixel support to `Image` while preserving eager API behavior. +3. Preserve eager `Image` behavior; keep lazy decode at `Observation.data`. 4. Add `H264LCM` and `H264LcmTransport` as the first live carrier adapter. -5. Add memory2 H.264 storage config, backend/payload strategy, GOP index table, and lazy loader. -6. Add registry serialization so reopened SQLite stores know which streams use H.264 storage. +5. Add memory2 generic payload-strategy support and H.264 image payload strategy. +6. Add registry serialization so reopened SQLite stores know which streams use H.264 payload strategy. 7. Add `demo_h264_video_e2e` for synthetic end-to-end live transport plus memory2 storage QA. 8. Add tests and synthetic manual QA for live transport, storage, lazy decode, replay, unsupported formats, and seq gaps. 9. Update memory2 and transport docs with opt-in examples and dependency notes. @@ -446,7 +453,6 @@ No generated blueprint registry update is needed unless a runnable demo blueprin - Should LCM H.264 publish raw packet bytes under an `Image` channel name or use a distinct LCM message type/channel suffix internally? - What default bitrate, keyframe interval, and target FPS should be used for common DimOS camera streams? - Should first-version memory2 storage store packet blobs in the existing `{stream}_blob` table or introduce a dedicated packet blob table? -- Should transport subscribers receive lazy `Image` objects by default, or should eager decode remain the default for maximum compatibility? - Should WebRTC integration reuse this `VideoPacket` abstraction, or map directly between `Image` and WebRTC media tracks with optional packet export for memory2? - Does aiortc expose a stable encoded-frame hook that can avoid decode/re-encode when recording a WebRTC H.264 stream into memory2? - Should `AiortcH264Codec` pin to aiortc minor versions or include compatibility tests against the minimum supported aiortc version? diff --git a/openspec/changes/add-h264-codec-mem2-storage/docs.md b/openspec/changes/add-h264-codec-mem2-storage/docs.md index fa5ef9e7b9..1fc6f44188 100644 --- a/openspec/changes/add-h264-codec-mem2-storage/docs.md +++ b/openspec/changes/add-h264-codec-mem2-storage/docs.md @@ -9,10 +9,10 @@ - Update `docs/usage/blueprints.md` with an opt-in blueprint transport mapping example for `H264LcmTransport` and `H264Config`. - Update memory2 user docs, likely under `docs/usage/` or the memory2 capability docs, to describe opt-in H.264-backed image storage: - Default image storage remains JPEG-backed. - - Users opt in per stream with `H264ImageStorageConfig`. + - Users opt in per stream with `H264ImagePayloadStrategy`. - memory2 still stores one observation per source frame. - metadata queries do not require pixel decode. - - accessing `obs.data` lazily reconstructs an `Image`, decoding from the nearest prior keyframe when needed. + - accessing `obs.data` lazily reconstructs an `Image` when the H.264 decode session has valid GOP state; replay/decoded views suppress deltas until the first keyframe at or after the start point. - replay emits decoded `Image` frames on the normal replay schedule. - Add a short manual QA section for `demo-h264-video-e2e` after the demo blueprint exists: - run `dimos run demo-h264-video-e2e --daemon` diff --git a/openspec/changes/add-h264-codec-mem2-storage/proposal.md b/openspec/changes/add-h264-codec-mem2-storage/proposal.md index b808cc4828..e428eba421 100644 --- a/openspec/changes/add-h264-codec-mem2-storage/proposal.md +++ b/openspec/changes/add-h264-codec-mem2-storage/proposal.md @@ -27,7 +27,7 @@ DimOS needs an opt-in H.264 image-stream path that preserves the public `Image` ### New Capabilities - `h264-image-streams`: Covers carrier-neutral H.264 image packets, live image-stream encode/decode behavior, keyframe/GOP handling, sequence-gap behavior, and transport compatibility expectations. -- `memory2-h264-storage`: Covers opt-in H.264-backed memory2 image observation storage, per-frame packet persistence, GOP indexing, lazy `Image` reconstruction, and replay compatibility. +- `memory2-h264-storage`: Covers opt-in H.264-backed memory2 image observation storage, per-frame packet persistence, best-effort GOP decode, lazy `Observation.data` reconstruction, and replay compatibility. ### Modified Capabilities diff --git a/openspec/changes/add-h264-codec-mem2-storage/specs/h264-image-streams/spec.md b/openspec/changes/add-h264-codec-mem2-storage/specs/h264-image-streams/spec.md index 1cb004ca3f..0c97d46d19 100644 --- a/openspec/changes/add-h264-codec-mem2-storage/specs/h264-image-streams/spec.md +++ b/openspec/changes/add-h264-codec-mem2-storage/specs/h264-image-streams/spec.md @@ -46,6 +46,21 @@ DimOS SHALL provide periodic keyframes for H.264 image streams so subscribers ca - **THEN** the keyframe packet MUST include the decoder parameter information needed for that bootstrap, such as SPS/PPS for H.264 Annex B streams - **AND** later delta frames in the same GOP may depend on that decoded keyframe state. +### Requirement: H.264 live decode is best-effort without QoS guarantees +DimOS SHALL apply a best-effort H.264 decode policy for live carriers that do not provide video QoS, keyframe requests, or durable keyframe caching. + +#### Scenario: Subscriber starts without GOP state +- **GIVEN** an H.264 live subscriber starts receiving packets at a point whose first packet is a delta frame +- **WHEN** the subscriber's decoder has no valid prior GOP state +- **THEN** DimOS MUST suppress decoded output for undecodable delta frames +- **AND** DimOS MUST begin delivering decoded `Image` values after the first keyframe at or after the subscriber start point establishes valid decoder state. + +#### Scenario: QoS policy is deferred +- **GIVEN** an H.264 image stream uses an LCM-style best-effort carrier +- **WHEN** packets are lost or a subscriber joins late +- **THEN** DimOS MUST rely on periodic keyframes and decode suppression for v1 recovery +- **AND** DimOS documentation must describe keyframe request, durable keyframe cache, retransmission, and transport QoS as follow-up design work rather than v1 guarantees. + ### Requirement: Sequence gaps recover safely DimOS SHALL detect missing or out-of-order H.264 live-stream packets and resume decoded image delivery from a valid keyframe state. diff --git a/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-h264-storage/spec.md b/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-h264-storage/spec.md index a73dfa1af4..4161080eb6 100644 --- a/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-h264-storage/spec.md +++ b/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-h264-storage/spec.md @@ -1,7 +1,7 @@ ## ADDED Requirements ### Requirement: H.264 image storage is opt-in per memory2 stream -memory2 SHALL allow image streams to opt into H.264-backed storage while preserving the default image-storage behavior for streams that do not opt in. +memory2 SHALL allow image streams to opt into H.264-backed storage through a generic payload strategy while preserving the default image-storage behavior for streams that do not opt in. #### Scenario: Stream opts into H.264 storage - **GIVEN** a memory2 image stream is configured for H.264-backed storage @@ -9,6 +9,12 @@ memory2 SHALL allow image streams to opt into H.264-backed storage while preserv - **THEN** memory2 MUST store those image observations using H.264-backed payloads - **AND** queries for the stream must continue to return image observations associated with the original frame timestamps. +#### Scenario: H.264 storage uses payload strategy extension point +- **GIVEN** a store creates an `Image` stream with an H.264 payload strategy +- **WHEN** memory2 creates the stream backend +- **THEN** memory2 MUST route payload encode, blob loader attachment, and decode-error suppression through the generic payload strategy interface +- **AND** the generic store base must not contain H.264-specific branches or imports. + #### Scenario: Stream uses default image storage - **GIVEN** a memory2 image stream is created without H.264 image-storage configuration - **WHEN** the stream appends `Image` values @@ -33,16 +39,16 @@ memory2 SHALL store each H.264-backed image observation with an encoded payload - **THEN** the payload MUST represent all NAL units emitted for that source frame in Annex B form - **AND** memory2 MUST avoid exposing individual RTP fragments as the stored observation payload. -### Requirement: GOP metadata supports random access and replay -memory2 SHALL persist enough GOP and keyframe metadata for H.264-backed image streams to decode requested observations and replay streams deterministically. +### Requirement: Decode starts from valid keyframe state +memory2 SHALL use the same best-effort H.264 decode policy as live subscribers: decode starts without GOP state and suppresses delta frames until a keyframe at or after the start point establishes valid decoder state. -#### Scenario: Query decodes a mid-GOP observation -- **GIVEN** a user queries an H.264-backed image observation whose encoded payload is a delta frame -- **WHEN** the user accesses the observation pixel data -- **THEN** memory2 MUST decode from the nearest prior usable keyframe through the requested observation -- **AND** the returned value must be a decoded `Image` for the requested observation. +#### Scenario: Replay seeks into the middle of a GOP +- **GIVEN** a user starts replay or a decoded view at a timestamp whose first stored H.264 packet is a delta frame +- **WHEN** memory2 decodes the stream from that start point +- **THEN** memory2 MUST suppress undecodable delta frames until the first keyframe at or after the start point +- **AND** memory2 MUST emit decoded `Image` values for that keyframe and following decodable delta frames. -#### Scenario: Required GOP data is missing +#### Scenario: Required GOP state is missing - **GIVEN** an H.264-backed image observation requires prior GOP data to decode - **WHEN** memory2 cannot load a usable keyframe or required delta-frame sequence - **THEN** memory2 MUST fail the pixel decode with a clear storage/decode error @@ -53,18 +59,18 @@ memory2 SHALL allow metadata access for H.264-backed image observations without #### Scenario: Query reads observation metadata only - **GIVEN** a memory2 store contains H.264-backed image observations -- **WHEN** a user queries observations and reads timestamps, frame identifiers, pose metadata, tags, width, height, or image format metadata +- **WHEN** a user queries observations and reads timestamps, frame identifiers, pose metadata, or tags - **THEN** memory2 MUST provide that metadata without requiring H.264 pixel decode - **AND** pixel decode should occur only when the user accesses image data. -### Requirement: Lazy pixel access reconstructs Image values -memory2 SHALL lazily reconstruct `Image` values for H.264-backed observations when pixel data is requested. +### Requirement: Lazy pixel access reconstructs Image values on best-effort decode +memory2 SHALL lazily reconstruct `Image` values for H.264-backed observations when pixel data is requested and valid decoder state is available. #### Scenario: User accesses observation data - **GIVEN** a queried H.264-backed image observation has not decoded its pixels yet - **WHEN** the user accesses `obs.data` -- **THEN** memory2 MUST return a decoded `Image` value for that observation -- **AND** subsequent compatible accesses may reuse decoded state without changing observable image contents. +- **THEN** memory2 MUST return a decoded `Image` value if the H.264 decode session has valid GOP state for that observation +- **AND** memory2 MUST suppress or fail clearly for undecodable deltas rather than returning corrupted pixels. ### Requirement: H.264-backed replay emits normal Image frames memory2 SHALL replay H.264-backed image streams as normal decoded `Image` frames on the existing replay schedule. @@ -73,13 +79,14 @@ memory2 SHALL replay H.264-backed image streams as normal decoded `Image` frames - **GIVEN** a memory2 store contains an H.264-backed image stream - **WHEN** replay is started for that stream - **THEN** replay MUST emit decoded `Image` values in observation timestamp order +- **AND** replay MUST skip undecodable deltas before the first valid keyframe at or after the replay start point - **AND** consumers of replayed streams must not need to consume encoded video packet values. ### Requirement: H.264 storage survives store reopen -memory2 SHALL persist H.264 storage configuration and frame-index metadata so a reopened store can query, decode, and replay H.264-backed image streams. +memory2 SHALL persist H.264 payload-strategy configuration and frame metadata so a reopened store can query, decode, and replay H.264-backed image streams. #### Scenario: Reopen and decode - **GIVEN** a memory2 store was written with an H.264-backed image stream - **WHEN** the process closes and a later process reopens the store - **THEN** memory2 MUST recognize the stream as H.264-backed -- **AND** the reopened store must support metadata query, lazy pixel decode, and replay for the stored observations. +- **AND** the reopened store must support metadata query, lazy pixel decode, and best-effort replay for the stored observations. diff --git a/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-payload-strategies/spec.md b/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-payload-strategies/spec.md new file mode 100644 index 0000000000..a0a5f7493d --- /dev/null +++ b/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-payload-strategies/spec.md @@ -0,0 +1,56 @@ +## ADDED Requirements + +### Requirement: memory2 streams support generic payload strategies +memory2 SHALL allow a stream backend to delegate payload encoding, lazy loader attachment, and decode-error policy to an optional payload strategy without changing the logical stream payload type. + +#### Scenario: Stream appends through a payload strategy +- **GIVEN** a memory2 stream is created with a payload strategy for its payload type +- **WHEN** the stream appends a value +- **THEN** the backend MUST preserve normal observation metadata insertion semantics +- **AND** the backend MUST delegate payload byte encoding to the configured payload strategy before writing the blob. + +#### Scenario: Stream queries attach strategy loaders +- **GIVEN** a memory2 stream has stored blobs written by a payload strategy +- **WHEN** observations are queried or replayed +- **THEN** the backend MUST attach lazy data loaders through the payload strategy +- **AND** observation metadata must remain readable without materializing the payload. + +### Requirement: Payload strategies remain storage-generic +memory2 SHALL keep payload strategy integration generic so the base store and backend abstractions do not depend on H.264-specific classes. + +#### Scenario: Base store creates a strategy-backed backend +- **GIVEN** a stream configuration includes a payload strategy +- **WHEN** the generic store creates the backend +- **THEN** the store MUST pass the strategy through the generic backend construction path +- **AND** the store MUST avoid payload-specific imports, type checks, or backend subclasses for H.264. + +#### Scenario: Storage backend binds optional local resources +- **GIVEN** a concrete store implementation reopens a stream with a serialized payload strategy +- **WHEN** the strategy needs store-local resources such as a SQLite connection for auxiliary metadata +- **THEN** the concrete store MAY bind those resources through a strategy hook +- **AND** the binding hook must remain generic so other strategies can use the same extension point. + +### Requirement: Payload strategy configuration survives store reopen +memory2 SHALL persist payload strategy identity and configuration in stream registry metadata so reopened stores can reconstruct strategy-backed streams. + +#### Scenario: Reopen a strategy-backed stream +- **GIVEN** a stream was created with a payload strategy +- **WHEN** a later process reopens the store +- **THEN** memory2 MUST deserialize the configured payload strategy +- **AND** the reopened stream must use that strategy for lazy payload access and replay behavior. + +### Requirement: Replay honors strategy decode suppression +memory2 SHALL allow payload strategies to classify decode errors that replay should suppress while preserving normal failure behavior for unrelated errors. + +#### Scenario: Strategy suppresses an undecodable payload +- **GIVEN** a replay iterator encounters a payload decode error +- **AND** the stream's payload strategy classifies that error as suppressible +- **WHEN** replay advances through the stream +- **THEN** memory2 MUST skip that undecodable observation +- **AND** replay MUST continue with later observations. + +#### Scenario: Strategy does not suppress an error +- **GIVEN** a replay iterator encounters a payload decode error +- **AND** the stream's payload strategy does not classify that error as suppressible +- **WHEN** replay advances through the stream +- **THEN** memory2 MUST surface the error to the caller. diff --git a/openspec/changes/add-h264-codec-mem2-storage/tasks.md b/openspec/changes/add-h264-codec-mem2-storage/tasks.md index 4842b7a728..35c714e5e2 100644 --- a/openspec/changes/add-h264-codec-mem2-storage/tasks.md +++ b/openspec/changes/add-h264-codec-mem2-storage/tasks.md @@ -8,11 +8,11 @@ - [x] 1.6 Add explicit errors for unsupported image formats, missing video dependencies, and unusable GOP/decode state. - [x] 1.7 Add focused codec tests for per-frame Annex B packet shape, keyframe metadata, SPS/PPS bootstrap behavior, sequence-gap handling, dependency errors, and unsupported image formats. -## 2. Image lazy data support +## 2. Image compatibility and observation lazy decode -- [x] 2.1 Extend `Image` construction/access so metadata fields remain available without forcing pixel decode when an image is backed by a lazy pixel loader. +- [x] 2.1 Keep `Image` eager and numpy-backed; use memory2 `Observation.data` as the lazy H.264 decode boundary. - [x] 2.2 Preserve existing eager `Image` behavior and compatibility for current JPEG, LCM, SHM, memory2, and visualization consumers. -- [x] 2.3 Add tests proving eager images still work, lazy images expose metadata without decode, and `Image.data` materializes pixels exactly when requested. +- [x] 2.3 Add tests proving eager images still work after the H.264 storage changes. ## 3. Live H.264 image transport @@ -23,14 +23,15 @@ ## 4. memory2 H.264 image storage -- [x] 4.1 Add per-stream H.264 image storage configuration for direct store creation and recorder configuration. -- [x] 4.2 Route configured memory2 `Image` streams to H.264-backed storage while leaving unconfigured `Image` streams on the existing default storage path. +- [x] 4.1 Add per-stream H.264 image payload strategy configuration for direct store creation and recorder configuration. +- [x] 4.2 Route configured memory2 `Image` streams through a generic payload strategy while leaving unconfigured `Image` streams on the existing default storage path. - [x] 4.3 Store one observation row per source frame and one encoded Annex B frame packet payload per observation. -- [x] 4.4 Add persistent GOP/keyframe index metadata for H.264-backed image streams. +- [x] 4.4 Add persistent H.264 frame/keyframe metadata for H.264-backed image streams. - [x] 4.5 Persist and reload per-stream storage configuration so reopened stores recognize H.264-backed image streams. -- [x] 4.6 Add lazy observation loading that returns metadata without decode and reconstructs `Image` pixels from the nearest prior usable keyframe through the requested observation when `obs.data` is accessed. -- [x] 4.7 Add replay support that emits decoded `Image` values in observation timestamp order for H.264-backed streams. -- [x] 4.8 Add memory2 tests for append/query, metadata access without decode, keyframe and mid-GOP lazy decode, missing-GOP failure, store reopen, replay, default JPEG compatibility, and unsupported formats. +- [x] 4.6 Add lazy observation loading that returns metadata without decode and reconstructs `Image` pixels on best-effort H.264 decode when `obs.data` is accessed. +- [x] 4.7 Add replay support that emits decoded `Image` values in observation timestamp order and suppresses undecodable deltas until the first valid keyframe after the replay start point. +- [x] 4.8 Add memory2 tests for append/query, metadata access without decode, keyframe and sequential lazy decode, missing-GOP failure, store reopen, replay seek suppression, default JPEG compatibility, and unsupported formats. +- [x] 4.9 Add generic payload strategy tests for lifecycle, payload encoding/loading, registry persistence, SQLite binding, and replay decode-error suppression. ## 5. Synthetic end-to-end blueprint and manual QA surface @@ -44,7 +45,7 @@ - [x] 6.1 Update user-facing transport docs with H.264 opt-in behavior, `Image` stream preservation, Annex B per-frame packets, keyframe/GOP recovery, unsupported formats, and dependency notes. - [x] 6.2 Update blueprint docs with an H.264 image transport mapping example. -- [x] 6.3 Update memory2 docs with H.264 image storage configuration, one-observation-per-frame behavior, metadata query without decode, lazy `obs.data` decode, random access from keyframes, and replay behavior. +- [x] 6.3 Update memory2 docs with H.264 image payload strategy configuration, one-observation-per-frame behavior, metadata query without decode, lazy `obs.data` decode, best-effort keyframe startup, and replay behavior. - [x] 6.4 Add docs for running and inspecting the `demo-h264-video-e2e` synthetic QA blueprint. - [x] 6.5 Update contributor testing docs with video dependency setup, focused test targets, skip behavior when dependencies are unavailable, and blueprint-registry regeneration guidance. - [x] 6.6 Update coding-agent docs if maintainers want the H.264/Foxglove packet-shape rule documented for future agent edits. @@ -53,7 +54,7 @@ - [x] 7.1 Run `openspec validate add-h264-codec-mem2-storage --strict`. - [x] 7.2 Run focused unit tests for H.264 codec/access-unit/GOP behavior. -- [x] 7.3 Run focused unit tests for lazy `Image` behavior. +- [x] 7.3 Run focused unit tests for eager `Image` compatibility. - [x] 7.4 Run focused memory2 storage tests for H.264 append/query/lazy decode/reopen/replay/default compatibility. - [x] 7.5 Run focused live transport tests for H.264 LCM round-trip and sequence-gap recovery. - [x] 7.6 Run `uv run pytest dimos/robot/test_all_blueprints_generation.py` if the demo blueprint is registered. From 683e6a99799ca07eb900a16c9d5cf37aa7890885 Mon Sep 17 00:00:00 2001 From: cc Date: Thu, 11 Jun 2026 20:33:50 -0700 Subject: [PATCH 04/14] refactor: store h264 as encoded images --- dimos/core/transport.py | 17 +- dimos/memory2/backend.py | 50 +- dimos/memory2/codecs/base.py | 6 + dimos/memory2/module.py | 6 +- dimos/memory2/replay.py | 31 +- dimos/memory2/store/base.py | 2 - dimos/memory2/store/sqlite.py | 65 +-- dimos/memory2/test_payload_strategy.py | 164 ------ dimos/memory2/video/h264.py | 357 +------------ dimos/memory2/video/test_h264_storage.py | 314 +++--------- dimos/msgs/sensor_msgs/Image.py | 254 +++++++-- dimos/msgs/sensor_msgs/VideoPacket.py | 98 ---- dimos/protocol/pubsub/impl/h264_lcm.py | 20 +- dimos/protocol/pubsub/impl/test_h264_lcm.py | 112 ++-- dimos/protocol/video/demo_h264_video_e2e.py | 36 +- dimos/protocol/video/h264.py | 125 +++-- dimos/protocol/video/test_h264.py | 103 ++-- docs/capabilities/memory/index.md | 34 +- docs/coding-agents/style.md | 13 +- docs/development/testing.md | 8 +- docs/usage/transports/index.md | 8 +- .../add-h264-codec-mem2-storage/design.md | 484 ++++-------------- .../add-h264-codec-mem2-storage/docs.md | 19 +- .../add-h264-codec-mem2-storage/proposal.md | 18 +- .../specs/h264-image-streams/spec.md | 39 +- .../specs/memory2-h264-storage/spec.md | 78 +-- .../specs/memory2-payload-strategies/spec.md | 56 -- .../add-h264-codec-mem2-storage/tasks.md | 43 +- 28 files changed, 822 insertions(+), 1738 deletions(-) delete mode 100644 dimos/memory2/test_payload_strategy.py delete mode 100644 dimos/msgs/sensor_msgs/VideoPacket.py delete mode 100644 openspec/changes/add-h264-codec-mem2-storage/specs/memory2-payload-strategies/spec.md diff --git a/dimos/core/transport.py b/dimos/core/transport.py index 0e5d23ad4d..2a57f7e030 100644 --- a/dimos/core/transport.py +++ b/dimos/core/transport.py @@ -163,16 +163,27 @@ def stop(self) -> None: class H264LcmTransport(LCMTransport): # type: ignore[type-arg] - def __init__(self, topic: str, type: type, config: Any | None = None, **kwargs) -> None: # type: ignore[no-untyped-def] + def __init__( + self, + topic: str, + type: type, + config: Any | None = None, + decode_images: bool = True, + **kwargs, + ) -> None: # type: ignore[no-untyped-def] from dimos.protocol.pubsub.impl.h264_lcm import H264LCM from dimos.protocol.video.h264 import H264Config self.config = config or H264Config() - self.lcm = H264LCM(config=self.config, **kwargs) # type: ignore[assignment] + self.decode_images = decode_images + self.lcm = H264LCM(config=self.config, decode_images=decode_images, **kwargs) # type: ignore[assignment] super().__init__(topic, type) def __reduce__(self): # type: ignore[no-untyped-def] - return (H264LcmTransport, (self.topic.topic, self.topic.lcm_type, self.config)) + return ( + H264LcmTransport, + (self.topic.topic, self.topic.lcm_type, self.config, self.decode_images), + ) def start(self) -> None: self.lcm.start() diff --git a/dimos/memory2/backend.py b/dimos/memory2/backend.py index 53ce318aaa..7b95bd6335 100644 --- a/dimos/memory2/backend.py +++ b/dimos/memory2/backend.py @@ -17,7 +17,7 @@ from __future__ import annotations from dataclasses import replace -from typing import TYPE_CHECKING, Any, Generic, Protocol, TypeVar +from typing import TYPE_CHECKING, Any, Generic, TypeVar from dimos.core.resource import CompositeResource from dimos.memory2.codecs.base import Codec, codec_id @@ -40,30 +40,6 @@ T = TypeVar("T") -class PayloadStrategy(Protocol[T]): - """Stateful payload encoding hook for a memory2 stream. - - Stateless storage uses ``Codec`` directly. Stateful encodings such as video - can provide a strategy without requiring a special Backend subclass. - """ - - codec_id: str - - def start(self) -> None: ... - def stop(self) -> None: ... - def encode(self, value: T) -> bytes: ... - def after_blob_put(self, stream_name: str, row_id: int, encoded: bytes) -> None: ... - def make_loader(self, stream_name: str, row_id: int, blob_store: BlobStore) -> Any: ... - def attach_loaders( - self, - stream_name: str, - observations: Iterator[Observation[T]], - blob_store: BlobStore, - ) -> Iterator[Observation[T]]: ... - def should_suppress_decode_error(self, error: BaseException) -> bool: ... - def serialize(self) -> dict[str, Any]: ... - - class Backend(CompositeResource, Generic[T]): """Orchestrates metadata, blob, vector, and live stores for one stream. (encode → insert → store blob → index vector → notify) lives here, @@ -79,7 +55,6 @@ def __init__( vector_store: VectorStore | None = None, notifier: Notifier[T] | None = None, eager_blobs: bool = False, - payload_strategy: PayloadStrategy[T] | None = None, ) -> None: super().__init__() self.metadata_store = self.register_disposable(metadata_store) @@ -89,7 +64,6 @@ def __init__( self.vector_store = self.register_disposable(vector_store) if vector_store else None self.notifier: Notifier[T] = self.register_disposable(notifier or SubjectNotifier()) self.eager_blobs = eager_blobs - self.payload_strategy = payload_strategy def start(self) -> None: self.metadata_store.start() @@ -97,13 +71,6 @@ def start(self) -> None: self.blob_store.start() if self.vector_store is not None: self.vector_store.start() - if self.payload_strategy is not None: - self.payload_strategy.start() - - def stop(self) -> None: - if self.payload_strategy is not None: - self.payload_strategy.stop() - super().stop() @property def name(self) -> str: @@ -114,8 +81,6 @@ def _make_loader(self, row_id: int) -> Any: if bs is None: raise RuntimeError("BlobStore required but not configured") name, codec = self.name, self.codec - if self.payload_strategy is not None: - return self.payload_strategy.make_loader(name, row_id, bs) def loader() -> Any: raw = bs.get(name, row_id) @@ -142,10 +107,7 @@ def append(self, obs: Observation[T]) -> Observation[T]: # Encode payload before any locking (avoids holding locks during IO) encoded: bytes | None = None if self.blob_store is not None and not is_scalar: - if self.payload_strategy is not None: - encoded = self.payload_strategy.encode(payload) - else: - encoded = self.codec.encode(payload) + encoded = self.codec.encode(payload) try: # Insert metadata, get assigned id @@ -156,8 +118,6 @@ def append(self, obs: Observation[T]) -> Observation[T]: if encoded is not None: assert self.blob_store is not None self.blob_store.put(self.name, row_id, encoded) - if self.payload_strategy is not None: - self.payload_strategy.after_blob_put(self.name, row_id, encoded) # Replace inline data with lazy loader obs._data = _UNLOADED obs._loader = self._make_loader(row_id) @@ -195,9 +155,6 @@ def _attach_loaders(self, it: Iterator[Observation[T]]) -> Iterator[Observation[ obs.data_type = self.data_type yield obs return - if self.payload_strategy is not None: - yield from self.payload_strategy.attach_loaders(self.name, it, self.blob_store) - return for obs in it: obs.data_type = self.data_type if obs._loader is None and isinstance(obs._data, type(_UNLOADED)): @@ -306,7 +263,4 @@ def serialize(self) -> dict[str, Any]: "blob_store": self.blob_store.serialize() if self.blob_store else None, "vector_store": self.vector_store.serialize() if self.vector_store else None, "notifier": self.notifier.serialize(), - "payload_strategy": self.payload_strategy.serialize() - if self.payload_strategy is not None - else None, } diff --git a/dimos/memory2/codecs/base.py b/dimos/memory2/codecs/base.py index 821b36b60f..def8ef41fc 100644 --- a/dimos/memory2/codecs/base.py +++ b/dimos/memory2/codecs/base.py @@ -76,6 +76,8 @@ def codec_from_id(codec_id_str: str, payload_module: str) -> Codec[Any]: def _class_to_id(codec: Any) -> str: name = type(codec).__name__ + if name == "H264ImageCodec": + return "h264" if name.endswith("Codec"): return name[:-5].lower() return name.lower() @@ -101,6 +103,10 @@ def _make_one(name: str, payload_module: str, inner: Codec[Any] | None = None) - from dimos.memory2.codecs.jpeg import JpegCodec return JpegCodec() + if name == "h264": + from dimos.memory2.video.h264 import H264ImageCodec + + return H264ImageCodec() if name == "lcm": from dimos.memory2.codecs.lcm import LcmCodec diff --git a/dimos/memory2/module.py b/dimos/memory2/module.py index c3264d5116..ad2982ad48 100644 --- a/dimos/memory2/module.py +++ b/dimos/memory2/module.py @@ -254,7 +254,7 @@ class RecorderConfig(MemoryModuleConfig): default_frame_id: str = "base_link" tf_tolerance: float = 0.5 db_path: str | Path = "recording.db" - payload_strategies: dict[str, Any] = Field(default_factory=dict) + codecs: dict[str, Any] = Field(default_factory=dict) class Recorder(MemoryModule): @@ -305,8 +305,8 @@ def start(self) -> None: for name, port in self.inputs.items(): stream_overrides: dict[str, Any] = {} - if name in self.config.payload_strategies: - stream_overrides["payload_strategy"] = self.config.payload_strategies[name] + if name in self.config.codecs: + stream_overrides["codec"] = self.config.codecs[name] stream: Stream[Any] = self.store.stream(name, port.type, **stream_overrides) self._port_to_stream(name, port, stream) logger.info("Recording %s (%s)", name, port.type.__name__) diff --git a/dimos/memory2/replay.py b/dimos/memory2/replay.py index 91c977cbbd..3062a44e98 100644 --- a/dimos/memory2/replay.py +++ b/dimos/memory2/replay.py @@ -138,13 +138,6 @@ def _decode(self, obs: Any) -> T: data = self._autocast(data) return cast("T", data) - def _should_suppress_decode_error(self, error: BaseException) -> bool: - stream = self._replay.store.stream(self._name) - source = getattr(stream, "_source", None) - strategy = getattr(source, "payload_strategy", None) - should_suppress = getattr(strategy, "should_suppress_decode_error", None) - return bool(should_suppress is not None and should_suppress(error)) - def _base_stream(self) -> Stream[Any]: """Memory2 Stream bounded by the replay window, ordered by ts.""" cfg = self._replay.config @@ -196,12 +189,7 @@ def iterate_ts(self) -> Iterator[tuple[float, T]]: emitted = False obs: Any for obs in self._base_stream(): - try: - decoded = self._decode(obs) - except BaseException as exc: - if self._should_suppress_decode_error(exc): - continue - raise + decoded = self._decode(obs) emitted = True yield (obs.ts, decoded) if not self._replay.config.loop or not emitted: @@ -216,10 +204,6 @@ def first(self) -> T | None: return self._decode(self._base_stream().first()) except LookupError: return None - except BaseException as exc: - if self._should_suppress_decode_error(exc): - return None - raise def find_closest(self, timestamp: float, tolerance: float = 1.0) -> T | None: s: Stream[Any] = self._replay.store.stream(self._name) @@ -229,10 +213,8 @@ def find_closest(self, timestamp: float, tolerance: float = 1.0) -> T | None: return None try: return self._decode(obs) - except BaseException as exc: - if self._should_suppress_decode_error(exc): - return None - raise + except LookupError: + return None def observable(self) -> Observable[T]: """Timed Observable scheduled against the Replay's shared anchor. @@ -261,12 +243,7 @@ def make_iterator() -> Iterator[tuple[float, T]]: emitted = False obs: Any for obs in base(): - try: - decoded = decode(obs) - except BaseException as exc: - if self._should_suppress_decode_error(exc): - continue - raise + decoded = decode(obs) emitted = True yield (obs.ts, decoded) if not loop or not emitted: diff --git a/dimos/memory2/store/base.py b/dimos/memory2/store/base.py index 62c5bf53fe..7a7162a6d1 100644 --- a/dimos/memory2/store/base.py +++ b/dimos/memory2/store/base.py @@ -158,7 +158,6 @@ def _create_backend( ) -> Backend[Any]: """Create a Backend for the named stream. Called once per stream name.""" codec = self._resolve_codec(payload_type, config.pop("codec", None)) - payload_strategy = config.pop("payload_strategy", None) # Instantiate or use provided instances obs = config.pop("observation_store", self.config.observation_store) @@ -185,7 +184,6 @@ def _create_backend( vector_store=vs, notifier=notifier, eager_blobs=config.get("eager_blobs", False), - payload_strategy=payload_strategy, ) def stream(self, name: str, payload_type: type[T] | None = None, **overrides: Any) -> Stream[T]: diff --git a/dimos/memory2/store/sqlite.py b/dimos/memory2/store/sqlite.py index 5558bc0c55..e1c5ba5071 100644 --- a/dimos/memory2/store/sqlite.py +++ b/dimos/memory2/store/sqlite.py @@ -24,7 +24,6 @@ from dimos.memory2.blobstore.base import BlobStore from dimos.memory2.blobstore.sqlite import SqliteBlobStore from dimos.memory2.codecs.base import codec_id -from dimos.memory2.codecs.pickle import PickleCodec from dimos.memory2.observationstore.sqlite import SqliteObservationStore from dimos.memory2.registry import RegistryStore, deserialize_component, qual from dimos.memory2.store.base import Store, StoreConfig @@ -72,15 +71,9 @@ def _assemble_backend(self, name: str, stored: dict[str, Any]) -> Backend[Any]: data_type = _resolve_payload_type(payload_module) eager_blobs = stored.get("eager_blobs", False) page_size = stored.get("page_size", self.config.page_size) - payload_strategy_data = stored.get("payload_strategy") - codec = ( - PickleCodec() - if payload_strategy_data is not None - else codec_from_id(stored["codec_id"], payload_module) - ) + codec = codec_from_id(stored["codec_id"], payload_module) backend_conn = self._open_connection() - payload_strategy = self._deserialize_payload_strategy(payload_strategy_data, backend_conn) # Reconstruct components from serialized config bs_data = stored.get("blob_store") @@ -128,7 +121,6 @@ def _assemble_backend(self, name: str, stored: dict[str, Any]) -> Backend[Any]: vector_store=vs, notifier=notifier, eager_blobs=eager_blobs, - payload_strategy=payload_strategy, ) @staticmethod @@ -138,14 +130,10 @@ def _serialize_backend( """Serialize a backend's config for registry storage.""" cfg: dict[str, Any] = { "payload_module": payload_module, - "codec_id": backend.payload_strategy.codec_id - if backend.payload_strategy is not None - else codec_id(backend.codec), + "codec_id": codec_id(backend.codec), "eager_blobs": backend.eager_blobs, "page_size": page_size, } - if backend.payload_strategy is not None: - cfg["payload_strategy"] = backend.payload_strategy.serialize() if backend.blob_store is not None: cfg["blob_store"] = backend.blob_store.serialize() if backend.vector_store is not None: @@ -177,23 +165,14 @@ def _create_backend( backend_conn = self._open_connection() - payload_strategy = self._payload_strategy_from_config(config, backend_conn) - # Inject conn-shared instances unless user provided overrides if not isinstance(config.get("blob_store"), BlobStore): config["blob_store"] = SqliteBlobStore(conn=backend_conn) if not isinstance(config.get("vector_store"), VectorStore): config["vector_store"] = SqliteVectorStore(conn=backend_conn) - # Resolve codec early — needed for SqliteObservationStore. Stateful - # payload strategies own blob encoding/decoding, so keep sqlite eager - # joins disabled and use a harmless metadata-store codec. - if payload_strategy is not None: - codec = PickleCodec() - config["eager_blobs"] = False - config["payload_strategy"] = payload_strategy - else: - codec = self._resolve_codec(payload_type, config.get("codec")) + # Resolve codec early — needed for SqliteObservationStore. + codec = self._resolve_codec(payload_type, config.get("codec")) config["codec"] = codec # Create SqliteObservationStore with conn-sharing @@ -222,48 +201,12 @@ def _create_backend( return backend - @staticmethod - def _deserialize_payload_strategy( - data: dict[str, Any] | None, - conn: sqlite3.Connection, - ) -> Any | None: - if data is None: - return None - strategy = deserialize_component(data) - SqliteStore._bind_payload_strategy(strategy, conn) - return strategy - - @staticmethod - def _payload_strategy_from_config( - config: dict[str, Any], conn: sqlite3.Connection - ) -> Any | None: - strategy = config.pop("payload_strategy", None) - if strategy is None: - return None - SqliteStore._bind_payload_strategy(strategy, conn) - return strategy - - @staticmethod - def _bind_payload_strategy(strategy: Any, conn: sqlite3.Connection) -> None: - bind = getattr(strategy, "bind_sqlite", None) - if bind is not None: - bind(conn) - def list_streams(self) -> list[str]: db_names = set(self._registry.list_streams()) return sorted(db_names | set(self._streams.keys())) def delete_stream(self, name: str) -> None: - stored = self._registry.get(name) - payload_strategy = None - if stored is not None: - payload_strategy = self._deserialize_payload_strategy( - stored.get("payload_strategy"), - self._registry_conn, - ) super().delete_stream(name) - if payload_strategy is not None and hasattr(payload_strategy, "delete_stream"): - payload_strategy.delete_stream(name) self._registry_conn.execute(f'DROP TABLE IF EXISTS "{name}"') self._registry_conn.execute(f'DROP TABLE IF EXISTS "{name}_blob"') self._registry_conn.execute(f'DROP TABLE IF EXISTS "{name}_vec"') diff --git a/dimos/memory2/test_payload_strategy.py b/dimos/memory2/test_payload_strategy.py deleted file mode 100644 index 5bc0a92d00..0000000000 --- a/dimos/memory2/test_payload_strategy.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright 2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -import pytest - -from dimos.memory2.store.sqlite import SqliteStore -from dimos.memory2.type.observation import _UNLOADED - -if TYPE_CHECKING: - from collections.abc import Iterator - - from dimos.memory2.blobstore.base import BlobStore - from dimos.memory2.type.observation import Observation - - -class SuppressMeError(RuntimeError): - pass - - -class DoNotSuppressError(RuntimeError): - pass - - -class PrefixPayloadStrategy: - codec_id = "prefix" - - def __init__(self, prefix: str = "encoded:") -> None: - self.prefix = prefix - self.started = False - self.stopped = False - self.bound_sqlite = False - self.encoded_values: list[str] = [] - self.blob_rows: list[tuple[str, int, bytes]] = [] - - def start(self) -> None: - self.started = True - - def stop(self) -> None: - self.stopped = True - - def bind_sqlite(self, _conn: Any) -> None: - self.bound_sqlite = True - - def encode(self, value: str) -> bytes: - self.encoded_values.append(value) - return f"{self.prefix}{value}".encode() - - def after_blob_put(self, stream_name: str, row_id: int, encoded: bytes) -> None: - self.blob_rows.append((stream_name, row_id, encoded)) - - def _decode(self, raw: bytes) -> str: - value = raw.decode() - if not value.startswith(self.prefix): - raise ValueError("payload strategy prefix missing") - decoded = value.removeprefix(self.prefix) - if decoded == "skip": - raise SuppressMeError("skip this payload") - if decoded == "boom": - raise DoNotSuppressError("do not suppress this payload") - return decoded - - def make_loader(self, stream_name: str, row_id: int, blob_store: BlobStore) -> Any: - def loader() -> str: - return self._decode(blob_store.get(stream_name, row_id)) - - return loader - - def attach_loaders( - self, - stream_name: str, - observations: Iterator[Observation[str]], - blob_store: BlobStore, - ) -> Iterator[Observation[str]]: - for obs in observations: - obs.data_type = str - if obs._loader is None and isinstance(obs._data, type(_UNLOADED)): - row_id = obs.id - obs._loader = self.make_loader(stream_name, row_id, blob_store) - yield obs - - def should_suppress_decode_error(self, error: BaseException) -> bool: - return isinstance(error, SuppressMeError) - - def serialize(self) -> dict[str, Any]: - return { - "class": f"{type(self).__module__}.{type(self).__qualname__}", - "config": {"prefix": self.prefix}, - } - - -def test_payload_strategy_encodes_loads_and_stops(tmp_path) -> None: - strategy = PrefixPayloadStrategy(prefix="p:") - store = SqliteStore(path=str(tmp_path / "strategy.db")) - stream = store.stream("events", str, payload_strategy=strategy) - - appended = stream.append("hello", ts=1.0) - - assert strategy.bound_sqlite - assert strategy.started - assert strategy.encoded_values == ["hello"] - assert strategy.blob_rows == [("events", appended.id, b"p:hello")] - queried = stream.first() - assert queried._data is _UNLOADED - assert queried.data == "hello" - - store.stop() - assert strategy.stopped - - -def test_payload_strategy_persists_and_binds_on_reopen(tmp_path) -> None: - db = tmp_path / "strategy-reopen.db" - with SqliteStore(path=str(db)) as store: - stream = store.stream( - "events", - str, - payload_strategy=PrefixPayloadStrategy(prefix="stored:"), - ) - stream.append("hello", ts=1.0) - - with SqliteStore(path=str(db), must_exist=True) as reopened: - stream = reopened.stream("events", str) - assert stream._source is not None - strategy = stream._source.payload_strategy - assert isinstance(strategy, PrefixPayloadStrategy) - assert strategy.prefix == "stored:" - assert strategy.bound_sqlite - assert stream.first().data == "hello" - - -def test_replay_skips_strategy_suppressed_decode_errors(tmp_path) -> None: - store = SqliteStore(path=str(tmp_path / "strategy-replay.db")) - stream = store.stream("events", str, payload_strategy=PrefixPayloadStrategy()) - stream.append("first", ts=1.0) - stream.append("skip", ts=2.0) - stream.append("third", ts=3.0) - - assert list(store.replay().streams.events.iterate()) == ["first", "third"] - - -def test_replay_surfaces_non_suppressed_strategy_errors(tmp_path) -> None: - store = SqliteStore(path=str(tmp_path / "strategy-replay-error.db")) - stream = store.stream("events", str, payload_strategy=PrefixPayloadStrategy()) - stream.append("first", ts=1.0) - stream.append("boom", ts=2.0) - - replay_iter = store.replay().streams.events.iterate() - assert next(replay_iter) == "first" - with pytest.raises(DoNotSuppressError): - next(replay_iter) diff --git a/dimos/memory2/video/h264.py b/dimos/memory2/video/h264.py index c517c1c67d..e3b484ba38 100644 --- a/dimos/memory2/video/h264.py +++ b/dimos/memory2/video/h264.py @@ -14,349 +14,36 @@ from __future__ import annotations -from dataclasses import asdict, dataclass, replace -import sqlite3 -from typing import TYPE_CHECKING, Any +from dimos.msgs.sensor_msgs.Image import H264_IMAGE_ENCODING, Image -from dimos.memory2.type.observation import _UNLOADED -from dimos.msgs.sensor_msgs.Image import Image, ImageFormat -from dimos.msgs.sensor_msgs.VideoPacket import VideoPacket -from dimos.protocol.video.h264 import ( - H264CodecAdapter, - H264Config, - H264Decoder, - H264Encoder, - VideoDecodeGapError, -) -if TYPE_CHECKING: - from collections.abc import Iterator +class H264ImageCodec: + """memory2 codec for already-H.264 encoded Image payloads. - from dimos.memory2.blobstore.base import BlobStore - from dimos.memory2.type.observation import Observation - - -@dataclass(frozen=True) -class H264ImageStorageConfig: - """Per-stream memory2 image storage mode for H.264-backed observations.""" - - codec: H264Config = H264Config() - mode: str = "h264" - codec_adapter: H264CodecAdapter | None = None - - def serialize(self) -> dict[str, Any]: - cfg = asdict(self.codec) - cfg["supported_formats"] = [fmt.value for fmt in self.codec.supported_formats] - return {"mode": self.mode, "codec": cfg} - - @classmethod - def parse(cls, raw: H264ImageStorageConfig | dict[str, Any]) -> H264ImageStorageConfig: - if isinstance(raw, cls): - return raw - if not isinstance(raw, dict): - raise TypeError(f"Cannot parse H.264 image storage config from {type(raw).__name__}") - mode = raw.get("mode", "h264") - codec_raw = raw.get("codec", {}) - if isinstance(codec_raw, H264Config): - codec = codec_raw - else: - codec_dict = dict(codec_raw) - formats = codec_dict.get("supported_formats") - if formats is not None: - codec_dict["supported_formats"] = tuple(ImageFormat(fmt) for fmt in formats) - codec = H264Config(**codec_dict) - return cls(codec=codec, mode=mode) - - -@dataclass(frozen=True) -class H264FrameIndexRow: - stream_name: str - observation_id: int - seq: int - keyframe_observation_id: int - is_keyframe: bool - pts: int - width: int - height: int - format: str - codec: str - bitstream: str - - -class H264FrameIndexStore: - """Persistent GOP/keyframe index for H.264-backed image streams.""" - - def __init__(self, conn: sqlite3.Connection) -> None: - self._conn = conn - - def start(self) -> None: - self._conn.execute( - """ - CREATE TABLE IF NOT EXISTS h264_frames ( - stream_name TEXT NOT NULL, - observation_id INTEGER NOT NULL, - seq INTEGER NOT NULL, - keyframe_observation_id INTEGER NOT NULL, - is_keyframe INTEGER NOT NULL, - pts INTEGER NOT NULL, - width INTEGER NOT NULL, - height INTEGER NOT NULL, - format TEXT NOT NULL, - codec TEXT NOT NULL, - bitstream TEXT NOT NULL, - PRIMARY KEY (stream_name, observation_id) - ) - """ - ) - self._conn.execute( - """ - CREATE INDEX IF NOT EXISTS idx_h264_frames_stream_keyframe - ON h264_frames(stream_name, is_keyframe, observation_id) - """ - ) - - def stop(self) -> None: - pass - - def delete_stream(self, stream_name: str) -> None: - self._conn.execute("DELETE FROM h264_frames WHERE stream_name = ?", (stream_name,)) - - def insert(self, stream_name: str, observation_id: int, packet: VideoPacket) -> None: - keyframe_observation_id = ( - observation_id - if packet.is_keyframe - else self._keyframe_observation_id( - stream_name, - packet.keyframe_seq, - current_observation_id=observation_id, - ) - ) - self._conn.execute( - """ - INSERT INTO h264_frames ( - stream_name, observation_id, seq, keyframe_observation_id, is_keyframe, - pts, width, height, format, codec, bitstream - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, - ( - stream_name, - observation_id, - packet.seq, - keyframe_observation_id, - int(packet.is_keyframe), - packet.pts, - packet.width, - packet.height, - packet.format, - packet.codec, - packet.bitstream, - ), - ) - - def packet_ids_for_decode(self, stream_name: str, observation_id: int) -> list[int]: - row = self._conn.execute( - """ - SELECT keyframe_observation_id FROM h264_frames - WHERE stream_name = ? AND observation_id = ? - """, - (stream_name, observation_id), - ).fetchone() - if row is None: - raise VideoDecodeGapError(f"No H.264 GOP index for observation {observation_id}") - keyframe_id = int(row[0]) - rows = self._conn.execute( - """ - SELECT observation_id FROM h264_frames - WHERE stream_name = ? AND observation_id BETWEEN ? AND ? - ORDER BY observation_id ASC - """, - (stream_name, keyframe_id, observation_id), - ).fetchall() - ids = [int(item[0]) for item in rows] - if not ids or ids[0] != keyframe_id or ids[-1] != observation_id: - raise VideoDecodeGapError( - f"Incomplete H.264 GOP index for observation {observation_id}" - ) - return ids - - def rows(self, stream_name: str) -> list[H264FrameIndexRow]: - rows = self._conn.execute( - """ - SELECT stream_name, observation_id, seq, keyframe_observation_id, is_keyframe, - pts, width, height, format, codec, bitstream - FROM h264_frames WHERE stream_name = ? ORDER BY observation_id ASC - """, - (stream_name,), - ).fetchall() - return [ - H264FrameIndexRow( - stream_name=row[0], - observation_id=int(row[1]), - seq=int(row[2]), - keyframe_observation_id=int(row[3]), - is_keyframe=bool(row[4]), - pts=int(row[5]), - width=int(row[6]), - height=int(row[7]), - format=row[8], - codec=row[9], - bitstream=row[10], - ) - for row in rows - ] - - def _keyframe_observation_id( - self, - stream_name: str, - keyframe_seq: int, - *, - current_observation_id: int, - ) -> int: - row = self._conn.execute( - """ - SELECT observation_id FROM h264_frames - WHERE stream_name = ? AND seq = ? AND is_keyframe = 1 AND observation_id <= ? - ORDER BY observation_id DESC - LIMIT 1 - """, - (stream_name, keyframe_seq, current_observation_id), - ).fetchone() - if row is None: - raise VideoDecodeGapError(f"No H.264 keyframe index for seq {keyframe_seq}") - return int(row[0]) - - -class H264ImagePayloadStrategy: - """Stateful H.264 payload strategy for logical ``Stream[Image]`` storage.""" - - codec_id = "h264" - - def __init__( - self, - *, - storage_config: H264ImageStorageConfig | dict[str, Any] | None = None, - frame_index: H264FrameIndexStore | None = None, - ) -> None: - self.storage_config = ( - H264ImageStorageConfig.parse(storage_config) - if storage_config is not None - else H264ImageStorageConfig() - ) - self.frame_index = frame_index - self._encoder: H264Encoder | None = None - - def bind_frame_index(self, frame_index: H264FrameIndexStore) -> None: - self.frame_index = frame_index - - def bind_sqlite(self, conn: sqlite3.Connection) -> None: - self.bind_frame_index(H264FrameIndexStore(conn)) - - def start(self) -> None: - if self.frame_index is None: - raise RuntimeError("H.264 image payload strategy requires a frame index store") - self.frame_index.start() - - def stop(self) -> None: - pass + This codec deliberately does not decode pixels. It persists an ``Image`` whose + ``encoding`` is ``"h264"`` and restores the same encoded image on read. A + separate H.264 decode session turns the encoded stream back into raw Images + for visualization or module consumption. + """ def encode(self, value: Image) -> bytes: - if not isinstance(value, Image): - raise TypeError( - f"H.264 image payload strategy expects Image, got {type(value).__name__}" - ) - if self._encoder is None: - self._encoder = H264Encoder( - self.storage_config.codec, - codec=self.storage_config.codec_adapter, + if value.encoding != H264_IMAGE_ENCODING: + raise ValueError( + f"H264ImageCodec stores encoded Images; got encoding={value.encoding!r}" ) - return self._encoder.encode(value).lcm_encode() - - def after_blob_put(self, stream_name: str, row_id: int, encoded: bytes) -> None: - frame_index = self.frame_index - if frame_index is None: - raise RuntimeError("H.264 image payload strategy requires a frame index store") - frame_index.insert(stream_name, row_id, VideoPacket.lcm_decode(encoded)) - - def make_loader(self, stream_name: str, row_id: int, blob_store: BlobStore) -> Any: - storage_config = self.storage_config - - def loader() -> Image: - decoder = H264Decoder(storage_config.codec, codec=storage_config.codec_adapter) - packet = VideoPacket.lcm_decode(blob_store.get(stream_name, row_id)) - return decoder.decode(packet) - - return loader - - def attach_loaders( - self, - stream_name: str, - observations: Iterator[Observation[Image]], - blob_store: BlobStore, - ) -> Iterator[Observation[Image]]: - decoder = H264Decoder(self.storage_config.codec, codec=self.storage_config.codec_adapter) - - for obs in observations: - obs.data_type = Image - if obs._loader is None and isinstance(obs._data, type(_UNLOADED)): - row_id = obs.id + return value.lcm_encode() - def loader(row_id: int = row_id) -> Image: - packet = VideoPacket.lcm_decode(blob_store.get(stream_name, row_id)) - return decoder.decode(packet) - - obs._loader = loader - yield obs - - def should_suppress_decode_error(self, error: BaseException) -> bool: - return isinstance(error, VideoDecodeGapError) - - def delete_stream(self, stream_name: str) -> None: - if self.frame_index is not None: - self.frame_index.delete_stream(stream_name) - - def serialize(self) -> dict[str, Any]: - return { - "class": f"{type(self).__module__}.{type(self).__qualname__}", - "config": {"storage_config": self.storage_config.serialize()}, - } - - -def h264_image_payload_strategy_from_any(raw: Any) -> H264ImagePayloadStrategy | None: - storage_config = storage_config_from_any(raw) - if storage_config is None: - return None - return H264ImagePayloadStrategy(storage_config=storage_config) - - -def bind_sqlite_frame_index(strategy: Any, conn: sqlite3.Connection) -> Any: - if isinstance(strategy, H264ImagePayloadStrategy): - strategy.bind_frame_index(H264FrameIndexStore(conn)) - return strategy - - -def storage_config_from_any(raw: Any) -> H264ImageStorageConfig | None: - if raw is None: - return None - config = H264ImageStorageConfig.parse(raw) - if config.mode != "h264": - return None - return config + def decode(self, data: bytes) -> Image: + image = Image.lcm_decode(data) + if image.encoding != H264_IMAGE_ENCODING: + raise ValueError( + f"H264ImageCodec expected encoded Image; got encoding={image.encoding!r}" + ) + return image -def storage_config_with_adapter( - config: H264ImageStorageConfig, - adapter: H264CodecAdapter | None, -) -> H264ImageStorageConfig: - return replace(config, codec_adapter=adapter) +def is_h264_image(image: Image) -> bool: + return image.encoding == H264_IMAGE_ENCODING -__all__ = [ - "H264FrameIndexRow", - "H264FrameIndexStore", - "H264ImagePayloadStrategy", - "H264ImageStorageConfig", - "bind_sqlite_frame_index", - "h264_image_payload_strategy_from_any", - "storage_config_from_any", - "storage_config_with_adapter", -] +__all__ = ["H264ImageCodec", "is_h264_image"] diff --git a/dimos/memory2/video/test_h264_storage.py b/dimos/memory2/video/test_h264_storage.py index 10297039c3..044f8bb2d0 100644 --- a/dimos/memory2/video/test_h264_storage.py +++ b/dimos/memory2/video/test_h264_storage.py @@ -14,283 +14,113 @@ from __future__ import annotations -import sqlite3 - import numpy as np import pytest -from dimos.memory2.backend import Backend -from dimos.memory2.blobstore.sqlite import SqliteBlobStore -from dimos.memory2.codecs.pickle import PickleCodec -from dimos.memory2.observationstore.sqlite import SqliteObservationStore +from dimos.memory2.codecs.base import codec_from_id, codec_id +from dimos.memory2.codecs.jpeg import JpegCodec from dimos.memory2.store.sqlite import SqliteStore -from dimos.memory2.type.observation import _UNLOADED -from dimos.memory2.video.h264 import ( - H264FrameIndexStore, - H264ImagePayloadStrategy, - H264ImageStorageConfig, - h264_image_payload_strategy_from_any, - storage_config_from_any, -) -from dimos.msgs.sensor_msgs.Image import Image, ImageFormat -from dimos.msgs.sensor_msgs.VideoPacket import VideoPacket -from dimos.protocol.video.h264 import UnsupportedVideoImageError, VideoDecodeGapError - - -class FakeH264CodecAdapter: - def encode_image(self, image: Image, *, force_keyframe: bool) -> tuple[bytes, int]: - return image.data.tobytes(), int(image.ts * 1000) - - def decode_packet(self, packet: VideoPacket) -> Image: - channels = 1 if packet.format == ImageFormat.GRAY.value else 3 - shape = ( - (packet.height, packet.width) - if channels == 1 - else (packet.height, packet.width, channels) - ) - arr = np.frombuffer(packet.data, dtype=np.uint8).copy().reshape(shape) - return Image.from_numpy( - arr, format=ImageFormat(packet.format), frame_id=packet.frame_id, ts=packet.ts - ) +from dimos.memory2.video.h264 import H264ImageCodec +from dimos.msgs.sensor_msgs.Image import H264_IMAGE_ENCODING, Image, ImageFormat -def _image(seq: int, fmt: ImageFormat = ImageFormat.RGB) -> Image: +def _raw_image(seq: int, fmt: ImageFormat = ImageFormat.RGB) -> Image: data = np.full((2, 2, 3), seq, dtype=np.uint8) if fmt == ImageFormat.GRAY: data = np.full((2, 2), seq, dtype=np.uint8) return Image.from_numpy(data, format=fmt, frame_id="cam", ts=float(seq)) -def _make_backend( - conn: sqlite3.Connection, *, config: H264ImageStorageConfig | None = None -) -> Backend[Image]: - frame_index = H264FrameIndexStore(conn) - strategy = H264ImagePayloadStrategy( - storage_config=config or H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()), - frame_index=frame_index, +def _encoded_image(seq: int, *, key: bool = True) -> Image: + return Image.encoded( + data=b"\x00\x00\x00\x01\x65" + bytes([seq]), + encoding=H264_IMAGE_ENCODING, + format=ImageFormat.RGB, + frame_id="cam", + ts=float(seq), + codec_metadata={ + "seq": seq, + "codec": "h264", + "bitstream": "annex_b", + "is_keyframe": key, + "keyframe_seq": seq if key else 0, + "pts": seq * 90, + "width": 2, + "height": 2, + "channels": 3, + "dtype": "uint8", + }, ) - blob_store = SqliteBlobStore(conn=conn) - obs_store = SqliteObservationStore( - conn=conn, name="cam", codec=PickleCodec(), blob_store_conn_match=False, page_size=256 - ) - backend = Backend( - metadata_store=obs_store, - codec=PickleCodec(), - data_type=Image, - blob_store=blob_store, - payload_strategy=strategy, - ) - backend.start() - return backend -def test_storage_config_parse_and_serialize() -> None: - config = H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) - raw = config.serialize() - parsed = H264ImageStorageConfig.parse(raw) - assert parsed.mode == "h264" - assert parsed.codec == config.codec - assert storage_config_from_any(raw) == H264ImageStorageConfig(codec=config.codec) - assert isinstance(h264_image_payload_strategy_from_any(raw), H264ImagePayloadStrategy) - assert storage_config_from_any({"mode": "jpeg", "codec": raw["codec"]}) is None +def test_h264_image_codec_roundtrips_encoded_image() -> None: + codec = H264ImageCodec() + image = _encoded_image(1) + decoded = codec.decode(codec.encode(image)) -def test_store_creates_h264_backend_from_config(tmp_path) -> None: - store = SqliteStore(path=str(tmp_path / "h264.db")) - backend = store._create_backend( - "cam", - Image, - payload_strategy=H264ImagePayloadStrategy( - storage_config=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) - ), - ) - assert isinstance(backend, Backend) - assert isinstance(backend.payload_strategy, H264ImagePayloadStrategy) - assert backend.payload_strategy.storage_config.mode == "h264" - assert isinstance(backend.payload_strategy.storage_config.codec_adapter, FakeH264CodecAdapter) + assert decoded == image + assert decoded.encoding == H264_IMAGE_ENCODING + assert decoded.codec_metadata["seq"] == 1 + assert decoded.width == 2 + assert decoded.height == 2 -def test_h264_image_stream_keeps_default_jpeg_compatibility(tmp_path) -> None: - store = SqliteStore(path=str(tmp_path / "jpeg.db")) - stream = store.stream("rgb", Image) - obs = stream.append(_image(1)) - assert obs.data.format == ImageFormat.RGB - assert store.stream("rgb").count() == 1 +def test_h264_image_codec_rejects_raw_images() -> None: + codec = H264ImageCodec() + + with pytest.raises(ValueError, match="encoded Images"): + codec.encode(_raw_image(1)) -def test_h264_one_observation_and_one_blob_per_frame(tmp_path) -> None: - conn = sqlite3.connect(str(tmp_path / "frames.db")) - backend = _make_backend(conn) - from dimos.memory2.type.observation import Observation +def test_codec_id_and_factory_support_h264_for_image() -> None: + codec = H264ImageCodec() - stored = backend.append(Observation(data_type=Image, _data=_image(1))) - assert stored.id == 1 - assert backend.blob_store is not None - assert backend.blob_store.get("cam", 1) - assert isinstance(backend.payload_strategy, H264ImagePayloadStrategy) - assert backend.payload_strategy.frame_index is not None - assert len(backend.payload_strategy.frame_index.rows("cam")) == 1 + assert codec_id(codec) == "h264" + assert isinstance(codec_from_id("h264", "dimos.msgs.sensor_msgs.Image.Image"), H264ImageCodec) -def test_h264_persistent_gop_index_and_lazy_decode(tmp_path) -> None: - db = tmp_path / "gop.db" +def test_h264_stream_stores_encoded_images_with_normal_backend(tmp_path) -> None: + db = tmp_path / "h264.db" with SqliteStore(path=str(db)) as store: - stream = store.stream( - "cam", - Image, - payload_strategy=H264ImagePayloadStrategy( - storage_config=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) - ), - ) - stream.append(_image(1), ts=1.0) - stream.append(_image(2), ts=2.0) - observations = list(stream) - obs = observations[1] - assert obs._loader is not None - assert obs._data is _UNLOADED - assert obs.id == 2 - assert obs.ts == 2.0 - assert observations[0].data.data.shape == (2, 2, 3) - assert obs.data.data.shape == (2, 2, 3) - backend = stream._source - assert isinstance(backend.payload_strategy, H264ImagePayloadStrategy) - assert backend.payload_strategy.frame_index is not None - assert len(backend.payload_strategy.frame_index.rows("cam")) == 2 + stream = store.stream("cam", Image, codec="h264") + stored = stream.append(_encoded_image(1), ts=1.0) + assert stored.data.encoding == H264_IMAGE_ENCODING + assert stored.data.codec_metadata["seq"] == 1 with SqliteStore(path=str(db), must_exist=True) as reopened: stream = reopened.stream("cam", Image) - assert stream.count() == 2 - backend = stream._source - assert isinstance(backend.payload_strategy, H264ImagePayloadStrategy) - assert backend.payload_strategy.storage_config.mode == "h264" - backend.payload_strategy.storage_config = H264ImageStorageConfig( - codec_adapter=FakeH264CodecAdapter() - ) - assert reopened.streams.cam.first().data.data.shape == (2, 2, 3) - - -def test_h264_reopen_append_uses_nearest_reset_sequence_keyframe(tmp_path) -> None: - db = tmp_path / "reopen_append.db" - conn = sqlite3.connect(str(db)) - backend = _make_backend(conn) - from dimos.memory2.type.observation import Observation - - backend.append(Observation(ts=1.0, data_type=Image, _data=_image(1))) - backend.append(Observation(ts=2.0, data_type=Image, _data=_image(2))) - backend.stop() - conn.close() - - reopened_conn = sqlite3.connect(str(db)) - reopened_backend = _make_backend(reopened_conn) - reopened_backend.append(Observation(ts=3.0, data_type=Image, _data=_image(3))) - reopened_backend.append(Observation(ts=4.0, data_type=Image, _data=_image(4))) - - assert isinstance(reopened_backend.payload_strategy, H264ImagePayloadStrategy) - assert reopened_backend.payload_strategy.frame_index is not None - rows = reopened_backend.payload_strategy.frame_index.rows("cam") - assert [(row.observation_id, row.seq, row.keyframe_observation_id) for row in rows] == [ - (1, 0, 1), - (2, 1, 1), - (3, 0, 3), - (4, 1, 3), - ] + obs = stream.first() + assert obs.data.encoding == H264_IMAGE_ENCODING + assert obs.data.codec_metadata["seq"] == 1 + assert obs.data.width == 2 -def test_h264_mid_gop_decode_and_missing_gop_failure(tmp_path) -> None: - store = SqliteStore(path=str(tmp_path / "gap.db")) - stream = store.stream( - "cam", - Image, - payload_strategy=H264ImagePayloadStrategy( - storage_config=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) - ), - ) - stream.append(_image(1)) - stream.append(_image(2)) - stream.append(_image(3)) - observations = list(stream) - assert [obs.data.data[0, 0, 0] for obs in observations] == [1, 2, 3] - - backend = stream._source - assert isinstance(backend.payload_strategy, H264ImagePayloadStrategy) - assert backend.payload_strategy.frame_index is not None - assert backend.blob_store is not None - backend.blob_store.delete("cam", 2) - gap_observations = list(stream) - assert gap_observations[0].data.data[0, 0, 0] == 1 - with pytest.raises(KeyError): - _ = gap_observations[1].data - gap_obs = gap_observations[2] - with pytest.raises(VideoDecodeGapError): - _ = gap_obs.data - - -def test_h264_replay_seek_suppresses_delta_until_next_keyframe(tmp_path) -> None: - config = H264ImageStorageConfig( - codec_adapter=FakeH264CodecAdapter(), - codec=H264ImageStorageConfig().codec, - ) - store = SqliteStore(path=str(tmp_path / "seek.db")) - stream = store.stream( - "cam", - Image, - payload_strategy=H264ImagePayloadStrategy(storage_config=config), - ) - for seq in range(1, 34): - stream.append(_image(seq), ts=float(seq)) - - replay = store.replay(from_timestamp=2.0) - images = list(replay.streams.cam.iterate()) - - assert images[0].ts == 31.0 - assert [img.data[0, 0, 0] for img in images[:3]] == [31, 32, 33] +def test_h264_replay_emits_encoded_images(tmp_path) -> None: + store = SqliteStore(path=str(tmp_path / "replay.db")) + stream = store.stream("cam", Image, codec="h264") + stream.append(_encoded_image(1), ts=1.0) + stream.append(_encoded_image(2, key=False), ts=2.0) + replayed = list(store.replay().streams.cam.iterate()) -def test_replay_iterate_returns_decoded_images(tmp_path) -> None: - store = SqliteStore(path=str(tmp_path / "replay.db")) - stream = store.stream( - "cam", - Image, - payload_strategy=H264ImagePayloadStrategy( - storage_config=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) - ), - ) - stream.append(_image(1), ts=1.0) - stream.append(_image(2), ts=2.0) + assert [image.encoding for image in replayed] == [H264_IMAGE_ENCODING, H264_IMAGE_ENCODING] + assert [image.codec_metadata["seq"] for image in replayed] == [1, 2] - replay = store.replay() - images = list(replay.streams.cam.iterate()) - assert [img.ts for img in images] == [1.0, 2.0] - assert [img.data[0, 0, 0] for img in images] == [1, 2] +def test_default_image_stream_still_uses_jpeg_codec(tmp_path) -> None: + store = SqliteStore(path=str(tmp_path / "jpeg.db")) + stream = store.stream("rgb", Image) + stream.append(_raw_image(1)) -def test_h264_rejects_unsupported_formats(tmp_path) -> None: - store = SqliteStore(path=str(tmp_path / "bad.db")) - stream = store.stream( - "cam", - Image, - payload_strategy=H264ImagePayloadStrategy( - storage_config=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) - ), - ) - rgba = np.zeros((2, 2, 4), dtype=np.uint8) - with pytest.raises(UnsupportedVideoImageError): - stream.append(Image.from_numpy(rgba, format=ImageFormat.RGBA)) + assert isinstance(stream._source.codec, JpegCodec) + assert store.stream("rgb").first().data.encoding == "raw" -def test_sqlite_delete_stream_removes_h264_frame_index_rows(tmp_path) -> None: - db = tmp_path / "delete.db" - store = SqliteStore(path=str(db)) - stream = store.stream( - "cam", - Image, - payload_strategy=H264ImagePayloadStrategy( - storage_config=H264ImageStorageConfig(codec_adapter=FakeH264CodecAdapter()) - ), - ) - stream.append(_image(1)) - store.delete_stream("cam") +def test_encoded_images_reject_pixel_operations() -> None: + image = _encoded_image(1) - conn = sqlite3.connect(str(db)) - count = conn.execute("SELECT COUNT(*) FROM h264_frames WHERE stream_name = 'cam'").fetchone()[0] - assert count == 0 + with pytest.raises(ValueError, match="requires raw Image data"): + image.to_rgb() + with pytest.raises(ValueError, match="requires raw Image data"): + image.as_numpy() diff --git a/dimos/msgs/sensor_msgs/Image.py b/dimos/msgs/sensor_msgs/Image.py index 5eaca03886..e5d4ba1b02 100644 --- a/dimos/msgs/sensor_msgs/Image.py +++ b/dimos/msgs/sensor_msgs/Image.py @@ -17,6 +17,8 @@ import base64 from dataclasses import dataclass, field from enum import Enum +import json +import struct import time from typing import TYPE_CHECKING, Any, Literal, TypedDict import warnings @@ -82,41 +84,85 @@ class AgentImageMessage(TypedDict): data: str # Base64 encoded image data +RAW_IMAGE_ENCODING = "raw" +H264_IMAGE_ENCODING = "h264" +_ENCODED_IMAGE_MAGIC = b"DIMI1" + + +def _data_equal(left: np.ndarray[Any, np.dtype[Any]] | bytes, right: object) -> bool: + if isinstance(left, np.ndarray) and isinstance(right, np.ndarray): + return bool(np.array_equal(left, right)) + if isinstance(left, bytes) and isinstance(right, bytes): + return left == right + return False + + +def _pack_encoded_image_payload(metadata: dict[str, Any], payload: bytes) -> bytes: + header = json.dumps(metadata, sort_keys=True, separators=(",", ":")).encode("utf-8") + return _ENCODED_IMAGE_MAGIC + struct.pack(">I", len(header)) + header + payload + + +def _unpack_encoded_image_payload(payload: bytes) -> tuple[dict[str, Any], bytes]: + if not payload.startswith(_ENCODED_IMAGE_MAGIC): + return {}, payload + offset = len(_ENCODED_IMAGE_MAGIC) + header_len = struct.unpack(">I", payload[offset : offset + 4])[0] + header_start = offset + 4 + header_end = header_start + header_len + metadata = json.loads(payload[header_start:header_end].decode("utf-8")) + return metadata, payload[header_end:] + + @dataclass class Image(Timestamped): - """Simple NumPy-based image container.""" + """Image container for raw pixels or explicitly encoded image payloads.""" msg_name = "sensor_msgs.Image" - data: np.ndarray[Any, np.dtype[Any]] = field( + data: np.ndarray[Any, np.dtype[Any]] | bytes = field( default_factory=lambda: np.zeros((1, 1, 3), dtype=np.uint8) ) format: ImageFormat = field(default=ImageFormat.BGR) frame_id: str = field(default="") ts: float = field(default_factory=time.time) + encoding: str = field(default=RAW_IMAGE_ENCODING) + codec_metadata: dict[str, Any] = field(default_factory=dict) def __post_init__(self) -> None: - if not isinstance(self.data, np.ndarray): - self.data = np.asarray(self.data) - if self.data.ndim < 2: - raise ValueError("Image requires a 2D/3D NumPy array") + if self.encoding == RAW_IMAGE_ENCODING: + if not isinstance(self.data, np.ndarray): + self.data = np.asarray(self.data) + if not isinstance(self.data, np.ndarray): + raise TypeError("Raw Image payload must be a NumPy array") + arr: np.ndarray[Any, np.dtype[Any]] = self.data + if arr.ndim < 2: + raise ValueError("Image requires a 2D/3D NumPy array") + return + if isinstance(self.data, bytearray): + self.data = bytes(self.data) + elif not isinstance(self.data, bytes): + self.data = memoryview(np.ascontiguousarray(self.data)).tobytes() + if not self.data: + raise ValueError("Encoded Image payload cannot be empty") def __str__(self) -> str: return ( f"Image(shape={self.shape}, format={self.format.value}, dtype={self.dtype}, " - f"ts={to_human_readable(self.ts)})" + f"encoding={self.encoding}, ts={to_human_readable(self.ts)})" ) def __repr__(self) -> str: - return f"Image(shape={self.shape}, format={self.format.value}, dtype={self.dtype}, frame_id='{self.frame_id}', ts={self.ts})" + return f"Image(shape={self.shape}, format={self.format.value}, dtype={self.dtype}, encoding='{self.encoding}', frame_id='{self.frame_id}', ts={self.ts})" def __eq__(self, other: object) -> bool: if not isinstance(other, Image): return False return ( - np.array_equal(self.data, other.data) + _data_equal(self.data, other.data) and self.format == other.format and self.frame_id == other.frame_id + and self.encoding == other.encoding + and self.codec_metadata == other.codec_metadata and abs(self.ts - other.ts) < 1e-6 ) @@ -124,40 +170,101 @@ def __len__(self) -> int: return int(self.height * self.width) def __getstate__(self) -> dict[str, Any]: - return {"data": self.data, "format": self.format, "frame_id": self.frame_id, "ts": self.ts} + return { + "data": self.data, + "format": self.format, + "frame_id": self.frame_id, + "ts": self.ts, + "encoding": self.encoding, + "codec_metadata": self.codec_metadata, + } def __setstate__(self, state: dict[str, Any]) -> None: self.data = state.get("data", np.zeros((1, 1, 3), dtype=np.uint8)) self.format = state.get("format", ImageFormat.BGR) self.frame_id = state.get("frame_id", "") self.ts = state.get("ts", time.time()) + self.encoding = state.get("encoding", RAW_IMAGE_ENCODING) + self.codec_metadata = state.get("codec_metadata", {}) + + @property + def is_raw(self) -> bool: + return self.encoding == RAW_IMAGE_ENCODING + + @property + def is_encoded(self) -> bool: + return not self.is_raw + + def require_raw(self, operation: str = "operation") -> np.ndarray[Any, np.dtype[Any]]: + if self.encoding != RAW_IMAGE_ENCODING: + raise ValueError(f"{operation} requires raw Image data; got encoding={self.encoding!r}") + assert isinstance(self.data, np.ndarray) + return self.data @property def height(self) -> int: - return int(self.data.shape[0]) + if self.is_encoded: + return int(self.codec_metadata.get("height", 0)) + arr = self.require_raw("height") + return int(arr.shape[0]) @property def width(self) -> int: - return int(self.data.shape[1]) + if self.is_encoded: + return int(self.codec_metadata.get("width", 0)) + arr = self.require_raw("width") + return int(arr.shape[1]) @property def channels(self) -> int: - if self.data.ndim == 2: + if self.is_encoded: + if "channels" in self.codec_metadata: + return int(self.codec_metadata["channels"]) + if self.format in ( + ImageFormat.GRAY, + ImageFormat.GRAY16, + ImageFormat.DEPTH, + ImageFormat.DEPTH16, + ): + return 1 + if self.format in (ImageFormat.RGBA, ImageFormat.BGRA): + return 4 + return 3 + arr = self.require_raw("channels") + if arr.ndim == 2: return 1 - if self.data.ndim == 3: - return int(self.data.shape[2]) + if arr.ndim == 3: + return int(arr.shape[2]) raise ValueError("Invalid image dimensions") @property def shape(self) -> tuple[int, ...]: - return tuple(self.data.shape) + if self.is_encoded: + if self.channels == 1: + return (self.height, self.width) + return (self.height, self.width, self.channels) + return tuple(self.require_raw("shape").shape) @property def dtype(self) -> np.dtype[Any]: - return self.data.dtype + if self.is_encoded: + return np.dtype(self.codec_metadata.get("dtype", "uint8")) + return self.require_raw("dtype").dtype def copy(self) -> Image: - return Image(data=self.data.copy(), format=self.format, frame_id=self.frame_id, ts=self.ts) + data: np.ndarray[Any, np.dtype[Any]] | bytes + if self.is_encoded: + data = bytes(self.data) + else: + data = self.require_raw("copy").copy() + return Image( + data=data, + format=self.format, + frame_id=self.frame_id, + ts=self.ts, + encoding=self.encoding, + codec_metadata=dict(self.codec_metadata), + ) @classmethod def from_numpy( @@ -174,6 +281,27 @@ def from_numpy( ts=ts if ts is not None else time.time(), ) + @classmethod + def encoded( + cls, + *, + data: bytes, + encoding: str, + format: ImageFormat, + frame_id: str = "", + ts: float | None = None, + codec_metadata: dict[str, Any] | None = None, + ) -> Image: + metadata = dict(codec_metadata or {}) + return cls( + data=data, + format=format, + frame_id=frame_id, + ts=ts if ts is not None else time.time(), + encoding=encoding, + codec_metadata=metadata, + ) + @classmethod def from_file( cls, @@ -211,7 +339,7 @@ def from_opencv( def to_opencv(self) -> np.ndarray: """Convert to OpenCV BGR format.""" - arr = self.data + arr = self.require_raw("to_opencv") if self.format == ImageFormat.BGR: return arr if self.format == ImageFormat.RGB: @@ -231,12 +359,12 @@ def to_opencv(self) -> np.ndarray: def as_numpy(self) -> np.ndarray: """Get image data as numpy array.""" - return self.data + return self.require_raw("as_numpy") def to_rgb(self) -> Image: + arr = self.require_raw("to_rgb") if self.format == ImageFormat.RGB: return self.copy() - arr = self.data if self.format == ImageFormat.BGR: return Image( data=cv2.cvtColor(arr, cv2.COLOR_BGR2RGB), @@ -267,9 +395,9 @@ def to_rgb(self) -> Image: return self.copy() def to_bgr(self) -> Image: + arr = self.require_raw("to_bgr") if self.format == ImageFormat.BGR: return self.copy() - arr = self.data if self.format == ImageFormat.RGB: return Image( data=cv2.cvtColor(arr, cv2.COLOR_RGB2BGR), @@ -304,18 +432,19 @@ def to_bgr(self) -> Image: return self.copy() def to_grayscale(self) -> Image: + arr = self.require_raw("to_grayscale") if self.format in (ImageFormat.GRAY, ImageFormat.GRAY16, ImageFormat.DEPTH): return self.copy() if self.format == ImageFormat.BGR: return Image( - data=cv2.cvtColor(self.data, cv2.COLOR_BGR2GRAY), + data=cv2.cvtColor(arr, cv2.COLOR_BGR2GRAY), format=ImageFormat.GRAY, frame_id=self.frame_id, ts=self.ts, ) if self.format == ImageFormat.RGB: return Image( - data=cv2.cvtColor(self.data, cv2.COLOR_RGB2GRAY), + data=cv2.cvtColor(arr, cv2.COLOR_RGB2GRAY), format=ImageFormat.GRAY, frame_id=self.frame_id, ts=self.ts, @@ -323,7 +452,7 @@ def to_grayscale(self) -> Image: if self.format in (ImageFormat.RGBA, ImageFormat.BGRA): code = cv2.COLOR_RGBA2GRAY if self.format == ImageFormat.RGBA else cv2.COLOR_BGRA2GRAY return Image( - data=cv2.cvtColor(self.data, code), + data=cv2.cvtColor(arr, code), format=ImageFormat.GRAY, frame_id=self.frame_id, ts=self.ts, @@ -332,11 +461,13 @@ def to_grayscale(self) -> Image: def to_rerun(self) -> Any: """Convert to rerun Image format.""" - return _format_to_rerun(self.data, self.format) + return _format_to_rerun(self.require_raw("to_rerun"), self.format) def resize(self, width: int, height: int, interpolation: int = cv2.INTER_LINEAR) -> Image: return Image( - data=cv2.resize(self.data, (width, height), interpolation=interpolation), + data=cv2.resize( + self.require_raw("resize"), (width, height), interpolation=interpolation + ), format=self.format, frame_id=self.frame_id, ts=self.ts, @@ -372,7 +503,8 @@ def crop(self, x: int, y: int, width: int, height: int) -> Image: Returns: A new Image containing the cropped region """ - img_height, img_width = self.data.shape[:2] + arr = self.require_raw("crop") + img_height, img_width = arr.shape[:2] # Clamp the crop region to image bounds x = max(0, min(x, img_width)) @@ -381,10 +513,10 @@ def crop(self, x: int, y: int, width: int, height: int) -> Image: y_end = min(y + height, img_height) # Perform the crop using array slicing - if self.data.ndim == 2: - cropped_data = self.data[y:y_end, x:x_end] + if arr.ndim == 2: + cropped_data = arr[y:y_end, x:x_end] else: - cropped_data = self.data[y:y_end, x:x_end, :] + cropped_data = arr[y:y_end, x:x_end, :] return Image(data=cropped_data, format=self.format, frame_id=self.frame_id, ts=self.ts) @@ -396,8 +528,9 @@ def brightness(self) -> float: reading every pixel, and the mean converges quickly (CLT). """ max_val = 65535.0 if self.format in (ImageFormat.GRAY16, ImageFormat.DEPTH16) else 255.0 - step = max(1, max(self.data.shape[:2]) // 256) - return float(self.data[::step, ::step].mean() / max_val) + arr = self.require_raw("brightness") + step = max(1, max(arr.shape[:2]) // 256) + return float(arr[::step, ::step].mean() / max_val) @property def sharpness(self) -> float: @@ -406,7 +539,7 @@ def sharpness(self) -> float: Downsamples to ~160px wide before computing Laplacian variance for fast evaluation (~10-20x cheaper than full-res Sobel). """ - gray = self.to_grayscale().data + gray = self.to_grayscale().require_raw("sharpness") # Downsample to ~160px wide for cheap evaluation h, w = gray.shape[:2] if w > 160: @@ -486,6 +619,31 @@ def lcm_encode(self, frame_id: str | None = None) -> bytes: msg.header.stamp.sec = int(now) msg.header.stamp.nsec = int((now - int(now)) * 1e9) + if self.is_encoded: + if not isinstance(self.data, bytes): + raise ValueError("Encoded Image payload must be bytes") + codec_metadata = dict(self.codec_metadata) + codec_metadata.setdefault("width", self.width) + codec_metadata.setdefault("height", self.height) + codec_metadata.setdefault("channels", self.channels) + codec_metadata.setdefault("dtype", str(self.dtype)) + metadata = { + "format": self.format.value, + "encoding": self.encoding, + "codec_metadata": codec_metadata, + } + packed = _pack_encoded_image_payload(metadata, self.data) + msg.height = self.height + msg.width = self.width + msg.encoding = self.encoding + msg.is_bigendian = False + msg.step = 0 + msg.data_length = len(packed) + msg.data = packed + return msg.lcm_encode() # type: ignore[no-any-return] + + arr = self.require_raw("lcm_encode") + # Image properties msg.height = self.height msg.width = self.width @@ -493,10 +651,10 @@ def lcm_encode(self, frame_id: str | None = None) -> bytes: msg.is_bigendian = False # Calculate step (bytes per row) - channels = 1 if self.data.ndim == 2 else self.data.shape[2] + channels = 1 if arr.ndim == 2 else arr.shape[2] msg.step = self.width * self.dtype.itemsize * channels - view = memoryview(np.ascontiguousarray(self.data)).cast("B") # type: ignore[arg-type] + view = memoryview(np.ascontiguousarray(arr)).cast("B") # type: ignore[arg-type] msg.data_length = len(view) msg.data = view @@ -509,6 +667,26 @@ def lcm_decode(cls, data: bytes, **kwargs: Any) -> Image: # JPEG-compressed images use a different decode path. if msg.encoding == "jpeg": return cls.lcm_jpeg_decode(data, **kwargs) + if msg.encoding == H264_IMAGE_ENCODING: + metadata, payload = _unpack_encoded_image_payload(bytes(msg.data)) + codec_metadata = dict(metadata.get("codec_metadata", {})) + codec_metadata.setdefault("width", msg.width) + codec_metadata.setdefault("height", msg.height) + image_format = ImageFormat(metadata.get("format", ImageFormat.RGB.value)) + return cls.encoded( + data=payload, + encoding=H264_IMAGE_ENCODING, + format=image_format, + frame_id=msg.header.frame_id if hasattr(msg, "header") else "", + ts=( + msg.header.stamp.sec + msg.header.stamp.nsec / 1e9 + if hasattr(msg, "header") + and hasattr(msg.header, "stamp") + and msg.header.stamp.sec > 0 + else time.time() + ), + codec_metadata=codec_metadata, + ) fmt, dtype, channels = _parse_lcm_encoding(msg.encoding) arr: np.ndarray[Any, Any] = np.frombuffer(msg.data, dtype=dtype) @@ -542,7 +720,7 @@ def to_jpeg_bytes(self, quality: int = 75) -> bytes: jpeg = TurboJPEG() # Canonicalize to RGB so JPEG bytes are deterministic regardless of input format. - rgb_array = self.to_rgb().data + rgb_array = self.to_rgb().require_raw("to_jpeg_bytes") return jpeg.encode(rgb_array, quality=quality, pixel_format=TJPF_RGB) # type: ignore[no-any-return] def lcm_jpeg_encode(self, quality: int = 75, frame_id: str | None = None) -> bytes: @@ -620,6 +798,8 @@ def lcm_jpeg_decode(cls, data: bytes, **kwargs: Any) -> Image: __all__ = [ + "H264_IMAGE_ENCODING", + "RAW_IMAGE_ENCODING", "Image", "ImageFormat", "sharpness_barrier", diff --git a/dimos/msgs/sensor_msgs/VideoPacket.py b/dimos/msgs/sensor_msgs/VideoPacket.py deleted file mode 100644 index 0ed619a20d..0000000000 --- a/dimos/msgs/sensor_msgs/VideoPacket.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright 2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -from dataclasses import dataclass -import json -import struct -from typing import Any, ClassVar - -_MAGIC = b"DVP1" - - -@dataclass(frozen=True) -class VideoPacket: - """One encoded video frame/access unit. - - The first supported shape is a complete H.264 Annex B access unit for - exactly one source image frame. Delta frames are complete encoded-frame - packets, but they are not necessarily independently decodable without the - preceding GOP state. - """ - - msg_name: ClassVar[str] = "sensor_msgs.VideoPacket" - - seq: int - ts: float - frame_id: str - width: int - height: int - format: str - codec: str - bitstream: str - is_keyframe: bool - keyframe_seq: int - pts: int - data: bytes - - def __post_init__(self) -> None: - if self.seq < 0: - raise ValueError("seq must be non-negative") - if self.width <= 0 or self.height <= 0: - raise ValueError("width and height must be positive") - if self.codec != "h264": - raise ValueError(f"Unsupported video codec: {self.codec!r}") - if self.bitstream != "annex_b": - raise ValueError(f"Unsupported video bitstream: {self.bitstream!r}") - if not isinstance(self.data, bytes): - object.__setattr__(self, "data", bytes(self.data)) - if len(self.data) == 0: - raise ValueError("VideoPacket data must not be empty") - - def lcm_encode(self) -> bytes: - """Encode into a compact self-describing binary envelope.""" - - header = { - "seq": self.seq, - "ts": self.ts, - "frame_id": self.frame_id, - "width": self.width, - "height": self.height, - "format": self.format, - "codec": self.codec, - "bitstream": self.bitstream, - "is_keyframe": self.is_keyframe, - "keyframe_seq": self.keyframe_seq, - "pts": self.pts, - } - header_bytes = json.dumps(header, separators=(",", ":")).encode("utf-8") - return _MAGIC + struct.pack("!I", len(header_bytes)) + header_bytes + self.data - - @classmethod - def lcm_decode(cls, payload: bytes) -> VideoPacket: - """Decode a packet produced by :meth:`lcm_encode`.""" - - if len(payload) < 8 or payload[:4] != _MAGIC: - raise ValueError("Invalid VideoPacket payload") - header_len = struct.unpack("!I", payload[4:8])[0] - header_start = 8 - header_end = header_start + header_len - if header_end > len(payload): - raise ValueError("Truncated VideoPacket header") - header: dict[str, Any] = json.loads(payload[header_start:header_end].decode("utf-8")) - return cls(data=payload[header_end:], **header) - - -__all__ = ["VideoPacket"] diff --git a/dimos/protocol/pubsub/impl/h264_lcm.py b/dimos/protocol/pubsub/impl/h264_lcm.py index 28e6f75ac7..69f8107930 100644 --- a/dimos/protocol/pubsub/impl/h264_lcm.py +++ b/dimos/protocol/pubsub/impl/h264_lcm.py @@ -17,7 +17,6 @@ from __future__ import annotations from dimos.msgs.sensor_msgs.Image import Image -from dimos.msgs.sensor_msgs.VideoPacket import VideoPacket from dimos.protocol.pubsub.encoders import DecodingError, LCMTopicProto, PubSubEncoderMixin from dimos.protocol.pubsub.impl.lcmpubsub import LCMPubSubBase from dimos.protocol.video.h264 import H264Config, H264Decoder, H264Encoder, VideoDecodeGapError @@ -26,9 +25,16 @@ class H264EncoderMixin(PubSubEncoderMixin[LCMTopicProto, Image, bytes]): """Encoder mixin for Image streams using H.264 packets on the wire.""" - def __init__(self, *, config: H264Config | None = None, **kwargs: object) -> None: + def __init__( + self, + *, + config: H264Config | None = None, + decode_images: bool = True, + **kwargs: object, + ) -> None: super().__init__(**kwargs) # type: ignore[misc] self.h264_config = config or H264Config() + self.decode_images = decode_images self._encoder: H264Encoder | None = None self._decoder: H264Decoder | None = None @@ -42,14 +48,16 @@ def decode(self, msg: bytes, topic: LCMTopicProto) -> Image: raise DecodingError("Ignoring LCM_SELF_TEST topic") if topic.lcm_type is not None and not issubclass(topic.lcm_type, Image): raise DecodingError(f"H.264 LCM topic {topic.topic!r} is not typed as Image") - if self._decoder is None: - self._decoder = H264Decoder(self.h264_config) try: - packet = VideoPacket.lcm_decode(msg) + image = Image.lcm_decode(msg) except ValueError as exc: raise DecodingError(str(exc)) from exc + if not self.decode_images: + return image + if self._decoder is None: + self._decoder = H264Decoder(self.h264_config) try: - return self._decoder.decode(packet) + return self._decoder.decode(image) except VideoDecodeGapError as exc: raise DecodingError(str(exc)) from exc diff --git a/dimos/protocol/pubsub/impl/test_h264_lcm.py b/dimos/protocol/pubsub/impl/test_h264_lcm.py index 03ee35b88e..9db7b42b4a 100644 --- a/dimos/protocol/pubsub/impl/test_h264_lcm.py +++ b/dimos/protocol/pubsub/impl/test_h264_lcm.py @@ -21,8 +21,7 @@ import pytest from dimos.msgs.protocol import DimosMsg -from dimos.msgs.sensor_msgs.Image import Image, ImageFormat -from dimos.msgs.sensor_msgs.VideoPacket import VideoPacket +from dimos.msgs.sensor_msgs.Image import H264_IMAGE_ENCODING, Image, ImageFormat from dimos.protocol.pubsub.encoders import DecodingError, LCMTopicProto from dimos.protocol.pubsub.impl.h264_lcm import H264LCM, H264EncoderMixin from dimos.protocol.video.h264 import VideoDecodeGapError @@ -34,36 +33,45 @@ class StubTopic: lcm_type: type[DimosMsg] | None = None +def _encoded(image: Image, *, seq: int = 0, key: bool = True) -> Image: + return Image.encoded( + data=b"\x00\x00\x00\x01\x65" if key else b"\x00\x00\x00\x01\x41", + encoding=H264_IMAGE_ENCODING, + format=image.format, + frame_id=image.frame_id, + ts=image.ts, + codec_metadata={ + "seq": seq, + "codec": "h264", + "bitstream": "annex_b", + "is_keyframe": key, + "keyframe_seq": seq if key else 0, + "pts": seq * 90, + "width": image.width, + "height": image.height, + "channels": image.channels, + "dtype": str(image.dtype), + }, + ) + + class FakeEncoder: - def encode(self, image: Image) -> VideoPacket: - return VideoPacket( - seq=0, - ts=image.ts, - frame_id=image.frame_id, - width=image.width, - height=image.height, - format=image.format.value, - codec="h264", - bitstream="annex_b", - is_keyframe=True, - keyframe_seq=0, - pts=90, - data=b"\x00\x00\x00\x01\x65", - ) + def encode(self, image: Image) -> Image: + return _encoded(image) class FakeDecoder: def __init__(self, *, fail: bool = False) -> None: self.fail = fail - def decode(self, packet: VideoPacket) -> Image: + def decode(self, image: Image) -> Image: if self.fail: raise VideoDecodeGapError("waiting for keyframe") return Image( - data=np.zeros((packet.height, packet.width, 3), dtype=np.uint8), - format=ImageFormat(packet.format), - frame_id=packet.frame_id, - ts=packet.ts, + data=np.zeros((image.height, image.width, 3), dtype=np.uint8), + format=image.format, + frame_id=image.frame_id, + ts=image.ts, ) @@ -92,50 +100,64 @@ class InMemoryH264PubSub(H264EncoderMixin, InMemoryPubSubBase): # type: ignore[ pass -def test_h264_lcm_encodes_image_as_video_packet_bytes() -> None: +def test_h264_lcm_encodes_image_as_h264_encoded_image_bytes() -> None: transport = H264LCM() transport._encoder = FakeEncoder() # type: ignore[assignment] image = Image(data=np.zeros((2, 3, 3), dtype=np.uint8), format=ImageFormat.RGB, frame_id="cam") payload = transport.encode(image, StubTopic("/color", Image)) - packet = VideoPacket.lcm_decode(payload) + encoded = Image.lcm_decode(payload) - assert packet.codec == "h264" - assert packet.bitstream == "annex_b" - assert packet.width == 3 - assert packet.height == 2 - assert packet.is_keyframe is True + assert encoded.encoding == H264_IMAGE_ENCODING + assert encoded.codec_metadata["codec"] == "h264" + assert encoded.codec_metadata["bitstream"] == "annex_b" + assert encoded.width == 3 + assert encoded.height == 2 + assert encoded.codec_metadata["is_keyframe"] is True -def test_h264_lcm_decodes_video_packet_bytes_to_image() -> None: +def test_h264_lcm_decodes_h264_image_bytes_to_raw_image() -> None: transport = H264LCM() transport._decoder = FakeDecoder() # type: ignore[assignment] - packet = FakeEncoder().encode( + encoded = FakeEncoder().encode( Image(data=np.zeros((2, 3, 3), dtype=np.uint8), format=ImageFormat.RGB, frame_id="cam") ) - image = transport.decode(packet.lcm_encode(), StubTopic("/color", Image)) + image = transport.decode(encoded.lcm_encode(), StubTopic("/color", Image)) + assert image.encoding == "raw" assert image.frame_id == "cam" assert image.shape == (2, 3, 3) +def test_h264_lcm_decode_false_returns_encoded_image() -> None: + transport = H264LCM(decode_images=False) + encoded = FakeEncoder().encode( + Image(data=np.zeros((2, 3, 3), dtype=np.uint8), format=ImageFormat.RGB, frame_id="cam") + ) + + image = transport.decode(encoded.lcm_encode(), StubTopic("/color", Image)) + + assert image.encoding == H264_IMAGE_ENCODING + assert image.frame_id == "cam" + + def test_h264_lcm_suppresses_decode_gap() -> None: transport = H264LCM() transport._decoder = FakeDecoder(fail=True) # type: ignore[assignment] - packet = FakeEncoder().encode( + encoded = FakeEncoder().encode( Image(data=np.zeros((2, 3, 3), dtype=np.uint8), format=ImageFormat.RGB, frame_id="cam") ) with pytest.raises(DecodingError, match="waiting for keyframe"): - transport.decode(packet.lcm_encode(), StubTopic("/color", Image)) + transport.decode(encoded.lcm_encode(), StubTopic("/color", Image)) -def test_h264_lcm_suppresses_non_video_packet_payload() -> None: +def test_h264_lcm_suppresses_non_image_payload() -> None: transport = H264LCM() - with pytest.raises(DecodingError, match="Invalid VideoPacket payload"): - transport.decode(b"not-a-video-packet", StubTopic("/color", Image)) + with pytest.raises(DecodingError): + transport.decode(b"not-an-image", StubTopic("/color", Image)) def test_h264_lcm_publish_subscribe_delivers_decoded_image() -> None: @@ -164,16 +186,20 @@ def test_h264_lcm_late_subscriber_waits_for_keyframe() -> None: transport._decoder = decoder # type: ignore[assignment] transport.subscribe(topic, lambda image, _topic: received.append(image)) - delta_packet = FakeEncoder().encode( - Image(data=np.zeros((2, 3, 3), dtype=np.uint8), format=ImageFormat.RGB, frame_id="cam") + delta = _encoded( + Image(data=np.zeros((2, 3, 3), dtype=np.uint8), format=ImageFormat.RGB, frame_id="cam"), + seq=1, + key=False, ) - InMemoryPubSubBase.publish(transport, topic, delta_packet.lcm_encode()) + InMemoryPubSubBase.publish(transport, topic, delta.lcm_encode()) decoder.fail = False - keyframe_packet = FakeEncoder().encode( - Image(data=np.zeros((2, 3, 3), dtype=np.uint8), format=ImageFormat.RGB, frame_id="cam") + keyframe = _encoded( + Image(data=np.zeros((2, 3, 3), dtype=np.uint8), format=ImageFormat.RGB, frame_id="cam"), + seq=2, + key=True, ) - InMemoryPubSubBase.publish(transport, topic, keyframe_packet.lcm_encode()) + InMemoryPubSubBase.publish(transport, topic, keyframe.lcm_encode()) assert len(received) == 1 assert received[0].frame_id == "cam" diff --git a/dimos/protocol/video/demo_h264_video_e2e.py b/dimos/protocol/video/demo_h264_video_e2e.py index a848bd1eda..b06acdbe76 100644 --- a/dimos/protocol/video/demo_h264_video_e2e.py +++ b/dimos/protocol/video/demo_h264_video_e2e.py @@ -30,10 +30,9 @@ from dimos.hardware.sensors.camera.webcam import Webcam from dimos.memory2.module import OnExisting, Recorder from dimos.memory2.store.sqlite import SqliteStore -from dimos.memory2.video.h264 import H264ImagePayloadStrategy, H264ImageStorageConfig from dimos.msgs.sensor_msgs.Image import Image, ImageFormat from dimos.protocol.pubsub.impl.h264_lcm import H264LCM -from dimos.protocol.video.h264 import H264Config +from dimos.protocol.video.h264 import H264Config, H264Decoder, VideoDecodeGapError from dimos.utils.logging_config import setup_logger from dimos.visualization.vis_module import vis_module @@ -130,7 +129,7 @@ class H264MemoryReplayConfig(ModuleConfig): class H264MemoryReplay(Module): - """Replay a memory2 H.264 image stream as normal `Image` frames.""" + """Replay a memory2 H.264 image stream as decoded `Image` frames.""" config: H264MemoryReplayConfig color_image: Out[Image] @@ -145,13 +144,22 @@ def start(self) -> None: duration=self.config.duration, loop=self.config.loop, ) + decoder = H264Decoder(_webcam_h264_config) + + def publish_decoded(image: Image) -> None: + try: + self.color_image.publish(decoder.decode(image)) + except VideoDecodeGapError: + # V1 best effort: seek/replay can begin mid-GOP. Suppress deltas + # until the next keyframe restores decoder state. + return def on_error(error: Exception) -> None: logger.error("H.264 replay pipeline error: %s", error, exc_info=True) self.register_disposable( replay.streams.color_image.observable().subscribe( - on_next=self.color_image.publish, + on_next=publish_decoded, on_error=on_error, ) ) @@ -197,7 +205,11 @@ def _on_image(self, image: Image) -> None: self._received += 1 if self._received % 10 == 0: - logger.info("H.264 video probe received %s decoded frames", self._received) + logger.info( + "H.264 video probe received %s %s frames", + self._received, + image.encoding, + ) @rpc def summary(self) -> str: @@ -224,11 +236,7 @@ def _webcam() -> Webcam: H264E2ERecorder.blueprint( db_path="h264_video_e2e.db", on_existing=OnExisting.OVERWRITE, - payload_strategies={ - "color_image": H264ImagePayloadStrategy( - storage_config=H264ImageStorageConfig(codec=_h264_config) - ), - }, + codecs={"color_image": "h264"}, ), H264VideoProbe.blueprint(), ).transports( @@ -237,6 +245,7 @@ def _webcam() -> Webcam: "/demo_h264_video_e2e/color_image", Image, config=_h264_config, + decode_images=False, ) } ) @@ -247,11 +256,7 @@ def _webcam() -> Webcam: H264WebcamRecorder.blueprint( db_path="webcam_h264.db", on_existing=OnExisting.OVERWRITE, - payload_strategies={ - "color_image": H264ImagePayloadStrategy( - storage_config=H264ImageStorageConfig(codec=_webcam_h264_config) - ), - }, + codecs={"color_image": "h264"}, ), ).transports( { @@ -259,6 +264,7 @@ def _webcam() -> Webcam: "/demo_h264_webcam_record/color_image", Image, config=_webcam_h264_config, + decode_images=False, ) } ) diff --git a/dimos/protocol/video/h264.py b/dimos/protocol/video/h264.py index 52303d45c6..187c1c1772 100644 --- a/dimos/protocol/video/h264.py +++ b/dimos/protocol/video/h264.py @@ -17,17 +17,20 @@ from collections.abc import Callable, Sequence from dataclasses import dataclass, field from fractions import Fraction -from typing import TYPE_CHECKING, Protocol +from typing import TYPE_CHECKING, Any, Protocol import numpy as np -from dimos.msgs.sensor_msgs.Image import Image, ImageFormat -from dimos.msgs.sensor_msgs.VideoPacket import VideoPacket +from dimos.msgs.sensor_msgs.Image import H264_IMAGE_ENCODING, Image, ImageFormat if TYPE_CHECKING: import av +H264_CODEC = "h264" +H264_BITSTREAM = "annex_b" + + class MissingVideoDependencyError(ImportError): """Raised when H.264 support is selected without required video packages.""" @@ -72,7 +75,7 @@ class H264CodecAdapter(Protocol): def encode_image(self, image: Image, *, force_keyframe: bool) -> tuple[bytes, int]: ... - def decode_packet(self, packet: VideoPacket) -> Image: ... + def decode_image(self, image: Image) -> Image: ... @dataclass(frozen=True) @@ -100,6 +103,10 @@ def from_rtp_payloads( def ensure_supported_image(image: Image, config: H264Config) -> None: """Validate the first-version H.264 image input contract.""" + if image.encoding != "raw": + raise UnsupportedVideoImageError( + f"H.264 encoding expects raw Image data; got encoding={image.encoding!r}" + ) if image.format not in config.supported_formats: supported = ", ".join(fmt.value for fmt in config.supported_formats) raise UnsupportedVideoImageError( @@ -115,6 +122,26 @@ def ensure_supported_image(image: Image, config: H264Config) -> None: ) +def h264_metadata(image: Image) -> dict[str, Any]: + """Return validated H.264 metadata from an encoded Image.""" + + if image.encoding != H264_IMAGE_ENCODING: + raise ValueError(f"Expected H.264 encoded Image, got encoding={image.encoding!r}") + metadata = image.codec_metadata + if metadata.get("codec", H264_CODEC) != H264_CODEC: + raise ValueError(f"Expected codec={H264_CODEC!r}, got {metadata.get('codec')!r}") + if metadata.get("bitstream", H264_BITSTREAM) != H264_BITSTREAM: + raise ValueError( + f"Expected bitstream={H264_BITSTREAM!r}, got {metadata.get('bitstream')!r}" + ) + for key in ("seq", "is_keyframe", "keyframe_seq", "pts", "width", "height"): + if key not in metadata: + raise ValueError(f"H.264 encoded Image missing metadata field {key!r}") + if not isinstance(image.data, bytes): + raise ValueError("H.264 encoded Image payload must be bytes") + return metadata + + class AiortcH264Codec: """Small adapter around aiortc's H.264 encoder/decoder internals.""" @@ -150,30 +177,34 @@ def encode_image(self, image: Image, *, force_keyframe: bool) -> tuple[bytes, in access_unit = H264AccessUnit.from_rtp_payloads(payloads, self._depayload) return access_unit.data, int(pts) - def decode_packet(self, packet: VideoPacket) -> Image: - frame = self._jitter_frame_type(data=packet.data, timestamp=packet.pts) + def decode_image(self, image: Image) -> Image: + metadata = h264_metadata(image) + assert isinstance(image.data, bytes) + frame = self._jitter_frame_type(data=image.data, timestamp=int(metadata["pts"])) decoded_frames = self._decoder.decode(frame) if not decoded_frames: raise VideoDecodeGapError("H.264 decoder produced no frame") - return self._from_video_frame(decoded_frames[0], packet) + return self._from_video_frame(decoded_frames[0], image) def _to_video_frame(self, image: Image) -> av.VideoFrame: fmt = _av_input_format(image.format) - frame = self._av.VideoFrame.from_ndarray(np.ascontiguousarray(image.data), format=fmt) + frame = self._av.VideoFrame.from_ndarray( + np.ascontiguousarray(image.require_raw("h264 encode")), format=fmt + ) frame.pts = self._frame_index frame.time_base = self._time_base self._frame_index += 1 return frame @staticmethod - def _from_video_frame(frame: av.VideoFrame, packet: VideoPacket) -> Image: - image_format = ImageFormat(packet.format) + def _from_video_frame(frame: av.VideoFrame, image: Image) -> Image: + image_format = image.format arr = frame.to_ndarray(format=_av_input_format(image_format)) - return Image(data=arr, format=image_format, frame_id=packet.frame_id, ts=packet.ts) + return Image(data=arr, format=image_format, frame_id=image.frame_id, ts=image.ts) class H264Encoder: - """Encode a normal DimOS Image stream into per-frame H.264 packets.""" + """Encode a normal DimOS Image stream into per-frame H.264 Images.""" def __init__( self, @@ -186,28 +217,33 @@ def __init__( self._seq = 0 self._keyframe_seq = -1 - def encode(self, image: Image, *, force_keyframe: bool = False) -> VideoPacket: + def encode(self, image: Image, *, force_keyframe: bool = False) -> Image: ensure_supported_image(image, self.config) is_keyframe = self._should_force_keyframe(force_keyframe) access_unit, pts = self._codec.encode_image(image, force_keyframe=is_keyframe) if is_keyframe: self._keyframe_seq = self._seq - packet = VideoPacket( - seq=self._seq, - ts=image.ts, - frame_id=image.frame_id, - width=image.width, - height=image.height, - format=image.format.value, - codec="h264", - bitstream="annex_b", - is_keyframe=is_keyframe, - keyframe_seq=self._keyframe_seq, - pts=pts, + metadata: dict[str, Any] = { + "seq": self._seq, + "codec": H264_CODEC, + "bitstream": H264_BITSTREAM, + "is_keyframe": is_keyframe, + "keyframe_seq": self._keyframe_seq, + "pts": pts, + "width": image.width, + "height": image.height, + "channels": image.channels, + "dtype": str(image.dtype), + } + self._seq += 1 + return Image.encoded( data=access_unit, + encoding=H264_IMAGE_ENCODING, + format=image.format, + frame_id=image.frame_id, + ts=image.ts, + codec_metadata=metadata, ) - self._seq += 1 - return packet def _should_force_keyframe(self, requested: bool) -> bool: if requested or self._seq == 0 or self._keyframe_seq < 0: @@ -217,35 +253,40 @@ def _should_force_keyframe(self, requested: bool) -> bool: class GopBuffer: - """Track H.264 GOP validity across a packet stream.""" + """Track H.264 GOP validity across an encoded Image stream.""" def __init__(self) -> None: self.expected_seq: int | None = None self.keyframe_seq: int | None = None self.valid = False - def accept(self, packet: VideoPacket) -> bool: - """Return True when the packet can be safely decoded.""" + def accept(self, image: Image) -> bool: + """Return True when the encoded Image can be safely decoded.""" - if self.expected_seq is not None and packet.seq != self.expected_seq: + metadata = h264_metadata(image) + seq = int(metadata["seq"]) + keyframe_seq = int(metadata["keyframe_seq"]) + is_keyframe = bool(metadata["is_keyframe"]) + + if self.expected_seq is not None and seq != self.expected_seq: self.valid = False - self.expected_seq = packet.seq + 1 + self.expected_seq = seq + 1 - if packet.is_keyframe: - self.keyframe_seq = packet.seq + if is_keyframe: + self.keyframe_seq = seq self.valid = True return True if not self.valid: return False - if self.keyframe_seq is None or packet.keyframe_seq != self.keyframe_seq: + if self.keyframe_seq is None or keyframe_seq != self.keyframe_seq: self.valid = False return False return True class H264Decoder: - """Decode per-frame H.264 packets into normal DimOS Images.""" + """Decode H.264 encoded Images into normal raw DimOS Images.""" def __init__( self, @@ -258,12 +299,13 @@ def __init__( self._codec = codec or AiortcH264Codec(self.config) self._gop_buffer = gop_buffer or GopBuffer() - def decode(self, packet: VideoPacket) -> Image: - if not self._gop_buffer.accept(packet): + def decode(self, image: Image) -> Image: + metadata = h264_metadata(image) + if not self._gop_buffer.accept(image): raise VideoDecodeGapError( - f"Cannot decode H.264 packet seq={packet.seq}; waiting for next keyframe" + f"Cannot decode H.264 image seq={metadata['seq']}; waiting for next keyframe" ) - return self._codec.decode_packet(packet) + return self._codec.decode_image(image) def _av_input_format(format: ImageFormat) -> str: @@ -279,6 +321,8 @@ def _av_input_format(format: ImageFormat) -> str: __all__ = [ + "H264_BITSTREAM", + "H264_CODEC", "AiortcH264Codec", "GopBuffer", "H264AccessUnit", @@ -290,4 +334,5 @@ def _av_input_format(format: ImageFormat) -> str: "UnsupportedVideoImageError", "VideoDecodeGapError", "ensure_supported_image", + "h264_metadata", ] diff --git a/dimos/protocol/video/test_h264.py b/dimos/protocol/video/test_h264.py index 6f3825db7f..71aba194bf 100644 --- a/dimos/protocol/video/test_h264.py +++ b/dimos/protocol/video/test_h264.py @@ -20,8 +20,7 @@ import numpy as np import pytest -from dimos.msgs.sensor_msgs.Image import Image, ImageFormat -from dimos.msgs.sensor_msgs.VideoPacket import VideoPacket +from dimos.msgs.sensor_msgs.Image import H264_IMAGE_ENCODING, Image, ImageFormat from dimos.protocol.video.h264 import ( AiortcH264Codec, GopBuffer, @@ -32,13 +31,14 @@ MissingVideoDependencyError, UnsupportedVideoImageError, VideoDecodeGapError, + h264_metadata, ) @dataclass class FakeCodec: encoded_force_keyframes: list[bool] - decoded_packets: list[int] + decoded_sequences: list[int] def encode_image(self, image: Image, *, force_keyframe: bool) -> tuple[bytes, int]: self.encoded_force_keyframes.append(force_keyframe) @@ -46,13 +46,14 @@ def encode_image(self, image: Image, *, force_keyframe: bool) -> tuple[bytes, in return b"\x00\x00\x00\x01\x67sps\x00\x00\x00\x01\x68pps\x00\x00\x00\x01\x65idr", 90 return b"\x00\x00\x00\x01\x41delta", 180 - def decode_packet(self, packet: VideoPacket) -> Image: - self.decoded_packets.append(packet.seq) + def decode_image(self, image: Image) -> Image: + metadata = h264_metadata(image) + self.decoded_sequences.append(int(metadata["seq"])) return Image( - data=np.zeros((packet.height, packet.width, 3), dtype=np.uint8), - format=ImageFormat(packet.format), - frame_id=packet.frame_id, - ts=packet.ts, + data=np.zeros((image.height, image.width, 3), dtype=np.uint8), + format=image.format, + frame_id=image.frame_id, + ts=image.ts, ) @@ -65,31 +66,38 @@ def _image(format: ImageFormat = ImageFormat.RGB, dtype: np.dtype = np.dtype(np. ) -def _packet(seq: int, *, key: bool, keyframe_seq: int | None = None) -> VideoPacket: - return VideoPacket( - seq=seq, - ts=123.0 + seq, - frame_id="cam", - width=6, - height=4, - format=ImageFormat.RGB.value, - codec="h264", - bitstream="annex_b", - is_keyframe=key, - keyframe_seq=seq if key else (0 if keyframe_seq is None else keyframe_seq), - pts=seq * 90, +def _encoded(seq: int, *, key: bool, keyframe_seq: int | None = None) -> Image: + return Image.encoded( data=b"\x00\x00\x00\x01\x65" if key else b"\x00\x00\x00\x01\x41", + encoding=H264_IMAGE_ENCODING, + format=ImageFormat.RGB, + frame_id="cam", + ts=123.0 + seq, + codec_metadata={ + "seq": seq, + "codec": "h264", + "bitstream": "annex_b", + "is_keyframe": key, + "keyframe_seq": seq if key else (0 if keyframe_seq is None else keyframe_seq), + "pts": seq * 90, + "width": 6, + "height": 4, + "channels": 3, + "dtype": "uint8", + }, ) -def test_video_packet_serializes_complete_access_unit() -> None: - packet = _packet(0, key=True) +def test_encoded_h264_image_lcm_roundtrips_metadata_and_access_unit() -> None: + image = _encoded(0, key=True) - decoded = VideoPacket.lcm_decode(packet.lcm_encode()) + decoded = Image.lcm_decode(image.lcm_encode()) - assert decoded == packet - assert decoded.codec == "h264" - assert decoded.bitstream == "annex_b" + assert decoded == image + assert decoded.encoding == H264_IMAGE_ENCODING + assert decoded.codec_metadata["codec"] == "h264" + assert decoded.codec_metadata["bitstream"] == "annex_b" + assert isinstance(decoded.data, bytes) assert decoded.data.startswith(b"\x00\x00\x00\x01") @@ -102,39 +110,52 @@ def test_access_unit_assembles_depayloaded_annex_b_fragments() -> None: assert unit.data == b"\x00\x00\x00\x01payload-a\x00\x00\x00\x01payload-b" -def test_encoder_emits_keyframe_metadata_and_periodic_keyframes() -> None: - codec = FakeCodec(encoded_force_keyframes=[], decoded_packets=[]) +def test_encoder_emits_encoded_image_metadata_and_periodic_keyframes() -> None: + codec = FakeCodec(encoded_force_keyframes=[], decoded_sequences=[]) encoder = H264Encoder(H264Config(keyframe_interval=2, max_gop_frames=2), codec=codec) p0 = encoder.encode(_image()) p1 = encoder.encode(_image()) p2 = encoder.encode(_image()) - assert [p0.seq, p1.seq, p2.seq] == [0, 1, 2] - assert [p0.is_keyframe, p1.is_keyframe, p2.is_keyframe] == [True, False, True] - assert [p0.keyframe_seq, p1.keyframe_seq, p2.keyframe_seq] == [0, 0, 2] + assert [p0.codec_metadata["seq"], p1.codec_metadata["seq"], p2.codec_metadata["seq"]] == [ + 0, + 1, + 2, + ] + assert [ + p0.codec_metadata["is_keyframe"], + p1.codec_metadata["is_keyframe"], + p2.codec_metadata["is_keyframe"], + ] == [True, False, True] + assert [ + p0.codec_metadata["keyframe_seq"], + p1.codec_metadata["keyframe_seq"], + p2.codec_metadata["keyframe_seq"], + ] == [0, 0, 2] assert codec.encoded_force_keyframes == [True, False, True] + assert isinstance(p0.data, bytes) assert b"\x67" in p0.data and b"\x68" in p0.data def test_gop_buffer_suppresses_delta_after_sequence_gap_until_keyframe() -> None: - codec = FakeCodec(encoded_force_keyframes=[], decoded_packets=[]) + codec = FakeCodec(encoded_force_keyframes=[], decoded_sequences=[]) decoder = H264Decoder(codec=codec, gop_buffer=GopBuffer()) - assert decoder.decode(_packet(0, key=True)).frame_id == "cam" - assert decoder.decode(_packet(1, key=False, keyframe_seq=0)).frame_id == "cam" + assert decoder.decode(_encoded(0, key=True)).frame_id == "cam" + assert decoder.decode(_encoded(1, key=False, keyframe_seq=0)).frame_id == "cam" with pytest.raises(VideoDecodeGapError): - decoder.decode(_packet(3, key=False, keyframe_seq=0)) + decoder.decode(_encoded(3, key=False, keyframe_seq=0)) with pytest.raises(VideoDecodeGapError): - decoder.decode(_packet(4, key=False, keyframe_seq=0)) + decoder.decode(_encoded(4, key=False, keyframe_seq=0)) - assert decoder.decode(_packet(5, key=True)).frame_id == "cam" - assert codec.decoded_packets == [0, 1, 5] + assert decoder.decode(_encoded(5, key=True)).frame_id == "cam" + assert codec.decoded_sequences == [0, 1, 5] def test_unsupported_image_format_and_dtype_fail_explicitly() -> None: - codec = FakeCodec(encoded_force_keyframes=[], decoded_packets=[]) + codec = FakeCodec(encoded_force_keyframes=[], decoded_sequences=[]) encoder = H264Encoder(codec=codec) with pytest.raises(UnsupportedVideoImageError, match="RGBA"): diff --git a/docs/capabilities/memory/index.md b/docs/capabilities/memory/index.md index b95b024f8b..537fdc4283 100644 --- a/docs/capabilities/memory/index.md +++ b/docs/capabilities/memory/index.md @@ -215,19 +215,13 @@ matters and frame-to-frame compression is worth the dependency cost. ```python skip from dimos.memory2.store.sqlite import SqliteStore -from dimos.memory2.video.h264 import H264ImagePayloadStrategy, H264ImageStorageConfig from dimos.msgs.sensor_msgs.Image import Image -from dimos.protocol.video.h264 import H264Config store = SqliteStore(path="robot_video.db") color = store.stream( "color_image", Image, - payload_strategy=H264ImagePayloadStrategy( - storage_config=H264ImageStorageConfig( - codec=H264Config(bitrate=2_000_000, keyframe_interval=30), - ), - ), + codec="h264", ) ``` @@ -235,32 +229,24 @@ Recorders can configure the same setting per input stream: ```python skip from dimos.memory2.module import Recorder -from dimos.memory2.video.h264 import H264ImagePayloadStrategy, H264ImageStorageConfig -from dimos.protocol.video.h264 import H264Config recorder = Recorder.blueprint( db_path="robot_video.db", - payload_strategies={ - "color_image": H264ImagePayloadStrategy( - storage_config=H264ImageStorageConfig( - codec=H264Config(bitrate=2_000_000, keyframe_interval=30), - ), - ) - }, + codecs={"color_image": "h264"}, ) ``` H.264 storage keeps the normal memory2 shape: one observation row per source -frame. The blob for that observation stores one serialized video packet whose -payload is a complete H.264 Annex B access unit, not individual RTP fragments. -The store also writes H.264 frame metadata for cleanup, diagnostics, and future -indexed decode work. +frame. The blob for that observation stores one encoded `Image` whose data is a +complete H.264 Annex B access unit, not individual RTP fragments. H.264 frame +metadata lives in `Image.codec_metadata`. Metadata queries do not decode pixels. You can inspect timestamps, poses, tags, -and frame ids without paying decode cost. Accessing `obs.data` decodes lazily -when the H.264 decode session has valid GOP state and returns a normal `Image`. -Replay emits decoded `Image` values in timestamp order and suppresses deltas -until the first keyframe at or after the replay start point. +frame ids, `Image.encoding`, and H.264 codec metadata without paying decode +cost. Accessing `obs.data` returns an encoded `Image` for H.264 streams. Use an +explicit H.264 decode session to convert replayed encoded images to raw pixel +images; that decoder suppresses deltas until the first keyframe at or after the +replay start point. H.264 storage currently supports uint8 RGB, BGR, and grayscale images. It raises an explicit error for depth images, 16-bit images, alpha formats, and other diff --git a/docs/coding-agents/style.md b/docs/coding-agents/style.md index 7e903b408b..657399b40c 100644 --- a/docs/coding-agents/style.md +++ b/docs/coding-agents/style.md @@ -50,14 +50,15 @@ from dimos.memory2.store.base import Store from dimos.memory2.stream import Stream ``` -## H.264 image packet shape +## H.264 encoded Image shape When editing H.264 image transport or memory2 storage, keep the public module contract as `Out[Image]` and `In[Image]`. Do not expose RTP fragments to module authors or memory2 observations. -For LCM, DDS, and memory2 storage, each encoded packet must contain all H.264 NAL -units for exactly one source frame as one Annex B access unit. Store one memory2 -observation per source frame. P-frames still depend on earlier GOP state, so -decode from a valid keyframe and suppress output after sequence gaps, late join, -or replay seek until the next keyframe. +For LCM, DDS, and memory2 storage, each encoded `Image` must contain all H.264 +NAL units for exactly one source frame as one Annex B access unit, with H.264 +frame state in `Image.codec_metadata`. Store one memory2 observation per source +frame. P-frames still depend on earlier GOP state, so decode from a valid +keyframe and suppress output after sequence gaps, late join, or replay seek until +the next keyframe. diff --git a/docs/development/testing.md b/docs/development/testing.md index b402fd8114..306a6403b7 100644 --- a/docs/development/testing.md +++ b/docs/development/testing.md @@ -67,8 +67,8 @@ pytest -m self_hosted dimos/path/to/test_something.py The H.264 unit tests use fake codec adapters where possible, so they run in the default suite without requiring FFmpeg/libx264. Run the focused tests after -changing video packet shape, eager `Image` compatibility, H.264 transport, memory2 -storage, or the demo blueprint: +changing encoded `Image` shape, eager/raw `Image` compatibility, H.264 transport, +memory2 storage, or the demo blueprint: ```bash uv run pytest dimos/protocol/video/test_h264.py dimos/msgs/sensor_msgs/test_image.py -q @@ -80,8 +80,8 @@ CI=1 uv run pytest dimos/robot/test_all_blueprints_generation.py -q The runtime H.264 path uses `aiortc`, PyAV, FFmpeg, and libx264. If a test or manual run instantiates the real codec and those dependencies are missing, H.264 should fail with an actionable dependency error. Keep fake-adapter unit tests in -place so the default suite still covers packet semantics, GOP handling, and -memory2 behavior. +place so the default suite still covers encoded-image semantics, GOP handling, +and memory2 behavior. When you add or rename a runnable demo blueprint, regenerate `dimos/robot/all_blueprints.py` with: diff --git a/docs/usage/transports/index.md b/docs/usage/transports/index.md index 7d5a30113d..062096716e 100644 --- a/docs/usage/transports/index.md +++ b/docs/usage/transports/index.md @@ -120,7 +120,8 @@ Use `H264LcmTransport` when a high-rate `Image` stream needs video compression over LCM. The module API stays the same: publishers still call `Out[Image].publish(image)`, and subscribers still receive `Image` values. The transport encodes each source frame as one H.264 Annex B access unit on the wire -and decodes it at the subscriber. +and decodes it at the subscriber by default. Set `decode_images=False` when a +subscriber, such as a recorder, should receive encoded `Image` values instead. ```python skip from dimos.core.transport import H264LcmTransport @@ -142,6 +143,11 @@ blueprint = blueprint.transports( ) ``` +Encoded delivery uses the same public `Image` type but sets +`image.encoding == "h264"`, stores the Annex B payload in `image.data`, and stores +sequence/keyframe metadata in `image.codec_metadata`. Raw-pixel methods raise for +encoded images; decode them through a H.264 decode session first. + H.264 transport is opt-in. The default image paths remain unchanged: normal LCM uses the `Image` LCM encoding, and memory2 still stores images with the default JPEG codec unless configured otherwise. diff --git a/openspec/changes/add-h264-codec-mem2-storage/design.md b/openspec/changes/add-h264-codec-mem2-storage/design.md index fa9dd99fbb..f18645459a 100644 --- a/openspec/changes/add-h264-codec-mem2-storage/design.md +++ b/openspec/changes/add-h264-codec-mem2-storage/design.md @@ -1,458 +1,158 @@ ## Context -DimOS transports currently move typed stream payloads and are mostly stateless per message. Image-specific compression already exists as JPEG transport adapters: `JpegLcmTransport` and `JpegShmTransport` wrap a carrier with an image encoder/decoder while subscribers still receive `Image` objects. This is the right precedent for H.264, but H.264 differs because decoding depends on GOP state rather than one independent compressed frame. +DimOS modules exchange typed `Image` streams. Existing JPEG compression keeps that public type stable: JPEG is a storage/transport codec detail, and callers usually see decoded raw pixels. H.264 needs a similar opt-in path, but it differs from JPEG because many frames are delta frames that require prior GOP state to decode. -memory2 currently stores image observations through the normal `Backend` path. `codec_for(Image)` selects `JpegCodec`, which is stateless per row. `Observation` already supports lazy payloads through `_UNLOADED` and `_loader`, which is the correct surface for memory2 H.264 decode. The current `Codec.encode(value) -> bytes` contract is not expressive enough for H.264 writes because the encoder is stateful, keyframes are periodic, and later packets depend on earlier packets. +This design keeps the PR minimal and coherent with existing abstractions: -The design therefore introduces H.264 as an image codec layer that can sit above multiple carriers, not as a replacement for LCM, DDS, ROS, SHM, or WebRTC. Carrier adapters move compressed video packets between machines; endpoint adapters decode those packets back into `Image` objects for normal modules. +- `Image` remains the public payload type. +- Default image storage remains JPEG. +- memory2 continues to use the normal `Backend` + `Codec` path. +- H.264 live transport owns live encode/decode state. +- H.264 storage stores encoded `Image` values through a normal `H264ImageCodec`, not through a special backend. -The aiortc project is the preferred implementation source for the video codec layer now that DimOS already depends on it for WebRTC-related functionality. It implements Python WebRTC/ORTC video send/receive paths, including H.264 encode/decode, H.264 RTP packetization/depacketization, PyAV-backed `libx264` encoding, Baseline/zerolatency-style settings, and WebRTC loss-recovery mechanisms such as NACK/PLI. DimOS should directly wrap aiortc's H.264 encoder/decoder where practical, while converting aiortc RTP payload details into a Foxglove-style complete Annex B access unit before exposing packets to non-WebRTC carriers or memory2. - -Foxglove's `CompressedVideo` design is the right compatibility target for DimOS packet shape: one message contains the compressed video data needed for exactly one source frame, H.264 data is Annex B, B-frames are not supported, and every IDR keyframe includes parameter sets such as SPS/PPS. This does not remove the need for keyframes: P-frames still depend on prior decoded reference frames. It does remove the need for DimOS memory2/LCM/DDS consumers to reason about individual RTP fragments. +Foxglove's H.264 guidance remains the packet-shape target: each encoded message contains all Annex B NAL units emitted for one encoder input frame. A complete encoded frame packet is not necessarily independently decodable; P-frames still require earlier GOP state. ## Goals / Non-Goals **Goals:** -- Preserve `Out[Image]` and `In[Image]` as the public module stream contract. -- Add a carrier-neutral per-frame `VideoPacket` representation for complete H.264 Annex B access units, matching Foxglove's one compressed-video message per encoder input frame model. -- Add stateful H.264 encoder/decoder components with deterministic GOP/keyframe behavior. -- Add an LCM carrier adapter first, modeled after `JpegLcmTransport`. -- Add memory2 H.264 storage that keeps one observation per frame, stores one packet blob per frame, and lazily reconstructs `obs.data` as `Image`. -- Provide top-level opt-in configuration for live transports and recorder/store storage. -- Use aiortc directly for H.264 encode/decode through a DimOS adapter, and use aiortc public WebRTC APIs for the future WebRTC carrier. -- Keep JPEG transport and JPEG memory2 storage as defaults. +- Preserve `Out[Image]` and `In[Image]` as the user-facing stream contract. +- Extend `Image` so it can explicitly carry either raw pixels (`encoding="raw"`) or encoded H.264 access-unit bytes (`encoding="h264"`). +- Add H.264 encode/decode sessions with GOP/keyframe tracking, sequence-gap suppression, and explicit unsupported-format/dependency errors. +- Add `H264LcmTransport` with a decode mode: + - `decode_images=True`: subscribers receive decoded raw `Image` values. + - `decode_images=False`: subscribers receive encoded `Image` values for storage or inspection. +- Add `H264ImageCodec` so memory2 can store encoded H.264 `Image` values through the existing codec path. +- Keep `codec_for(Image)` as JPEG and require explicit `codec="h264"` for H.264 storage. +- Document v1 best-effort behavior: no transport QoS, durable keyframe cache, keyframe request, or guaranteed arbitrary random pixel decode. **Non-Goals:** -- Replacing underlying carriers such as LCM, DDS, ROS, SHM, or WebRTC. -- Making every transport support H.264 in the first implementation; DDS/SHM/WebRTC carriers are follow-ups. -- Exposing aiortc RTP payload fragments or WebRTC session state as public DimOS module, transport, or memory2 storage APIs. -- Supporting depth images, 16-bit images, alpha formats, or arbitrary pixel formats in the first implementation. -- Making `codec_for(Image)` return H.264 by default. -- Guaranteeing random access without decoding from a prior keyframe. -- Exposing video packets to normal module authors as the default stream type. - -## DimOS Architecture - -### Layering - -The design has three layers: - -```text -Module API layer - Out[Image] / In[Image] - │ - ▼ -Codec layer - H264Encoder / H264Decoder / GopBuffer - Image ⇄ VideoPacket - │ - ▼ -Carrier layer - H264LcmTransport first - DDS / SHM / WebRTC later -``` - -The carrier still performs inter-process or inter-machine communication. The H.264 layer only changes how image payloads are encoded before carrier publish and decoded after carrier receive. - -### Proposed classes and locations - -Core packet and codec classes: - -- `dimos/msgs/sensor_msgs/VideoPacket.py` - - Carrier-neutral message for one encoded video frame/access unit. - - Fields: `seq`, `ts`, `frame_id`, `width`, `height`, `format`, `codec`, `bitstream`, `is_keyframe`, `keyframe_seq`, `pts`, `data`. - - First supported `codec`: `h264`. - - First supported `bitstream`: Annex B complete access unit for exactly one source frame, aligned with Foxglove `CompressedVideo` expectations: for every full-frame encoder input call, DimOS creates one `VideoPacket` containing all NAL units emitted for that input frame. - - A `VideoPacket` is a complete encoded-frame packet, not necessarily an independently decodable image. Keyframe packets must contain enough decoder bootstrap data for late join and recovery, including SPS/PPS on every IDR; delta-frame packets require prior decoded GOP state. - -- `dimos/protocol/video/h264.py` - - `H264Config`: bitrate, target fps, keyframe interval, profile, preset/tune, max GOP frames, pixel format. - - `AiortcH264Codec`: small DimOS adapter around `aiortc.codecs.h264.H264Encoder`, `aiortc.codecs.h264.H264Decoder`, and `aiortc.codecs.h264.h264_depayload`. - - `H264Encoder`: DimOS-facing wrapper that runs at the publishing or recording endpoint and converts `Image` to ordered `VideoPacket` values using aiortc. - - `H264Decoder`: DimOS-facing wrapper that runs at the subscribing or replay/decode endpoint and converts ordered `VideoPacket` values to decoded `Image` values using aiortc. - - `GopBuffer`: tracks the latest keyframe and following delta packets, detects sequence gaps, and suppresses output until the next keyframe after a gap. - - `H264AccessUnit`: helper that converts aiortc RTP payload batches into a complete Annex B access unit before building a `VideoPacket`. - - `UnsupportedVideoImageError` / `VideoDecodeGapError`: explicit errors for unsupported image formats and unusable GOP state. - -Implementation dependency: - -- aiortc's `src/aiortc/codecs/h264.py` provides the mechanics DimOS should call rather than reimplement initially: `H264Encoder.encode()` uses PyAV `libx264`, forces keyframes by setting frame picture type, emits Baseline/zerolatency H.264, and returns RTP-sized H.264 payloads plus timestamp; `h264_depayload()` converts RTP H.264 payloads back to Annex B bytes; `H264Decoder.decode()` decodes a depayloaded `JitterFrame` through PyAV. -- DimOS should assemble the aiortc payloads for one encoded source frame into a single Annex B `VideoPacket.data` value before publication/storage. This packet carries every NAL unit emitted for that encoder input frame, but only IDR/keyframe packets are expected to be independently bootstrappable. WebRTC carriers may keep aiortc RTP packetization internally, but LCM/DDS/memory2 should exchange complete access units. -- The adapter should avoid leaking aiortc classes such as `JitterFrame` and RTP payload descriptors into DimOS public APIs. If future aiortc versions change these codec internals, only `AiortcH264Codec` should need adjustment. - -Image payload semantics: - -- `dimos/msgs/sensor_msgs/Image.py` - - Keep `Image` as the eager numpy-backed payload used by existing modules, transports, visualization, and JPEG storage. - - H.264 laziness belongs at memory2's `Observation.data` boundary, not inside `Image`. - - When H.264 decode succeeds, `obs.data` returns a normal eager `Image`. - -LCM carrier classes: - -- `dimos/protocol/pubsub/impl/h264_lcm.py` - - `H264LCM`: LCM pubsub encoder/decoder that publishes serialized `VideoPacket` values on the wire and returns `Image` objects to subscribers. - - Holds one encoder per publisher instance and one `GopBuffer`/decoder per subscriber instance. - -- `dimos/core/transport.py` - - `H264LcmTransport`: mirrors `JpegLcmTransport` and instantiates `H264LCM` lazily to avoid importing video dependencies at normal startup. - - Reduces to `(H264LcmTransport, (topic, type, config))` for worker serialization. - -WebRTC carrier classes, later: +- Adding a special memory2 backend for H.264. +- Adding a generic payload-strategy framework for this PR. +- Adding lazy pixels to `Image`; `Image.data` remains eager and is either `np.ndarray` for raw images or `bytes` for encoded images. +- Exposing a separate public encoded-video stream type. +- Supporting depth, 16-bit, alpha, or arbitrary pixel formats in the first implementation. +- Making H.264 the default image storage codec. -- `dimos/protocol/pubsub/impl/webrtc_video.py` - - Uses aiortc public APIs such as `RTCPeerConnection`, media tracks, RTP senders/receivers, and RTCP feedback. - - Lets WebRTC own packetization, jitter buffering, retransmission/NACK, PLI keyframe requests, bitrate adaptation, and NAT traversal. - - Bridges between DimOS `Image` and WebRTC `VideoFrame` at the module boundary. - - Optionally exports encoded packets into the DimOS `VideoPacket` format for memory2 recording when aiortc exposes a clean encoded-frame hook; otherwise the first WebRTC integration may decode to `Image` and let memory2 re-encode. - - If exporting, convert WebRTC RTP payloads into complete Annex B access units first; do not persist raw RTP fragments. +## Architecture -memory2 storage classes: +### Image payload shape -- `dimos/memory2/video/h264.py` - - `H264ImagePayloadStrategy`: generic memory2 payload strategy for logical `Stream[Image]` storage. - - `H264ImageStorageConfig`: config object consumed by the payload strategy. - - `H264FrameIndexStore`: stores H.264 frame metadata for cleanup, diagnostics, and future indexed decode work. - - The strategy owns encoder state on append and writes one observation row plus one serialized `VideoPacket` blob per source frame. - - Observation loaders and replay use the same H.264 decode-session policy as live transport: deltas are suppressed until a valid keyframe establishes decoder state. +`Image` gains two explicit codec fields: -Store/recorder integration: - -- `dimos/memory2/store/sqlite.py` - - Persist generic `payload_strategy` config in `_streams` so reopening the database restores the selected payload strategy. - - Bind SQLite-backed auxiliary stores to strategies through generic strategy hooks rather than H.264-specific `Store` branches. - -- `dimos/memory2/module.py` - - Add recorder-level per-stream `payload_strategies` configuration. - - Recorder still subscribes to `In[Image]`; the payload strategy controls how incoming images are persisted. - -### Where components run - -Live LCM path across machines: - -```text -Source machine / worker process - module Out[Image] - └─ H264LcmTransport.broadcast() - └─ H264Encoder encodes Image -> VideoPacket - └─ LCM publishes packet bytes - -Network / LCM multicast - carries VideoPacket bytes, not numpy pixels - -Subscriber machine / worker process - H264LcmTransport.subscribe() - └─ LCM receives packet bytes - └─ GopBuffer validates seq/keyframe state - └─ H264Decoder produces eager Image - └─ module In[Image] callback +```python +encoding: str = "raw" +codec_metadata: dict[str, Any] = {} ``` -memory2 recording path: - -```text -Recorder module process - In[Image] receives normal Image - └─ stream.append(Image) - └─ generic Backend delegates payload bytes to H264ImagePayloadStrategy - ├─ observation table row: ts / pose / tags - ├─ blob row: serialized VideoPacket with complete Annex B access unit - └─ h264 frame metadata: seq / keyframe / pts / format -``` +For raw images, `data` is a NumPy array and existing pixel operations work. -memory2 replay/decode path: +For H.264 images, `data` is bytes containing one complete Annex B access unit for one source frame. `format` still describes the decoded pixel layout (for example, RGB or BGR), while `codec_metadata` carries video metadata such as: -```text -Replay or query process - stream query returns Observation[Image] metadata - └─ obs.data - └─ H264 payload strategy decodes through H264Decoder session state - ├─ delta before valid keyframe: suppress/fail clearly - └─ keyframe and following deltas: return eager Image +```python +{ + "codec": "h264", + "bitstream": "annex_b", + "seq": 42, + "is_keyframe": False, + "keyframe_seq": 30, + "pts": 3780, + "width": 640, + "height": 480, + "channels": 3, + "dtype": "uint8", +} ``` -The first implementation may re-encode images when recording a decoded `Image` stream that originally arrived over H.264 transport. Preserving incoming packet bytes end-to-end can be a later optimization via a packet side-channel; it is not required to make the public behavior correct. +Pixel operations such as `to_rgb()`, `to_bgr()`, `to_opencv()`, `as_numpy()`, `brightness`, and Rerun conversion require `encoding="raw"` and fail clearly for encoded images. -WebRTC/aiortc path, later: +### H.264 codec/session layer -```text -Source machine / async WebRTC worker - Image source track - └─ aiortc encodes VideoFrame -> RTP/H.264 - └─ WebRTC handles packetization, jitter, NACK/PLI, bandwidth +`dimos/protocol/video/h264.py` provides the shared stateful video logic: -Network / WebRTC session - carries RTP media packets +- `H264Config`: bitrate, target FPS, keyframe interval, profile/tune/preset, max GOP, supported formats. +- `AiortcH264Codec`: adapter around aiortc/PyAV H.264 encode/decode internals. +- `H264Encoder`: converts raw `Image` to encoded `Image(encoding="h264")`. +- `H264Decoder`: converts encoded H.264 `Image` to raw `Image` when GOP state is valid. +- `GopBuffer`: tracks sequence numbers and keyframe state; suppresses deltas after gaps until a keyframe. +- `H264AccessUnit`: assembles aiortc RTP-sized payloads into one Annex B access unit. -Subscriber machine / async WebRTC worker - aiortc receives/decodes RTP media - └─ adapter converts VideoFrame -> Image - └─ module In[Image] callback -``` - -This path is intentionally different from LCM and memory2 storage. WebRTC is a session protocol with negotiated codecs and RTP packet state; memory2 still needs deterministic per-observation packet rows and GOP lookup independent of any active peer connection. +Transport and replay/view code instantiate separate encoder/decoder sessions. They share implementation, not runtime state. -### Top-level activation and configuration +### Live transport -Live transport activation should use existing blueprint transport mapping: +`H264LcmTransport` mirrors the JPEG transport pattern while adding an explicit decode mode. ```python -from dimos.core.transport import H264LcmTransport -from dimos.protocol.video.h264 import H264Config - -blueprint = autoconnect(camera(), consumer()).transports( - { - ("color_image", Image): H264LcmTransport( - "/color_image", - Image, - config=H264Config( - bitrate=2_000_000, - keyframe_interval=30, - profile="baseline", - tune="zerolatency", - ), - ) - } -) +H264LcmTransport("/camera/color", Image, config=H264Config(...)) ``` -memory2 direct store activation: +Default mode decodes on receive, so normal subscribers get raw `Image` values. ```python -from dimos.memory2.video.h264 import H264ImagePayloadStrategy, H264ImageStorageConfig -from dimos.protocol.video.h264 import H264Config - -stream = store.stream( - "color_image", - Image, - payload_strategy=H264ImagePayloadStrategy( - storage_config=H264ImageStorageConfig( - codec=H264Config(bitrate=2_000_000, keyframe_interval=30), - ), - ), -) +H264LcmTransport("/camera/color", Image, config=cfg, decode_images=False) ``` -Recorder activation: +Encoded mode still uses the logical `Image` type, but subscribers receive `Image(encoding="h264")`. This is the mode used by recorders that should persist transport-produced H.264 bytes. -```python -MyRecorder.blueprint( - payload_strategies={ - "color_image": H264ImagePayloadStrategy( - storage_config=H264ImageStorageConfig( - codec=H264Config(bitrate=2_000_000, keyframe_interval=30), - ), - ) - } -) -``` +### memory2 storage -Default behavior stays unchanged: +memory2 stores H.264 through a normal codec: ```python -store.stream("color_image", Image) # JPEG-backed memory2 storage -LCMTransport("/color_image", Image) # normal LCM image transport +store.stream("color_image", Image, codec="h264") ``` -### Proposed end-to-end test blueprint - -Add one runnable synthetic blueprint that proves live H.264 transmission and H.264 memory2 storage through the normal DimOS surfaces, without robot hardware or a physical camera. - -Proposed location and registry name: - -- `dimos/protocol/video/demo_h264_video_e2e.py` -- Blueprint variable: `demo_h264_video_e2e` -- CLI name after registry generation: `demo-h264-video-e2e` - -Components: - -- `SyntheticVideoSource(Module)` - - Publishes deterministic `color_image: Out[Image]` frames. - - Uses a moving pattern, frame counter overlay/encoded pixels, and fixed metadata: width, height, format, frame_id, timestamp cadence. - - Defaults to a short loop-friendly rate such as 15 or 30 FPS, with configurable width, height, FPS, frame count, and pattern seed. - -- `H264E2ERecorder(Recorder)` - - Declares `color_image: In[Image]`. - - Uses recorder-level `payload_strategies={"color_image": H264ImagePayloadStrategy(...)}` so memory2 writes the received image stream as H.264 packets rather than JPEG blobs. - - Defaults `db_path` to an explicit temporary/demo path such as `h264_video_e2e.db` so manual QA can inspect it. - -- `H264VideoProbe(Module)` - - Subscribes to `color_image: In[Image]` after live H.264 transport decode. - - Tracks received frame count, monotonic timestamps, dimensions, frame_id, and approximate pixel/checksum expectations for the deterministic pattern. - - Exposes a simple RPC/status method for manual QA, e.g. `summary() -> str`, reporting frames received, drops detected, first/last seq-equivalent frame marker, and validation errors. - -Blueprint sketch: +or recorder config: ```python -from dimos.core.coordination.blueprints import autoconnect -from dimos.core.transport import H264LcmTransport -from dimos.memory2.video.h264 import H264ImagePayloadStrategy, H264ImageStorageConfig -from dimos.msgs.sensor_msgs import Image -from dimos.protocol.video.h264 import H264Config - -h264_config = H264Config( - bitrate=2_000_000, - target_fps=30, - keyframe_interval=30, - profile="baseline", - tune="zerolatency", -) - -demo_h264_video_e2e = autoconnect( - SyntheticVideoSource.blueprint(width=640, height=360, fps=30), - H264E2ERecorder.blueprint( - db_path="h264_video_e2e.db", - payload_strategies={ - "color_image": H264ImagePayloadStrategy( - storage_config=H264ImageStorageConfig(codec=h264_config), - ), - }, - ), - H264VideoProbe.blueprint(expected_width=640, expected_height=360), -).transports( - { - ("color_image", Image): H264LcmTransport( - "/demo/h264_video_e2e/color_image", - Image, - config=h264_config, - ) - } -) -``` - -This blueprint intentionally exercises two independent H.264 paths: - -1. **Live transmission:** `SyntheticVideoSource` publishes normal `Image`; `H264LcmTransport` encodes to `VideoPacket`, transmits over LCM, decodes back to `Image`, and delivers to normal `In[Image]` subscribers. -2. **Storage:** `H264E2ERecorder` receives normal `Image` and writes memory2 observations using H.264 image storage, including GOP index rows and one Annex B packet blob per observation. - -Manual QA contract: - -- Run `dimos run demo-h264-video-e2e --daemon`. -- Confirm logs show H.264 encoder initialization, periodic keyframes, probe frame counts, and recorder append counts. -- Open the produced memory2 store and query `color_image` observations without touching `obs.data`; metadata should be available without decode. -- Access `obs.data` during ordered replay/query. Delta frames before the first valid keyframe after the start point may be suppressed or fail clearly; the first keyframe at or after the start point and later deltas should return decoded `Image` pixels. -- Replay the stored stream and confirm decoded images arrive on the normal replay schedule. -- Run a seq-gap variant, either by a test-only packet drop option in `H264LcmTransport` or a direct `GopBuffer` driver, and verify the probe receives no corrupted images and resumes only after the next keyframe. - -The blueprint should be excluded from normal hardware requirements and should not require a viewer. If it is registered as a runnable blueprint, regenerate `dimos/robot/all_blueprints.py` with `pytest dimos/robot/test_all_blueprints_generation.py`. - -### Storage schema - -Use existing per-stream observation and blob tables for primary data: - -```text -color_image - id, ts, value, pose fields, tags - -color_image_blob - id -> serialized VideoPacket Annex B access unit -``` - -Add a standalone GOP index table for H.264 image streams: - -```text -h264_frames - stream_name - observation_id - seq - keyframe_observation_id - is_keyframe - pts - width - height - format - codec - bitstream +Recorder.blueprint(codecs={"color_image": "h264"}) ``` -This table is storage-owned metadata. Generic observation tables remain focused on timeline, pose, tags, and scalar values. +`H264ImageCodec` only stores/restores encoded `Image` values. It does not decode pixels and does not own GOP state. Reopened stores restore the codec through the existing stream registry `codec_id` field. -### DimOS Spec Protocols, skills/MCP, CLI, generated registries +This means H.264 recording expects the recorder input to receive encoded Images, typically by subscribing through `H264LcmTransport(..., decode_images=False)`. If a recorder receives raw Images, either use the default JPEG codec or explicitly encode before appending. -No new DimOS Python `Spec` Protocol is required for the first version because encode/decode is transport and storage behavior, not cross-module RPC. No skills or MCP tools are exposed. +### Replay and visualization -No CLI command is required for the core feature. The synthetic `demo-h264-video-e2e` blueprint is the manual QA surface for end-to-end live transmission and storage. If the runnable blueprint is added, regenerate `dimos/robot/all_blueprints.py` with `pytest dimos/robot/test_all_blueprints_generation.py`. +memory2 replay of a stream stored with `codec="h264"` emits encoded Images in timestamp order. A separate H.264 decoder session converts that encoded stream to raw Images for Rerun or consumers. V1 decode policy is best effort: if replay starts mid-GOP, deltas are suppressed until the first keyframe at or after the start point. ## Decisions -1. **Make H.264 a codec layer above carriers, not a carrier itself.** - - Rationale: publisher and subscriber may run on different machines, and LCM/DDS/SHM/WebRTC remain responsible for communication. - - Alternative rejected: a monolithic `H264ImageTransport` that hides the underlying carrier, because it does not generalize cleanly to DDS or WebRTC. +1. **Use encoded `Image`, not a separate public encoded-video type.** + - Rationale: the user-facing type remains `Image` across transport and memory2. H.264 packet metadata lives in `Image.codec_metadata`. -2. **Use one complete Annex B `VideoPacket` per source frame.** - - Rationale: this preserves frame timestamps, sequence numbers, GOP state, and memory2 one-observation-per-frame semantics while matching Foxglove `CompressedVideo` expectations. Each packet contains all NAL units emitted for one encoder input frame. - - Key detail: "complete packet for a frame" is not the same as "standalone-decodable frame." IDR/keyframe packets can bootstrap decode when they include SPS/PPS; P-frame packets still require prior decoded GOP state. - - Alternative rejected: MP4 segment files as the primary model, because live transports and per-frame memory2 replay become harder to align. +2. **Use normal memory2 codecs, not a special backend.** + - Rationale: memory2 already persists blob payloads through `Codec`. H.264 encoded images can be stored as encoded data without changing `Store` or `Backend` semantics. 3. **Keep `codec_for(Image)` as JPEG.** - - Rationale: H.264 writes need stateful encoder ownership; the stateless memory2 `Codec` contract should remain simple and backward compatible. H.264 storage uses a generic payload strategy instead of changing the default codec. + - Rationale: default behavior must remain stateless, compatible, and independent of H.264 dependencies. -4. **Decode only from valid GOP state.** - - Rationale: missing H.264 packets can corrupt decoded pixels. After a seq gap, late join, or replay seek into a GOP, subscribers and memory2 replay should suppress or fail decode until a keyframe restores a self-contained GOP. - - Key detail: complete per-frame access units remove RTP-fragment handling from DimOS storage, but they do not remove inter-frame dependencies; P-frames still require prior decoded reference frames. +4. **Let transport choose decoded vs encoded subscriber payloads.** + - Rationale: normal modules want raw Images, while recorders may want the H.264 bytes produced by transport. The choice is explicit on `H264LcmTransport`. -5. **Use aiortc's H.264 codec classes through a DimOS adapter.** - - Rationale: aiortc already depends on PyAV, sets up `libx264`, handles keyframe forcing, produces RTP-sized H.264 payloads, and provides depayload/decode logic. Reusing it reduces new codec code and aligns with the WebRTC transport dependency. - - Boundary: DimOS stores and transports one logical Annex B `VideoPacket` per source frame; aiortc's multiple RTP payloads are an implementation detail converted before leaving the codec adapter. - - Alternative rejected: copy aiortc's H.264 implementation into DimOS immediately, because direct wrapping is simpler while aiortc is already a dependency. +5. **Decode only from valid GOP state.** + - Rationale: complete per-frame access units remove RTP-fragment handling but not inter-frame dependency. P-frames still require prior decoded state. -6. **Configure WebRTC as a future carrier, not the core codec abstraction.** - - Rationale: WebRTC already solves live RTP packetization, jitter, packet loss, keyframe requests, NAT traversal, and adaptive bitrate, but memory2 still needs deterministic per-observation packet storage and replay. - - aiortc should be the preferred Python implementation path for this carrier because it already supports sending and receiving H.264 video and RTCP recovery feedback. +6. **Defer QoS.** + - Rationale: LCM is best effort. Keyframe request, durable keyframe cache, retransmission, PLI, and transport QoS belong in a later video-session/QoS design. -7. **Store one complete access unit per observation, not one observation per RTP fragment.** - - Rationale: aiortc's encoder returns multiple RTP-sized payloads for one source frame. memory2 should depayload and assemble them into one Annex B `VideoPacket` for the frame, so queries, replay, pose, tags, and GOP indexes remain frame-oriented and Foxglove-compatible. - - Alternative rejected: storing each RTP fragment as its own observation, because replay and random access would inherit network packetization complexity and break frame-level memory2 semantics. +## Safety / Replay -8. **Repeat decoder parameter sets on every IDR keyframe.** - - Rationale: late join, random access, and memory2 partial reads require keyframes to bootstrap decoding without relying on stream-start state. - - Alternative rejected: sending SPS/PPS only at stream startup, because late subscribers and mid-recording reads may never decode. +This change affects image transport and recording only. It does not command robot hardware or change control loops. -## Safety / Simulation / Replay +Unsupported image formats must fail explicitly when H.264 encoding is selected. Encoded images must not silently pass through raw-pixel methods. -This change affects image transport and recording only. It does not command robot hardware, alter control loops, or expose new skills. Existing hardware safety assumptions remain unchanged. - -Simulation and hardware cameras use the same `Image` semantics. Unsupported image formats such as depth or 16-bit images should fail at H.264 configuration/append/publish time with a clear error, not silently convert or corrupt data. - -Replay must emit normal decoded `Image` objects on the existing memory2 replay schedule. Sequential replay should share decoder state so normal playback decodes each packet once. - -V1 H.264 decode is best-effort. Late subscribers and memory2 replay/query starting at timestamp `T` start without prior GOP state; delta frames are suppressed until the first keyframe at or after `T`, then that keyframe and following decodable deltas are available. Full QoS, durable keyframe cache, keyframe request/PLI, and indexed random decode are follow-up design work. - -Manual QA should use the synthetic `demo-h264-video-e2e` blueprint so no robot or physical camera is required. The demo should verify live LCM round-trip, memory2 append/query without decode, lazy `obs.data` decode, replay, and seq-gap behavior. - -## Risks / Trade-offs - -- **Stateful codec complexity:** H.264 has encoder and decoder state. Mitigation: keep state in explicit `H264Encoder`, `H264Decoder`, and `GopBuffer` classes rather than hiding it in `Codec`. -- **Observation-level lazy decode:** Existing `Image` remains eager. Mitigation: keep H.264 laziness at `Observation.data` so generic image consumers remain unchanged. -- **Packet loss:** LCM has no built-in reliable delivery or late-join keyframe durability. Mitigation: periodic IDR frames and seq-gap suppression; later add keyframe request or durable carriers where available. -- **Dependency variability:** aiortc/PyAV/FFmpeg support varies by platform. Mitigation: keep H.264 optional under the extra that already provides aiortc/WebRTC support, preserve JPEG defaults, and fail clearly when video mode is selected without dependencies. -- **aiortc codec API stability:** aiortc codec classes are importable and useful, but the most stable aiortc surface is WebRTC itself. Mitigation: isolate all direct codec imports in `AiortcH264Codec`, pin/verify aiortc versions, and add focused tests around encode/depayload/decode behavior. -- **Double encode on record:** A recorder consuming decoded H.264 transport images may re-encode for memory2 storage. Mitigation: accept this in the first version; consider packet pass-through as a later optimization. -- **Best-effort random access:** Mid-GOP access without prior decoder state may be unavailable in v1. Mitigation: short GOP defaults, decoder reuse during sequential replay, and suppression until the first keyframe after the start point. +Replay after arbitrary seek is best effort. A decoder session starts without GOP state, suppresses deltas until the first keyframe at or after the start point, then emits decoded raw Images for that keyframe and following decodable deltas. Full random pixel access to any arbitrary P-frame is not a v1 guarantee. ## Migration / Rollout -1. Reuse the existing aiortc/WebRTC dependency path for H.264 support; add a lightweight `video` extra only if users need H.264 storage without the broader WebRTC extra. -2. Add `VideoPacket`, H.264 config, `AiortcH264Codec`, DimOS-facing encoder/decoder wrappers, GOP buffer, Annex B access-unit assembly, and explicit errors. -3. Preserve eager `Image` behavior; keep lazy decode at `Observation.data`. -4. Add `H264LCM` and `H264LcmTransport` as the first live carrier adapter. -5. Add memory2 generic payload-strategy support and H.264 image payload strategy. -6. Add registry serialization so reopened SQLite stores know which streams use H.264 payload strategy. -7. Add `demo_h264_video_e2e` for synthetic end-to-end live transport plus memory2 storage QA. -8. Add tests and synthetic manual QA for live transport, storage, lazy decode, replay, unsupported formats, and seq gaps. -9. Update memory2 and transport docs with opt-in examples and dependency notes. - -Rollback is straightforward because all behavior is opt-in. Removing H.264 configuration returns live streams and new recordings to existing transport/JPEG behavior. Existing H.264-backed recordings still require the video dependency to decode pixels, but metadata should remain queryable. - -No generated blueprint registry update is needed unless a runnable demo blueprint is added. - -## Open Questions - -- Should the packet message be named `VideoPacket`, `EncodedImagePacket`, or `CompressedVideoFrame`? -- Should LCM H.264 publish raw packet bytes under an `Image` channel name or use a distinct LCM message type/channel suffix internally? -- What default bitrate, keyframe interval, and target FPS should be used for common DimOS camera streams? -- Should first-version memory2 storage store packet blobs in the existing `{stream}_blob` table or introduce a dedicated packet blob table? -- Should WebRTC integration reuse this `VideoPacket` abstraction, or map directly between `Image` and WebRTC media tracks with optional packet export for memory2? -- Does aiortc expose a stable encoded-frame hook that can avoid decode/re-encode when recording a WebRTC H.264 stream into memory2? -- Should `AiortcH264Codec` pin to aiortc minor versions or include compatibility tests against the minimum supported aiortc version? +1. Extend `Image` with `encoding` and `codec_metadata` while preserving raw eager defaults. +2. Add H.264 encoder/decoder/session classes that produce and consume encoded Images. +3. Add `H264LcmTransport` decode mode. +4. Add `H264ImageCodec` and explicit `codec="h264"` storage. +5. Update demos so recording uses encoded transport mode and replay decodes through an H.264 session before visualization. +6. Update docs/tests/specs to remove obsolete storage-strategy and packet-type language. + +Rollback is straightforward for new runs: remove H.264 transport/storage configuration and streams return to normal raw/JPEG behavior. Existing H.264-backed recordings require the H.264 codec path to read encoded Images and a decoder session to view pixels. diff --git a/openspec/changes/add-h264-codec-mem2-storage/docs.md b/openspec/changes/add-h264-codec-mem2-storage/docs.md index 1fc6f44188..bbbab8f0e9 100644 --- a/openspec/changes/add-h264-codec-mem2-storage/docs.md +++ b/openspec/changes/add-h264-codec-mem2-storage/docs.md @@ -2,30 +2,31 @@ - Update `docs/usage/transports/index.md` or the image-transport-specific transport docs to describe opt-in H.264 image transport behavior: - Public module streams remain `Out[Image]` and `In[Image]`. - - `H264LcmTransport` compresses image payloads internally as H.264 and delivers decoded `Image` objects to subscribers. - - H.264 packets contain complete Annex B access units for one source frame, matching Foxglove-style `CompressedVideo` expectations. + - `H264LcmTransport` compresses image payloads internally as H.264 and delivers decoded `Image` objects by default. + - `decode_images=False` delivers encoded `Image` objects for storage or explicit decode sessions. + - H.264 encoded images contain complete Annex B access units for one source frame, matching Foxglove-style `CompressedVideo` expectations. - Delta frames require prior GOP state; after packet loss or late join, subscribers resume on the next keyframe. - Unsupported image formats fail clearly rather than silently converting. - Update `docs/usage/blueprints.md` with an opt-in blueprint transport mapping example for `H264LcmTransport` and `H264Config`. - Update memory2 user docs, likely under `docs/usage/` or the memory2 capability docs, to describe opt-in H.264-backed image storage: - Default image storage remains JPEG-backed. - - Users opt in per stream with `H264ImagePayloadStrategy`. + - Users opt in per stream with `codec="h264"` or recorder `codecs={"stream": "h264"}`. - memory2 still stores one observation per source frame. - - metadata queries do not require pixel decode. - - accessing `obs.data` lazily reconstructs an `Image` when the H.264 decode session has valid GOP state; replay/decoded views suppress deltas until the first keyframe at or after the start point. - - replay emits decoded `Image` frames on the normal replay schedule. + - metadata queries and `obs.data` access return encoded `Image` values without pixel decode. + - explicit H.264 decode sessions convert encoded replay streams to raw decoded `Image` values and suppress deltas until the first keyframe at or after the start point. + - replay emits encoded `Image` values on the normal replay schedule. - Add a short manual QA section for `demo-h264-video-e2e` after the demo blueprint exists: - run `dimos run demo-h264-video-e2e --daemon` - inspect probe/recorder logs - query the generated memory2 store - - validate lazy decode, replay, and seq-gap recovery. + - validate encoded storage, replay decode, and seq-gap recovery. - Mention optional video dependencies in the installation or feature docs. Users should know that H.264 mode requires the aiortc/PyAV/FFmpeg dependency path while JPEG defaults remain available without selecting H.264. ## Contributor Docs - Update `docs/development/testing.md` or a nearby development testing guide with H.264-specific test commands once tests exist: - - unit tests for `VideoPacket`, H.264 access-unit assembly, GOP buffering, unsupported formats, and lazy `Image.data` behavior - - memory2 storage tests for append/query/lazy decode/reopen/replay + - unit tests for encoded `Image` metadata, H.264 access-unit assembly, GOP buffering, unsupported formats, and raw-pixel guards + - memory2 storage tests for `H264ImageCodec`, append/query/reopen/replay, and default JPEG compatibility - synthetic end-to-end demo/blueprint smoke test for live LCM transmission and memory2 recording. - Document dependency expectations for contributors who run video tests locally, including how to install the relevant `uv` extras and how tests should skip clearly when video dependencies are unavailable. - If `demo_h264_video_e2e` is registered as a runnable blueprint, contributor docs should remind maintainers to regenerate `dimos/robot/all_blueprints.py` with `pytest dimos/robot/test_all_blueprints_generation.py`. diff --git a/openspec/changes/add-h264-codec-mem2-storage/proposal.md b/openspec/changes/add-h264-codec-mem2-storage/proposal.md index e428eba421..cff06956f4 100644 --- a/openspec/changes/add-h264-codec-mem2-storage/proposal.md +++ b/openspec/changes/add-h264-codec-mem2-storage/proposal.md @@ -2,32 +2,32 @@ DimOS image streams currently use full `Image` objects over typed transports and memory2 stores images as independent JPEG payloads. That is simple and compatible, but inefficient for long-running camera streams and remote subscribers because each frame is compressed independently and no shared video codec state is reused. -DimOS needs an opt-in H.264 image-stream path that preserves the public `Image` stream contract while allowing live transports and memory2 storage to carry compact video frame packets. The design should make H.264 reusable across carriers such as LCM first, and DDS/WebRTC later, while keeping memory2 queries, pose/tag alignment, and replay frame semantics intact. +DimOS needs an opt-in H.264 image-stream path that preserves the public `Image` stream contract while allowing live transports and memory2 storage to carry compact encoded image payloads. The design should make H.264 reusable across carriers such as LCM first, and DDS/WebRTC later, while keeping memory2 queries, pose/tag alignment, and replay frame timing intact. ## What Changes -- Add a carrier-neutral H.264 image packet behavior for RGB/BGR-style `Image` streams, with one encoded video access unit per source frame. +- Add encoded `Image` support for RGB/BGR-style image streams, with one H.264 Annex B access unit per source frame and codec metadata on the `Image`. - Add stateful H.264 encode/decode behavior that produces periodic self-contained keyframes, rejects unsupported image formats clearly, detects sequence gaps, and resumes delivery only after a valid keyframe. -- Add an opt-in live transport path for H.264 image streams, starting with LCM, that exposes decoded `Image` objects to subscribers rather than video packets. -- Add memory2 H.264 image storage that preserves one observation row per frame, stores per-frame video packet payloads, indexes GOP/keyframe relationships, and lazily reconstructs `obs.data` as an `Image` on demand. +- Add an opt-in live transport path for H.264 image streams, starting with LCM, that can expose decoded raw `Image` values or encoded H.264 `Image` values depending on subscriber configuration. +- Add memory2 H.264 image storage through a normal `H264ImageCodec` so streams can store one encoded `Image` observation per frame without a special storage backend. - Preserve the existing JPEG image codec and JPEG-backed memory2 storage as the default behavior. - No hardware-safety behavior changes are intended. - No public robot-control, skill, or MCP breaking changes are intended. ## Affected DimOS Surfaces -- Modules/streams: typed `Image` streams, image-specific transport adapters, memory2 Recorder ingestion, memory2 Stream/Observation lazy payload access, and replay output of decoded images. +- Modules/streams: typed `Image` streams, image-specific transport adapters, memory2 Recorder ingestion, memory2 Stream/Observation payload access, and replay output of encoded images for H.264-backed streams. - Blueprints/CLI: blueprints may opt image streams into H.264-capable transports or memory2 H.264 storage; existing blueprint behavior remains unchanged unless configured. - Skills/MCP: no direct skill or MCP behavior changes expected. -- Hardware/simulation/replay: camera-heavy hardware and simulation streams may benefit from reduced bandwidth/storage; replay must continue to emit normal decoded `Image` frames on the same schedule. +- Hardware/simulation/replay: camera-heavy hardware and simulation streams may benefit from reduced bandwidth/storage; H.264 replay emits encoded `Image` values on the same schedule and explicit decode sessions convert them to decoded frames for consumers. - Docs/generated registries: memory2 and transport docs need updates; generated blueprint registries are not expected to change unless new demo blueprints are added. ## Capabilities ### New Capabilities -- `h264-image-streams`: Covers carrier-neutral H.264 image packets, live image-stream encode/decode behavior, keyframe/GOP handling, sequence-gap behavior, and transport compatibility expectations. -- `memory2-h264-storage`: Covers opt-in H.264-backed memory2 image observation storage, per-frame packet persistence, best-effort GOP decode, lazy `Observation.data` reconstruction, and replay compatibility. +- `h264-image-streams`: Covers encoded H.264 `Image` payloads, live image-stream encode/decode behavior, keyframe/GOP handling, sequence-gap behavior, and transport compatibility expectations. +- `memory2-h264-storage`: Covers opt-in H.264-backed memory2 image observation storage through `H264ImageCodec`, per-frame encoded image persistence, replay of encoded images, and explicit best-effort decode sessions. ### Modified Capabilities @@ -37,4 +37,4 @@ DimOS needs an opt-in H.264 image-stream path that preserves the public `Image` Users and developers gain a more bandwidth- and storage-efficient option for camera streams while keeping existing `Image` stream consumers and memory2 query/replay behavior familiar. Existing JPEG-backed recordings, default transports, and non-image streams remain compatible. -Compatibility risk centers on adding optional video codec dependencies, preserving lazy-load lifetimes, making GOP recovery deterministic after packet loss or missing storage rows, and avoiding silent corruption when frames cannot be decoded. Documentation and QA should cover opt-in configuration, supported image formats, dependency installation, LCM live-stream behavior, memory2 append/query/lazy-decode/replay behavior, packet-loss recovery, and a small synthetic image-stream demo. +Compatibility risk centers on adding optional video codec dependencies, keeping encoded images from accidentally flowing into raw-pixel operations, making GOP recovery deterministic after packet loss or replay seek, and avoiding silent corruption when frames cannot be decoded. Documentation and QA should cover opt-in configuration, supported image formats, dependency installation, LCM live-stream behavior, memory2 append/query/replay behavior, packet-loss recovery, and a small synthetic image-stream demo. diff --git a/openspec/changes/add-h264-codec-mem2-storage/specs/h264-image-streams/spec.md b/openspec/changes/add-h264-codec-mem2-storage/specs/h264-image-streams/spec.md index 0c97d46d19..b5ca467d27 100644 --- a/openspec/changes/add-h264-codec-mem2-storage/specs/h264-image-streams/spec.md +++ b/openspec/changes/add-h264-codec-mem2-storage/specs/h264-image-streams/spec.md @@ -8,7 +8,7 @@ DimOS SHALL allow an image stream to opt into H.264 encoding while preserving `I - **AND** the source module publishes `Image` values on an `Out[Image]` stream - **WHEN** a downstream module subscribes through an `In[Image]` stream - **THEN** the downstream callback receives decoded `Image` values -- **AND** the module author does not need to publish or subscribe to encoded video packet values. +- **AND** the module author does not need to publish or subscribe to a separate encoded video type. #### Scenario: Existing image streams remain unchanged by default - **GIVEN** a blueprint does not opt an image stream into H.264 transmission @@ -16,19 +16,19 @@ DimOS SHALL allow an image stream to opt into H.264 encoding while preserving `I - **THEN** DimOS MUST preserve the existing image transport behavior - **AND** H.264 dependencies or settings are not required for that stream. -### Requirement: H.264 packets are complete per-frame Annex B access units -DimOS SHALL represent each H.264-transmitted source image frame as one complete encoded-frame packet containing the Annex B NAL units emitted for that encoder input frame. +### Requirement: H.264 encoded Images are complete per-frame Annex B access units +DimOS SHALL represent each H.264-transmitted source image frame as one encoded `Image` whose `data` contains the complete Annex B access unit emitted for that encoder input frame. -#### Scenario: One encoded packet corresponds to one source frame +#### Scenario: One encoded Image corresponds to one source frame - **GIVEN** an H.264-enabled image stream publishes one source `Image` frame - **WHEN** DimOS encodes that frame for a non-WebRTC carrier or for packet inspection -- **THEN** the encoded packet data MUST contain all NAL units emitted for that source frame in Annex B form -- **AND** the encoded packet represents exactly one source frame. +- **THEN** the encoded `Image.data` MUST contain all NAL units emitted for that source frame in Annex B form +- **AND** the encoded `Image.codec_metadata` MUST identify the payload as H.264 Annex B for exactly one source frame. -#### Scenario: Delta-frame packets require GOP state -- **GIVEN** an encoded packet contains a delta frame -- **WHEN** a decoder processes that packet without the prior GOP state required by H.264 -- **THEN** DimOS MUST treat the packet as requiring recovery from a keyframe +#### Scenario: Delta-frame encoded Images require GOP state +- **GIVEN** an encoded `Image` contains a delta frame +- **WHEN** a decoder processes that encoded image without the prior GOP state required by H.264 +- **THEN** DimOS MUST treat the encoded image as requiring recovery from a keyframe - **AND** DimOS MUST avoid presenting corrupted image pixels as a valid decoded `Image`. ### Requirement: Keyframes bootstrap late join and recovery @@ -42,10 +42,25 @@ DimOS SHALL provide periodic keyframes for H.264 image streams so subscribers ca #### Scenario: Keyframes include decoder parameter data - **GIVEN** an H.264 image stream emits an IDR keyframe -- **WHEN** the keyframe packet is used to bootstrap a new decoder -- **THEN** the keyframe packet MUST include the decoder parameter information needed for that bootstrap, such as SPS/PPS for H.264 Annex B streams +- **WHEN** the keyframe encoded `Image` is used to bootstrap a new decoder +- **THEN** the encoded `Image.data` MUST include the decoder parameter information needed for that bootstrap, such as SPS/PPS for H.264 Annex B streams - **AND** later delta frames in the same GOP may depend on that decoded keyframe state. +### Requirement: H.264 transport can deliver decoded or encoded Images +DimOS SHALL allow H.264 live transport subscribers to receive decoded raw `Image` values by default or encoded H.264 `Image` values when explicitly requested. + +#### Scenario: Default subscriber receives decoded Images +- **GIVEN** a blueprint configures `H264LcmTransport` without changing its decode mode +- **WHEN** a source publishes raw `Image` values +- **THEN** the subscriber MUST receive raw decoded `Image` values +- **AND** pixel operations on those images remain valid. + +#### Scenario: Encoded subscriber receives H.264 Images +- **GIVEN** a blueprint configures `H264LcmTransport` with encoded delivery enabled +- **WHEN** a source publishes raw `Image` values +- **THEN** the subscriber MUST receive `Image` values with `encoding="h264"` +- **AND** those images MUST preserve H.264 frame metadata needed by downstream storage or decode sessions. + ### Requirement: H.264 live decode is best-effort without QoS guarantees DimOS SHALL apply a best-effort H.264 decode policy for live carriers that do not provide video QoS, keyframe requests, or durable keyframe caching. diff --git a/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-h264-storage/spec.md b/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-h264-storage/spec.md index 4161080eb6..f4ebc05a6e 100644 --- a/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-h264-storage/spec.md +++ b/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-h264-storage/spec.md @@ -1,19 +1,19 @@ ## ADDED Requirements ### Requirement: H.264 image storage is opt-in per memory2 stream -memory2 SHALL allow image streams to opt into H.264-backed storage through a generic payload strategy while preserving the default image-storage behavior for streams that do not opt in. +memory2 SHALL allow image streams to opt into H.264 storage through the normal codec configuration path while preserving the default image-storage behavior for streams that do not opt in. #### Scenario: Stream opts into H.264 storage -- **GIVEN** a memory2 image stream is configured for H.264-backed storage -- **WHEN** the stream appends `Image` values -- **THEN** memory2 MUST store those image observations using H.264-backed payloads +- **GIVEN** a memory2 image stream is configured with the H.264 image codec +- **WHEN** the stream appends encoded `Image` values with `encoding="h264"` +- **THEN** memory2 MUST store those image observations using H.264 encoded payloads through the existing backend/blob path - **AND** queries for the stream must continue to return image observations associated with the original frame timestamps. -#### Scenario: H.264 storage uses payload strategy extension point -- **GIVEN** a store creates an `Image` stream with an H.264 payload strategy +#### Scenario: H.264 storage uses the normal codec extension point +- **GIVEN** a store creates an `Image` stream with `codec="h264"` - **WHEN** memory2 creates the stream backend -- **THEN** memory2 MUST route payload encode, blob loader attachment, and decode-error suppression through the generic payload strategy interface -- **AND** the generic store base must not contain H.264-specific branches or imports. +- **THEN** memory2 MUST use the normal codec resolution and blob persistence flow +- **AND** the generic store and backend paths must not contain H.264-specific branches or imports. #### Scenario: Stream uses default image storage - **GIVEN** a memory2 image stream is created without H.264 image-storage configuration @@ -30,63 +30,65 @@ memory2 SHALL store H.264-backed image streams with one observation correspondin - **THEN** memory2 MUST create one queryable observation per source frame - **AND** each observation must retain its timestamp, frame identifier, pose metadata when available, and tags independently of pixel decode. -### Requirement: Stored H.264 packets are complete frame access units -memory2 SHALL store each H.264-backed image observation with an encoded payload that contains the complete Annex B access unit for that source frame. +### Requirement: Stored H.264 Images are complete frame access units +memory2 SHALL store each H.264 image observation with an encoded `Image` payload that contains the complete Annex B access unit for that source frame. -#### Scenario: Stored packet is inspected or exported -- **GIVEN** an H.264-backed image observation has an encoded payload +#### Scenario: Stored encoded Image is inspected or exported +- **GIVEN** an H.264-backed image observation has an encoded `Image` payload - **WHEN** the payload is inspected by storage tooling or exported to a compatible video-message format -- **THEN** the payload MUST represent all NAL units emitted for that source frame in Annex B form +- **THEN** the `Image.data` payload MUST represent all NAL units emitted for that source frame in Annex B form +- **AND** the `Image.codec_metadata` MUST include H.264 frame metadata such as sequence, keyframe state, keyframe reference, presentation timestamp, dimensions, codec, and bitstream - **AND** memory2 MUST avoid exposing individual RTP fragments as the stored observation payload. ### Requirement: Decode starts from valid keyframe state -memory2 SHALL use the same best-effort H.264 decode policy as live subscribers: decode starts without GOP state and suppresses delta frames until a keyframe at or after the start point establishes valid decoder state. +H.264 decoded views over memory2 replay SHALL use the same best-effort H.264 decode policy as live subscribers: decode starts without GOP state and suppresses delta frames until a keyframe at or after the start point establishes valid decoder state. #### Scenario: Replay seeks into the middle of a GOP - **GIVEN** a user starts replay or a decoded view at a timestamp whose first stored H.264 packet is a delta frame -- **WHEN** memory2 decodes the stream from that start point -- **THEN** memory2 MUST suppress undecodable delta frames until the first keyframe at or after the start point -- **AND** memory2 MUST emit decoded `Image` values for that keyframe and following decodable delta frames. +- **WHEN** a H.264 decode session decodes the replayed encoded images from that start point +- **THEN** the decode session MUST suppress undecodable delta frames until the first keyframe at or after the start point +- **AND** the decoded view MUST emit decoded `Image` values for that keyframe and following decodable delta frames. #### Scenario: Required GOP state is missing -- **GIVEN** an H.264-backed image observation requires prior GOP data to decode -- **WHEN** memory2 cannot load a usable keyframe or required delta-frame sequence -- **THEN** memory2 MUST fail the pixel decode with a clear storage/decode error -- **AND** memory2 MUST avoid returning corrupted pixels as a valid `Image`. +- **GIVEN** an H.264 encoded image requires prior GOP state to decode +- **WHEN** a decode session has no usable keyframe state +- **THEN** the decode session MUST fail or suppress the decode with a clear decode error +- **AND** DimOS MUST avoid returning corrupted pixels as a valid decoded `Image`. ### Requirement: Metadata queries do not force pixel decode -memory2 SHALL allow metadata access for H.264-backed image observations without decoding image pixels. +memory2 SHALL allow metadata and encoded-payload access for H.264-backed image observations without decoding image pixels. #### Scenario: Query reads observation metadata only - **GIVEN** a memory2 store contains H.264-backed image observations -- **WHEN** a user queries observations and reads timestamps, frame identifiers, pose metadata, or tags -- **THEN** memory2 MUST provide that metadata without requiring H.264 pixel decode -- **AND** pixel decode should occur only when the user accesses image data. +- **WHEN** a user queries observations and reads timestamps, frame identifiers, pose metadata, tags, `Image.encoding`, or H.264 codec metadata +- **THEN** memory2 MUST provide that information without requiring H.264 pixel decode +- **AND** the stored `obs.data` value for a H.264 stream MUST be an encoded `Image`, not a decoded pixel image. -### Requirement: Lazy pixel access reconstructs Image values on best-effort decode -memory2 SHALL lazily reconstruct `Image` values for H.264-backed observations when pixel data is requested and valid decoder state is available. +### Requirement: H.264 codec stores and restores encoded Images +memory2 SHALL store and restore H.264 observations as encoded `Image` values through the H.264 image codec. -#### Scenario: User accesses observation data -- **GIVEN** a queried H.264-backed image observation has not decoded its pixels yet +#### Scenario: User accesses observation data from an H.264 stream +- **GIVEN** a queried H.264-backed image observation was stored with the H.264 image codec - **WHEN** the user accesses `obs.data` -- **THEN** memory2 MUST return a decoded `Image` value if the H.264 decode session has valid GOP state for that observation -- **AND** memory2 MUST suppress or fail clearly for undecodable deltas rather than returning corrupted pixels. +- **THEN** memory2 MUST return an `Image` value with `encoding="h264"` +- **AND** pixel decoding MUST require an explicit H.264 decode session outside the generic memory2 backend. -### Requirement: H.264-backed replay emits normal Image frames -memory2 SHALL replay H.264-backed image streams as normal decoded `Image` frames on the existing replay schedule. +### Requirement: H.264-backed replay emits encoded Images +memory2 SHALL replay H.264-backed image streams as encoded `Image` values on the existing replay schedule. #### Scenario: Replaying a stored H.264 image stream - **GIVEN** a memory2 store contains an H.264-backed image stream - **WHEN** replay is started for that stream -- **THEN** replay MUST emit decoded `Image` values in observation timestamp order -- **AND** replay MUST skip undecodable deltas before the first valid keyframe at or after the replay start point -- **AND** consumers of replayed streams must not need to consume encoded video packet values. +- **THEN** replay MUST emit encoded `Image` values in observation timestamp order +- **AND** an explicit H.264 decode session MAY convert those encoded images to raw decoded `Image` values for visualization or consumers +- **AND** that decode session MUST skip undecodable deltas before the first valid keyframe at or after the replay start point. ### Requirement: H.264 storage survives store reopen -memory2 SHALL persist H.264 payload-strategy configuration and frame metadata so a reopened store can query, decode, and replay H.264-backed image streams. +memory2 SHALL persist H.264 codec configuration and encoded image metadata so a reopened store can query and replay H.264-backed image streams. #### Scenario: Reopen and decode - **GIVEN** a memory2 store was written with an H.264-backed image stream - **WHEN** the process closes and a later process reopens the store - **THEN** memory2 MUST recognize the stream as H.264-backed -- **AND** the reopened store must support metadata query, lazy pixel decode, and best-effort replay for the stored observations. +- **AND** the reopened store must return encoded `Image` values from query and replay +- **AND** explicit decode sessions must retain the same best-effort keyframe-start behavior after reopen. diff --git a/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-payload-strategies/spec.md b/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-payload-strategies/spec.md deleted file mode 100644 index a0a5f7493d..0000000000 --- a/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-payload-strategies/spec.md +++ /dev/null @@ -1,56 +0,0 @@ -## ADDED Requirements - -### Requirement: memory2 streams support generic payload strategies -memory2 SHALL allow a stream backend to delegate payload encoding, lazy loader attachment, and decode-error policy to an optional payload strategy without changing the logical stream payload type. - -#### Scenario: Stream appends through a payload strategy -- **GIVEN** a memory2 stream is created with a payload strategy for its payload type -- **WHEN** the stream appends a value -- **THEN** the backend MUST preserve normal observation metadata insertion semantics -- **AND** the backend MUST delegate payload byte encoding to the configured payload strategy before writing the blob. - -#### Scenario: Stream queries attach strategy loaders -- **GIVEN** a memory2 stream has stored blobs written by a payload strategy -- **WHEN** observations are queried or replayed -- **THEN** the backend MUST attach lazy data loaders through the payload strategy -- **AND** observation metadata must remain readable without materializing the payload. - -### Requirement: Payload strategies remain storage-generic -memory2 SHALL keep payload strategy integration generic so the base store and backend abstractions do not depend on H.264-specific classes. - -#### Scenario: Base store creates a strategy-backed backend -- **GIVEN** a stream configuration includes a payload strategy -- **WHEN** the generic store creates the backend -- **THEN** the store MUST pass the strategy through the generic backend construction path -- **AND** the store MUST avoid payload-specific imports, type checks, or backend subclasses for H.264. - -#### Scenario: Storage backend binds optional local resources -- **GIVEN** a concrete store implementation reopens a stream with a serialized payload strategy -- **WHEN** the strategy needs store-local resources such as a SQLite connection for auxiliary metadata -- **THEN** the concrete store MAY bind those resources through a strategy hook -- **AND** the binding hook must remain generic so other strategies can use the same extension point. - -### Requirement: Payload strategy configuration survives store reopen -memory2 SHALL persist payload strategy identity and configuration in stream registry metadata so reopened stores can reconstruct strategy-backed streams. - -#### Scenario: Reopen a strategy-backed stream -- **GIVEN** a stream was created with a payload strategy -- **WHEN** a later process reopens the store -- **THEN** memory2 MUST deserialize the configured payload strategy -- **AND** the reopened stream must use that strategy for lazy payload access and replay behavior. - -### Requirement: Replay honors strategy decode suppression -memory2 SHALL allow payload strategies to classify decode errors that replay should suppress while preserving normal failure behavior for unrelated errors. - -#### Scenario: Strategy suppresses an undecodable payload -- **GIVEN** a replay iterator encounters a payload decode error -- **AND** the stream's payload strategy classifies that error as suppressible -- **WHEN** replay advances through the stream -- **THEN** memory2 MUST skip that undecodable observation -- **AND** replay MUST continue with later observations. - -#### Scenario: Strategy does not suppress an error -- **GIVEN** a replay iterator encounters a payload decode error -- **AND** the stream's payload strategy does not classify that error as suppressible -- **WHEN** replay advances through the stream -- **THEN** memory2 MUST surface the error to the caller. diff --git a/openspec/changes/add-h264-codec-mem2-storage/tasks.md b/openspec/changes/add-h264-codec-mem2-storage/tasks.md index 35c714e5e2..f8878ee316 100644 --- a/openspec/changes/add-h264-codec-mem2-storage/tasks.md +++ b/openspec/changes/add-h264-codec-mem2-storage/tasks.md @@ -1,42 +1,41 @@ -## 1. Core video packet and codec behavior +## 1. Encoded Image and H.264 codec behavior -- [x] 1.1 Add the carrier-neutral encoded video frame message for one complete H.264 Annex B access unit per source `Image` frame, including sequence, timestamp, frame identifier, dimensions, format, codec, bitstream, keyframe, keyframe-reference, presentation timestamp, and payload fields. +- [x] 1.1 Add encoded `Image` support for one complete H.264 Annex B access unit per source frame, including sequence, timestamp, frame identifier, dimensions, format, codec, bitstream, keyframe, keyframe-reference, presentation timestamp, and payload metadata. - [x] 1.2 Add H.264 configuration covering bitrate, target FPS, keyframe interval, profile, preset/tune, maximum GOP length, and supported pixel format settings. - [x] 1.3 Add the aiortc-backed H.264 adapter that converts `Image` frames to H.264 output and converts H.264 input back to `Image` while keeping aiortc/RTP internals out of public DimOS APIs. - [x] 1.4 Add access-unit assembly so all NAL units emitted for one encoder input frame are stored or transmitted as one Annex B packet, not as individual RTP fragments. - [x] 1.5 Add GOP/keyframe state tracking that detects sequence gaps, marks decoder state invalid, suppresses corrupted output, and resumes only after a usable keyframe. - [x] 1.6 Add explicit errors for unsupported image formats, missing video dependencies, and unusable GOP/decode state. -- [x] 1.7 Add focused codec tests for per-frame Annex B packet shape, keyframe metadata, SPS/PPS bootstrap behavior, sequence-gap handling, dependency errors, and unsupported image formats. +- [x] 1.7 Add focused codec tests for per-frame Annex B encoded image shape, keyframe metadata, SPS/PPS bootstrap behavior, sequence-gap handling, dependency errors, and unsupported image formats. -## 2. Image compatibility and observation lazy decode +## 2. Image compatibility and encoded-payload guards -- [x] 2.1 Keep `Image` eager and numpy-backed; use memory2 `Observation.data` as the lazy H.264 decode boundary. +- [x] 2.1 Keep raw `Image` eager and numpy-backed while allowing encoded `Image` payloads to carry H.264 bytes and codec metadata. - [x] 2.2 Preserve existing eager `Image` behavior and compatibility for current JPEG, LCM, SHM, memory2, and visualization consumers. -- [x] 2.3 Add tests proving eager images still work after the H.264 storage changes. +- [x] 2.3 Add tests proving eager images still work and encoded images reject raw-pixel operations. ## 3. Live H.264 image transport - [x] 3.1 Add the H.264 LCM pubsub adapter that publishes encoded video frame packets on the wire and delivers decoded `Image` values to subscribers. -- [x] 3.2 Add `H264LcmTransport` to the transport layer with worker-safe serialization behavior matching existing transport patterns. +- [x] 3.2 Add `H264LcmTransport` to the transport layer with worker-safe serialization behavior and a `decode_images` mode matching existing transport patterns. - [x] 3.3 Keep normal image transport behavior unchanged unless a blueprint explicitly opts a stream into H.264 transport. - [x] 3.4 Add live transport tests for `Out[Image]` to `In[Image]` delivery, keyframe bootstrap, late subscriber behavior, sequence-gap recovery, and default transport compatibility. -## 4. memory2 H.264 image storage +## 4. memory2 H.264 image codec storage -- [x] 4.1 Add per-stream H.264 image payload strategy configuration for direct store creation and recorder configuration. -- [x] 4.2 Route configured memory2 `Image` streams through a generic payload strategy while leaving unconfigured `Image` streams on the existing default storage path. -- [x] 4.3 Store one observation row per source frame and one encoded Annex B frame packet payload per observation. -- [x] 4.4 Add persistent H.264 frame/keyframe metadata for H.264-backed image streams. -- [x] 4.5 Persist and reload per-stream storage configuration so reopened stores recognize H.264-backed image streams. -- [x] 4.6 Add lazy observation loading that returns metadata without decode and reconstructs `Image` pixels on best-effort H.264 decode when `obs.data` is accessed. -- [x] 4.7 Add replay support that emits decoded `Image` values in observation timestamp order and suppresses undecodable deltas until the first valid keyframe after the replay start point. -- [x] 4.8 Add memory2 tests for append/query, metadata access without decode, keyframe and sequential lazy decode, missing-GOP failure, store reopen, replay seek suppression, default JPEG compatibility, and unsupported formats. -- [x] 4.9 Add generic payload strategy tests for lifecycle, payload encoding/loading, registry persistence, SQLite binding, and replay decode-error suppression. +- [x] 4.1 Add an `H264ImageCodec` that stores and restores encoded `Image` values through the normal memory2 codec path. +- [x] 4.2 Route configured memory2 `Image` streams through `codec="h264"` while leaving unconfigured `Image` streams on the existing default JPEG storage path. +- [x] 4.3 Store one observation row per source frame and one encoded `Image` payload containing a complete Annex B access unit per observation. +- [x] 4.4 Persist H.264 codec metadata in encoded `Image` payloads instead of a memory2-specific frame index table. +- [x] 4.5 Persist and reload per-stream codec configuration so reopened stores recognize H.264-backed image streams. +- [x] 4.6 Keep generic memory2 query and `obs.data` access encoded for H.264 streams; require explicit H.264 decode sessions for raw pixels. +- [x] 4.7 Add replay support that emits encoded `Image` values in observation timestamp order so an explicit decode session can apply best-effort keyframe startup. +- [x] 4.8 Add memory2 tests for `H264ImageCodec`, append/query, encoded payload access, store reopen, replay, default JPEG compatibility, and encoded-image raw-pixel guards. ## 5. Synthetic end-to-end blueprint and manual QA surface -- [x] 5.1 Add `dimos/protocol/video/demo_h264_video_e2e.py` with a deterministic synthetic `Image` source, H.264 memory2 recorder, and decoded-frame probe. -- [x] 5.2 Configure the blueprint to exercise both live H.264 LCM transmission and H.264 memory2 storage through normal `Image` stream surfaces. +- [x] 5.1 Add `dimos/protocol/video/demo_h264_video_e2e.py` with a deterministic synthetic `Image` source, H.264 memory2 recorder, and image probe. +- [x] 5.2 Configure the blueprint to exercise live H.264 LCM transmission and H.264 memory2 storage through encoded `Image` storage surfaces. - [x] 5.3 Add probe status or logs that report received frame counts, dimensions, timestamp monotonicity, validation failures, and drop/recovery observations. - [x] 5.4 Register the runnable blueprint as `demo-h264-video-e2e` if it is intended to be exposed through `dimos run`. - [x] 5.5 Regenerate and verify `dimos/robot/all_blueprints.py` if the demo blueprint is registered. @@ -45,7 +44,7 @@ - [x] 6.1 Update user-facing transport docs with H.264 opt-in behavior, `Image` stream preservation, Annex B per-frame packets, keyframe/GOP recovery, unsupported formats, and dependency notes. - [x] 6.2 Update blueprint docs with an H.264 image transport mapping example. -- [x] 6.3 Update memory2 docs with H.264 image payload strategy configuration, one-observation-per-frame behavior, metadata query without decode, lazy `obs.data` decode, best-effort keyframe startup, and replay behavior. +- [x] 6.3 Update memory2 docs with H.264 image codec configuration, one-observation-per-frame behavior, encoded `obs.data` access, explicit decode sessions, best-effort keyframe startup, and replay behavior. - [x] 6.4 Add docs for running and inspecting the `demo-h264-video-e2e` synthetic QA blueprint. - [x] 6.5 Update contributor testing docs with video dependency setup, focused test targets, skip behavior when dependencies are unavailable, and blueprint-registry regeneration guidance. - [x] 6.6 Update coding-agent docs if maintainers want the H.264/Foxglove packet-shape rule documented for future agent edits. @@ -55,8 +54,8 @@ - [x] 7.1 Run `openspec validate add-h264-codec-mem2-storage --strict`. - [x] 7.2 Run focused unit tests for H.264 codec/access-unit/GOP behavior. - [x] 7.3 Run focused unit tests for eager `Image` compatibility. -- [x] 7.4 Run focused memory2 storage tests for H.264 append/query/lazy decode/reopen/replay/default compatibility. +- [x] 7.4 Run focused memory2 storage tests for H.264 codec append/query/reopen/replay/default compatibility. - [x] 7.5 Run focused live transport tests for H.264 LCM round-trip and sequence-gap recovery. - [x] 7.6 Run `uv run pytest dimos/robot/test_all_blueprints_generation.py` if the demo blueprint is registered. - [x] 7.7 Run relevant docs validation, including `uv run doclinks` if available and `uv run md-babel-py run ` for executable markdown snippets. -- [x] 7.8 Manually run `dimos run demo-h264-video-e2e --daemon`, inspect logs/probe status, query the generated memory2 store without pixel decode, access `obs.data` for keyframe and mid-GOP observations, replay the stream, and verify sequence-gap recovery behavior. +- [x] 7.8 Manually run `dimos run demo-h264-video-e2e --daemon`, inspect logs/probe status, query the generated memory2 store for encoded observations, replay the stream through an explicit decoder, and verify sequence-gap recovery behavior. From adaf92952b22f94a5ad279047f713cae71ea01cf Mon Sep 17 00:00:00 2001 From: cc Date: Thu, 11 Jun 2026 21:32:39 -0700 Subject: [PATCH 05/14] test: add h264 storage benchmark blueprint --- dimos/protocol/video/demo_h264_video_e2e.py | 300 ++++++++++++++++++ dimos/robot/all_blueprints.py | 5 + .../memory/h264_storage_benchmark_report.md | 93 ++++++ 3 files changed, 398 insertions(+) create mode 100644 docs/capabilities/memory/h264_storage_benchmark_report.md diff --git a/dimos/protocol/video/demo_h264_video_e2e.py b/dimos/protocol/video/demo_h264_video_e2e.py index b06acdbe76..9762629435 100644 --- a/dimos/protocol/video/demo_h264_video_e2e.py +++ b/dimos/protocol/video/demo_h264_video_e2e.py @@ -16,9 +16,14 @@ from __future__ import annotations +import os +from pathlib import Path +import sqlite3 +import tempfile import threading import time +import cv2 import numpy as np from dimos.core.coordination.blueprints import autoconnect @@ -120,6 +125,273 @@ class H264WebcamRecorder(Recorder): color_image: In[Image] +class JpegBenchmarkRecorder(Recorder): + """Recorder for the JPEG side of the storage-size benchmark.""" + + jpeg_image: In[Image] + + +class H264BenchmarkRecorder(Recorder): + """Recorder for the H.264 side of the storage-size benchmark.""" + + h264_image: In[Image] + + +class H264StorageBenchmarkSourceConfig(SyntheticVideoSourceConfig): + video_path: str = "" + width: int = 320 + height: int = 240 + fps: float = 15.0 + frame_count: int = 150 + output_frame_id: str = "h264_storage_benchmark_camera" + + +class H264StorageBenchmarkSource(Module): + """Publish identical raw frames to JPEG and H.264 recording paths.""" + + config: H264StorageBenchmarkSourceConfig + jpeg_image: Out[Image] + h264_image: Out[Image] + + _thread: threading.Thread | None = None + _stop_event: threading.Event | None = None + + @rpc + def start(self) -> None: + super().start() + self._stop_event = threading.Event() + self._thread = threading.Thread(target=self._publish_loop, daemon=True) + self._thread.start() + video_path = self._configured_video_path() + source = str(video_path) if video_path is not None else "synthetic pattern" + logger.info( + "Started H.264/JPEG storage benchmark source: %s, %sx%s @ %.2f FPS for up to %s frames", + source, + self.config.width, + self.config.height, + self.config.fps, + self.config.frame_count, + ) + + @rpc + def stop(self) -> None: + if self._stop_event is not None: + self._stop_event.set() + if self._thread is not None: + self._thread.join(timeout=2.0) + self._thread = None + super().stop() + + def _publish_loop(self) -> None: + assert self._stop_event is not None + video_path = self._configured_video_path() + if video_path is not None: + self._publish_video_file(video_path) + return + + period = 1.0 / max(self.config.fps, 0.1) + next_publish = time.monotonic() + for seq in range(self.config.frame_count): + if self._stop_event.is_set(): + break + frame = self._make_frame(seq) + self.jpeg_image.publish(frame) + self.h264_image.publish(frame.copy()) + next_publish += period + time.sleep(max(0.0, next_publish - time.monotonic())) + logger.info("H.264/JPEG storage benchmark source finished publishing frames") + + def _configured_video_path(self) -> Path | None: + value = self.config.video_path or os.environ.get("DIMOS_H264_BENCHMARK_VIDEO", "") + return Path(value).expanduser() if value else None + + def _publish_video_file(self, video_path: Path) -> None: + assert self._stop_event is not None + if not video_path.exists(): + logger.error("Benchmark video file does not exist: %s", video_path) + return + + capture = cv2.VideoCapture(str(video_path)) + try: + if not capture.isOpened(): + logger.error("Failed to open benchmark video file: %s", video_path) + return + + period = 1.0 / max(self.config.fps, 0.1) + next_publish = time.monotonic() + published = 0 + for seq in range(self.config.frame_count): + if self._stop_event.is_set(): + break + ok, frame_bgr = capture.read() + if not ok: + break + frame = self._image_from_video_frame(frame_bgr) + self.jpeg_image.publish(frame) + self.h264_image.publish(frame.copy()) + published = seq + 1 + next_publish += period + time.sleep(max(0.0, next_publish - time.monotonic())) + logger.info( + "H.264/JPEG storage benchmark video source published %s frames from %s", + published, + video_path, + ) + finally: + capture.release() + + def _image_from_video_frame(self, frame_bgr: np.ndarray) -> Image: + if self.config.width > 0 and self.config.height > 0: + frame_bgr = cv2.resize( + frame_bgr, + (self.config.width, self.config.height), + interpolation=cv2.INTER_AREA, + ) + frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB) + return Image( + data=frame_rgb, + format=ImageFormat.RGB, + frame_id=self.config.output_frame_id, + ts=time.time(), + ) + + def _make_frame(self, seq: int) -> Image: + yy, xx = np.indices((self.config.height, self.config.width), dtype=np.uint16) + base = (xx + (yy * 2) + (seq * 4) + self.config.seed) % 256 + marker = ((xx // 20 + yy // 20 + seq) % 2) * 35 + data = np.stack( + (base, (base + 70 + marker) % 256, (base + 145) % 256), + axis=2, + ).astype(np.uint8) + return Image( + data=data, + format=ImageFormat.RGB, + frame_id=self.config.output_frame_id, + ts=time.time(), + ) + + +class H264StorageBenchmarkReporterConfig(ModuleConfig): + jpeg_db_path: str = "benchmark_jpeg.db" + h264_db_path: str = "benchmark_h264.db" + min_wait_seconds: float = 12.0 + wait_seconds: float = 18.0 + stable_seconds: float = 2.0 + poll_seconds: float = 0.5 + + +class H264StorageBenchmarkReporter(Module): + """Log the JPEG vs H.264 SQLite DB size comparison.""" + + config: H264StorageBenchmarkReporterConfig + + _thread: threading.Thread | None = None + _stop_event: threading.Event | None = None + _last_summary: str | None = None + + @rpc + def start(self) -> None: + super().start() + self._stop_event = threading.Event() + self._thread = threading.Thread(target=self._report_loop, daemon=True) + self._thread.start() + + @rpc + def stop(self) -> None: + if self._stop_event is not None: + self._stop_event.set() + if self._thread is not None: + self._thread.join(timeout=2.0) + self._thread = None + super().stop() + + @rpc + def summary(self) -> str: + """Return the most recent JPEG-vs-H.264 storage benchmark summary.""" + return self._last_summary or "benchmark summary not available yet" + + def _report_loop(self) -> None: + assert self._stop_event is not None + started_at = time.monotonic() + deadline = time.monotonic() + self.config.wait_seconds + stable_since: float | None = None + last_sizes: tuple[int, int] | None = None + jpeg_path = Path(self.config.jpeg_db_path) + h264_path = Path(self.config.h264_db_path) + + while time.monotonic() < deadline and not self._stop_event.is_set(): + if jpeg_path.exists() and h264_path.exists(): + sizes = ( + _sqlite_snapshot_size(jpeg_path), + _sqlite_snapshot_size(h264_path), + ) + if sizes == last_sizes: + stable_since = stable_since or time.monotonic() + recording_window_elapsed = ( + time.monotonic() - started_at >= self.config.min_wait_seconds + ) + if ( + recording_window_elapsed + and time.monotonic() - stable_since >= self.config.stable_seconds + ): + self._log_sizes(sizes[0], sizes[1]) + return + else: + last_sizes = sizes + stable_since = None + time.sleep(self.config.poll_seconds) + + if jpeg_path.exists() and h264_path.exists(): + self._log_sizes( + _sqlite_snapshot_size(jpeg_path), + _sqlite_snapshot_size(h264_path), + ) + else: + missing = [str(path) for path in (jpeg_path, h264_path) if not path.exists()] + self._last_summary = f"benchmark DB size unavailable; missing={missing}" + logger.warning(self._last_summary) + + def _log_sizes(self, jpeg_bytes: int, h264_bytes: int) -> None: + ratio = h264_bytes / jpeg_bytes if jpeg_bytes else float("inf") + saved = jpeg_bytes - h264_bytes + saved_pct = (saved / jpeg_bytes * 100.0) if jpeg_bytes else 0.0 + self._last_summary = ( + "H.264/JPEG storage benchmark: " + f"jpeg={jpeg_bytes} bytes ({jpeg_bytes / 1024 / 1024:.2f} MiB), " + f"h264={h264_bytes} bytes ({h264_bytes / 1024 / 1024:.2f} MiB), " + f"h264/jpeg={ratio:.3f}, saved={saved} bytes ({saved_pct:.1f}%)" + ) + logger.info(self._last_summary) + print(self._last_summary, flush=True) + + +def _sqlite_snapshot_size(path: Path) -> int: + """Return compact SQLite DB size, even while WAL sidecars are active.""" + if not path.exists(): + return 0 + try: + with tempfile.NamedTemporaryFile(prefix=f"{path.stem}-", suffix=".db") as tmp: + source = sqlite3.connect(f"file:{path}?mode=ro", uri=True) + target = sqlite3.connect(tmp.name) + try: + source.backup(target) + finally: + target.close() + source.close() + return Path(tmp.name).stat().st_size + except sqlite3.Error: + return _sqlite_live_file_size(path) + + +def _sqlite_live_file_size(path: Path) -> int: + total = path.stat().st_size if path.exists() else 0 + for suffix in ("-wal", "-shm"): + sidecar = Path(f"{path}{suffix}") + if sidecar.exists(): + total += sidecar.stat().st_size + return total + + class H264MemoryReplayConfig(ModuleConfig): db_path: str = "webcam_h264.db" speed: float = 1.0 @@ -225,6 +497,7 @@ def summary(self) -> str: _h264_config = H264Config(bitrate=1_000_000, target_fps=10, keyframe_interval=15) _webcam_h264_config = H264Config(bitrate=2_000_000, target_fps=15, keyframe_interval=30) +_benchmark_h264_config = H264Config(bitrate=1_500_000, target_fps=15, keyframe_interval=30) def _webcam() -> Webcam: @@ -251,6 +524,33 @@ def _webcam() -> Webcam: ) +demo_h264_storage_benchmark = autoconnect( + H264StorageBenchmarkSource.blueprint(), + JpegBenchmarkRecorder.blueprint( + db_path="benchmark_jpeg.db", + on_existing=OnExisting.OVERWRITE, + ), + H264BenchmarkRecorder.blueprint( + db_path="benchmark_h264.db", + on_existing=OnExisting.OVERWRITE, + codecs={"h264_image": "h264"}, + ), + H264StorageBenchmarkReporter.blueprint( + jpeg_db_path="benchmark_jpeg.db", + h264_db_path="benchmark_h264.db", + ), +).transports( + { + ("h264_image", Image): H264LcmTransport( + "/demo_h264_storage_benchmark/h264_image", + Image, + config=_benchmark_h264_config, + decode_images=False, + ) + } +) + + demo_h264_webcam_record = autoconnect( CameraModule.blueprint(hardware=_webcam, transform=None, frequency=15.0), H264WebcamRecorder.blueprint( diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index 58649b6ed8..e6f1f93409 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -50,6 +50,7 @@ "demo-error-on-name-conflicts": "dimos.robot.unitree.demo_error_on_name_conflicts:demo_error_on_name_conflicts", "demo-google-maps-skill": "dimos.agents.skills.demo_google_maps_skill:demo_google_maps_skill", "demo-gps-nav": "dimos.agents.skills.demo_gps_nav:demo_gps_nav", + "demo-h264-storage-benchmark": "dimos.protocol.video.demo_h264_video_e2e:demo_h264_storage_benchmark", "demo-h264-video-e2e": "dimos.protocol.video.demo_h264_video_e2e:demo_h264_video_e2e", "demo-h264-webcam-record": "dimos.protocol.video.demo_h264_video_e2e:demo_h264_webcam_record", "demo-h264-webcam-replay": "dimos.protocol.video.demo_h264_video_e2e:demo_h264_webcam_replay", @@ -168,12 +169,16 @@ "gps-nav-skill-container": "dimos.agents.skills.gps_nav_skill.GpsNavSkillContainer", "grasping-module": "dimos.manipulation.grasping.grasping.GraspingModule", "gstreamer-camera-module": "dimos.hardware.sensors.camera.gstreamer.gstreamer_camera.GstreamerCameraModule", + "h264-benchmark-recorder": "dimos.protocol.video.demo_h264_video_e2e.H264BenchmarkRecorder", "h264-e2-e-recorder": "dimos.protocol.video.demo_h264_video_e2e.H264E2ERecorder", "h264-memory-replay": "dimos.protocol.video.demo_h264_video_e2e.H264MemoryReplay", + "h264-storage-benchmark-reporter": "dimos.protocol.video.demo_h264_video_e2e.H264StorageBenchmarkReporter", + "h264-storage-benchmark-source": "dimos.protocol.video.demo_h264_video_e2e.H264StorageBenchmarkSource", "h264-video-probe": "dimos.protocol.video.demo_h264_video_e2e.H264VideoProbe", "h264-webcam-recorder": "dimos.protocol.video.demo_h264_video_e2e.H264WebcamRecorder", "joint-trajectory-controller": "dimos.manipulation.control.trajectory_controller.joint_trajectory_controller.JointTrajectoryController", "joystick-module": "dimos.robot.unitree.b1.joystick_module.JoystickModule", + "jpeg-benchmark-recorder": "dimos.protocol.video.demo_h264_video_e2e.JpegBenchmarkRecorder", "keyboard-teleop": "dimos.robot.unitree.keyboard_teleop.KeyboardTeleop", "keyboard-teleop-module": "dimos.teleop.keyboard.keyboard_teleop_module.KeyboardTeleopModule", "local-planner": "dimos.navigation.nav_stack.modules.local_planner.local_planner.LocalPlanner", diff --git a/docs/capabilities/memory/h264_storage_benchmark_report.md b/docs/capabilities/memory/h264_storage_benchmark_report.md new file mode 100644 index 0000000000..39e9c888ca --- /dev/null +++ b/docs/capabilities/memory/h264_storage_benchmark_report.md @@ -0,0 +1,93 @@ +# H.264 memory2 storage benchmark + +This report compares memory2 image storage size for the same frames stored with the default JPEG codec and the opt-in H.264 codec. + +## Method + +Blueprint: `demo-h264-storage-benchmark` + +The benchmark source publishes identical `Image` frames to two recorder streams: + +- `jpeg_image` uses the default memory2 `Image` codec (`JpegCodec`). +- `h264_image` uses `codec="h264"` and receives encoded H.264 images through `H264LcmTransport(decode_images=False)`. + +The reporter measures compact SQLite snapshot sizes with SQLite backup, so active WAL/SHM sidecars do not skew the comparison. + +## Public video sample + +Source video: + +- URL: `https://raw.githubusercontent.com/opencv/opencv/master/samples/data/vtest.avi` +- Local path used for the run: `/tmp/opencode/dimos-h264-benchmark-vtest.avi` +- File size: 8,131,690 bytes +- Source dimensions: 768 x 576 +- Source FPS: 10 +- Source frame count: 795 + +Benchmark settings: + +- Frames recorded: 150 +- Recorded dimensions: 320 x 240 +- Publish rate: 15 FPS +- H.264 bitrate: 1,500,000 bps +- H.264 keyframe interval: 30 frames +- H.264 profile/preset/tune: baseline / veryfast / zerolatency +- B-frames: disabled + +Command: + +```bash +rm -f benchmark_jpeg.db benchmark_jpeg.db-wal benchmark_jpeg.db-shm \ + benchmark_h264.db benchmark_h264.db-wal benchmark_h264.db-shm + +DIMOS_H264_BENCHMARK_VIDEO=/tmp/opencode/dimos-h264-benchmark-vtest.avi \ + uv run dimos run demo-h264-storage-benchmark --daemon + +sleep 22 +uv run dimos log -n 80 +uv run dimos stop +``` + +## Result + +| Codec | DB path | Rows | Blob rows | Blob bytes | DB size | +|---|---:|---:|---:|---:|---:| +| JPEG | `benchmark_jpeg.db` | 150 | 150 | 1,586,940 | 1,884,160 bytes (1.80 MiB) | +| H.264 | `benchmark_h264.db` | 150 | 150 | 1,008,355 | 1,126,400 bytes (1.07 MiB) | + +H.264 used 59.8% of the JPEG storage size and saved 757,760 bytes, a 40.2% reduction for this sample. + +## Direct ffmpeg H.264 comparison + +To estimate the cost of per-frame Foxglove-style storage versus a continuous H.264 stream, the same 150 frames were encoded directly with ffmpeg using similar H.264 settings: + +```bash +ffmpeg -y -v error \ + -i /tmp/opencode/dimos-h264-benchmark-vtest.avi \ + -vf "scale=320:240,fps=15" \ + -frames:v 150 \ + -c:v libx264 \ + -b:v 1500k -maxrate 1500k -bufsize 3000k \ + -profile:v baseline -preset veryfast -tune zerolatency \ + -g 30 -keyint_min 30 -sc_threshold 0 -bf 0 \ + -pix_fmt yuv420p \ + -f h264 /tmp/opencode/dimos-h264-benchmark-direct.h264 +``` + +| Output | Size | +|---|---:| +| Direct ffmpeg Annex B H.264 elementary stream | 1,603,706 bytes (1.53 MiB) | +| Direct ffmpeg MP4 container | 1,606,038 bytes (1.53 MiB) | +| memory2 H.264 SQLite DB | 1,126,400 bytes (1.07 MiB) | +| memory2 H.264 blob payloads only | 1,008,355 bytes (0.96 MiB) | + +In this run, memory2 H.264 storage was smaller than the direct ffmpeg elementary stream. That means this benchmark does not show a storage-efficiency penalty from the per-frame Annex B access-unit layout. It mostly shows that the current aiortc/libx264 path and the direct ffmpeg command did not produce identical rate-control output, even with similar nominal settings. + +The storage overhead within memory2 was measurable: the H.264 DB was 118,045 bytes larger than its stored blob payloads, or 11.7% over the blob bytes. That overhead includes observation metadata, SQLite page overhead, and one encoded-image envelope per frame. + +## Notes + +- The benchmark measures SQLite DB size, not raw compressed frame bytes alone. Observation metadata and blob table overhead are included for both codecs. +- The direct ffmpeg comparison is not a quality-matched encoder benchmark. It uses similar nominal settings to the DimOS H.264 config, but aiortc/PyAV and ffmpeg rate control can still choose different actual bit allocation. +- The sample video already contains temporal structure. Synthetic frames from the same benchmark blueprint produced a larger reduction in one local run: JPEG 2,109,440 bytes, H.264 983,040 bytes, a 53.4% reduction. +- H.264 results depend on bitrate, keyframe interval, resolution, motion, and scene texture. From c540b2119d41dd846ceadb3be4c131fec79dc2cd Mon Sep 17 00:00:00 2001 From: cc Date: Thu, 11 Jun 2026 21:53:06 -0700 Subject: [PATCH 06/14] refactor: clean up h264 memory2 spillover --- dimos/memory2/module.py | 6 +- dimos/memory2/observationstore/sqlite.py | 2 +- dimos/memory2/replay.py | 11 +--- dimos/memory2/store/sqlite.py | 7 ++- dimos/protocol/video/demo_h264_video_e2e.py | 61 +++++++++++++++++-- docs/capabilities/memory/index.md | 15 +++-- .../add-h264-codec-mem2-storage/design.md | 5 +- .../add-h264-codec-mem2-storage/docs.md | 2 +- 8 files changed, 78 insertions(+), 31 deletions(-) diff --git a/dimos/memory2/module.py b/dimos/memory2/module.py index ad2982ad48..9a3d90e164 100644 --- a/dimos/memory2/module.py +++ b/dimos/memory2/module.py @@ -254,7 +254,6 @@ class RecorderConfig(MemoryModuleConfig): default_frame_id: str = "base_link" tf_tolerance: float = 0.5 db_path: str | Path = "recording.db" - codecs: dict[str, Any] = Field(default_factory=dict) class Recorder(MemoryModule): @@ -304,10 +303,7 @@ def start(self) -> None: return for name, port in self.inputs.items(): - stream_overrides: dict[str, Any] = {} - if name in self.config.codecs: - stream_overrides["codec"] = self.config.codecs[name] - stream: Stream[Any] = self.store.stream(name, port.type, **stream_overrides) + stream: Stream[Any] = self.store.stream(name, port.type) self._port_to_stream(name, port, stream) logger.info("Recording %s (%s)", name, port.type.__name__) diff --git a/dimos/memory2/observationstore/sqlite.py b/dimos/memory2/observationstore/sqlite.py index c74a481cfe..31c6a25ea0 100644 --- a/dimos/memory2/observationstore/sqlite.py +++ b/dimos/memory2/observationstore/sqlite.py @@ -258,7 +258,7 @@ def _ensure_tables(self) -> None: self._conn.execute( f'CREATE TABLE IF NOT EXISTS "{self._name}" (' " id INTEGER PRIMARY KEY AUTOINCREMENT," - " ts REAL NOT NULL UNIQUE," + " ts REAL NOT NULL," " value NUMERIC," " pose_x REAL, pose_y REAL, pose_z REAL," " pose_qx REAL, pose_qy REAL, pose_qz REAL, pose_qw REAL," diff --git a/dimos/memory2/replay.py b/dimos/memory2/replay.py index 3062a44e98..516d39b372 100644 --- a/dimos/memory2/replay.py +++ b/dimos/memory2/replay.py @@ -189,9 +189,8 @@ def iterate_ts(self) -> Iterator[tuple[float, T]]: emitted = False obs: Any for obs in self._base_stream(): - decoded = self._decode(obs) emitted = True - yield (obs.ts, decoded) + yield (obs.ts, self._decode(obs)) if not self._replay.config.loop or not emitted: break @@ -211,10 +210,7 @@ def find_closest(self, timestamp: float, tolerance: float = 1.0) -> T | None: obs: Any = s.at(timestamp, tolerance).first() except LookupError: return None - try: - return self._decode(obs) - except LookupError: - return None + return self._decode(obs) def observable(self) -> Observable[T]: """Timed Observable scheduled against the Replay's shared anchor. @@ -243,9 +239,8 @@ def make_iterator() -> Iterator[tuple[float, T]]: emitted = False obs: Any for obs in base(): - decoded = decode(obs) emitted = True - yield (obs.ts, decoded) + yield (obs.ts, decode(obs)) if not loop or not emitted: break diff --git a/dimos/memory2/store/sqlite.py b/dimos/memory2/store/sqlite.py index e1c5ba5071..bb2b735c1c 100644 --- a/dimos/memory2/store/sqlite.py +++ b/dimos/memory2/store/sqlite.py @@ -68,10 +68,10 @@ def _assemble_backend(self, name: str, stored: dict[str, Any]) -> Backend[Any]: from dimos.memory2.codecs.base import _resolve_payload_type, codec_from_id payload_module = stored["payload_module"] + codec = codec_from_id(stored["codec_id"], payload_module) data_type = _resolve_payload_type(payload_module) eager_blobs = stored.get("eager_blobs", False) page_size = stored.get("page_size", self.config.page_size) - codec = codec_from_id(stored["codec_id"], payload_module) backend_conn = self._open_connection() @@ -113,7 +113,7 @@ def _assemble_backend(self, name: str, stored: dict[str, Any]) -> Backend[Any]: blob_store_conn_match=blob_store_conn_match and eager_blobs, page_size=page_size, ) - return Backend( + backend: Backend[Any] = Backend( metadata_store=metadata_store, codec=codec, data_type=data_type, @@ -122,6 +122,7 @@ def _assemble_backend(self, name: str, stored: dict[str, Any]) -> Backend[Any]: notifier=notifier, eager_blobs=eager_blobs, ) + return backend @staticmethod def _serialize_backend( @@ -171,7 +172,7 @@ def _create_backend( if not isinstance(config.get("vector_store"), VectorStore): config["vector_store"] = SqliteVectorStore(conn=backend_conn) - # Resolve codec early — needed for SqliteObservationStore. + # Resolve codec early — needed for SqliteObservationStore codec = self._resolve_codec(payload_type, config.get("codec")) config["codec"] = codec diff --git a/dimos/protocol/video/demo_h264_video_e2e.py b/dimos/protocol/video/demo_h264_video_e2e.py index 9762629435..82d735f25b 100644 --- a/dimos/protocol/video/demo_h264_video_e2e.py +++ b/dimos/protocol/video/demo_h264_video_e2e.py @@ -22,6 +22,7 @@ import tempfile import threading import time +from typing import ClassVar, cast import cv2 import numpy as np @@ -35,9 +36,11 @@ from dimos.hardware.sensors.camera.webcam import Webcam from dimos.memory2.module import OnExisting, Recorder from dimos.memory2.store.sqlite import SqliteStore +from dimos.memory2.stream import Stream from dimos.msgs.sensor_msgs.Image import Image, ImageFormat from dimos.protocol.pubsub.impl.h264_lcm import H264LCM from dimos.protocol.video.h264 import H264Config, H264Decoder, VideoDecodeGapError +from dimos.utils.data import backup_file from dimos.utils.logging_config import setup_logger from dimos.visualization.vis_module import vis_module @@ -113,15 +116,63 @@ def _make_frame(self, seq: int) -> Image: ) -class H264E2ERecorder(Recorder): +class _H264RecorderMixin: + """Mixin that stores selected Image inputs with the H.264 codec.""" + + h264_streams: ClassVar[frozenset[str]] = frozenset() + + @rpc + def start(self) -> None: + recorder = cast("Recorder", self) + Module.start(recorder) + + if recorder.config.g.replay: + logger.info( + "Replay mode active — Recorder disabled, leaving %s untouched", + recorder.config.db_path, + ) + return + + db_path = Path(recorder.config.db_path) + if db_path.exists(): + if recorder.config.on_existing is OnExisting.OVERWRITE: + db_path.unlink() + logger.info("Deleted existing recording %s", db_path) + elif recorder.config.on_existing is OnExisting.BACKUP: + backup = backup_file(db_path, keep_last=recorder.config.backup_keep_last) + if backup is None: + logger.info("Removed existing recording %s (backup_keep_last=0)", db_path) + else: + logger.info("Backed up existing recording %s -> %s", db_path, backup) + else: + raise FileExistsError(f"Recording already exists: {db_path}") + + if not recorder.inputs: + logger.warning("Recorder has no In ports — nothing to record, subclass the Recorder") + return + + for name, port in recorder.inputs.items(): + stream: Stream[Image] + h264_streams = getattr(self, "h264_streams", frozenset()) + if name in h264_streams: + stream = recorder.store.stream(name, port.type, codec="h264") + else: + stream = recorder.store.stream(name, port.type) + recorder._port_to_stream(name, port, stream) + logger.info("Recording %s (%s)", name, port.type.__name__) + + +class H264E2ERecorder(_H264RecorderMixin, Recorder): """Recorder with a typed image input for the synthetic H.264 demo.""" + h264_streams: ClassVar[frozenset[str]] = frozenset({"color_image"}) color_image: In[Image] -class H264WebcamRecorder(Recorder): +class H264WebcamRecorder(_H264RecorderMixin, Recorder): """Recorder with a typed image input for webcam H.264 QA.""" + h264_streams: ClassVar[frozenset[str]] = frozenset({"color_image"}) color_image: In[Image] @@ -131,9 +182,10 @@ class JpegBenchmarkRecorder(Recorder): jpeg_image: In[Image] -class H264BenchmarkRecorder(Recorder): +class H264BenchmarkRecorder(_H264RecorderMixin, Recorder): """Recorder for the H.264 side of the storage-size benchmark.""" + h264_streams: ClassVar[frozenset[str]] = frozenset({"h264_image"}) h264_image: In[Image] @@ -509,7 +561,6 @@ def _webcam() -> Webcam: H264E2ERecorder.blueprint( db_path="h264_video_e2e.db", on_existing=OnExisting.OVERWRITE, - codecs={"color_image": "h264"}, ), H264VideoProbe.blueprint(), ).transports( @@ -533,7 +584,6 @@ def _webcam() -> Webcam: H264BenchmarkRecorder.blueprint( db_path="benchmark_h264.db", on_existing=OnExisting.OVERWRITE, - codecs={"h264_image": "h264"}, ), H264StorageBenchmarkReporter.blueprint( jpeg_db_path="benchmark_jpeg.db", @@ -556,7 +606,6 @@ def _webcam() -> Webcam: H264WebcamRecorder.blueprint( db_path="webcam_h264.db", on_existing=OnExisting.OVERWRITE, - codecs={"color_image": "h264"}, ), ).transports( { diff --git a/docs/capabilities/memory/index.md b/docs/capabilities/memory/index.md index 537fdc4283..3641b45c33 100644 --- a/docs/capabilities/memory/index.md +++ b/docs/capabilities/memory/index.md @@ -225,15 +225,20 @@ color = store.stream( ) ``` -Recorders can configure the same setting per input stream: +Recorder modules that need H.264 storage should create their target stream with +the same codec override: ```python skip +from dimos.core.stream import In from dimos.memory2.module import Recorder +from dimos.msgs.sensor_msgs.Image import Image -recorder = Recorder.blueprint( - db_path="robot_video.db", - codecs={"color_image": "h264"}, -) +class H264Recorder(Recorder): + color_image: In[Image] + + def start(self) -> None: + stream = self.store.stream("color_image", Image, codec="h264") + self._port_to_stream("color_image", self.color_image, stream) ``` H.264 storage keeps the normal memory2 shape: one observation row per source diff --git a/openspec/changes/add-h264-codec-mem2-storage/design.md b/openspec/changes/add-h264-codec-mem2-storage/design.md index f18645459a..591df2bc96 100644 --- a/openspec/changes/add-h264-codec-mem2-storage/design.md +++ b/openspec/changes/add-h264-codec-mem2-storage/design.md @@ -104,10 +104,11 @@ memory2 stores H.264 through a normal codec: store.stream("color_image", Image, codec="h264") ``` -or recorder config: +or an H.264-specific recorder subclass: ```python -Recorder.blueprint(codecs={"color_image": "h264"}) +stream = self.store.stream("color_image", Image, codec="h264") +self._port_to_stream("color_image", self.color_image, stream) ``` `H264ImageCodec` only stores/restores encoded `Image` values. It does not decode pixels and does not own GOP state. Reopened stores restore the codec through the existing stream registry `codec_id` field. diff --git a/openspec/changes/add-h264-codec-mem2-storage/docs.md b/openspec/changes/add-h264-codec-mem2-storage/docs.md index bbbab8f0e9..b55ec0873c 100644 --- a/openspec/changes/add-h264-codec-mem2-storage/docs.md +++ b/openspec/changes/add-h264-codec-mem2-storage/docs.md @@ -10,7 +10,7 @@ - Update `docs/usage/blueprints.md` with an opt-in blueprint transport mapping example for `H264LcmTransport` and `H264Config`. - Update memory2 user docs, likely under `docs/usage/` or the memory2 capability docs, to describe opt-in H.264-backed image storage: - Default image storage remains JPEG-backed. - - Users opt in per stream with `codec="h264"` or recorder `codecs={"stream": "h264"}`. + - Users opt in per stream with `codec="h264"`; recorder subclasses that need H.264 storage create their target stream with that codec override. - memory2 still stores one observation per source frame. - metadata queries and `obs.data` access return encoded `Image` values without pixel decode. - explicit H.264 decode sessions convert encoded replay streams to raw decoded `Image` values and suppress deltas until the first keyframe at or after the start point. From a19705c1844111528ea283cb6a21680c5ae16844 Mon Sep 17 00:00:00 2001 From: cc Date: Fri, 12 Jun 2026 10:43:32 -0700 Subject: [PATCH 07/14] feat: add distributed test --- dimos/protocol/video/demo_h264_video_e2e.py | 49 +++++++++++++++++++++ dimos/robot/all_blueprints.py | 3 ++ 2 files changed, 52 insertions(+) diff --git a/dimos/protocol/video/demo_h264_video_e2e.py b/dimos/protocol/video/demo_h264_video_e2e.py index 82d735f25b..6b5f73003c 100644 --- a/dimos/protocol/video/demo_h264_video_e2e.py +++ b/dimos/protocol/video/demo_h264_video_e2e.py @@ -550,6 +550,7 @@ def summary(self) -> str: _h264_config = H264Config(bitrate=1_000_000, target_fps=10, keyframe_interval=15) _webcam_h264_config = H264Config(bitrate=2_000_000, target_fps=15, keyframe_interval=30) _benchmark_h264_config = H264Config(bitrate=1_500_000, target_fps=15, keyframe_interval=30) +_inter_machine_h264_topic = "/demo_h264_inter_machine/color_image" def _webcam() -> Webcam: @@ -619,6 +620,54 @@ def _webcam() -> Webcam: ) +demo_h264_webcam_rerun = autoconnect( + CameraModule.blueprint(hardware=_webcam, transform=None, frequency=15.0), + H264VideoProbe.blueprint(), + vis_module( + "rerun", + rerun_config={"pubsubs": [H264LCM(config=_webcam_h264_config)]}, + ), +).transports( + { + ("color_image", Image): H264LcmTransport( + "/demo_h264_webcam_rerun/color_image", + Image, + config=_webcam_h264_config, + ) + } +) + + +demo_h264_webcam_publish = autoconnect( + CameraModule.blueprint(hardware=_webcam, transform=None, frequency=15.0), +).transports( + { + ("color_image", Image): H264LcmTransport( + _inter_machine_h264_topic, + Image, + config=_webcam_h264_config, + ) + } +) + + +demo_h264_rerun_subscribe = autoconnect( + H264VideoProbe.blueprint(), + vis_module( + "rerun", + rerun_config={"pubsubs": [H264LCM(config=_webcam_h264_config)]}, + ), +).transports( + { + ("color_image", Image): H264LcmTransport( + _inter_machine_h264_topic, + Image, + config=_webcam_h264_config, + ) + } +) + + demo_h264_webcam_replay = autoconnect( H264MemoryReplay.blueprint(db_path="webcam_h264.db"), H264VideoProbe.blueprint(), diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index e6f1f93409..7539c4caa0 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -50,10 +50,13 @@ "demo-error-on-name-conflicts": "dimos.robot.unitree.demo_error_on_name_conflicts:demo_error_on_name_conflicts", "demo-google-maps-skill": "dimos.agents.skills.demo_google_maps_skill:demo_google_maps_skill", "demo-gps-nav": "dimos.agents.skills.demo_gps_nav:demo_gps_nav", + "demo-h264-rerun-subscribe": "dimos.protocol.video.demo_h264_video_e2e:demo_h264_rerun_subscribe", "demo-h264-storage-benchmark": "dimos.protocol.video.demo_h264_video_e2e:demo_h264_storage_benchmark", "demo-h264-video-e2e": "dimos.protocol.video.demo_h264_video_e2e:demo_h264_video_e2e", + "demo-h264-webcam-publish": "dimos.protocol.video.demo_h264_video_e2e:demo_h264_webcam_publish", "demo-h264-webcam-record": "dimos.protocol.video.demo_h264_video_e2e:demo_h264_webcam_record", "demo-h264-webcam-replay": "dimos.protocol.video.demo_h264_video_e2e:demo_h264_webcam_replay", + "demo-h264-webcam-rerun": "dimos.protocol.video.demo_h264_video_e2e:demo_h264_webcam_rerun", "demo-mcp-stress-test": "dimos.core.tests.stress_test_blueprint:demo_mcp_stress_test", "demo-object-scene-registration": "dimos.perception.demo_object_scene_registration:demo_object_scene_registration", "demo-osm": "dimos.mapping.osm.demo_osm:demo_osm", From dc578e3c3991e1a58415b1d85b4290a0cf9ee4a4 Mon Sep 17 00:00:00 2001 From: cc Date: Fri, 12 Jun 2026 10:43:50 -0700 Subject: [PATCH 08/14] spec: remove --- .../.openspec.yaml | 2 - .../add-h264-codec-mem2-storage/design.md | 159 ------------------ .../add-h264-codec-mem2-storage/docs.md | 58 ------- .../add-h264-codec-mem2-storage/proposal.md | 40 ----- .../specs/h264-image-streams/spec.md | 115 ------------- .../specs/memory2-h264-storage/spec.md | 94 ----------- .../add-h264-codec-mem2-storage/tasks.md | 61 ------- openspec/config.yaml | 45 ----- openspec/schemas/dimos-capability/schema.yaml | 128 -------------- .../dimos-capability/templates/design.md | 35 ---- .../dimos-capability/templates/docs.md | 19 --- .../dimos-capability/templates/proposal.md | 32 ---- .../dimos-capability/templates/spec.md | 16 -- .../dimos-capability/templates/tasks.md | 15 -- 14 files changed, 819 deletions(-) delete mode 100644 openspec/changes/add-h264-codec-mem2-storage/.openspec.yaml delete mode 100644 openspec/changes/add-h264-codec-mem2-storage/design.md delete mode 100644 openspec/changes/add-h264-codec-mem2-storage/docs.md delete mode 100644 openspec/changes/add-h264-codec-mem2-storage/proposal.md delete mode 100644 openspec/changes/add-h264-codec-mem2-storage/specs/h264-image-streams/spec.md delete mode 100644 openspec/changes/add-h264-codec-mem2-storage/specs/memory2-h264-storage/spec.md delete mode 100644 openspec/changes/add-h264-codec-mem2-storage/tasks.md delete mode 100644 openspec/config.yaml delete mode 100644 openspec/schemas/dimos-capability/schema.yaml delete mode 100644 openspec/schemas/dimos-capability/templates/design.md delete mode 100644 openspec/schemas/dimos-capability/templates/docs.md delete mode 100644 openspec/schemas/dimos-capability/templates/proposal.md delete mode 100644 openspec/schemas/dimos-capability/templates/spec.md delete mode 100644 openspec/schemas/dimos-capability/templates/tasks.md diff --git a/openspec/changes/add-h264-codec-mem2-storage/.openspec.yaml b/openspec/changes/add-h264-codec-mem2-storage/.openspec.yaml deleted file mode 100644 index fb1ec77bfd..0000000000 --- a/openspec/changes/add-h264-codec-mem2-storage/.openspec.yaml +++ /dev/null @@ -1,2 +0,0 @@ -schema: dimos-capability -created: 2026-06-10 diff --git a/openspec/changes/add-h264-codec-mem2-storage/design.md b/openspec/changes/add-h264-codec-mem2-storage/design.md deleted file mode 100644 index 591df2bc96..0000000000 --- a/openspec/changes/add-h264-codec-mem2-storage/design.md +++ /dev/null @@ -1,159 +0,0 @@ -## Context - -DimOS modules exchange typed `Image` streams. Existing JPEG compression keeps that public type stable: JPEG is a storage/transport codec detail, and callers usually see decoded raw pixels. H.264 needs a similar opt-in path, but it differs from JPEG because many frames are delta frames that require prior GOP state to decode. - -This design keeps the PR minimal and coherent with existing abstractions: - -- `Image` remains the public payload type. -- Default image storage remains JPEG. -- memory2 continues to use the normal `Backend` + `Codec` path. -- H.264 live transport owns live encode/decode state. -- H.264 storage stores encoded `Image` values through a normal `H264ImageCodec`, not through a special backend. - -Foxglove's H.264 guidance remains the packet-shape target: each encoded message contains all Annex B NAL units emitted for one encoder input frame. A complete encoded frame packet is not necessarily independently decodable; P-frames still require earlier GOP state. - -## Goals / Non-Goals - -**Goals:** - -- Preserve `Out[Image]` and `In[Image]` as the user-facing stream contract. -- Extend `Image` so it can explicitly carry either raw pixels (`encoding="raw"`) or encoded H.264 access-unit bytes (`encoding="h264"`). -- Add H.264 encode/decode sessions with GOP/keyframe tracking, sequence-gap suppression, and explicit unsupported-format/dependency errors. -- Add `H264LcmTransport` with a decode mode: - - `decode_images=True`: subscribers receive decoded raw `Image` values. - - `decode_images=False`: subscribers receive encoded `Image` values for storage or inspection. -- Add `H264ImageCodec` so memory2 can store encoded H.264 `Image` values through the existing codec path. -- Keep `codec_for(Image)` as JPEG and require explicit `codec="h264"` for H.264 storage. -- Document v1 best-effort behavior: no transport QoS, durable keyframe cache, keyframe request, or guaranteed arbitrary random pixel decode. - -**Non-Goals:** - -- Adding a special memory2 backend for H.264. -- Adding a generic payload-strategy framework for this PR. -- Adding lazy pixels to `Image`; `Image.data` remains eager and is either `np.ndarray` for raw images or `bytes` for encoded images. -- Exposing a separate public encoded-video stream type. -- Supporting depth, 16-bit, alpha, or arbitrary pixel formats in the first implementation. -- Making H.264 the default image storage codec. - -## Architecture - -### Image payload shape - -`Image` gains two explicit codec fields: - -```python -encoding: str = "raw" -codec_metadata: dict[str, Any] = {} -``` - -For raw images, `data` is a NumPy array and existing pixel operations work. - -For H.264 images, `data` is bytes containing one complete Annex B access unit for one source frame. `format` still describes the decoded pixel layout (for example, RGB or BGR), while `codec_metadata` carries video metadata such as: - -```python -{ - "codec": "h264", - "bitstream": "annex_b", - "seq": 42, - "is_keyframe": False, - "keyframe_seq": 30, - "pts": 3780, - "width": 640, - "height": 480, - "channels": 3, - "dtype": "uint8", -} -``` - -Pixel operations such as `to_rgb()`, `to_bgr()`, `to_opencv()`, `as_numpy()`, `brightness`, and Rerun conversion require `encoding="raw"` and fail clearly for encoded images. - -### H.264 codec/session layer - -`dimos/protocol/video/h264.py` provides the shared stateful video logic: - -- `H264Config`: bitrate, target FPS, keyframe interval, profile/tune/preset, max GOP, supported formats. -- `AiortcH264Codec`: adapter around aiortc/PyAV H.264 encode/decode internals. -- `H264Encoder`: converts raw `Image` to encoded `Image(encoding="h264")`. -- `H264Decoder`: converts encoded H.264 `Image` to raw `Image` when GOP state is valid. -- `GopBuffer`: tracks sequence numbers and keyframe state; suppresses deltas after gaps until a keyframe. -- `H264AccessUnit`: assembles aiortc RTP-sized payloads into one Annex B access unit. - -Transport and replay/view code instantiate separate encoder/decoder sessions. They share implementation, not runtime state. - -### Live transport - -`H264LcmTransport` mirrors the JPEG transport pattern while adding an explicit decode mode. - -```python -H264LcmTransport("/camera/color", Image, config=H264Config(...)) -``` - -Default mode decodes on receive, so normal subscribers get raw `Image` values. - -```python -H264LcmTransport("/camera/color", Image, config=cfg, decode_images=False) -``` - -Encoded mode still uses the logical `Image` type, but subscribers receive `Image(encoding="h264")`. This is the mode used by recorders that should persist transport-produced H.264 bytes. - -### memory2 storage - -memory2 stores H.264 through a normal codec: - -```python -store.stream("color_image", Image, codec="h264") -``` - -or an H.264-specific recorder subclass: - -```python -stream = self.store.stream("color_image", Image, codec="h264") -self._port_to_stream("color_image", self.color_image, stream) -``` - -`H264ImageCodec` only stores/restores encoded `Image` values. It does not decode pixels and does not own GOP state. Reopened stores restore the codec through the existing stream registry `codec_id` field. - -This means H.264 recording expects the recorder input to receive encoded Images, typically by subscribing through `H264LcmTransport(..., decode_images=False)`. If a recorder receives raw Images, either use the default JPEG codec or explicitly encode before appending. - -### Replay and visualization - -memory2 replay of a stream stored with `codec="h264"` emits encoded Images in timestamp order. A separate H.264 decoder session converts that encoded stream to raw Images for Rerun or consumers. V1 decode policy is best effort: if replay starts mid-GOP, deltas are suppressed until the first keyframe at or after the start point. - -## Decisions - -1. **Use encoded `Image`, not a separate public encoded-video type.** - - Rationale: the user-facing type remains `Image` across transport and memory2. H.264 packet metadata lives in `Image.codec_metadata`. - -2. **Use normal memory2 codecs, not a special backend.** - - Rationale: memory2 already persists blob payloads through `Codec`. H.264 encoded images can be stored as encoded data without changing `Store` or `Backend` semantics. - -3. **Keep `codec_for(Image)` as JPEG.** - - Rationale: default behavior must remain stateless, compatible, and independent of H.264 dependencies. - -4. **Let transport choose decoded vs encoded subscriber payloads.** - - Rationale: normal modules want raw Images, while recorders may want the H.264 bytes produced by transport. The choice is explicit on `H264LcmTransport`. - -5. **Decode only from valid GOP state.** - - Rationale: complete per-frame access units remove RTP-fragment handling but not inter-frame dependency. P-frames still require prior decoded state. - -6. **Defer QoS.** - - Rationale: LCM is best effort. Keyframe request, durable keyframe cache, retransmission, PLI, and transport QoS belong in a later video-session/QoS design. - -## Safety / Replay - -This change affects image transport and recording only. It does not command robot hardware or change control loops. - -Unsupported image formats must fail explicitly when H.264 encoding is selected. Encoded images must not silently pass through raw-pixel methods. - -Replay after arbitrary seek is best effort. A decoder session starts without GOP state, suppresses deltas until the first keyframe at or after the start point, then emits decoded raw Images for that keyframe and following decodable deltas. Full random pixel access to any arbitrary P-frame is not a v1 guarantee. - -## Migration / Rollout - -1. Extend `Image` with `encoding` and `codec_metadata` while preserving raw eager defaults. -2. Add H.264 encoder/decoder/session classes that produce and consume encoded Images. -3. Add `H264LcmTransport` decode mode. -4. Add `H264ImageCodec` and explicit `codec="h264"` storage. -5. Update demos so recording uses encoded transport mode and replay decodes through an H.264 session before visualization. -6. Update docs/tests/specs to remove obsolete storage-strategy and packet-type language. - -Rollback is straightforward for new runs: remove H.264 transport/storage configuration and streams return to normal raw/JPEG behavior. Existing H.264-backed recordings require the H.264 codec path to read encoded Images and a decoder session to view pixels. diff --git a/openspec/changes/add-h264-codec-mem2-storage/docs.md b/openspec/changes/add-h264-codec-mem2-storage/docs.md deleted file mode 100644 index b55ec0873c..0000000000 --- a/openspec/changes/add-h264-codec-mem2-storage/docs.md +++ /dev/null @@ -1,58 +0,0 @@ -## User-Facing Docs - -- Update `docs/usage/transports/index.md` or the image-transport-specific transport docs to describe opt-in H.264 image transport behavior: - - Public module streams remain `Out[Image]` and `In[Image]`. - - `H264LcmTransport` compresses image payloads internally as H.264 and delivers decoded `Image` objects by default. - - `decode_images=False` delivers encoded `Image` objects for storage or explicit decode sessions. - - H.264 encoded images contain complete Annex B access units for one source frame, matching Foxglove-style `CompressedVideo` expectations. - - Delta frames require prior GOP state; after packet loss or late join, subscribers resume on the next keyframe. - - Unsupported image formats fail clearly rather than silently converting. -- Update `docs/usage/blueprints.md` with an opt-in blueprint transport mapping example for `H264LcmTransport` and `H264Config`. -- Update memory2 user docs, likely under `docs/usage/` or the memory2 capability docs, to describe opt-in H.264-backed image storage: - - Default image storage remains JPEG-backed. - - Users opt in per stream with `codec="h264"`; recorder subclasses that need H.264 storage create their target stream with that codec override. - - memory2 still stores one observation per source frame. - - metadata queries and `obs.data` access return encoded `Image` values without pixel decode. - - explicit H.264 decode sessions convert encoded replay streams to raw decoded `Image` values and suppress deltas until the first keyframe at or after the start point. - - replay emits encoded `Image` values on the normal replay schedule. -- Add a short manual QA section for `demo-h264-video-e2e` after the demo blueprint exists: - - run `dimos run demo-h264-video-e2e --daemon` - - inspect probe/recorder logs - - query the generated memory2 store - - validate encoded storage, replay decode, and seq-gap recovery. -- Mention optional video dependencies in the installation or feature docs. Users should know that H.264 mode requires the aiortc/PyAV/FFmpeg dependency path while JPEG defaults remain available without selecting H.264. - -## Contributor Docs - -- Update `docs/development/testing.md` or a nearby development testing guide with H.264-specific test commands once tests exist: - - unit tests for encoded `Image` metadata, H.264 access-unit assembly, GOP buffering, unsupported formats, and raw-pixel guards - - memory2 storage tests for `H264ImageCodec`, append/query/reopen/replay, and default JPEG compatibility - - synthetic end-to-end demo/blueprint smoke test for live LCM transmission and memory2 recording. -- Document dependency expectations for contributors who run video tests locally, including how to install the relevant `uv` extras and how tests should skip clearly when video dependencies are unavailable. -- If `demo_h264_video_e2e` is registered as a runnable blueprint, contributor docs should remind maintainers to regenerate `dimos/robot/all_blueprints.py` with `pytest dimos/robot/test_all_blueprints_generation.py`. - -## Coding-Agent Docs - -- Update `docs/coding-agents/index.md` or a focused coding-agent guide if agents are expected to modify image transports or memory2 storage: - - H.264 is opt-in and must not replace JPEG defaults. - - Keep public module contracts as `Image` streams. - - Store complete Annex B access units per source frame, not RTP fragments. - - Preserve one memory2 observation per source frame. - - Avoid negative-only OpenSpec requirements when adding or editing specs; include positive `MUST`/`SHALL` statements. -- No `AGENTS.md` update is required unless maintainers want the H.264/Foxglove packet-shape rule to become a repo-wide coding-agent constraint. - -## Doc Validation - -- Run documentation link validation for changed docs if available: - - `uv run doclinks` -- Run markdown code-block validation for docs that contain executable Python snippets, for example: - - `uv run md-babel-py run docs/usage/blueprints.md` - - `uv run md-babel-py run ` -- If diagrams are added or regenerated, run: - - `bin/gen-diagrams` -- Validate generated blueprint registry freshness if the demo blueprint is registered: - - `uv run pytest dimos/robot/test_all_blueprints_generation.py` - -## No Docs Needed - -Documentation is needed. This change adds user-visible opt-in transport and memory2 storage configuration, dependency requirements, replay/lazy-decode behavior, and a runnable synthetic QA blueprint. diff --git a/openspec/changes/add-h264-codec-mem2-storage/proposal.md b/openspec/changes/add-h264-codec-mem2-storage/proposal.md deleted file mode 100644 index cff06956f4..0000000000 --- a/openspec/changes/add-h264-codec-mem2-storage/proposal.md +++ /dev/null @@ -1,40 +0,0 @@ -## Why - -DimOS image streams currently use full `Image` objects over typed transports and memory2 stores images as independent JPEG payloads. That is simple and compatible, but inefficient for long-running camera streams and remote subscribers because each frame is compressed independently and no shared video codec state is reused. - -DimOS needs an opt-in H.264 image-stream path that preserves the public `Image` stream contract while allowing live transports and memory2 storage to carry compact encoded image payloads. The design should make H.264 reusable across carriers such as LCM first, and DDS/WebRTC later, while keeping memory2 queries, pose/tag alignment, and replay frame timing intact. - -## What Changes - -- Add encoded `Image` support for RGB/BGR-style image streams, with one H.264 Annex B access unit per source frame and codec metadata on the `Image`. -- Add stateful H.264 encode/decode behavior that produces periodic self-contained keyframes, rejects unsupported image formats clearly, detects sequence gaps, and resumes delivery only after a valid keyframe. -- Add an opt-in live transport path for H.264 image streams, starting with LCM, that can expose decoded raw `Image` values or encoded H.264 `Image` values depending on subscriber configuration. -- Add memory2 H.264 image storage through a normal `H264ImageCodec` so streams can store one encoded `Image` observation per frame without a special storage backend. -- Preserve the existing JPEG image codec and JPEG-backed memory2 storage as the default behavior. -- No hardware-safety behavior changes are intended. -- No public robot-control, skill, or MCP breaking changes are intended. - -## Affected DimOS Surfaces - -- Modules/streams: typed `Image` streams, image-specific transport adapters, memory2 Recorder ingestion, memory2 Stream/Observation payload access, and replay output of encoded images for H.264-backed streams. -- Blueprints/CLI: blueprints may opt image streams into H.264-capable transports or memory2 H.264 storage; existing blueprint behavior remains unchanged unless configured. -- Skills/MCP: no direct skill or MCP behavior changes expected. -- Hardware/simulation/replay: camera-heavy hardware and simulation streams may benefit from reduced bandwidth/storage; H.264 replay emits encoded `Image` values on the same schedule and explicit decode sessions convert them to decoded frames for consumers. -- Docs/generated registries: memory2 and transport docs need updates; generated blueprint registries are not expected to change unless new demo blueprints are added. - -## Capabilities - -### New Capabilities - -- `h264-image-streams`: Covers encoded H.264 `Image` payloads, live image-stream encode/decode behavior, keyframe/GOP handling, sequence-gap behavior, and transport compatibility expectations. -- `memory2-h264-storage`: Covers opt-in H.264-backed memory2 image observation storage through `H264ImageCodec`, per-frame encoded image persistence, replay of encoded images, and explicit best-effort decode sessions. - -### Modified Capabilities - -- None. - -## Impact - -Users and developers gain a more bandwidth- and storage-efficient option for camera streams while keeping existing `Image` stream consumers and memory2 query/replay behavior familiar. Existing JPEG-backed recordings, default transports, and non-image streams remain compatible. - -Compatibility risk centers on adding optional video codec dependencies, keeping encoded images from accidentally flowing into raw-pixel operations, making GOP recovery deterministic after packet loss or replay seek, and avoiding silent corruption when frames cannot be decoded. Documentation and QA should cover opt-in configuration, supported image formats, dependency installation, LCM live-stream behavior, memory2 append/query/replay behavior, packet-loss recovery, and a small synthetic image-stream demo. diff --git a/openspec/changes/add-h264-codec-mem2-storage/specs/h264-image-streams/spec.md b/openspec/changes/add-h264-codec-mem2-storage/specs/h264-image-streams/spec.md deleted file mode 100644 index b5ca467d27..0000000000 --- a/openspec/changes/add-h264-codec-mem2-storage/specs/h264-image-streams/spec.md +++ /dev/null @@ -1,115 +0,0 @@ -## ADDED Requirements - -### Requirement: Opt-in H.264 image streams preserve the Image contract -DimOS SHALL allow an image stream to opt into H.264 encoding while preserving `Image` as the public stream payload type for publishers and subscribers. - -#### Scenario: Publisher and subscriber use normal Image objects -- **GIVEN** a blueprint configures an image stream for H.264 live transmission -- **AND** the source module publishes `Image` values on an `Out[Image]` stream -- **WHEN** a downstream module subscribes through an `In[Image]` stream -- **THEN** the downstream callback receives decoded `Image` values -- **AND** the module author does not need to publish or subscribe to a separate encoded video type. - -#### Scenario: Existing image streams remain unchanged by default -- **GIVEN** a blueprint does not opt an image stream into H.264 transmission -- **WHEN** the blueprint runs with its existing image transport configuration -- **THEN** DimOS MUST preserve the existing image transport behavior -- **AND** H.264 dependencies or settings are not required for that stream. - -### Requirement: H.264 encoded Images are complete per-frame Annex B access units -DimOS SHALL represent each H.264-transmitted source image frame as one encoded `Image` whose `data` contains the complete Annex B access unit emitted for that encoder input frame. - -#### Scenario: One encoded Image corresponds to one source frame -- **GIVEN** an H.264-enabled image stream publishes one source `Image` frame -- **WHEN** DimOS encodes that frame for a non-WebRTC carrier or for packet inspection -- **THEN** the encoded `Image.data` MUST contain all NAL units emitted for that source frame in Annex B form -- **AND** the encoded `Image.codec_metadata` MUST identify the payload as H.264 Annex B for exactly one source frame. - -#### Scenario: Delta-frame encoded Images require GOP state -- **GIVEN** an encoded `Image` contains a delta frame -- **WHEN** a decoder processes that encoded image without the prior GOP state required by H.264 -- **THEN** DimOS MUST treat the encoded image as requiring recovery from a keyframe -- **AND** DimOS MUST avoid presenting corrupted image pixels as a valid decoded `Image`. - -### Requirement: Keyframes bootstrap late join and recovery -DimOS SHALL provide periodic keyframes for H.264 image streams so subscribers can start or recover decoding at bounded intervals. - -#### Scenario: Late subscriber waits for a keyframe -- **GIVEN** an H.264 image stream is already publishing -- **WHEN** a subscriber joins after the stream has started -- **THEN** DimOS MUST begin delivering decoded images only after the subscriber has valid keyframe-based decoder state -- **AND** the subscriber must not receive corrupted decoded images from incomplete GOP state. - -#### Scenario: Keyframes include decoder parameter data -- **GIVEN** an H.264 image stream emits an IDR keyframe -- **WHEN** the keyframe encoded `Image` is used to bootstrap a new decoder -- **THEN** the encoded `Image.data` MUST include the decoder parameter information needed for that bootstrap, such as SPS/PPS for H.264 Annex B streams -- **AND** later delta frames in the same GOP may depend on that decoded keyframe state. - -### Requirement: H.264 transport can deliver decoded or encoded Images -DimOS SHALL allow H.264 live transport subscribers to receive decoded raw `Image` values by default or encoded H.264 `Image` values when explicitly requested. - -#### Scenario: Default subscriber receives decoded Images -- **GIVEN** a blueprint configures `H264LcmTransport` without changing its decode mode -- **WHEN** a source publishes raw `Image` values -- **THEN** the subscriber MUST receive raw decoded `Image` values -- **AND** pixel operations on those images remain valid. - -#### Scenario: Encoded subscriber receives H.264 Images -- **GIVEN** a blueprint configures `H264LcmTransport` with encoded delivery enabled -- **WHEN** a source publishes raw `Image` values -- **THEN** the subscriber MUST receive `Image` values with `encoding="h264"` -- **AND** those images MUST preserve H.264 frame metadata needed by downstream storage or decode sessions. - -### Requirement: H.264 live decode is best-effort without QoS guarantees -DimOS SHALL apply a best-effort H.264 decode policy for live carriers that do not provide video QoS, keyframe requests, or durable keyframe caching. - -#### Scenario: Subscriber starts without GOP state -- **GIVEN** an H.264 live subscriber starts receiving packets at a point whose first packet is a delta frame -- **WHEN** the subscriber's decoder has no valid prior GOP state -- **THEN** DimOS MUST suppress decoded output for undecodable delta frames -- **AND** DimOS MUST begin delivering decoded `Image` values after the first keyframe at or after the subscriber start point establishes valid decoder state. - -#### Scenario: QoS policy is deferred -- **GIVEN** an H.264 image stream uses an LCM-style best-effort carrier -- **WHEN** packets are lost or a subscriber joins late -- **THEN** DimOS MUST rely on periodic keyframes and decode suppression for v1 recovery -- **AND** DimOS documentation must describe keyframe request, durable keyframe cache, retransmission, and transport QoS as follow-up design work rather than v1 guarantees. - -### Requirement: Sequence gaps recover safely -DimOS SHALL detect missing or out-of-order H.264 live-stream packets and resume decoded image delivery from a valid keyframe state. - -#### Scenario: Packet loss occurs mid-GOP -- **GIVEN** a subscriber is decoding an H.264 image stream -- **WHEN** DimOS detects a sequence gap before the next keyframe -- **THEN** DimOS MUST stop delivering decoded images from the invalid GOP state -- **AND** DimOS SHALL resume delivery after a subsequent keyframe establishes valid decoder state. - -### Requirement: Unsupported image formats fail explicitly -DimOS SHALL accept only image formats supported by the configured H.264 image-stream mode and provide a clear failure for unsupported formats. - -#### Scenario: Supported color image is transmitted -- **GIVEN** an H.264-enabled image stream receives a supported 8-bit color `Image` format -- **WHEN** DimOS encodes and transmits the image -- **THEN** subscribers MUST receive a decoded `Image` with the expected dimensions, timestamp, frame identifier, and color format semantics. - -#### Scenario: Unsupported image format is rejected -- **GIVEN** an H.264-enabled image stream receives an unsupported image format such as depth, 16-bit, or alpha data -- **WHEN** DimOS attempts to encode or publish the image through the H.264 stream mode -- **THEN** DimOS MUST fail with a clear unsupported-format error -- **AND** DimOS MUST preserve safety by avoiding silent lossy conversion or corrupted output. - -### Requirement: H.264 stream configuration is observable and bounded -DimOS SHALL expose user-configurable H.264 stream settings for bitrate, keyframe cadence, frame-rate assumptions, and low-latency profile behavior. - -#### Scenario: Blueprint opts into H.264 settings -- **GIVEN** a blueprint configures an image stream for H.264 live transmission with bitrate and keyframe cadence settings -- **WHEN** the blueprint runs -- **THEN** DimOS MUST apply those settings to the H.264 stream behavior -- **AND** subscribers must continue to observe normal `Image` payloads rather than codec-specific internals. - -#### Scenario: H.264 dependencies are unavailable -- **GIVEN** a user selects H.264 image-stream mode in an environment without the required video codec dependencies -- **WHEN** DimOS starts or initializes the H.264 stream -- **THEN** DimOS MUST fail with an actionable dependency error -- **AND** DimOS MUST preserve non-H.264 image-stream behavior for configurations that do not select H.264. diff --git a/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-h264-storage/spec.md b/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-h264-storage/spec.md deleted file mode 100644 index f4ebc05a6e..0000000000 --- a/openspec/changes/add-h264-codec-mem2-storage/specs/memory2-h264-storage/spec.md +++ /dev/null @@ -1,94 +0,0 @@ -## ADDED Requirements - -### Requirement: H.264 image storage is opt-in per memory2 stream -memory2 SHALL allow image streams to opt into H.264 storage through the normal codec configuration path while preserving the default image-storage behavior for streams that do not opt in. - -#### Scenario: Stream opts into H.264 storage -- **GIVEN** a memory2 image stream is configured with the H.264 image codec -- **WHEN** the stream appends encoded `Image` values with `encoding="h264"` -- **THEN** memory2 MUST store those image observations using H.264 encoded payloads through the existing backend/blob path -- **AND** queries for the stream must continue to return image observations associated with the original frame timestamps. - -#### Scenario: H.264 storage uses the normal codec extension point -- **GIVEN** a store creates an `Image` stream with `codec="h264"` -- **WHEN** memory2 creates the stream backend -- **THEN** memory2 MUST use the normal codec resolution and blob persistence flow -- **AND** the generic store and backend paths must not contain H.264-specific branches or imports. - -#### Scenario: Stream uses default image storage -- **GIVEN** a memory2 image stream is created without H.264 image-storage configuration -- **WHEN** the stream appends `Image` values -- **THEN** memory2 MUST preserve the existing default image-storage behavior -- **AND** existing JPEG-backed recordings remain readable through the normal memory2 APIs. - -### Requirement: H.264 storage preserves one observation per source frame -memory2 SHALL store H.264-backed image streams with one observation corresponding to each source image frame. - -#### Scenario: Recording a sequence of image frames -- **GIVEN** a recorder receives a sequence of `Image` frames on an H.264-backed memory2 stream -- **WHEN** memory2 stores the sequence -- **THEN** memory2 MUST create one queryable observation per source frame -- **AND** each observation must retain its timestamp, frame identifier, pose metadata when available, and tags independently of pixel decode. - -### Requirement: Stored H.264 Images are complete frame access units -memory2 SHALL store each H.264 image observation with an encoded `Image` payload that contains the complete Annex B access unit for that source frame. - -#### Scenario: Stored encoded Image is inspected or exported -- **GIVEN** an H.264-backed image observation has an encoded `Image` payload -- **WHEN** the payload is inspected by storage tooling or exported to a compatible video-message format -- **THEN** the `Image.data` payload MUST represent all NAL units emitted for that source frame in Annex B form -- **AND** the `Image.codec_metadata` MUST include H.264 frame metadata such as sequence, keyframe state, keyframe reference, presentation timestamp, dimensions, codec, and bitstream -- **AND** memory2 MUST avoid exposing individual RTP fragments as the stored observation payload. - -### Requirement: Decode starts from valid keyframe state -H.264 decoded views over memory2 replay SHALL use the same best-effort H.264 decode policy as live subscribers: decode starts without GOP state and suppresses delta frames until a keyframe at or after the start point establishes valid decoder state. - -#### Scenario: Replay seeks into the middle of a GOP -- **GIVEN** a user starts replay or a decoded view at a timestamp whose first stored H.264 packet is a delta frame -- **WHEN** a H.264 decode session decodes the replayed encoded images from that start point -- **THEN** the decode session MUST suppress undecodable delta frames until the first keyframe at or after the start point -- **AND** the decoded view MUST emit decoded `Image` values for that keyframe and following decodable delta frames. - -#### Scenario: Required GOP state is missing -- **GIVEN** an H.264 encoded image requires prior GOP state to decode -- **WHEN** a decode session has no usable keyframe state -- **THEN** the decode session MUST fail or suppress the decode with a clear decode error -- **AND** DimOS MUST avoid returning corrupted pixels as a valid decoded `Image`. - -### Requirement: Metadata queries do not force pixel decode -memory2 SHALL allow metadata and encoded-payload access for H.264-backed image observations without decoding image pixels. - -#### Scenario: Query reads observation metadata only -- **GIVEN** a memory2 store contains H.264-backed image observations -- **WHEN** a user queries observations and reads timestamps, frame identifiers, pose metadata, tags, `Image.encoding`, or H.264 codec metadata -- **THEN** memory2 MUST provide that information without requiring H.264 pixel decode -- **AND** the stored `obs.data` value for a H.264 stream MUST be an encoded `Image`, not a decoded pixel image. - -### Requirement: H.264 codec stores and restores encoded Images -memory2 SHALL store and restore H.264 observations as encoded `Image` values through the H.264 image codec. - -#### Scenario: User accesses observation data from an H.264 stream -- **GIVEN** a queried H.264-backed image observation was stored with the H.264 image codec -- **WHEN** the user accesses `obs.data` -- **THEN** memory2 MUST return an `Image` value with `encoding="h264"` -- **AND** pixel decoding MUST require an explicit H.264 decode session outside the generic memory2 backend. - -### Requirement: H.264-backed replay emits encoded Images -memory2 SHALL replay H.264-backed image streams as encoded `Image` values on the existing replay schedule. - -#### Scenario: Replaying a stored H.264 image stream -- **GIVEN** a memory2 store contains an H.264-backed image stream -- **WHEN** replay is started for that stream -- **THEN** replay MUST emit encoded `Image` values in observation timestamp order -- **AND** an explicit H.264 decode session MAY convert those encoded images to raw decoded `Image` values for visualization or consumers -- **AND** that decode session MUST skip undecodable deltas before the first valid keyframe at or after the replay start point. - -### Requirement: H.264 storage survives store reopen -memory2 SHALL persist H.264 codec configuration and encoded image metadata so a reopened store can query and replay H.264-backed image streams. - -#### Scenario: Reopen and decode -- **GIVEN** a memory2 store was written with an H.264-backed image stream -- **WHEN** the process closes and a later process reopens the store -- **THEN** memory2 MUST recognize the stream as H.264-backed -- **AND** the reopened store must return encoded `Image` values from query and replay -- **AND** explicit decode sessions must retain the same best-effort keyframe-start behavior after reopen. diff --git a/openspec/changes/add-h264-codec-mem2-storage/tasks.md b/openspec/changes/add-h264-codec-mem2-storage/tasks.md deleted file mode 100644 index f8878ee316..0000000000 --- a/openspec/changes/add-h264-codec-mem2-storage/tasks.md +++ /dev/null @@ -1,61 +0,0 @@ -## 1. Encoded Image and H.264 codec behavior - -- [x] 1.1 Add encoded `Image` support for one complete H.264 Annex B access unit per source frame, including sequence, timestamp, frame identifier, dimensions, format, codec, bitstream, keyframe, keyframe-reference, presentation timestamp, and payload metadata. -- [x] 1.2 Add H.264 configuration covering bitrate, target FPS, keyframe interval, profile, preset/tune, maximum GOP length, and supported pixel format settings. -- [x] 1.3 Add the aiortc-backed H.264 adapter that converts `Image` frames to H.264 output and converts H.264 input back to `Image` while keeping aiortc/RTP internals out of public DimOS APIs. -- [x] 1.4 Add access-unit assembly so all NAL units emitted for one encoder input frame are stored or transmitted as one Annex B packet, not as individual RTP fragments. -- [x] 1.5 Add GOP/keyframe state tracking that detects sequence gaps, marks decoder state invalid, suppresses corrupted output, and resumes only after a usable keyframe. -- [x] 1.6 Add explicit errors for unsupported image formats, missing video dependencies, and unusable GOP/decode state. -- [x] 1.7 Add focused codec tests for per-frame Annex B encoded image shape, keyframe metadata, SPS/PPS bootstrap behavior, sequence-gap handling, dependency errors, and unsupported image formats. - -## 2. Image compatibility and encoded-payload guards - -- [x] 2.1 Keep raw `Image` eager and numpy-backed while allowing encoded `Image` payloads to carry H.264 bytes and codec metadata. -- [x] 2.2 Preserve existing eager `Image` behavior and compatibility for current JPEG, LCM, SHM, memory2, and visualization consumers. -- [x] 2.3 Add tests proving eager images still work and encoded images reject raw-pixel operations. - -## 3. Live H.264 image transport - -- [x] 3.1 Add the H.264 LCM pubsub adapter that publishes encoded video frame packets on the wire and delivers decoded `Image` values to subscribers. -- [x] 3.2 Add `H264LcmTransport` to the transport layer with worker-safe serialization behavior and a `decode_images` mode matching existing transport patterns. -- [x] 3.3 Keep normal image transport behavior unchanged unless a blueprint explicitly opts a stream into H.264 transport. -- [x] 3.4 Add live transport tests for `Out[Image]` to `In[Image]` delivery, keyframe bootstrap, late subscriber behavior, sequence-gap recovery, and default transport compatibility. - -## 4. memory2 H.264 image codec storage - -- [x] 4.1 Add an `H264ImageCodec` that stores and restores encoded `Image` values through the normal memory2 codec path. -- [x] 4.2 Route configured memory2 `Image` streams through `codec="h264"` while leaving unconfigured `Image` streams on the existing default JPEG storage path. -- [x] 4.3 Store one observation row per source frame and one encoded `Image` payload containing a complete Annex B access unit per observation. -- [x] 4.4 Persist H.264 codec metadata in encoded `Image` payloads instead of a memory2-specific frame index table. -- [x] 4.5 Persist and reload per-stream codec configuration so reopened stores recognize H.264-backed image streams. -- [x] 4.6 Keep generic memory2 query and `obs.data` access encoded for H.264 streams; require explicit H.264 decode sessions for raw pixels. -- [x] 4.7 Add replay support that emits encoded `Image` values in observation timestamp order so an explicit decode session can apply best-effort keyframe startup. -- [x] 4.8 Add memory2 tests for `H264ImageCodec`, append/query, encoded payload access, store reopen, replay, default JPEG compatibility, and encoded-image raw-pixel guards. - -## 5. Synthetic end-to-end blueprint and manual QA surface - -- [x] 5.1 Add `dimos/protocol/video/demo_h264_video_e2e.py` with a deterministic synthetic `Image` source, H.264 memory2 recorder, and image probe. -- [x] 5.2 Configure the blueprint to exercise live H.264 LCM transmission and H.264 memory2 storage through encoded `Image` storage surfaces. -- [x] 5.3 Add probe status or logs that report received frame counts, dimensions, timestamp monotonicity, validation failures, and drop/recovery observations. -- [x] 5.4 Register the runnable blueprint as `demo-h264-video-e2e` if it is intended to be exposed through `dimos run`. -- [x] 5.5 Regenerate and verify `dimos/robot/all_blueprints.py` if the demo blueprint is registered. - -## 6. Documentation - -- [x] 6.1 Update user-facing transport docs with H.264 opt-in behavior, `Image` stream preservation, Annex B per-frame packets, keyframe/GOP recovery, unsupported formats, and dependency notes. -- [x] 6.2 Update blueprint docs with an H.264 image transport mapping example. -- [x] 6.3 Update memory2 docs with H.264 image codec configuration, one-observation-per-frame behavior, encoded `obs.data` access, explicit decode sessions, best-effort keyframe startup, and replay behavior. -- [x] 6.4 Add docs for running and inspecting the `demo-h264-video-e2e` synthetic QA blueprint. -- [x] 6.5 Update contributor testing docs with video dependency setup, focused test targets, skip behavior when dependencies are unavailable, and blueprint-registry regeneration guidance. -- [x] 6.6 Update coding-agent docs if maintainers want the H.264/Foxglove packet-shape rule documented for future agent edits. - -## 7. Verification - -- [x] 7.1 Run `openspec validate add-h264-codec-mem2-storage --strict`. -- [x] 7.2 Run focused unit tests for H.264 codec/access-unit/GOP behavior. -- [x] 7.3 Run focused unit tests for eager `Image` compatibility. -- [x] 7.4 Run focused memory2 storage tests for H.264 codec append/query/reopen/replay/default compatibility. -- [x] 7.5 Run focused live transport tests for H.264 LCM round-trip and sequence-gap recovery. -- [x] 7.6 Run `uv run pytest dimos/robot/test_all_blueprints_generation.py` if the demo blueprint is registered. -- [x] 7.7 Run relevant docs validation, including `uv run doclinks` if available and `uv run md-babel-py run ` for executable markdown snippets. -- [x] 7.8 Manually run `dimos run demo-h264-video-e2e --daemon`, inspect logs/probe status, query the generated memory2 store for encoded observations, replay the stream through an explicit decoder, and verify sequence-gap recovery behavior. diff --git a/openspec/config.yaml b/openspec/config.yaml deleted file mode 100644 index 62a72bba63..0000000000 --- a/openspec/config.yaml +++ /dev/null @@ -1,45 +0,0 @@ -schema: dimos-capability - -context: | - DimOS is a robotics operating system for generalist robots. Modules communicate - through typed streams (`In[T]`, `Out[T]`) over LCM, SHM, ROS, DDS, or other - transports. Blueprints compose modules into runnable robot stacks. Skills are - `@skill`-annotated RPC methods exposed to agents and MCP clients. - - Terminology boundary: - - "OpenSpec spec" means a behavior specification under `openspec/specs/`. - - "DimOS Spec" means a Python Protocol/RPC contract in `*_spec.py` files, - usually inheriting `dimos.spec.utils.Spec` and `typing.Protocol`. - Keep these separate. OpenSpec specs describe observable behavior; DimOS Specs - describe code-level module interfaces. - - OpenSpec specs should capture current behavior, user/developer-visible - outcomes, public CLI/API/tool surfaces, robot safety constraints, and testable - scenarios. Put implementation choices, class names, module wiring, generated - registry updates, and rollout details in `design.md` or `tasks.md`. - - Documentation lives in: - - `docs/usage/` for user-facing concepts and APIs. - - `docs/capabilities/` for capability and platform guides. - - `docs/development/` for contributor process. - - `docs/coding-agents/` and `AGENTS.md` for coding-agent guidance. - -rules: - proposal: - - "Identify affected DimOS surfaces: modules, streams, blueprints, CLI, skills/MCP, docs, hardware, simulation, replay, or generated registries." - - Use capability names that match behavior domains, not Python class names. - - Mark hardware safety or public API/CLI changes explicitly. - specs: - - Write behavior-first requirements; avoid implementation detail unless it is externally observable. - - Every requirement must include at least one `#### Scenario:` block with concrete observable outcomes. - - Use "OpenSpec capability spec" when prose might otherwise be confused with DimOS Python `Spec` Protocols. - design: - - Call out DimOS `Spec` Protocols, adapter Protocols, blueprint composition, stream names/types, and skill/MCP exposure when relevant. - - Mention generated files and required regeneration commands, especially `pytest dimos/robot/test_all_blueprints_generation.py` for blueprint registry changes. - - Include hardware/simulation/replay assumptions and safety constraints for robot-facing work. - docs: - - List user-facing docs, contributor docs, coding-agent docs, and AGENTS.md updates required by the change. - - Include documentation validation commands for changed docs, such as `doclinks` and `md-babel-py run ` where applicable. - tasks: - - Include verification tasks for OpenSpec validation, relevant pytest targets, type checks when needed, and manual QA through the user-facing surface. - - Add registry generation tasks when blueprint names, module classes, or generated registry inputs change. diff --git a/openspec/schemas/dimos-capability/schema.yaml b/openspec/schemas/dimos-capability/schema.yaml deleted file mode 100644 index fedb7964ee..0000000000 --- a/openspec/schemas/dimos-capability/schema.yaml +++ /dev/null @@ -1,128 +0,0 @@ -name: dimos-capability -version: 1 -description: DimOS capability workflow - proposal → specs/design/docs → tasks -artifacts: - - id: proposal - generates: proposal.md - description: DimOS change proposal covering intent, scope, capability impact, and affected robot/software surfaces - template: proposal.md - instruction: | - Create the proposal document that establishes WHY this change is needed and what DimOS behavior it affects. - - Sections: - - **Why**: 1-2 concise paragraphs on the problem or opportunity. Explain why the change matters now. - - **What Changes**: Bullet list of added, modified, or removed behavior. Mark public API/CLI or hardware-safety breaking changes with **BREAKING**. - - **Affected DimOS Surfaces**: Identify modules, streams, blueprints, CLI commands, skills/MCP tools, docs, hardware, simulation, replay, generated registries, or external protocols touched by the change. - - **Capabilities**: Identify which OpenSpec capability specs will be created or modified: - - **New Capabilities**: List behavior domains introduced by the change. Each becomes `specs//spec.md`. Use kebab-case names (for example, `agent-skills-mcp`, `blueprint-composition`, `manipulation-stack`). - - **Modified Capabilities**: List existing `openspec/specs//` entries whose requirements change. Only include spec-level behavior changes, not implementation-only refactors. - - **Impact**: Summarize user/developer impact, compatibility risks, dependency changes, documentation updates, and test/QA scope. - - Keep proposals concise. Do not include line-by-line implementation details; put architecture and rollout decisions in `design.md`. - requires: [] - - id: specs - generates: specs/**/*.md - description: Behavior-first OpenSpec capability delta specifications - template: spec.md - instruction: | - Create OpenSpec capability specs that define WHAT DimOS should do, not how it is implemented. - - Create one delta spec file per capability listed in proposal.md: - - New capabilities: use `specs//spec.md` with the exact kebab-case name from the proposal. - - Modified capabilities: use the existing folder from `openspec/specs//`. - - Use these delta sections as `##` headers: - - **ADDED Requirements**: New externally observable behavior. - - **MODIFIED Requirements**: Changed behavior. Include the full updated requirement block, not a partial patch. - - **REMOVED Requirements**: Deprecated behavior. Include **Reason** and **Migration**. - - **RENAMED Requirements**: Name-only changes. Use FROM:/TO: format. - - Requirement format: - - Use `### Requirement: `. - - Use SHALL/MUST for normative requirements. - - Include at least one `#### Scenario: ` per requirement. Scenario headings MUST use exactly four `#` characters. - - Prefer `- **GIVEN**`, `- **WHEN**`, `- **THEN**`, and `- **AND**` bullets. - - Cover happy path plus meaningful edge/error/safety cases. - - DimOS-specific guidance: - - Specify user/developer-visible behavior, robot outcomes, CLI behavior, skill/MCP tool behavior, stream contracts, safety constraints, and compatibility expectations. - - Avoid Python class names, private module internals, transport implementation choices, and generated-file details unless those details are observable API contracts. - - Use "OpenSpec capability spec" in prose when needed to avoid confusion with DimOS Python `Spec` Protocols. - - If the behavior only changes implementation and not observable requirements, do not create a spec delta. - requires: - - proposal - - id: design - generates: design.md - description: DimOS technical design and architecture decisions - template: design.md - instruction: | - Create the design document that explains HOW the change should be implemented in DimOS. - - Include design.md for cross-module changes, new robot/hardware integration, new public interfaces, new dependencies, safety-sensitive behavior, generated registry changes, or unclear architecture. - - Sections: - - **Context**: Current state, relevant modules/blueprints/docs, and constraints. - - **Goals / Non-Goals**: What the design achieves and explicitly excludes. - - **DimOS Architecture**: Modules, streams, transports, blueprints, RPC/module refs, DimOS `Spec` Protocols, adapter Protocols, skills/MCP exposure, CLI entry points, and generated registries involved. - - **Decisions**: Key choices with rationale and alternatives considered. - - **Safety / Simulation / Replay**: Hardware assumptions, sim/replay behavior, safety constraints, and manual QA surface. - - **Risks / Trade-offs**: Known risks and mitigations. - - **Migration / Rollout**: Compatibility, generated files, docs, and deployment steps. - - **Open Questions**: Outstanding decisions or unknowns. - - Reference proposal.md for intent and specs for behavior. Keep line-by-line work in tasks.md. - requires: - - proposal - - id: docs - generates: docs.md - description: Documentation impact plan for user, contributor, and coding-agent docs - template: docs.md - instruction: | - Create the documentation impact plan for the change. - - Sections: - - **User-Facing Docs**: Updates under `docs/usage/`, `docs/capabilities/`, `docs/platforms/`, or README files. - - **Contributor Docs**: Updates under `docs/development/`. - - **Coding-Agent Docs**: Updates under `docs/coding-agents/` or `AGENTS.md`. - - **Doc Validation**: Commands needed for changed docs, such as `doclinks`, `md-babel-py run `, and `bin/gen-diagrams`. - - **No Docs Needed**: If no docs are needed, explain why. - - Match `docs/development/writing_docs.md`: contributor-only docs belong in `docs/development`; user-facing behavior belongs in `docs/usage` or `docs/capabilities`. - requires: - - proposal - - id: tasks - generates: tasks.md - description: Implementation, validation, docs, and manual-QA checklist - template: tasks.md - instruction: | - Create the implementation checklist. The apply phase parses checkbox format, so every actionable task MUST use `- [ ]`. - - Guidelines: - - Group tasks under numbered `##` headings. - - Each task must be `- [ ] X.Y Task description`. - - Keep tasks small enough to complete in one focused session. - - Order tasks by dependency. - - Include docs and validation tasks from docs.md. - - Include generated registry tasks when blueprints or module registry inputs change. - - Include manual QA through the actual user surface: CLI, TUI, HTTP API, MCP tool, simulation/replay blueprint, hardware procedure, or library driver. - - Typical DimOS validation tasks: - - Run `openspec validate `. - - Run focused pytest targets for changed modules. - - Run `pytest dimos/robot/test_all_blueprints_generation.py` when blueprint registry output may change. - - Run docs validation commands for changed docs. - - Run lints/types when the touched area requires them. - - Reference specs for WHAT, design for HOW, and docs.md for documentation work. - requires: - - specs - - design - - docs -apply: - requires: - - tasks - tracks: tasks.md - instruction: | - Read proposal.md, specs, design.md, docs.md, and tasks.md before editing code. - Work through pending tasks, mark checkboxes complete as they finish, and keep artifacts current when implementation changes the plan. - Verify with OpenSpec validation, focused tests, docs checks, and manual QA through the relevant DimOS surface. diff --git a/openspec/schemas/dimos-capability/templates/design.md b/openspec/schemas/dimos-capability/templates/design.md deleted file mode 100644 index 25031ceb8b..0000000000 --- a/openspec/schemas/dimos-capability/templates/design.md +++ /dev/null @@ -1,35 +0,0 @@ -## Context - - - -## Goals / Non-Goals - -**Goals:** - - -**Non-Goals:** - - -## DimOS Architecture - - - -## Decisions - - - -## Safety / Simulation / Replay - - - -## Risks / Trade-offs - - - -## Migration / Rollout - - - -## Open Questions - - diff --git a/openspec/schemas/dimos-capability/templates/docs.md b/openspec/schemas/dimos-capability/templates/docs.md deleted file mode 100644 index d274aed653..0000000000 --- a/openspec/schemas/dimos-capability/templates/docs.md +++ /dev/null @@ -1,19 +0,0 @@ -## User-Facing Docs - - - -## Contributor Docs - - - -## Coding-Agent Docs - - - -## Doc Validation - - - -## No Docs Needed - - diff --git a/openspec/schemas/dimos-capability/templates/proposal.md b/openspec/schemas/dimos-capability/templates/proposal.md deleted file mode 100644 index 98d409e8de..0000000000 --- a/openspec/schemas/dimos-capability/templates/proposal.md +++ /dev/null @@ -1,32 +0,0 @@ -## Why - - - -## What Changes - - - -## Affected DimOS Surfaces - - -- Modules/streams: -- Blueprints/CLI: -- Skills/MCP: -- Hardware/simulation/replay: -- Docs/generated registries: - -## Capabilities - -### New Capabilities - -- ``: - -### Modified Capabilities - -- ``: - -## Impact - - diff --git a/openspec/schemas/dimos-capability/templates/spec.md b/openspec/schemas/dimos-capability/templates/spec.md deleted file mode 100644 index afc0c1ff58..0000000000 --- a/openspec/schemas/dimos-capability/templates/spec.md +++ /dev/null @@ -1,16 +0,0 @@ -## ADDED Requirements - -### Requirement: - - -#### Scenario: -- **GIVEN** -- **WHEN** -- **THEN** -- **AND** - - diff --git a/openspec/schemas/dimos-capability/templates/tasks.md b/openspec/schemas/dimos-capability/templates/tasks.md deleted file mode 100644 index b38fcdfabb..0000000000 --- a/openspec/schemas/dimos-capability/templates/tasks.md +++ /dev/null @@ -1,15 +0,0 @@ -## 1. Implementation - -- [ ] 1.1 -- [ ] 1.2 - -## 2. Documentation - -- [ ] 2.1 - -## 3. Verification - -- [ ] 3.1 Run `openspec validate ` -- [ ] 3.2 Run focused tests for changed code -- [ ] 3.3 Run docs validation commands for changed docs -- [ ] 3.4 Manually QA through the relevant DimOS surface (CLI, MCP, simulation/replay, hardware procedure, HTTP API, or library driver) From 91ebd3b84ee93328126664a3cb21f5e92f6fa7fb Mon Sep 17 00:00:00 2001 From: cc Date: Fri, 12 Jun 2026 13:27:08 -0700 Subject: [PATCH 09/14] doc: revert --- docs/coding-agents/index.md | 1 - docs/development/openspec.md | 102 ----------------------------------- docs/docs.json | 1 - 3 files changed, 104 deletions(-) delete mode 100644 docs/development/openspec.md diff --git a/docs/coding-agents/index.md b/docs/coding-agents/index.md index b50e0bb0cd..d888e67e52 100644 --- a/docs/coding-agents/index.md +++ b/docs/coding-agents/index.md @@ -4,7 +4,6 @@ ├── style.md (code style guidelines for dimos) ├── code-quality-rules.md (code-quality rules agents scan/fix against) ├── testing.md (docs about writing tests) -├── ../development/openspec.md (OpenSpec behavior-spec workflow) ├── docs (these are docs about writing docs) │   ├── codeblocks.md │   ├── doclinks.md diff --git a/docs/development/openspec.md b/docs/development/openspec.md deleted file mode 100644 index 280eb0f57e..0000000000 --- a/docs/development/openspec.md +++ /dev/null @@ -1,102 +0,0 @@ -# OpenSpec Workflow - -DimOS uses OpenSpec as the checked-in planning layer for behavior changes. OpenSpec artifacts live under `openspec/` and should describe what the system is supposed to do, why it is changing, and how contributors or agents should validate the work. - -## Terminology - -Keep these two meanings separate: - -- **OpenSpec capability spec**: Markdown requirements under `openspec/specs//spec.md`. These describe observable behavior and acceptance scenarios. -- **DimOS Spec**: Python Protocol/RPC contracts in files like `dimos/navigation/navigation_spec.py` or `dimos/manipulation/control/arm_driver_spec.py`. These describe module interfaces for code wiring. - -Use "OpenSpec capability spec" in prose when there is any chance of confusion. - -## Schema - -The project uses the `dimos-capability` schema configured in `openspec/config.yaml`. - -The artifact flow is: - -```text -proposal - ├── specs - ├── design - └── docs - └── tasks -``` - -| Artifact | Purpose | -|---|---| -| `proposal.md` | Intent, scope, affected DimOS surfaces, and capability impact. | -| `specs//spec.md` | Behavior-first requirements and scenarios. | -| `design.md` | Module, stream, blueprint, skill/MCP, safety, and rollout decisions. | -| `docs.md` | Documentation impact and doc validation plan. | -| `tasks.md` | Implementation, docs, verification, and manual QA checklist. | - -## When to create a change - -Create an OpenSpec change when work changes observable behavior, public CLI/API/MCP behavior, robot behavior, hardware/simulation/replay workflows, docs that users rely on, or cross-module architecture. - -Do not create a change for a purely mechanical refactor, typo fix, or internal cleanup unless it changes behavior or needs cross-session planning context. - -## Writing specs - -OpenSpec capability specs are behavior contracts, not implementation plans. - -Good spec content: - -- User- or developer-visible behavior. -- Public CLI/API/MCP tool behavior. -- Stream or message behavior that downstream modules rely on. -- Robot safety constraints and hardware/simulation/replay expectations. -- Scenarios that can be tested or manually verified. - -Avoid in specs: - -- Private class/function names. -- Generated-file mechanics. -- Library choices and wiring details. -- Step-by-step implementation tasks. - -Put those details in `design.md` or `tasks.md`. - -## Capability names - -Prefer behavior-domain names over code names. Useful starting points: - -- `module-system` -- `blueprint-composition` -- `cli-lifecycle` -- `agent-skills-mcp` -- `configuration` -- `navigation-stack` -- `manipulation-stack` -- `hardware-adapters` -- `simulation-replay` -- `documentation-system` - -Add specs progressively as changes need them. Do not try to backfill the whole project at once. - -## Validation - -Use OpenSpec validation before implementation and before archiving: - -```bash skip -openspec schema validate dimos-capability -openspec validate -openspec templates --json -``` - -For documentation changes, also run the relevant doc checks from [Writing Docs](/docs/development/writing_docs.md): - -```bash skip -md-babel-py run -``` - -When a change touches blueprint names, module-level blueprint variables, or module registry inputs, run: - -```bash skip -pytest dimos/robot/test_all_blueprints_generation.py -``` - -Then run focused tests for the changed code and manually QA through the actual surface: CLI command, MCP tool, HTTP API, simulation/replay blueprint, hardware procedure, or library driver. diff --git a/docs/docs.json b/docs/docs.json index 164fdd43c1..d18faa7bcf 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -151,7 +151,6 @@ "group": "Development", "pages": [ "development/conventions", - "development/openspec", "development/testing", "development/docker", "development/grid_testing", From f02984a9faa2e2c50b753b585bdbbe0455ef04d8 Mon Sep 17 00:00:00 2001 From: cc Date: Fri, 12 Jun 2026 14:00:20 -0700 Subject: [PATCH 10/14] fix: narrow raw image access for mypy --- dimos/core/transport.py | 2 +- dimos/experimental/security_demo/depth_estimator.py | 2 +- dimos/experimental/security_demo/security_module.py | 4 ++-- dimos/mapping/occupancy/visualize_path.py | 2 +- dimos/mapping/osm/current_location_map.py | 5 +++-- dimos/memory2/vis/utils.py | 2 +- dimos/models/vl/florence.py | 7 +++++-- dimos/models/vl/moondream.py | 2 +- dimos/models/vl/moondream_hosted.py | 2 +- dimos/msgs/sensor_msgs/Image.py | 4 +++- dimos/msgs/sensor_msgs/PointCloud2.py | 4 ++-- dimos/perception/common/utils.py | 2 +- .../perception/experimental/temporal_memory/clip_filter.py | 2 +- .../experimental/temporal_memory/temporal_utils/helpers.py | 4 +++- dimos/perception/object_tracker_2d.py | 2 +- dimos/perception/spatial_perception.py | 2 +- dimos/protocol/video/demo_h264_video_e2e.py | 2 +- dimos/protocol/video/h264.py | 4 ++-- dimos/robot/drone/camera_module.py | 2 +- dimos/robot/drone/drone_tracking_module.py | 2 +- dimos/teleop/quest_hosted/video_track.py | 4 +++- 21 files changed, 36 insertions(+), 26 deletions(-) diff --git a/dimos/core/transport.py b/dimos/core/transport.py index 2a57f7e030..19aedc6827 100644 --- a/dimos/core/transport.py +++ b/dimos/core/transport.py @@ -169,7 +169,7 @@ def __init__( type: type, config: Any | None = None, decode_images: bool = True, - **kwargs, + **kwargs: Any, ) -> None: # type: ignore[no-untyped-def] from dimos.protocol.pubsub.impl.h264_lcm import H264LCM from dimos.protocol.video.h264 import H264Config diff --git a/dimos/experimental/security_demo/depth_estimator.py b/dimos/experimental/security_demo/depth_estimator.py index 5737d3f006..55abb4b6a8 100644 --- a/dimos/experimental/security_demo/depth_estimator.py +++ b/dimos/experimental/security_demo/depth_estimator.py @@ -84,7 +84,7 @@ def _loop(self) -> None: def _process(self, image: Image) -> None: rgb = image.to_rgb() - pil_image = PILImage.fromarray(rgb.data) + pil_image = PILImage.fromarray(rgb.require_raw("DepthEstimator._process")) if pil_image.width > _DEPTH_MAX_WIDTH: scale = _DEPTH_MAX_WIDTH / pil_image.width new_h = int(pil_image.height * scale) diff --git a/dimos/experimental/security_demo/security_module.py b/dimos/experimental/security_demo/security_module.py index 9569227805..dbc9c2dc5c 100644 --- a/dimos/experimental/security_demo/security_module.py +++ b/dimos/experimental/security_demo/security_module.py @@ -299,7 +299,7 @@ def _patrol_step(self) -> None: ) annotated = draw_bounding_box( - image.data.copy(), + image.require_raw("SecurityModule._detection_step").copy(), list(best.bbox), label=best.name, confidence=best.confidence, @@ -340,7 +340,7 @@ def _follow_step(self) -> None: twist = self._visual_servo.compute_twist(best.bbox, latest_image.width) self.cmd_vel.publish(twist) - overlay = latest_image.data.copy() + overlay = latest_image.require_raw("SecurityModule._follow_step").copy() if hasattr(best, "mask") and best.mask is not None: mask_bool = best.mask > 0 green = np.zeros_like(overlay) diff --git a/dimos/mapping/occupancy/visualize_path.py b/dimos/mapping/occupancy/visualize_path.py index 89dcf83067..41b19e5686 100644 --- a/dimos/mapping/occupancy/visualize_path.py +++ b/dimos/mapping/occupancy/visualize_path.py @@ -30,7 +30,7 @@ def visualize_path( scale: int = 8, ) -> Image: image = visualize_occupancy_grid(occupancy_grid, "rainbow") - bgr = image.data + bgr = image.require_raw("visualize_path") bgr = cv2.resize( bgr, diff --git a/dimos/mapping/osm/current_location_map.py b/dimos/mapping/osm/current_location_map.py index d573370a06..d723e1e2ff 100644 --- a/dimos/mapping/osm/current_location_map.py +++ b/dimos/mapping/osm/current_location_map.py @@ -74,7 +74,8 @@ def _fetch_new_map(self) -> None: assert self._map_image is not None assert self._position is not None - pil_image = PILImage.fromarray(self._map_image.image.data) + map_data = self._map_image.image.require_raw("CurrentLocationMap._fetch_new_map") + pil_image = PILImage.fromarray(map_data) draw = ImageDraw.Draw(pil_image) x, y = self._map_image.latlon_to_pixel(self._position) radius = 20 @@ -85,7 +86,7 @@ def _fetch_new_map(self) -> None: width=3, ) - self._map_image.image.data[:] = np.array(pil_image) + map_data[:] = np.array(pil_image) def _position_is_too_far_off_center(self) -> bool: x, y = self._map_image.latlon_to_pixel(self._position) # type: ignore[arg-type, union-attr] diff --git a/dimos/memory2/vis/utils.py b/dimos/memory2/vis/utils.py index fee6f66057..ff8f90fbed 100644 --- a/dimos/memory2/vis/utils.py +++ b/dimos/memory2/vis/utils.py @@ -65,7 +65,7 @@ def mosaic( canvas = np.zeros((rows * cell_height, cols * cell_w, 3), dtype=np.uint8) for i, img in enumerate(images): r, c = divmod(i, cols) - tile = cv2.resize(img.to_bgr().data, (cell_w, cell_height)) + tile = cv2.resize(img.to_bgr().require_raw("mosaic_observations"), (cell_w, cell_height)) canvas[r * cell_height : (r + 1) * cell_height, c * cell_w : (c + 1) * cell_w] = tile result = Image(data=canvas, format=ImageFormat.BGR) diff --git a/dimos/models/vl/florence.py b/dimos/models/vl/florence.py index 8e964bb85f..b1aa56bc16 100644 --- a/dimos/models/vl/florence.py +++ b/dimos/models/vl/florence.py @@ -98,7 +98,7 @@ def caption(self, image: Image, detail: str | CaptionDetail | None = None) -> st task_prompt = CaptionDetail.from_str(detail).value # Convert to PIL - pil_image = PILImage.fromarray(image.to_rgb().data) + pil_image = PILImage.fromarray(image.to_rgb().require_raw("Florence2Model.caption")) # Process inputs inputs = self._processor(text=task_prompt, images=pil_image, return_tensors="pt") @@ -137,7 +137,10 @@ def caption_batch(self, *images: Image) -> list[str]: task_prompt = self._task_prompt # Convert all to PIL - pil_images = [PILImage.fromarray(img.to_rgb().data) for img in images] + pil_images = [ + PILImage.fromarray(img.to_rgb().require_raw("Florence2Model.caption_batch")) + for img in images + ] # Process batch inputs = self._processor( diff --git a/dimos/models/vl/moondream.py b/dimos/models/vl/moondream.py index e3cfe744ce..b7c1a0cc25 100644 --- a/dimos/models/vl/moondream.py +++ b/dimos/models/vl/moondream.py @@ -67,7 +67,7 @@ def _to_pil(self, image: Image | np.ndarray[Any, Any]) -> PILImage.Image: image, _ = self._prepare_image(image) rgb_image = image.to_rgb() - return PILImage.fromarray(rgb_image.data) + return PILImage.fromarray(rgb_image.require_raw("MoondreamVlModel._to_pil")) def query(self, image: Image | np.ndarray, query: str, **kwargs) -> str: # type: ignore[no-untyped-def] pil_image = self._to_pil(image) diff --git a/dimos/models/vl/moondream_hosted.py b/dimos/models/vl/moondream_hosted.py index 76e55451a1..2a6f81977d 100644 --- a/dimos/models/vl/moondream_hosted.py +++ b/dimos/models/vl/moondream_hosted.py @@ -54,7 +54,7 @@ def _to_pil_image(self, image: Image | np.ndarray) -> PILImage.Image: image = Image.from_numpy(image) rgb_image = image.to_rgb() - return PILImage.fromarray(rgb_image.data) + return PILImage.fromarray(rgb_image.require_raw("MoondreamHostedVlModel._to_pil_image")) def query(self, image: Image | np.ndarray, query: str, **kwargs) -> str: # type: ignore[no-untyped-def] pil_image = self._to_pil_image(image) diff --git a/dimos/msgs/sensor_msgs/Image.py b/dimos/msgs/sensor_msgs/Image.py index e5d4ba1b02..30c7aeb158 100644 --- a/dimos/msgs/sensor_msgs/Image.py +++ b/dimos/msgs/sensor_msgs/Image.py @@ -248,12 +248,14 @@ def shape(self) -> tuple[int, ...]: @property def dtype(self) -> np.dtype[Any]: if self.is_encoded: - return np.dtype(self.codec_metadata.get("dtype", "uint8")) + return np.dtype(str(self.codec_metadata.get("dtype", "uint8"))) return self.require_raw("dtype").dtype def copy(self) -> Image: data: np.ndarray[Any, np.dtype[Any]] | bytes if self.is_encoded: + if not isinstance(self.data, bytes): + raise ValueError("Encoded Image payload must be bytes") data = bytes(self.data) else: data = self.require_raw("copy").copy() diff --git a/dimos/msgs/sensor_msgs/PointCloud2.py b/dimos/msgs/sensor_msgs/PointCloud2.py index ae30c41711..46396ac15d 100644 --- a/dimos/msgs/sensor_msgs/PointCloud2.py +++ b/dimos/msgs/sensor_msgs/PointCloud2.py @@ -223,13 +223,13 @@ def from_rgbd( PointCloud2 instance with colored points """ # Get color as RGB numpy array - color_data = color_image.to_rgb().data + color_data = color_image.to_rgb().require_raw("PointCloud2.from_rgbd color") if hasattr(color_data, "get"): # CuPy array color_data = color_data.get() color_data = np.ascontiguousarray(color_data) # Get depth numpy array - depth_data = depth_image.data + depth_data = depth_image.require_raw("PointCloud2.from_rgbd depth") if hasattr(depth_data, "get"): # CuPy array depth_data = depth_data.get() diff --git a/dimos/perception/common/utils.py b/dimos/perception/common/utils.py index 20ffc2a254..f8e1a0a824 100644 --- a/dimos/perception/common/utils.py +++ b/dimos/perception/common/utils.py @@ -195,7 +195,7 @@ def rectify_image(image: Image, camera_matrix: np.ndarray, dist_coeffs: np.ndarr Returns an Image with numpy or cupy data depending on caller choice. """ - rect = cv2.undistort(image.data, camera_matrix, dist_coeffs) + rect = cv2.undistort(image.require_raw("rectify_image"), camera_matrix, dist_coeffs) return Image(data=rect, format=image.format, frame_id=image.frame_id, ts=image.ts) diff --git a/dimos/perception/experimental/temporal_memory/clip_filter.py b/dimos/perception/experimental/temporal_memory/clip_filter.py index 6ef7859e17..63cf9802ae 100644 --- a/dimos/perception/experimental/temporal_memory/clip_filter.py +++ b/dimos/perception/experimental/temporal_memory/clip_filter.py @@ -38,7 +38,7 @@ def _get_image_data(image: Image) -> np.ndarray[Any, Any]: """Extract numpy array from Image.""" if not hasattr(image, "data"): raise AttributeError(f"Image missing .data attribute: {type(image)}") - return image.data + return image.require_raw("_get_image_data") if CLIP_AVAILABLE: diff --git a/dimos/perception/experimental/temporal_memory/temporal_utils/helpers.py b/dimos/perception/experimental/temporal_memory/temporal_utils/helpers.py index 88ddee1157..d6e95e20d8 100644 --- a/dimos/perception/experimental/temporal_memory/temporal_utils/helpers.py +++ b/dimos/perception/experimental/temporal_memory/temporal_utils/helpers.py @@ -70,5 +70,7 @@ def is_scene_stale(frames: list["Frame"], stale_threshold: float = 5.0) -> bool: return False if not hasattr(first_img, "data") or not hasattr(last_img, "data"): return False - diff = np.abs(first_img.data.astype(float) - last_img.data.astype(float)) + first_data = first_img.require_raw("is_scene_stale first frame") + last_data = last_img.require_raw("is_scene_stale last frame") + diff = np.abs(first_data.astype(float) - last_data.astype(float)) return bool(diff.mean() < stale_threshold) diff --git a/dimos/perception/object_tracker_2d.py b/dimos/perception/object_tracker_2d.py index 653c519054..d527e025ce 100644 --- a/dimos/perception/object_tracker_2d.py +++ b/dimos/perception/object_tracker_2d.py @@ -93,7 +93,7 @@ def start(self) -> None: def on_frame(frame_msg: Image) -> None: arrival_time = time.perf_counter() with self._frame_lock: - self._latest_rgb_frame = frame_msg.data + self._latest_rgb_frame = frame_msg.require_raw("ObjectTracker2D.on_frame") self._frame_arrival_time = arrival_time unsub = self.color_image.subscribe(on_frame) diff --git a/dimos/perception/spatial_perception.py b/dimos/perception/spatial_perception.py index 4d1f1377f3..4b3f601a1c 100644 --- a/dimos/perception/spatial_perception.py +++ b/dimos/perception/spatial_perception.py @@ -190,7 +190,7 @@ def start(self) -> None: def set_video(image_msg: Image) -> None: # Convert Image message to numpy array if hasattr(image_msg, "data"): - frame = image_msg.data + frame = image_msg.require_raw("SpatialMemory.set_video") frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) self._latest_video_frame = frame else: diff --git a/dimos/protocol/video/demo_h264_video_e2e.py b/dimos/protocol/video/demo_h264_video_e2e.py index 6b5f73003c..14f0285024 100644 --- a/dimos/protocol/video/demo_h264_video_e2e.py +++ b/dimos/protocol/video/demo_h264_video_e2e.py @@ -153,7 +153,7 @@ def start(self) -> None: for name, port in recorder.inputs.items(): stream: Stream[Image] - h264_streams = getattr(self, "h264_streams", frozenset()) + h264_streams: frozenset[str] = getattr(self, "h264_streams", frozenset()) if name in h264_streams: stream = recorder.store.stream(name, port.type, codec="h264") else: diff --git a/dimos/protocol/video/h264.py b/dimos/protocol/video/h264.py index 187c1c1772..9688f125c4 100644 --- a/dimos/protocol/video/h264.py +++ b/dimos/protocol/video/h264.py @@ -17,7 +17,7 @@ from collections.abc import Callable, Sequence from dataclasses import dataclass, field from fractions import Fraction -from typing import TYPE_CHECKING, Any, Protocol +from typing import TYPE_CHECKING, Any, Protocol, cast import numpy as np @@ -184,7 +184,7 @@ def decode_image(self, image: Image) -> Image: decoded_frames = self._decoder.decode(frame) if not decoded_frames: raise VideoDecodeGapError("H.264 decoder produced no frame") - return self._from_video_frame(decoded_frames[0], image) + return self._from_video_frame(cast("av.VideoFrame", decoded_frames[0]), image) def _to_video_frame(self, image: Image) -> av.VideoFrame: fmt = _av_input_format(image.format) diff --git a/dimos/robot/drone/camera_module.py b/dimos/robot/drone/camera_module.py index b77c597980..72c77fe2ee 100644 --- a/dimos/robot/drone/camera_module.py +++ b/dimos/robot/drone/camera_module.py @@ -138,7 +138,7 @@ def _processing_loop(self) -> None: self._latest_frame = None # Get numpy array from Image - img_array = frame.data + img_array = frame.require_raw("DroneCameraModule._process_frames") # Create header header = Header(self.camera_frame_id) diff --git a/dimos/robot/drone/drone_tracking_module.py b/dimos/robot/drone/drone_tracking_module.py index 277ecc509e..846faf26fe 100644 --- a/dimos/robot/drone/drone_tracking_module.py +++ b/dimos/robot/drone/drone_tracking_module.py @@ -115,7 +115,7 @@ def _get_latest_frame(self) -> np.ndarray[Any, np.dtype[Any]] | None: if self._latest_frame is None: return None # Convert Image to numpy array - data: np.ndarray[Any, np.dtype[Any]] = self._latest_frame.data + data = self._latest_frame.require_raw("DroneTrackingModule._get_latest_frame") return data @rpc diff --git a/dimos/teleop/quest_hosted/video_track.py b/dimos/teleop/quest_hosted/video_track.py index 2a17c3c39a..07ad3a2846 100644 --- a/dimos/teleop/quest_hosted/video_track.py +++ b/dimos/teleop/quest_hosted/video_track.py @@ -104,7 +104,9 @@ async def recv(self) -> av.VideoFrame: self._first_mono = now pts = int((now - self._first_mono) * VIDEO_CLOCK_RATE) - frame = av.VideoFrame.from_ndarray(img.data, format=_AV_FORMAT_MAP.get(img.format, "bgr24")) + frame = av.VideoFrame.from_ndarray( + img.require_raw("CameraVideoTrack.recv"), format=_AV_FORMAT_MAP.get(img.format, "bgr24") + ) frame.pts = pts frame.time_base = VIDEO_TIME_BASE return frame From 0588e346501b4516ee5fda69078cf7e50165a709 Mon Sep 17 00:00:00 2001 From: cc Date: Fri, 12 Jun 2026 14:13:06 -0700 Subject: [PATCH 11/14] fix: align h264 tests with ci constraints --- dimos/memory2/video/__init__.py | 15 --------------- dimos/memory2/video/test_h264_storage.py | 22 ++++++++++++++++++---- dimos/protocol/video/__init__.py | 15 --------------- 3 files changed, 18 insertions(+), 34 deletions(-) delete mode 100644 dimos/memory2/video/__init__.py delete mode 100644 dimos/protocol/video/__init__.py diff --git a/dimos/memory2/video/__init__.py b/dimos/memory2/video/__init__.py deleted file mode 100644 index 86e17cecb4..0000000000 --- a/dimos/memory2/video/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright 2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Video storage helpers for memory2.""" diff --git a/dimos/memory2/video/test_h264_storage.py b/dimos/memory2/video/test_h264_storage.py index 044f8bb2d0..ab335dc260 100644 --- a/dimos/memory2/video/test_h264_storage.py +++ b/dimos/memory2/video/test_h264_storage.py @@ -14,15 +14,21 @@ from __future__ import annotations +from pathlib import Path +import platform + import numpy as np import pytest +from dimos.memory2.backend import Backend from dimos.memory2.codecs.base import codec_from_id, codec_id from dimos.memory2.codecs.jpeg import JpegCodec from dimos.memory2.store.sqlite import SqliteStore from dimos.memory2.video.h264 import H264ImageCodec from dimos.msgs.sensor_msgs.Image import H264_IMAGE_ENCODING, Image, ImageFormat +_SKIP_SQLITE_VEC = platform.machine() == "aarch64" or platform.system() == "Darwin" + def _raw_image(seq: int, fmt: ImageFormat = ImageFormat.RGB) -> Image: data = np.full((2, 2, 3), seq, dtype=np.uint8) @@ -80,7 +86,9 @@ def test_codec_id_and_factory_support_h264_for_image() -> None: assert isinstance(codec_from_id("h264", "dimos.msgs.sensor_msgs.Image.Image"), H264ImageCodec) -def test_h264_stream_stores_encoded_images_with_normal_backend(tmp_path) -> None: +def test_h264_stream_stores_encoded_images_with_normal_backend(tmp_path: Path) -> None: + if _SKIP_SQLITE_VEC: + pytest.skip("sqlite-vec extension not loadable here") db = tmp_path / "h264.db" with SqliteStore(path=str(db)) as store: stream = store.stream("cam", Image, codec="h264") @@ -96,7 +104,9 @@ def test_h264_stream_stores_encoded_images_with_normal_backend(tmp_path) -> None assert obs.data.width == 2 -def test_h264_replay_emits_encoded_images(tmp_path) -> None: +def test_h264_replay_emits_encoded_images(tmp_path: Path) -> None: + if _SKIP_SQLITE_VEC: + pytest.skip("sqlite-vec extension not loadable here") store = SqliteStore(path=str(tmp_path / "replay.db")) stream = store.stream("cam", Image, codec="h264") stream.append(_encoded_image(1), ts=1.0) @@ -108,12 +118,16 @@ def test_h264_replay_emits_encoded_images(tmp_path) -> None: assert [image.codec_metadata["seq"] for image in replayed] == [1, 2] -def test_default_image_stream_still_uses_jpeg_codec(tmp_path) -> None: +def test_default_image_stream_still_uses_jpeg_codec(tmp_path: Path) -> None: + if _SKIP_SQLITE_VEC: + pytest.skip("sqlite-vec extension not loadable here") store = SqliteStore(path=str(tmp_path / "jpeg.db")) stream = store.stream("rgb", Image) stream.append(_raw_image(1)) - assert isinstance(stream._source.codec, JpegCodec) + source = stream._source + assert isinstance(source, Backend) + assert isinstance(source.codec, JpegCodec) assert store.stream("rgb").first().data.encoding == "raw" diff --git a/dimos/protocol/video/__init__.py b/dimos/protocol/video/__init__.py deleted file mode 100644 index 4452bdd191..0000000000 --- a/dimos/protocol/video/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright 2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Video codec helpers.""" From 12a423941307062dc6c595a4cfb3b7349b3d9db3 Mon Sep 17 00:00:00 2001 From: cc Date: Fri, 12 Jun 2026 14:23:53 -0700 Subject: [PATCH 12/14] test: avoid turbojpeg dependency in codec assertion --- dimos/memory2/video/test_h264_storage.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/dimos/memory2/video/test_h264_storage.py b/dimos/memory2/video/test_h264_storage.py index ab335dc260..74eafb2b7a 100644 --- a/dimos/memory2/video/test_h264_storage.py +++ b/dimos/memory2/video/test_h264_storage.py @@ -123,12 +123,10 @@ def test_default_image_stream_still_uses_jpeg_codec(tmp_path: Path) -> None: pytest.skip("sqlite-vec extension not loadable here") store = SqliteStore(path=str(tmp_path / "jpeg.db")) stream = store.stream("rgb", Image) - stream.append(_raw_image(1)) source = stream._source assert isinstance(source, Backend) assert isinstance(source.codec, JpegCodec) - assert store.stream("rgb").first().data.encoding == "raw" def test_encoded_images_reject_pixel_operations() -> None: From e98edac82584fe4d8f36925e806c65d259d1cfc7 Mon Sep 17 00:00:00 2001 From: cc Date: Fri, 12 Jun 2026 14:42:26 -0700 Subject: [PATCH 13/14] fix: address h264 review comments --- dimos/memory2/codecs/base.py | 7 ++- dimos/memory2/video/h264.py | 2 + dimos/protocol/pubsub/impl/h264_lcm.py | 2 +- dimos/protocol/pubsub/impl/test_h264_lcm.py | 16 +++++- dimos/protocol/video/h264.py | 62 ++++++++++++++++++++- 5 files changed, 83 insertions(+), 6 deletions(-) diff --git a/dimos/memory2/codecs/base.py b/dimos/memory2/codecs/base.py index def8ef41fc..ed9c76f6d2 100644 --- a/dimos/memory2/codecs/base.py +++ b/dimos/memory2/codecs/base.py @@ -75,9 +75,12 @@ def codec_from_id(codec_id_str: str, payload_module: str) -> Codec[Any]: def _class_to_id(codec: Any) -> str: + explicit_id = getattr(codec, "CODEC_ID", None) + if explicit_id is not None: + if not isinstance(explicit_id, str): + raise TypeError(f"Codec CODEC_ID must be str, got {type(explicit_id).__name__}") + return explicit_id name = type(codec).__name__ - if name == "H264ImageCodec": - return "h264" if name.endswith("Codec"): return name[:-5].lower() return name.lower() diff --git a/dimos/memory2/video/h264.py b/dimos/memory2/video/h264.py index e3b484ba38..d833cebc2c 100644 --- a/dimos/memory2/video/h264.py +++ b/dimos/memory2/video/h264.py @@ -26,6 +26,8 @@ class H264ImageCodec: for visualization or module consumption. """ + CODEC_ID = "h264" + def encode(self, value: Image) -> bytes: if value.encoding != H264_IMAGE_ENCODING: raise ValueError( diff --git a/dimos/protocol/pubsub/impl/h264_lcm.py b/dimos/protocol/pubsub/impl/h264_lcm.py index 69f8107930..8784e1d93b 100644 --- a/dimos/protocol/pubsub/impl/h264_lcm.py +++ b/dimos/protocol/pubsub/impl/h264_lcm.py @@ -58,7 +58,7 @@ def decode(self, msg: bytes, topic: LCMTopicProto) -> Image: self._decoder = H264Decoder(self.h264_config) try: return self._decoder.decode(image) - except VideoDecodeGapError as exc: + except (VideoDecodeGapError, ValueError) as exc: raise DecodingError(str(exc)) from exc diff --git a/dimos/protocol/pubsub/impl/test_h264_lcm.py b/dimos/protocol/pubsub/impl/test_h264_lcm.py index 9db7b42b4a..3b7987e0e4 100644 --- a/dimos/protocol/pubsub/impl/test_h264_lcm.py +++ b/dimos/protocol/pubsub/impl/test_h264_lcm.py @@ -61,12 +61,15 @@ def encode(self, image: Image) -> Image: class FakeDecoder: - def __init__(self, *, fail: bool = False) -> None: + def __init__(self, *, fail: bool = False, invalid: bool = False) -> None: self.fail = fail + self.invalid = invalid def decode(self, image: Image) -> Image: if self.fail: raise VideoDecodeGapError("waiting for keyframe") + if self.invalid: + raise ValueError("Expected H.264 encoded Image") return Image( data=np.zeros((image.height, image.width, 3), dtype=np.uint8), format=image.format, @@ -153,6 +156,17 @@ def test_h264_lcm_suppresses_decode_gap() -> None: transport.decode(encoded.lcm_encode(), StubTopic("/color", Image)) +def test_h264_lcm_suppresses_invalid_h264_image() -> None: + transport = H264LCM() + transport._decoder = FakeDecoder(invalid=True) # type: ignore[assignment] + encoded = FakeEncoder().encode( + Image(data=np.zeros((2, 3, 3), dtype=np.uint8), format=ImageFormat.RGB, frame_id="cam") + ) + + with pytest.raises(DecodingError, match="Expected H.264 encoded Image"): + transport.decode(encoded.lcm_encode(), StubTopic("/color", Image)) + + def test_h264_lcm_suppresses_non_image_payload() -> None: transport = H264LCM() diff --git a/dimos/protocol/video/h264.py b/dimos/protocol/video/h264.py index 9688f125c4..69548f8c57 100644 --- a/dimos/protocol/video/h264.py +++ b/dimos/protocol/video/h264.py @@ -14,7 +14,7 @@ from __future__ import annotations -from collections.abc import Callable, Sequence +from collections.abc import Callable, Iterator, Sequence from dataclasses import dataclass, field from fractions import Fraction from typing import TYPE_CHECKING, Any, Protocol, cast @@ -149,6 +149,7 @@ def __init__(self, config: H264Config | None = None) -> None: self.config = config or H264Config() try: from aiortc.codecs.h264 import ( + MAX_FRAME_RATE, H264Decoder as AiortcDecoder, H264Encoder as AiortcEncoder, h264_depayload, @@ -163,7 +164,52 @@ def __init__(self, config: H264Config | None = None) -> None: self._av = av self._jitter_frame_type = JitterFrame self._depayload = h264_depayload - self._encoder = AiortcEncoder() + + class ConfiguredAiortcEncoder(AiortcEncoder): + def __init__(self, h264_config: H264Config) -> None: + super().__init__() + self._dimos_config = h264_config + + def _encode_frame(self, frame: av.VideoFrame, force_keyframe: bool) -> Iterator[bytes]: + configured_bitrate = self.codec.bit_rate if self.codec else None + if self.codec and ( + frame.width != self.codec.width + or frame.height != self.codec.height + or configured_bitrate is None + or abs(self.target_bitrate - configured_bitrate) / configured_bitrate > 0.1 + ): + self.buffer_data = b"" + self.buffer_pts = None + self.codec = None + + if force_keyframe: + frame.pict_type = av.video.frame.PictureType.I + else: + frame.pict_type = av.video.frame.PictureType.NONE + + if self.codec is None: + self.codec = av.CodecContext.create("libx264", "w") + self.codec.width = frame.width + self.codec.height = frame.height + self.codec.bit_rate = self.target_bitrate + self.codec.pix_fmt = self._dimos_config.pixel_format + self.codec.framerate = Fraction(MAX_FRAME_RATE, 1) + self.codec.time_base = Fraction(1, MAX_FRAME_RATE) + self.codec.options = { + "level": "31", + "preset": self._dimos_config.preset, + "tune": self._dimos_config.tune, + } + self.codec.profile = _av_h264_profile(self._dimos_config.profile) + + data_to_send = b"" + for package in self.codec.encode(frame): + data_to_send += bytes(package) + + if data_to_send: + yield from self._split_bitstream(data_to_send) + + self._encoder = ConfiguredAiortcEncoder(self.config) self._decoder = AiortcDecoder() self._frame_index = 0 self._time_base = Fraction(1, self.config.target_fps) @@ -320,6 +366,18 @@ def _av_input_format(format: ImageFormat) -> str: raise UnsupportedVideoImageError(f"Unsupported H.264 image format: {format.value}") +def _av_h264_profile(profile: str) -> str: + match profile.lower(): + case "baseline": + return "Baseline" + case "main": + return "Main" + case "high": + return "High" + case _: + return profile + + __all__ = [ "H264_BITSTREAM", "H264_CODEC", From ee9a851b7965d0864fa6a21e58adae658e57f345 Mon Sep 17 00:00:00 2001 From: cc Date: Fri, 12 Jun 2026 17:49:26 -0700 Subject: [PATCH 14/14] feat: some more examples --- dimos/robot/all_blueprints.py | 3 + .../agentic/unitree_go2_agentic_h264_video.py | 35 +++++++ .../smart/unitree_go2_h264_detection.py | 62 ++++++++++++ .../smart/unitree_go2_h264_video.py | 96 +++++++++++++++++++ 4 files changed, 196 insertions(+) create mode 100644 dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_h264_video.py create mode 100644 dimos/robot/unitree/go2/blueprints/smart/unitree_go2_h264_detection.py create mode 100644 dimos/robot/unitree/go2/blueprints/smart/unitree_go2_h264_video.py diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index db086b5722..7653c33b9d 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -109,12 +109,15 @@ "unitree-g1-sim": "dimos.robot.unitree.g1.blueprints.perceptive.unitree_g1_sim:unitree_g1_sim", "unitree-go2": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2:unitree_go2", "unitree-go2-agentic": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_agentic:unitree_go2_agentic", + "unitree-go2-agentic-h264-video": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_agentic_h264_video:unitree_go2_agentic_h264_video", "unitree-go2-agentic-huggingface": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_agentic_huggingface:unitree_go2_agentic_huggingface", "unitree-go2-agentic-ollama": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_agentic_ollama:unitree_go2_agentic_ollama", "unitree-go2-basic": "dimos.robot.unitree.go2.blueprints.basic.unitree_go2_basic:unitree_go2_basic", "unitree-go2-coordinator": "dimos.robot.unitree.go2.blueprints.basic.unitree_go2_coordinator:unitree_go2_coordinator", "unitree-go2-detection": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2_detection:unitree_go2_detection", "unitree-go2-fleet": "dimos.robot.unitree.go2.blueprints.basic.unitree_go2_fleet:unitree_go2_fleet", + "unitree-go2-h264-detection": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2_h264_detection:unitree_go2_h264_detection", + "unitree-go2-h264-video": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2_h264_video:unitree_go2_h264_video", "unitree-go2-keyboard-teleop": "dimos.robot.unitree.go2.blueprints.basic.unitree_go2_keyboard_teleop:unitree_go2_keyboard_teleop", "unitree-go2-markers": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2:unitree_go2_markers", "unitree-go2-memory": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2:unitree_go2_memory", diff --git a/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_h264_video.py b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_h264_video.py new file mode 100644 index 0000000000..35e4176765 --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_h264_video.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Agentic Go2 stack with H.264 transport enabled for the color image stream.""" + +from dimos.agents.mcp.mcp_client import McpClient +from dimos.agents.mcp.mcp_server import McpServer +from dimos.core.coordination.blueprints import autoconnect +from dimos.perception.perceive_loop_skill import PerceiveLoopSkill +from dimos.perception.spatial_perception import SpatialMemory +from dimos.robot.unitree.go2.blueprints.agentic._common_agentic import _common_agentic +from dimos.robot.unitree.go2.blueprints.smart.unitree_go2_h264_video import unitree_go2_h264_video + +unitree_go2_agentic_h264_video = autoconnect( + unitree_go2_h264_video, + SpatialMemory.blueprint(), + PerceiveLoopSkill.blueprint(), + McpServer.blueprint(), + McpClient.blueprint(), + _common_agentic, +).global_config(n_workers=12, robot_model="unitree_go2") + +__all__ = ["unitree_go2_agentic_h264_video"] diff --git a/dimos/robot/unitree/go2/blueprints/smart/unitree_go2_h264_detection.py b/dimos/robot/unitree/go2/blueprints/smart/unitree_go2_h264_detection.py new file mode 100644 index 0000000000..1d54f4979d --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/smart/unitree_go2_h264_detection.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Go2 replay stack for validating H.264 video transport with 3D detection.""" + +from dimos.core.coordination.blueprints import autoconnect +from dimos.core.transport import LCMTransport +from dimos.msgs.sensor_msgs.Image import Image +from dimos.msgs.sensor_msgs.PointCloud2 import PointCloud2 +from dimos.msgs.vision_msgs.Detection2DArray import Detection2DArray +from dimos.perception.detection.module3D import Detection3DModule +from dimos.robot.unitree.go2.blueprints.smart.unitree_go2_h264_video import ( + unitree_go2_h264_video, +) +from dimos.robot.unitree.go2.connection import GO2Connection + +unitree_go2_h264_detection = ( + autoconnect( + unitree_go2_h264_video, + Detection3DModule.blueprint( + camera_info=GO2Connection.camera_info_static, + ), + ) + .remappings( + [ + (Detection3DModule, "pointcloud", "global_map"), + ] + ) + .transports( + { + ("detections", Detection3DModule): LCMTransport( + "/detector3d/detections", Detection2DArray + ), + ("detected_pointcloud_0", Detection3DModule): LCMTransport( + "/detector3d/pointcloud/0", PointCloud2 + ), + ("detected_pointcloud_1", Detection3DModule): LCMTransport( + "/detector3d/pointcloud/1", PointCloud2 + ), + ("detected_pointcloud_2", Detection3DModule): LCMTransport( + "/detector3d/pointcloud/2", PointCloud2 + ), + ("detected_image_0", Detection3DModule): LCMTransport("/detector3d/image/0", Image), + ("detected_image_1", Detection3DModule): LCMTransport("/detector3d/image/1", Image), + ("detected_image_2", Detection3DModule): LCMTransport("/detector3d/image/2", Image), + } + ) +) + +__all__ = ["unitree_go2_h264_detection"] diff --git a/dimos/robot/unitree/go2/blueprints/smart/unitree_go2_h264_video.py b/dimos/robot/unitree/go2/blueprints/smart/unitree_go2_h264_video.py new file mode 100644 index 0000000000..d304d3d7cc --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/smart/unitree_go2_h264_video.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Go2 navigation stack with H.264 transport enabled for the color image stream.""" + +from typing import Any, cast + +from dimos.core.coordination.blueprints import autoconnect +from dimos.core.global_config import global_config +from dimos.core.transport import H264LcmTransport +from dimos.mapping.costmapper import CostMapper +from dimos.mapping.voxels import VoxelGridMapper +from dimos.msgs.sensor_msgs.Image import Image +from dimos.navigation.frontier_exploration.wavefront_frontier_goal_selector import ( + WavefrontFrontierExplorer, +) +from dimos.navigation.movement_manager.movement_manager import MovementManager +from dimos.navigation.patrolling.module import PatrollingModule +from dimos.navigation.replanning_a_star.module import ReplanningAStarPlanner +from dimos.protocol.video.demo_h264_video_e2e import H264VideoProbe +from dimos.protocol.video.h264 import H264Config, H264Decoder, VideoDecodeGapError +from dimos.robot.unitree.go2.blueprints.basic.unitree_go2_basic import rerun_config +from dimos.robot.unitree.go2.connection import GO2Connection +from dimos.visualization.vis_module import vis_module + +_go2_h264_config = H264Config( + bitrate=2_000_000, + target_fps=15, + keyframe_interval=30, +) +_go2_rerun_decoder: H264Decoder | None = None + + +def _convert_h264_color_image(image: Image) -> Any: + """Decode H.264 color frames before logging them in Rerun.""" + global _go2_rerun_decoder + + if image.encoding == "h264": + if _go2_rerun_decoder is None: + _go2_rerun_decoder = H264Decoder(_go2_h264_config) + try: + image = _go2_rerun_decoder.decode(image) + except (VideoDecodeGapError, ValueError): + # Replay/subscription can start mid-GOP. Suppress deltas until the + # next keyframe restores decoder state. + return None + return image.to_rerun() + + +_h264_rerun_config = { + **rerun_config, + "visual_override": { + **cast("dict[str, Any]", rerun_config["visual_override"]), + "world/color_image": _convert_h264_color_image, + }, +} + +unitree_go2_h264_video = ( + autoconnect( + vis_module( + viewer_backend=global_config.viewer, + rerun_config=_h264_rerun_config, + ), + GO2Connection.blueprint(), + VoxelGridMapper.blueprint(emit_every=5), + CostMapper.blueprint(), + ReplanningAStarPlanner.blueprint(), + WavefrontFrontierExplorer.blueprint(), + PatrollingModule.blueprint(), + MovementManager.blueprint(), + H264VideoProbe.blueprint(), + ) + .transports( + { + ("color_image", Image): H264LcmTransport( + "/color_image", + Image, + config=_go2_h264_config, + decode_images=True, + ), + } + ) + .global_config(n_workers=11, robot_model="unitree_go2") +)