From 917c1fc939ed52fa0b1ff9ac16f0c715158cedfe Mon Sep 17 00:00:00 2001 From: Janick Martinez Esturo Date: Sat, 13 Jun 2026 13:42:01 +0200 Subject: [PATCH] feat(argoverse2): add Argoverse 2 Sensor Dataset to NCore V4 converter Add a converter for the Argoverse 2 (AV2) Sensor Dataset, closing the Argoverse 2 half of #123 (the nuScenes half landed in #128). The converter reads the AV2 on-disk Apache Feather files directly with pyarrow (Arrow tables, no pandas) and deliberately avoids the heavy av2 devkit (torch, kornia, numba, polars, PyAV). The only added dependency is pyquaternion, already used in this package. Sensors: - Cameras: all 9 global-shutter cameras (7 ring + 2 stereo). AV2 imagery is shipped undistorted, so the stored model is pinhole with zero distortion and ShutterType.GLOBAL. - Lidar: the two stacked Velodyne VLP-32C units (up_lidar / down_lidar) are stored separately, each with its own static extrinsic. AV2 sweeps are egomotion-compensated to the sweep reference timestamp (the sweep start) and expressed in the egovehicle frame, with real per-point timestamps (offset_ns). Points are mapped into each unit's sensor frame and decompensated -- referenced to the sweep start -- using the shared MotionCompensator reference_timestamp_us, so NCore stores raw per-point-time directions. - Radar: AV2 has none. - Cuboids: native to the egovehicle frame at the sweep time; stored in the rig frame at that timestamp with no ego pose baked in, so egomotion stays swappable downstream (a V4 feature). Lidar unit split: AV2 shares one laser_number range [0,63] across both units with no documented up/down mapping. The two halves (<32 / >=32) are the two units; the up/down label is recovered from extrinsic geometry by per-beam elevation flatness (a ring is a constant-elevation cone only in its own sensor frame), stable per log with a wide margin. Structured VLP-32C model: AV2 provides no native firing-column index, but offset_ns + laser_number reconstruct it (offset_ns -> firing column at 10 Hz, laser_number -> beam/row). A structured model is derived per unit and stored as intrinsics with per-point model_element. Two things are essential for native-sensor accuracy: (1) the geometry is derived from the decompensated reference sweep (the compensated cloud is azimuth-smeared by ego motion ~0.5 deg); (2) per-row azimuth offsets are fit empirically (the 32 beams of a column span ~8.5 deg, so they are not co-azimuthal). The two units fire in opposite phase and thus spin oppositely in their own frames (one cw, one ccw), detected from the data. Result: ~0.0 deg median / ~0.02 deg p95 far-range reconstruction. --lidar-model-source {nominal,none} gates emission. Coordinate frames: the first ego pose's city_SE3_egovehicle is stored as the static world -> world_global anchor, so world_global is the AV2 city frame and absolute coordinates stay recoverable for later HD-map alignment. Includes an AV2_DIR-gated integration test (validated against a real val log), docs, and dependency wiring. --- MODULE.bazel.lock | 2 +- deps/pip/BUILD.bazel | 1 + deps/pip/requirements_3_11.in | 1 + deps/pip/requirements_3_11.txt | 4 +- deps/pip/requirements_argoverse2.in | 20 + docs/conversions/argoverse2/argoverse2.rst | 135 +++ docs/conversions/index.rst | 3 +- tools/data_converter/argoverse2/BUILD.bazel | 97 +++ tools/data_converter/argoverse2/NOTICE | 12 + tools/data_converter/argoverse2/README.md | 117 +++ tools/data_converter/argoverse2/converter.py | 776 ++++++++++++++++++ .../argoverse2/converter_test.py | 379 +++++++++ tools/data_converter/argoverse2/main.py | 23 + tools/data_converter/argoverse2/utils.py | 661 +++++++++++++++ tools/data_converter/argoverse2/utils_test.py | 272 ++++++ 15 files changed, 2500 insertions(+), 3 deletions(-) create mode 100644 deps/pip/requirements_argoverse2.in create mode 100644 docs/conversions/argoverse2/argoverse2.rst create mode 100644 tools/data_converter/argoverse2/BUILD.bazel create mode 100644 tools/data_converter/argoverse2/NOTICE create mode 100644 tools/data_converter/argoverse2/README.md create mode 100644 tools/data_converter/argoverse2/converter.py create mode 100644 tools/data_converter/argoverse2/converter_test.py create mode 100644 tools/data_converter/argoverse2/main.py create mode 100644 tools/data_converter/argoverse2/utils.py create mode 100644 tools/data_converter/argoverse2/utils_test.py diff --git a/MODULE.bazel.lock b/MODULE.bazel.lock index bf06b782..badf4c28 100644 --- a/MODULE.bazel.lock +++ b/MODULE.bazel.lock @@ -1502,7 +1502,7 @@ "bzlTransitiveDigest": "WyF2tLBpR8cyete55VmU+AwLC/mhDbnIlEdbVymbm04=", "usagesDigest": "DXk0VBJUK7HdLXJE59BVPG4lCTTPnHGNQg/SH9dTuVE=", "recordedFileInputs": { - "@@//deps/pip/requirements_3_11.txt": "01687dd0690641cecfe0cfa3a5a8ba2186f65112c34b4ab7b64caa55a416ae56", + "@@//deps/pip/requirements_3_11.txt": "845ddefb3a79c099a05f39880752241d3c30cefb3774c0611f253d5bff3f774f", "@@//deps/pip/requirements_3_8.txt": "94c254d645dc1d0555f48e66fa499aa2efecfc5e547500cd4ed409aee28e8880", "@@rules_fuzzing+//fuzzing/requirements.txt": "ab04664be026b632a0d2a2446c4f65982b7654f5b6851d2f9d399a19b7242a5b", "@@rules_python+//tools/publish/requirements_darwin.txt": "095d4a4f3d639dce831cd493367631cd51b53665292ab20194bac2c0c6458fa8", diff --git a/deps/pip/BUILD.bazel b/deps/pip/BUILD.bazel index 1e9c3e3d..becbd300 100644 --- a/deps/pip/BUILD.bazel +++ b/deps/pip/BUILD.bazel @@ -34,6 +34,7 @@ pip_compile( size = "medium", args = PIP_COMPILE_ARGS, data = [ + ":requirements_argoverse2.in", ":requirements_colmap.in", ":requirements_docs.in", ":requirements_ncore.in", diff --git a/deps/pip/requirements_3_11.in b/deps/pip/requirements_3_11.in index e7c68a14..b31bf737 100644 --- a/deps/pip/requirements_3_11.in +++ b/deps/pip/requirements_3_11.in @@ -21,6 +21,7 @@ -r requirements_waymo.in -r requirements_colmap.in -r requirements_nuscenes.in +-r requirements_argoverse2.in -r requirements_pai.in # Public API restrictions for 3.11 diff --git a/deps/pip/requirements_3_11.txt b/deps/pip/requirements_3_11.txt index 37fe4126..d620e757 100644 --- a/deps/pip/requirements_3_11.txt +++ b/deps/pip/requirements_3_11.txt @@ -1853,7 +1853,9 @@ pyarrow==23.0.1 \ --hash=sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f \ --hash=sha256:fa8e51cb04b9f8c9c5ace6bab63af9a1f88d35c0d6cbf53e8c17c098552285e1 \ --hash=sha256:fed7020203e9ef273360b9e45be52a2a47d3103caf156a30ace5247ffb51bdbd - # via -r deps/pip/requirements_pai.in + # via + # -r deps/pip/requirements_argoverse2.in + # -r deps/pip/requirements_pai.in pycocotools==2.0.11 \ --hash=sha256:04480330df5013f6edd94891a0ee8294274185f1b5093d1b0f23d51778f0c0e9 \ --hash=sha256:08c79789fd79e801ae4ecfcfeec32b31e36254e7a2b4019af28c104975d5e730 \ diff --git a/deps/pip/requirements_argoverse2.in b/deps/pip/requirements_argoverse2.in new file mode 100644 index 00000000..da09d455 --- /dev/null +++ b/deps/pip/requirements_argoverse2.in @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Argoverse 2 converter dependencies. +# The converter reads the AV2 Feather files directly and intentionally avoids the +# heavy `av2` devkit (torch, kornia, numba, polars, PyAV). Quaternion handling uses +# scipy (already an ncore dependency), so no extra dependency is needed here. +pyarrow diff --git a/docs/conversions/argoverse2/argoverse2.rst b/docs/conversions/argoverse2/argoverse2.rst new file mode 100644 index 00000000..0900181a --- /dev/null +++ b/docs/conversions/argoverse2/argoverse2.rst @@ -0,0 +1,135 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +Argoverse 2 Dataset +=================== + +The NCore Argoverse 2 tool converts data from the +`Argoverse 2 `_ Sensor Dataset into NCore +V4 format. The converter reads the Argoverse 2 on-disk Apache Feather files +directly with ``pyarrow`` and deliberately avoids the heavy ``av2`` devkit +(which pulls in torch, kornia, numba, polars and PyAV). Quaternion handling uses +``scipy`` (already an ncore dependency), so no extra dependency is introduced. + +.. _argoverse2_data_conventions: + +Conventions +----------- + +Argoverse 2 provides data from 9 cameras and 2 lidars; it has no radar. The +converter handles all sensor modalities and 3D cuboid annotations. + +Camera Sensors +^^^^^^^^^^^^^^ + 1. **ring_front_center** -- 2048x1550 (portrait) + 2. **ring_front_left** -- 1550x2048 + 3. **ring_front_right** -- 1550x2048 + 4. **ring_side_left** -- 1550x2048 + 5. **ring_side_right** -- 1550x2048 + 6. **ring_rear_left** -- 1550x2048 + 7. **ring_rear_right** -- 1550x2048 + 8. **stereo_front_left** -- 1550x2048 + 9. **stereo_front_right** -- 1550x2048 + +The released imagery for all nine cameras is already undistorted -- the official +av2 devkit projects with the intrinsic matrix ``K`` only and does not load the +distortion columns -- so camera intrinsics are stored using +:class:`~ncore.data.IdealPinholeCameraModelParameters`. Because the imagery is +already undistorted, **global shutter is assumed** (``ShutterType.GLOBAL``). The +``k1, k2, k3`` coefficients present in ``intrinsics.feather`` describe the +original lens (for re-distorting into the raw frame) and are intentionally not +applied to the released images; they are preserved per camera in the camera +component ``generic_meta_data`` under ``av2_original_distortion`` so the original +calibration is not lost. + +LiDAR Sensors +^^^^^^^^^^^^^ + 1. **up_lidar** -- Velodyne VLP-32C, 32 beams, 10 Hz + 2. **down_lidar** -- Velodyne VLP-32C, 32 beams, 10 Hz + +Argoverse 2 sweeps are egomotion-compensated to the sweep reference timestamp +and provided in the egovehicle frame, with real per-point timestamps +(``offset_ns``). The two stacked VLP-32C units are stored separately, each with +its own static extrinsic. Points are split per unit by ``laser_number``, +mapped into the unit's own sensor frame, and decompensated using the real +per-point timestamps so that NCore stores raw per-point-time ray directions. +Because the sensor extrinsic is static, this decompensation is independent of +whether the source data applied ego-motion before or after the sensor +transform. + +A structured VLP-32C model is stored per unit as lidar intrinsics, with per-point +``model_element`` (row, column). Argoverse 2 provides no native firing-column +index, so the firing pattern is reconstructed from ``offset_ns`` (firing columns -- +one VLP-32C revolution at 10 Hz) and ``laser_number`` (the beam, mapped to an +elevation-sorted row). The geometry is derived per log from the *decompensated* +reference sweep: elevations, the laser-to-row map, column timing, per-column +azimuths, and per-row azimuth offsets (the 32 beams of a firing column span several +degrees of azimuth, so the per-row offset is fit empirically). The two stacked +units fire in opposite phase, so they spin oppositely in their own frames (one +``cw``, one ``ccw``), which is detected from the data. The column grid is upsampled +4x so per-frame alignment is not column-quantized, and each sweep is re-aligned to +the model by a per-frame affine column remap -- a constant phase (the spin phase at +a given ``offset_ns`` drifts ~1 deg between sweeps) plus a linear term (the spin +rate drifts slightly within a sweep on some scenes). Steep downward beams that only +return at near range (no far data) have their azimuth offset fit from near-range +returns. Deriving from the decompensated cloud (not the ego-motion-smeared +compensated one) plus these steps gives ~0.03 deg median far-range reconstruction +across scenes (validated on 38 val logs / 76 units, all sub-0.08 deg median with no +systematic azimuth or elevation bias), on par with native-column sensors. Pass +``--lidar-model-source none`` to store raw ray bundles only. + +The ``laser_number`` to up/down unit split is not documented by Argoverse 2. The +two units occupy the two laser-number halves (``< 32`` and ``>= 32``); the unit +*label* is recovered from extrinsic geometry by per-beam elevation flatness -- a +laser ring traces a constant-elevation cone only in its own sensor frame, so the +wrong extrinsic tilts the cone and inflates the per-ring elevation spread. The +decision is made once per log and is stable with a wide (~2-10x) margin. + +Annotations +^^^^^^^^^^^ + +3D cuboid annotations are native to the egovehicle frame at the sweep reference +time. They are stored in the ``rig`` frame at that timestamp with no ego pose +baked in, so the egovehicle motion stays out of the stored coordinates and +remains swappable downstream (a V4 feature); the pose graph places the cuboids +using the active ego trajectory. The full 3-DOF box orientation is preserved (the +AV2 quaternion is converted to the ``BBox3`` ``xyz``-Euler convention, not reduced +to yaw). The ``track_uuid`` is used as the track ID. + +Coordinate Frames +^^^^^^^^^^^^^^^^^ + +The first ego pose's ``city_SE3_egovehicle`` is stored as the static +``world -> world_global`` pose, so ``world_global`` is the Argoverse 2 city +frame. All absolute city coordinates remain recoverable for later alignment +with the Argoverse 2 HD map (which the converter does not export). + +Usage +----- + +.. code-block:: bash + + bazel run //tools/data_converter/argoverse2 -- \ + --root-dir /path/to/argoverse2/sensor \ + --output-dir /path/to/output \ + argoverse2-v4 \ + --split val + +Convert a single log: + +.. code-block:: bash + + bazel run //tools/data_converter/argoverse2 -- \ + --root-dir /path/to/argoverse2/sensor \ + --output-dir /path/to/output \ + argoverse2-v4 \ + --split val \ + --log-id 02678d04-cc9f-3148-9f95-1ba66347dff9 + +Testing +------- + +.. code-block:: bash + + AV2_DIR=/path/to/argoverse2/sensor AV2_SPLIT=val \ + bazel test //tools/data_converter/argoverse2:pytest_converter diff --git a/docs/conversions/index.rst b/docs/conversions/index.rst index be50bee1..9bc84d1d 100644 --- a/docs/conversions/index.rst +++ b/docs/conversions/index.rst @@ -6,13 +6,14 @@ Data Conversions NCore provides conversion tools for importing 3rd-party dataset formats into the NCore V4 component-based format. Supported formats include KITTI, nuScenes, -Waymo, COLMAP (including ScanNet++), and PAI. +Argoverse 2, Waymo, COLMAP (including ScanNet++), and PAI. .. toctree:: :maxdepth: 1 kitti/kitti nuscenes/nuscenes + argoverse2/argoverse2 waymo/waymo colmap/colmap pai/pai diff --git a/tools/data_converter/argoverse2/BUILD.bazel b/tools/data_converter/argoverse2/BUILD.bazel new file mode 100644 index 00000000..a645eb51 --- /dev/null +++ b/tools/data_converter/argoverse2/BUILD.bazel @@ -0,0 +1,97 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@ncore_pip_deps//:requirements.bzl", "requirement") +load("@rules_python//python:defs.bzl", "py_binary", "py_library") +load("//bazel/pytest:defs.bzl", "pytest_test") + +# Argoverse 2 specific utilities: feather readers, sensor maps, lidar unit split +py_library( + name = "pylib_utils", + srcs = ["utils.py"], + deps = [ + requirement("numpy"), + requirement("pyarrow"), + requirement("scipy"), + requirement("universal_pathlib"), + "//ncore:pylib", + "//tools/data_converter:pylib_structured_lidar_model", + ], +) + +# Converter library (config, converter class, CLI registration) +py_library( + name = "pylib", + srcs = [ + "converter.py", + ], + deps = [ + ":pylib_utils", + requirement("click"), + requirement("numpy"), + requirement("pyarrow"), + requirement("scipy"), + requirement("tqdm"), + requirement("universal_pathlib"), + "//ncore:pylib", + "//tools/data_converter:pylib_cli", + "//tools/data_converter:pylib_structured_lidar_model", + ], +) + +# Standalone CLI binary +py_binary( + name = "convert", + srcs = ["main.py"], + main = "main.py", + deps = [":pylib"], +) + +alias( + name = "argoverse2", + actual = ":convert", +) + +# Integration test for the Argoverse 2 converter (requires AV2_DIR env var) +pytest_test( + name = "pytest_converter", + srcs = ["converter_test.py"], + python_versions = ["3.11"], + tags = ["manual"], # Only run when explicitly requested (needs external data) + deps = [ + ":pylib", + requirement("numpy"), + requirement("parameterized"), + requirement("pyarrow"), + requirement("torch"), + requirement("universal_pathlib"), + "//ncore:pylib", + ], +) + +# Data-free unit test for the VLP-32C lidar-model derivation (runs in CI). +pytest_test( + name = "pytest_utils", + srcs = ["utils_test.py"], + python_versions = ["3.11"], + deps = [ + ":pylib_utils", + requirement("numpy"), + requirement("pyarrow"), + requirement("torch"), + requirement("universal_pathlib"), + "//ncore:pylib", + ], +) diff --git a/tools/data_converter/argoverse2/NOTICE b/tools/data_converter/argoverse2/NOTICE new file mode 100644 index 00000000..e16f1b37 --- /dev/null +++ b/tools/data_converter/argoverse2/NOTICE @@ -0,0 +1,12 @@ +Argoverse 2 Dataset +Copyright (c) 2021 Argo AI, LLC + +This converter processes data from the Argoverse 2 (AV2) Sensor Dataset. +The Argoverse 2 dataset is released under the Creative Commons +Attribution-NonCommercial-ShareAlike 4.0 International License (CC BY-NC-SA 4.0). + +Users must agree to the Argoverse Terms of Use before downloading or using the dataset: +https://www.argoverse.org/about.html#terms-of-use + +This converter reads the Argoverse 2 on-disk Feather files directly and does not +depend on the av2 devkit. The av2 devkit is released under the MIT License. diff --git a/tools/data_converter/argoverse2/README.md b/tools/data_converter/argoverse2/README.md new file mode 100644 index 00000000..033d7952 --- /dev/null +++ b/tools/data_converter/argoverse2/README.md @@ -0,0 +1,117 @@ +# Argoverse 2 to NCore V4 Converter + +Converts [Argoverse 2](https://www.argoverse.org/av2.html) Sensor Dataset logs to +NCore V4 format. + +The converter reads the Argoverse 2 on-disk Apache Feather files directly with +`pyarrow`, deliberately avoiding the heavy `av2` devkit (which pulls in torch, +kornia, numba, polars and PyAV). Quaternion handling uses `scipy` (already an +ncore dependency), so no extra dependency is introduced. + +## Requirements + +- Argoverse 2 Sensor Dataset downloaded locally, organised as + `{root}/{split}/{log_id}/...` +- Python packages: `pyarrow` (plus `scipy`, already an ncore dependency) + +## Usage + +```bash +bazel run //tools/data_converter/argoverse2 -- \ + --root-dir /path/to/argoverse2/sensor \ + --output-dir /path/to/output \ + argoverse2-v4 \ + --split val +``` + +### Convert a single log + +```bash +bazel run //tools/data_converter/argoverse2 -- \ + --root-dir /path/to/argoverse2/sensor \ + --output-dir /path/to/output \ + argoverse2-v4 \ + --split val \ + --log-id 02678d04-cc9f-3148-9f95-1ba66347dff9 +``` + +## Options + +| Option | Default | Description | +|--------|---------|-------------| +| `--split` | val | Split directory under `--root-dir` (train, val, test) | +| `--log-id` | None | Filter to a single log by ID | +| `--store-type` | itar | Output store format (itar or directory) | +| `--profile` | separate-sensors | Component group assignment profile | +| `--sequence-meta/--no-sequence-meta` | enabled | Generate sequence meta JSON | + +## Sensor Assumptions + +- **Cameras**: 9 cameras (7 ring + 2 stereo). AV2 imagery is shipped already + undistorted -- the official av2 devkit projects with the intrinsic matrix `K` + only and does not load the `k1, k2, k3` columns -- so the stored model is an + ideal (distortion-free) pinhole (`IdealPinholeCameraModelParameters`). Because + the imagery is already undistorted, global shutter is assumed + (`ShutterType.GLOBAL`). The `k1, k2, k3` coefficients in `intrinsics.feather` + describe the original lens (for re-distorting into the raw frame) and must not be + applied to the released images, so they are not used for projection -- but they + are preserved per camera in the camera component `generic_meta_data` under + `av2_original_distortion` so the original calibration is not lost. +- **Lidar**: two stacked Velodyne VLP-32C units (`up_lidar` / `down_lidar`, 10 Hz). + The source sweep is egomotion-compensated to the sweep reference timestamp and + expressed in the egovehicle frame. Real per-point timestamps are available via + `offset_ns`. Each unit is stored separately with its own extrinsic. Points are + mapped into each unit's sensor frame and decompensated using the real per-point + timestamps so the stored directions are raw per-point-time measurements. Because + the extrinsic is static, this is independent of whether AV2 applied ego-motion + before or after the sensor transform. + - A structured VLP-32C model is stored per unit as lidar intrinsics, with + per-point `model_element` (row, column). AV2 provides no native firing-column + index, so the firing pattern is reconstructed from `offset_ns` (firing columns -- + one VLP-32C revolution at 10 Hz) and `laser_number` (the beam, mapped to an + elevation-sorted row). The geometry is derived per log from the *decompensated* + reference sweep: elevations, the laser->row map, column timing, per-column + azimuths, and per-row azimuth offsets (the 32 beams of a firing column span + several degrees of azimuth, so the per-row offset is fit empirically rather than + assumed). The two stacked units fire in opposite phase, so they spin oppositely + in their own frames (one `cw`, one `ccw`); this is detected from the data. The + column grid is upsampled 4x so the per-frame alignment is not column-quantized. + Each sweep is re-aligned to the model by a per-frame affine column remap (a + constant phase plus a linear term): the spin phase at a given `offset_ns` drifts + ~1 deg between sweeps (the constant), and the spin rate drifts slightly within a + sweep on some scenes (the linear term). A fixed mapping, or a phase-only shift, + would leave some frames off by up to ~1 deg / ~0.25 deg respectively. + Steep downward beams that only return at near range (no far data, e.g. the + lowest laser at ~-25 deg) have their azimuth offset fit from near-range returns. + Deriving from the decompensated cloud (not the ego-motion-smeared compensated + one) plus these steps gives ~0.03-0.05 deg median far-range reconstruction across + scenes, on par with native-column sensors. Pass `--lidar-model-source none` to + store raw ray bundles only. + - The `laser_number` to up/down unit split is not documented by AV2. The two + units occupy the two laser-number halves (`< 32` and `>= 32`); the unit *label* + is recovered from extrinsic geometry by per-beam elevation flatness (a laser + ring traces a constant-elevation cone only in its own sensor frame, so the + wrong extrinsic tilts the cone and inflates the per-ring elevation spread). The + decision is made once per log and is stable with a wide (~2-10x) margin. +- **Radar**: AV2 has no radar. +- **Cuboid annotations**: native to the egovehicle frame at the sweep reference + time, stored in the `rig` frame at that timestamp with no ego pose baked in. This + keeps the egovehicle motion out of the stored coordinates so it stays swappable + downstream (a V4 feature); the pose graph places the cuboids using the active ego + trajectory. The full 3-DOF box orientation is preserved (the AV2 quaternion is + converted to the BBox3 `xyz`-Euler convention, not reduced to yaw). `track_uuid` + is used as track ID. + +## Coordinate frames + +The first ego pose's `city_SE3_egovehicle` is stored as the static +`world -> world_global` pose, so `world_global` is the AV2 city frame. All absolute +city coordinates remain recoverable for later alignment with the AV2 HD map (which +the converter does not export). + +## Testing + +```bash +AV2_DIR=/path/to/argoverse2/sensor AV2_SPLIT=val \ + bazel test //tools/data_converter/argoverse2:pytest_converter +``` diff --git a/tools/data_converter/argoverse2/converter.py b/tools/data_converter/argoverse2/converter.py new file mode 100644 index 00000000..7c999e1a --- /dev/null +++ b/tools/data_converter/argoverse2/converter.py @@ -0,0 +1,776 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Argoverse 2 Sensor Dataset to NCore V4 converter.""" + +from __future__ import annotations + +import json +import logging + +from dataclasses import dataclass +from typing import Dict, List, Literal, Optional + +import click +import numpy as np +import tqdm + +from scipy.spatial.transform import Rotation as R +from upath import UPath + +from ncore.impl.common.transformations import ( + HalfClosedInterval, + MotionCompensator, + se3_inverse, +) +from ncore.impl.data.types import ( + BBox3, + CuboidTrackObservation, + LabelSource, + RowOffsetStructuredSpinningLidarModelParameters, +) +from ncore.impl.data.v4.components import ( + CameraSensorComponent, + CuboidsComponent, + IntrinsicsComponent, + LidarSensorComponent, + MasksComponent, + PosesComponent, + RadarSensorComponent, # noqa: F401 -- imported for parity; AV2 has no radar + SequenceComponentGroupsReader, + SequenceComponentGroupsWriter, +) +from ncore.impl.data.v4.types import ComponentGroupAssignments +from ncore.impl.data_converter.base import FileBasedDataConverter, FileBasedDataConverterConfig +from tools.data_converter.argoverse2.utils import ( + AV2_CATEGORY_MAP, + CAMERA_NAMES, + LIDAR_NAMES, + VLP32C_N_BEAMS, + Vlp32cGeometry, + assign_lidar_units, + build_vlp32c_model, + derive_vlp32c_geometry, + list_log_ids, + list_sensor_timestamps, + read_annotations, + read_city_se3_ego, + read_ego_se3_sensor, + read_intrinsics, + read_lidar_sweep, + reconstruct_model_elements, +) +from tools.data_converter.cli import cli + + +# Argoverse 2 timestamps are nanoseconds; NCore V4 uses microseconds. +NS_PER_US = 1000 + + +def _ns_to_us(value_ns: int) -> int: + return int(value_ns) // NS_PER_US + + +# ----------------------------------------------------------------------------- +# Config +# ----------------------------------------------------------------------------- + + +@dataclass(kw_only=True, slots=True) +class Argoverse2Converter4Config(FileBasedDataConverterConfig): + """Configuration for Argoverse 2 to NCore V4 conversion.""" + + split: str = "val" + log_id: Optional[str] = None + store_type: Literal["itar", "directory"] = "itar" + component_group_profile: Literal["default", "separate-sensors", "separate-all"] = "separate-sensors" + store_sequence_meta: bool = True + # Lidar model: "empirical" derives a VLP-32C structured model (per unit) and stores + # per-point model elements + intrinsics; "none" stores raw ray bundles only. + lidar_model_source: Literal["empirical", "none"] = "empirical" + + +# ----------------------------------------------------------------------------- +# Converter +# ----------------------------------------------------------------------------- + + +class Argoverse2Converter4(FileBasedDataConverter): + """Dataset preprocessing class for converting Argoverse 2 data to NCore V4. + + Sensor assumptions (sourced from the AV2 devkit and User Guide): + + - Cameras: 9 cameras (7 ring + 2 stereo). Imagery is shipped already + undistorted, so a pinhole model with zero distortion is exact. Because the + released imagery is already undistorted (a single capture timestamp per + image, no rolling-shutter metadata), global shutter is assumed -> + ``ShutterType.GLOBAL``. The original lens radial-distortion coefficients + ``(k1, k2, k3)`` are not applied but are preserved per camera in the camera + component ``generic_meta_data`` under ``av2_original_distortion``. + - Lidar: two stacked Velodyne VLP-32C units (``up_lidar`` / ``down_lidar``, + 10 Hz). The source sweep is egomotion-compensated to the sweep reference + timestamp and stored in the egovehicle frame. Real per-point timestamps are + provided (``offset_ns``). We split points per unit by ``laser_number``, + transform into each unit's sensor frame, and decompensate using the real + per-point timestamps so NCore stores raw per-point-time directions. A + structured VLP-32C model is stored per unit (with per-point ``model_element``) + by reconstructing the firing pattern from ``offset_ns`` + ``laser_number``; + the two units spin oppositely in their own frames (detected from data). Pass + ``--lidar-model-source none`` to skip the model and store raw ray bundles only. + - Radar: AV2 has no radar. + - Cuboid annotations: native to the egovehicle frame at the sweep reference + time. Stored in the ``rig`` frame at that timestamp with no ego pose baked + in, so the egovehicle motion stays swappable downstream (a V4 feature); the + pose graph places the cuboids using the active ego trajectory. + + The first ego pose's ``city_SE3_egovehicle`` is stored as the static + ``world -> world_global`` anchor, so ``world_global`` is the AV2 city frame. + This keeps absolute coordinates recoverable for later HD-map alignment. + """ + + def __init__(self, config: Argoverse2Converter4Config) -> None: + super().__init__(config) + + self.component_group_profile = config.component_group_profile + self.store_type = config.store_type + self.store_sequence_meta = config.store_sequence_meta + + self._split = config.split + self._log_id = config.log_id + self._lidar_model_source = config.lidar_model_source + + self.logger = logging.getLogger(__name__) + + @property + def _split_dir(self) -> UPath: + return self.root_dir / self._split + + @staticmethod + def get_sequence_ids(config: Argoverse2Converter4Config) -> List[str]: + """Discover log IDs to convert.""" + if config.log_id is not None: + return [config.log_id] + split_dir = UPath(config.root_dir) / config.split + return list_log_ids(split_dir) + + @staticmethod + def from_config(config: Argoverse2Converter4Config) -> Argoverse2Converter4: + return Argoverse2Converter4(config) + + def convert_sequence(self, sequence_id: str) -> None: + """Convert a single Argoverse 2 log to NCore V4 format.""" + log_id = sequence_id + log_dir = self._split_dir / log_id + + self.logger.info(f"Converting log {log_id} (split={self._split})") + + # --- Ego poses (egovehicle -> city) ----------------------------------- + pose_timestamps_ns, T_ego_city_all = read_city_se3_ego(log_dir) + n_poses = len(pose_timestamps_ns) + assert n_poses >= 2, f"Log has fewer than 2 ego poses: {n_poses}" + + pose_timestamps_us = np.array([_ns_to_us(t) for t in pose_timestamps_ns], dtype=np.uint64) + + # AV2 ego poses are dense (some only nanoseconds apart), so the ns -> us + # truncation can produce duplicate microsecond timestamps. The pose writer + # requires strictly increasing timestamps, so keep the first pose for each + # unique microsecond timestamp. + pose_timestamps_us, unique_idx = np.unique(pose_timestamps_us, return_index=True) + T_ego_city_all = T_ego_city_all[unique_idx] + n_poses = len(pose_timestamps_us) + assert n_poses >= 2, f"Log has fewer than 2 unique-microsecond ego poses: {n_poses}" + + # Anchor the first pose as world_global (the AV2 city frame); store all + # poses relative to it so the relative poses are float32-safe. + T_world_world_global = T_ego_city_all[0].copy() # float64 for global accuracy + T_world_global_inv = se3_inverse(T_world_world_global) + T_rig_world_relative = (T_world_global_inv @ T_ego_city_all).astype(np.float32) + + # --- Static sensor extrinsics (sensor -> ego) ------------------------- + ego_se3_sensor = read_ego_se3_sensor(log_dir) + + # --- Determine active sensors ----------------------------------------- + camera_ids = self.get_active_camera_ids(list(CAMERA_NAMES)) + lidar_ids = self.get_active_lidar_ids(list(LIDAR_NAMES)) + radar_ids = self.get_active_radar_ids([]) # AV2 has no radar + + # --- Sequence time interval ------------------------------------------- + all_ts_us: List[int] = [int(pose_timestamps_us[0]), int(pose_timestamps_us[-1])] + lidar_ts_ns = list_sensor_timestamps(log_dir, "lidar") + if lidar_ts_ns: + all_ts_us += [_ns_to_us(lidar_ts_ns[0]), _ns_to_us(lidar_ts_ns[-1])] + for cam_id in camera_ids: + cam_ts_ns = list_sensor_timestamps(log_dir, "cameras", cam_id) + if cam_ts_ns: + all_ts_us += [_ns_to_us(cam_ts_ns[0]), _ns_to_us(cam_ts_ns[-1])] + + seq_start_us = min(all_ts_us) + seq_end_us = max(all_ts_us) + + # Extend pose timeline to cover the sequence interval, extrapolating with a + # constant-velocity assumption so lidar decompensation near the boundaries + # has real motion to invert (mirrors the nuScenes converter). + T_rig_world_relative, pose_timestamps_us = self._extend_pose_timeline( + T_rig_world_relative, pose_timestamps_us, seq_start_us, seq_end_us + ) + + sequence_timestamp_interval_us = HalfClosedInterval.from_start_end(seq_start_us, seq_end_us) + + # --- Component group assignments -------------------------------------- + component_groups = ComponentGroupAssignments.create( + camera_ids=camera_ids, + lidar_ids=lidar_ids, + radar_ids=radar_ids, + point_clouds_ids=[], + camera_labels_ids=[], + profile=self.component_group_profile, + ) + + # --- Create writer ---------------------------------------------------- + store_writer = SequenceComponentGroupsWriter( + output_dir_path=self.output_dir / log_id, + store_base_name=log_id, + sequence_id=log_id, + sequence_timestamp_interval_us=sequence_timestamp_interval_us, + store_type=self.store_type, + generic_meta_data={ + "source_dataset": "argoverse2", + "argoverse2_split": self._split, + "argoverse2_log_id": log_id, + }, + ) + + poses_writer = store_writer.register_component_writer( + PosesComponent.Writer, + component_instance_name="default", + group_name=component_groups.poses_component_group, + generic_meta_data={ + "calibration_type": "argoverse2:egovehicle_SE3_sensor", + "egomotion_type": "argoverse2:city_SE3_egovehicle", + }, + ) + intrinsics_writer = store_writer.register_component_writer( + IntrinsicsComponent.Writer, + component_instance_name="default", + group_name=component_groups.intrinsics_component_group, + ) + masks_writer = store_writer.register_component_writer( + MasksComponent.Writer, + component_instance_name="default", + group_name=component_groups.masks_component_group, + ) + + # --- Store ego poses -------------------------------------------------- + poses_writer.store_dynamic_pose( + source_frame_id="rig", + target_frame_id="world", + poses=T_rig_world_relative, + timestamps_us=pose_timestamps_us, + ) + poses_writer.store_static_pose( + source_frame_id="world", + target_frame_id="world_global", + pose=T_world_world_global, + ) + + # --- Decode sensors --------------------------------------------------- + if lidar_ids: + self._decode_lidars( + log_dir=log_dir, + store_writer=store_writer, + poses_writer=poses_writer, + intrinsics_writer=intrinsics_writer, + component_groups=component_groups, + lidar_ids=lidar_ids, + ego_se3_sensor=ego_se3_sensor, + T_rig_world_relative=T_rig_world_relative, + pose_timestamps_us=pose_timestamps_us, + ) + + self._decode_cameras( + log_dir=log_dir, + store_writer=store_writer, + poses_writer=poses_writer, + intrinsics_writer=intrinsics_writer, + masks_writer=masks_writer, + component_groups=component_groups, + camera_ids=camera_ids, + ego_se3_sensor=ego_se3_sensor, + ) + + self._decode_cuboids( + log_dir=log_dir, + store_writer=store_writer, + component_groups=component_groups, + ) + + # --- Finalize --------------------------------------------------------- + ncore_4_paths = store_writer.finalize() + + if self.store_sequence_meta: + sequence_component_reader = SequenceComponentGroupsReader(ncore_4_paths) + sequence_meta_path = self.output_dir / log_id / f"{sequence_component_reader.sequence_id}.json" + with sequence_meta_path.open("w") as f: + json.dump(sequence_component_reader.get_sequence_meta().to_dict(), f, indent=2) + self.logger.info(f"Wrote sequence meta data {str(sequence_meta_path)}") + + # ------------------------------------------------------------------------- + # Pose timeline + # ------------------------------------------------------------------------- + + @staticmethod + def _extend_pose_timeline( + T_rig_world_relative: np.ndarray, + pose_timestamps_us: np.ndarray, + seq_start_us: int, + seq_end_us: int, + ) -> tuple[np.ndarray, np.ndarray]: + """Extend the relative pose timeline to cover [seq_start, seq_end]. + + Boundaries are extrapolated with a constant-velocity assumption so the + first/last lidar sweeps have real ego motion to decompensate against. + """ + if seq_start_us < int(pose_timestamps_us[0]): + T_0 = T_rig_world_relative[0] + T_1 = T_rig_world_relative[1] + T_delta_inv = se3_inverse(T_1) @ T_0 + T_boundary = (T_0 @ T_delta_inv).astype(np.float32) + T_rig_world_relative = np.concatenate([T_boundary[np.newaxis], T_rig_world_relative], axis=0) + pose_timestamps_us = np.concatenate([np.array([seq_start_us], dtype=np.uint64), pose_timestamps_us]) + + if seq_end_us > int(pose_timestamps_us[-1]): + T_n1 = T_rig_world_relative[-2] + T_n = T_rig_world_relative[-1] + T_delta = se3_inverse(T_n1) @ T_n + T_boundary = (T_n @ T_delta).astype(np.float32) + T_rig_world_relative = np.concatenate([T_rig_world_relative, T_boundary[np.newaxis]], axis=0) + pose_timestamps_us = np.concatenate([pose_timestamps_us, np.array([seq_end_us], dtype=np.uint64)]) + + return T_rig_world_relative, pose_timestamps_us + + # ------------------------------------------------------------------------- + # Lidar + # ------------------------------------------------------------------------- + + def _decode_lidars( + self, + log_dir: UPath, + store_writer: SequenceComponentGroupsWriter, + poses_writer: PosesComponent.Writer, + intrinsics_writer: IntrinsicsComponent.Writer, + component_groups: ComponentGroupAssignments, + lidar_ids: List[str], + ego_se3_sensor: Dict[str, np.ndarray], + T_rig_world_relative: np.ndarray, + pose_timestamps_us: np.ndarray, + ) -> None: + """Decode and store the two stacked VLP-32C lidars individually. + + AV2 lidar points are egomotion-compensated to the sweep reference time + (the sweep start) and provided in the egovehicle frame. For each unit we + map points into the unit's own sensor frame (via the static extrinsic) and + decompensate using the real per-point timestamps -- referenced to the sweep + start -- to recover raw per-point-time directions. + + Because the sensor extrinsic is static, the decompensation commutes with + the ego->sensor transform, so the result is independent of whether AV2 + applied ego-motion before or after the sensor transform. + """ + sweep_ts_ns = list_sensor_timestamps(log_dir, "lidar") + if not sweep_ts_ns: + self.logger.warning("No lidar sweeps found") + return + + # Static extrinsics + per-unit writers + per-unit motion compensators. + T_unit_rig: Dict[str, np.ndarray] = {} + lidar_writers: Dict[str, LidarSensorComponent.Writer] = {} + compensators: Dict[str, MotionCompensator] = {} + + for unit_id in lidar_ids: + T_unit_rig[unit_id] = ego_se3_sensor[unit_id].astype(np.float32) + poses_writer.store_static_pose(source_frame_id=unit_id, target_frame_id="rig", pose=T_unit_rig[unit_id]) + lidar_writers[unit_id] = store_writer.register_component_writer( + LidarSensorComponent.Writer, + component_instance_name=unit_id, + group_name=component_groups.lidar_component_groups.get(unit_id), + generic_meta_data={"sensor_model": "Velodyne VLP-32C"}, + ) + compensators[unit_id] = MotionCompensator.from_sensor_rig( + sensor_id=unit_id, + T_sensor_rig=T_unit_rig[unit_id], + T_rig_worlds=T_rig_world_relative, + T_rig_worlds_timestamps_us=pose_timestamps_us, + ) + + # Determine the laser_number -> unit labelling once from the first sweep. + # The split is a fixed physical property of the two stacked sensors, so we + # resolve it once (robustly, from extrinsic geometry) and reuse it for every + # sweep rather than re-deciding per frame. + T_up = ego_se3_sensor["up_lidar"] if "up_lidar" in ego_se3_sensor else np.eye(4) + T_down = ego_se3_sensor["down_lidar"] if "down_lidar" in ego_se3_sensor else np.eye(4) + first_sweep = read_lidar_sweep(log_dir / "sensors" / "lidar" / f"{sweep_ts_ns[0]}.feather") + first_masks = assign_lidar_units(first_sweep.laser_number, first_sweep.xyz, T_up, T_down) + + # Map the decision to a laser_number threshold test (lo half == laser_number < 32). + lo_is_up = bool(first_masks["up_lidar"][first_sweep.laser_number < 4].all()) + unit_for_lo = "up_lidar" if lo_is_up else "down_lidar" + unit_for_hi = "down_lidar" if lo_is_up else "up_lidar" + self.logger.info(f"Lidar unit split: laser_number<32 -> {unit_for_lo}, >=32 -> {unit_for_hi}") + + # Build a structured VLP-32C model per unit from the first sweep. AV2 has no + # native firing-column index, but offset_ns + laser_number reconstruct it (one + # firing column per VLP-32C revolution; laser_number selects the beam/row). The + # firing geometry (elevations, laser->row map, per-column azimuths, per-row + # azimuth offsets, spin direction) is derived empirically per log from the + # DECOMPENSATED reference sweep -- decompensation is essential, since the raw + # motion-compensated azimuths are smeared by ego motion (~0.5 deg) which would + # otherwise dominate the model error. + unit_geometry: Dict[str, Vlp32cGeometry] = {} + unit_model: Dict[str, RowOffsetStructuredSpinningLidarModelParameters] = {} + if self._lidar_model_source == "empirical": + first_ref_ts_us = _ns_to_us(int(first_sweep.timestamp_ns)) + first_pt_ts_us = ((first_sweep.timestamp_ns + first_sweep.offset_ns) // NS_PER_US).astype(np.uint64) + first_start_us = min(first_ref_ts_us, int(first_pt_ts_us.min())) + first_end_us = max(int(first_pt_ts_us.max()), first_start_us + 1) + for unit_id, mask_first in ( + (unit_for_lo, first_sweep.laser_number < 32), + (unit_for_hi, first_sweep.laser_number >= 32), + ): + if unit_id not in lidar_ids or not mask_first.any(): + continue + laser_in_unit = (first_sweep.laser_number[mask_first] % VLP32C_N_BEAMS).astype(np.int64) + offset_in_unit = first_sweep.offset_ns[mask_first] + + # rig -> unit sensor frame, then decompensate to per-point time so the + # firing geometry is clean (un-smeared by ego motion). + T_rig_unit = se3_inverse(T_unit_rig[unit_id]) + xyz_sensor_first = (T_rig_unit[:3, :3] @ first_sweep.xyz[mask_first].T).T + T_rig_unit[:3, 3] + xyz_decomp_first = compensators[unit_id].motion_decompensate_points( + sensor_id=unit_id, + xyz_reftime=xyz_sensor_first, + timestamp_us=first_pt_ts_us[mask_first], + frame_start_timestamp_us=first_start_us, + frame_end_timestamp_us=first_end_us, + reference_timestamp_us=first_ref_ts_us, + ) + + geometry = derive_vlp32c_geometry( + xyz_decompensated=xyz_decomp_first.astype(np.float64), + laser_number_in_unit=laser_in_unit, + offset_ns=offset_in_unit, + ) + model = build_vlp32c_model(geometry) + unit_geometry[unit_id] = geometry + unit_model[unit_id] = model + self.logger.info( + f"Derived VLP-32C model for {unit_id}: {geometry.n_columns} columns, " + f"spin {geometry.spinning_direction}, " + f"column period {geometry.column_period_ns / 1000.0:.2f} us" + ) + + for ts_ns in tqdm.tqdm(sweep_ts_ns, desc="Process lidar"): + sweep = read_lidar_sweep(log_dir / "sensors" / "lidar" / f"{ts_ns}.feather") + + # Per-point absolute time = sweep_ts + offset; convert to microseconds. + # The sweep reference timestamp (the filename) is the frame start and + # the egomotion-compensation reference for all points in the sweep. + reference_ts_us = _ns_to_us(int(sweep.timestamp_ns)) + point_ts_us = ((sweep.timestamp_ns + sweep.offset_ns) // NS_PER_US).astype(np.uint64) + + frame_start_us = min(reference_ts_us, int(point_ts_us.min())) + frame_end_us = int(point_ts_us.max()) + if frame_end_us <= frame_start_us: + frame_end_us = frame_start_us + 1 + + lo = sweep.laser_number < 32 + unit_masks = {unit_for_lo: lo, unit_for_hi: ~lo} + + for unit_id in lidar_ids: + mask = unit_masks[unit_id] + if not mask.any(): + continue + + xyz_rig = sweep.xyz[mask] # AV2 egovehicle frame == NCore rig frame + ts_unit = point_ts_us[mask] + intensity_unit = sweep.intensity[mask] + + # rig -> unit sensor frame (points are compensated to the sweep + # reference time, so this is the sensor reference-time frame). + T_rig_unit = se3_inverse(T_unit_rig[unit_id]) + xyz_sensor = (T_rig_unit[:3, :3] @ xyz_rig.T).T + T_rig_unit[:3, 3] + + # Decompensate from the reference-time sensor frame (the sweep start, + # AV2's compensation reference) to each point's own measurement time. + xyz_raw = compensators[unit_id].motion_decompensate_points( + sensor_id=unit_id, + xyz_reftime=xyz_sensor, + timestamp_us=ts_unit, + frame_start_timestamp_us=frame_start_us, + frame_end_timestamp_us=frame_end_us, + reference_timestamp_us=reference_ts_us, + ) + + distance_m = np.linalg.norm(xyz_raw, axis=1) + direction = np.zeros_like(xyz_raw) + nonzero = distance_m > 0 + direction[nonzero] = xyz_raw[nonzero] / distance_m[nonzero, np.newaxis] + + # Per-point structured-model element (row, column) from beam + firing + # time, re-aligned to the model by this frame's azimuth phase. + model_element = None + if unit_id in unit_geometry: + model_element = reconstruct_model_elements( + laser_number_in_unit=(sweep.laser_number[mask] % VLP32C_N_BEAMS).astype(np.int64), + offset_ns=sweep.offset_ns[mask], + geometry=unit_geometry[unit_id], + xyz_decompensated=xyz_raw, + ) + + lidar_writers[unit_id].store_frame( + direction=direction.astype(np.float32), + timestamp_us=ts_unit, + model_element=model_element, + distance_m=distance_m.reshape(1, -1), + intensity=intensity_unit.reshape(1, -1), + frame_timestamps_us=np.array([frame_start_us, frame_end_us], dtype=np.uint64), + generic_data={}, + generic_meta_data={}, + ) + + # Store the structured lidar model as intrinsics for each unit. + for unit_id, model in unit_model.items(): + intrinsics_writer.store_lidar_intrinsics(lidar_id=unit_id, lidar_model_parameters=model) + + # ------------------------------------------------------------------------- + # Cameras + # ------------------------------------------------------------------------- + + def _decode_cameras( + self, + log_dir: UPath, + store_writer: SequenceComponentGroupsWriter, + poses_writer: PosesComponent.Writer, + intrinsics_writer: IntrinsicsComponent.Writer, + masks_writer: MasksComponent.Writer, + component_groups: ComponentGroupAssignments, + camera_ids: List[str], + ego_se3_sensor: Dict[str, np.ndarray], + ) -> None: + """Decode and store all camera frames. + + AV2 imagery is shipped already undistorted, so global shutter is assumed and + the stored model is a pinhole with zero distortion coefficients. The original + lens radial-distortion coefficients ``(k1, k2, k3)`` are preserved per camera + in the camera component ``generic_meta_data`` (``av2_original_distortion``). + """ + intrinsics = read_intrinsics(log_dir) + + for camera_id in camera_ids: + cam_ts_ns = list_sensor_timestamps(log_dir, "cameras", camera_id) + if not cam_ts_ns: + self.logger.warning(f"No data for camera {camera_id}") + continue + + self.logger.info(f"Processing camera {camera_id}") + + # Extrinsic: camera -> rig (ego) + T_cam_rig = ego_se3_sensor[camera_id].astype(np.float32) + poses_writer.store_static_pose(source_frame_id=camera_id, target_frame_id="rig", pose=T_cam_rig) + + camera_intrinsics = intrinsics[camera_id] + intrinsics_writer.store_camera_intrinsics( + camera_id=camera_id, + camera_model_parameters=camera_intrinsics.model, + ) + + masks_writer.store_camera_masks(camera_id=camera_id, mask_images={}) + + # Preserve the original AV2 lens radial-distortion coefficients as + # provenance. The released imagery is already undistorted (so the stored + # model is a distortion-free ideal pinhole), but the raw (k1, k2, k3) + # describe the original lens and would otherwise be lost. + k1, k2, k3 = camera_intrinsics.original_distortion_k1k2k3 + camera_writer = store_writer.register_component_writer( + CameraSensorComponent.Writer, + component_instance_name=camera_id, + group_name=component_groups.camera_component_groups.get(camera_id), + generic_meta_data={ + "av2_original_distortion": {"k1": k1, "k2": k2, "k3": k3}, + }, + ) + + camera_dir = log_dir / "sensors" / "cameras" / camera_id + for ts_ns in tqdm.tqdm(cam_ts_ns, desc=f"Process {camera_id}"): + image_path = camera_dir / f"{ts_ns}.jpg" + with image_path.open("rb") as f: + image_binary = f.read() + + frame_ts = _ns_to_us(ts_ns) + camera_writer.store_frame( + image_binary_data=image_binary, + image_format="jpeg", + frame_timestamps_us=np.array([frame_ts, frame_ts], dtype=np.uint64), + generic_data={}, + generic_meta_data={}, + ) + + self.logger.info(f"Processed {len(camera_ids)} cameras") + + # ------------------------------------------------------------------------- + # Cuboid annotations + # ------------------------------------------------------------------------- + + def _decode_cuboids( + self, + log_dir: UPath, + store_writer: SequenceComponentGroupsWriter, + component_groups: ComponentGroupAssignments, + ) -> None: + """Decode AV2 3D annotations and store as cuboid track observations. + + AV2 cuboids are native to the egovehicle frame at the sweep reference + timestamp. We store them in that native frame -- ``rig`` at the sweep + timestamp -- without baking in any ego pose. This is lossless and, unlike + baking the cuboids into a static world frame, keeps the egovehicle motion + out of the stored coordinates so it remains swappable downstream (a V4 + feature): the pose graph places the cuboids using whatever ego trajectory + is active. + + Lidar points are decompensated to their own per-point time, but a cuboid is + a single object pose at the sweep reference time, so referencing it to + ``rig`` at that timestamp is exactly correct -- the pose graph evaluates the + rig pose at the cuboid timestamp when transforming. + """ + annotations_path = log_dir / "annotations.feather" + if not annotations_path.exists(): + self.logger.info("No annotations.feather found (test split)") + return + + cols = read_annotations(log_dir) + n = len(cols["category"]) + + cuboid_observations: List[CuboidTrackObservation] = [] + for i in tqdm.tqdm(range(n), total=n, desc="Process cuboids"): + category = str(cols["category"][i]) + if category not in AV2_CATEGORY_MAP: + continue + + timestamp_us = _ns_to_us(int(cols["timestamp_ns"][i])) + + # Convert the full AV2 orientation quaternion (scalar-first wxyz) to the + # BBox3 "xyz" intrinsic-Euler convention. Keep the full 3-DOF rotation -- + # AV2 cuboids can carry roll/pitch (e.g. objects on slopes/banked roads), + # so extracting yaw only would silently drop that. + quat_wxyz = (cols["qw"][i], cols["qx"][i], cols["qy"][i], cols["qz"][i]) + rx, ry, rz = R.from_quat(quat_wxyz, scalar_first=True).as_euler("xyz", degrees=False) + + # AV2 cuboids are in the egovehicle (rig) frame at the sweep reference + # time. length_m -> x extent, width_m -> y extent, height_m -> z extent. + bbox3 = BBox3.from_array( + np.array( + [ + cols["tx_m"][i], + cols["ty_m"][i], + cols["tz_m"][i], + cols["length_m"][i], + cols["width_m"][i], + cols["height_m"][i], + rx, + ry, + rz, + ], + dtype=np.float32, + ) + ) + + cuboid_observations.append( + CuboidTrackObservation( + track_id=str(cols["track_uuid"][i]), + class_id=AV2_CATEGORY_MAP[category], + timestamp_us=timestamp_us, + reference_frame_id="rig", + reference_frame_timestamp_us=timestamp_us, + bbox3=bbox3, + source=LabelSource.EXTERNAL, + ) + ) + + if cuboid_observations: + store_writer.register_component_writer( + CuboidsComponent.Writer, + "default", + component_groups.cuboid_track_observations_component_group, + ).store_observations(cuboid_observations) + self.logger.info(f"Stored {len(cuboid_observations)} cuboid observations") + else: + self.logger.info("No mapped cuboid annotations found") + + +# ----------------------------------------------------------------------------- +# CLI +# ----------------------------------------------------------------------------- + + +@cli.command(name="argoverse2-v4") +@click.option( + "--split", + type=str, + default="val", + show_default=True, + help="Argoverse 2 split directory under --root-dir (e.g. train, val, test)", +) +@click.option( + "--log-id", + type=str, + default=None, + help="Convert only the log with this ID (defaults to all logs in the split)", +) +@click.option( + "--store-type", + type=click.Choice(["itar", "directory"], case_sensitive=False), + default="itar", + show_default=True, + help="Output store type", +) +@click.option( + "component_group_profile", + "--profile", + type=click.Choice(["default", "separate-sensors", "separate-all"], case_sensitive=False), + default="separate-sensors", + show_default=True, + help="Output profile for component group assignment", +) +@click.option( + "store_sequence_meta", + "--sequence-meta/--no-sequence-meta", + default=True, + help="Generate sequence meta-data JSON?", +) +@click.option( + "lidar_model_source", + "--lidar-model-source", + type=click.Choice(["empirical", "none"], case_sensitive=False), + default="empirical", + show_default=True, + help="Lidar model: 'empirical' derives a VLP-32C structured model per unit (model " + "elements + intrinsics) from the data; 'none' stores raw ray bundles only.", +) +@click.pass_context +def argoverse2_v4(ctx, split, log_id, **kwargs): + """Argoverse 2 Sensor Dataset conversion (V4 format)""" + + config = Argoverse2Converter4Config(**{**vars(ctx.obj), "split": split, "log_id": log_id, **kwargs}) + + Argoverse2Converter4.convert(config) diff --git a/tools/data_converter/argoverse2/converter_test.py b/tools/data_converter/argoverse2/converter_test.py new file mode 100644 index 00000000..c70bc25e --- /dev/null +++ b/tools/data_converter/argoverse2/converter_test.py @@ -0,0 +1,379 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Integration tests for the Argoverse 2 data converter (V4 format). + +Requires the AV2_DIR environment variable pointing to an Argoverse 2 Sensor +Dataset root directory organised as ``{AV2_DIR}/{split}/{log_id}/...``. + +Set AV2_SPLIT to override the default split (``val``). The first log in the split +is used for testing. +""" + +import os +import tempfile +import unittest + +from typing import Literal, cast + +import numpy as np + +from parameterized import parameterized_class +from upath import UPath + +from ncore.impl.data.types import ( + IdealPinholeCameraModelParameters, + RowOffsetStructuredSpinningLidarModelParameters, + ShutterType, +) +from ncore.impl.data.v4.components import ( + CameraSensorComponent, + CuboidsComponent, + IntrinsicsComponent, + LidarSensorComponent, + PosesComponent, + RadarSensorComponent, + SequenceComponentGroupsReader, +) +from tools.data_converter.argoverse2.converter import Argoverse2Converter4, Argoverse2Converter4Config +from tools.data_converter.argoverse2.utils import CAMERA_NAMES, LIDAR_NAMES, list_log_ids + + +@parameterized_class( + ("store_type",), + [ + ("itar",), + ("directory",), + ], +) +class TestArgoverse2Converter(unittest.TestCase): + """Integration tests for the Argoverse 2 data converter. + + Requires AV2_DIR environment variable pointing to an Argoverse 2 Sensor Dataset + root. Uses the first log in the split for testing. + """ + + store_type: Literal["itar", "directory"] + + @classmethod + def setUpClass(cls): + cls.av2_dir = os.environ.get("AV2_DIR") + if cls.av2_dir is None: + raise unittest.SkipTest("AV2_DIR not set -- skipping Argoverse 2 integration tests") + + cls.split = os.environ.get("AV2_SPLIT", "val") + + log_ids = list_log_ids(UPath(cls.av2_dir) / cls.split) + assert log_ids, f"No logs found under {cls.av2_dir}/{cls.split}" + cls.log_id = log_ids[0] + + cls._tempdir = tempfile.TemporaryDirectory(prefix="argoverse2_test_") + cls.output_dir = cls._tempdir.name + + config = Argoverse2Converter4Config( + root_dir=cls.av2_dir, + output_dir=cls.output_dir, + no_cameras=False, + camera_ids=None, + no_lidars=False, + lidar_ids=None, + no_radars=False, + radar_ids=None, + verbose=False, + debug=False, + debug_port=5678, + split=cls.split, + log_id=cls.log_id, + store_type=cls.store_type, + component_group_profile="separate-sensors", + store_sequence_meta=True, + ) + Argoverse2Converter4.convert(config) + + seq_dirs = [d for d in UPath(cls.output_dir).iterdir() if d.is_dir()] + assert len(seq_dirs) == 1, f"Expected 1 sequence dir, found {len(seq_dirs)}: {seq_dirs}" + cls.seq_dir = seq_dirs[0] + + meta_files = list(cls.seq_dir.glob("*.json")) + assert len(meta_files) == 1, f"Expected 1 meta JSON, found {len(meta_files)}" + cls.reader = SequenceComponentGroupsReader([meta_files[0]]) + + @classmethod + def tearDownClass(cls): + cls._tempdir.cleanup() + + # --- Poses ---------------------------------------------------------------- + + def test_sequence_has_dynamic_rig_to_world_pose(self): + poses_readers = self.reader.open_component_readers(PosesComponent.Reader) + self.assertEqual(len(poses_readers), 1) + poses_reader = list(poses_readers.values())[0] + + poses, timestamps = poses_reader.get_dynamic_pose("rig", "world") + self.assertEqual(poses.shape[1:], (4, 4)) + self.assertGreater(poses.shape[0], 0) + self.assertEqual(timestamps.shape[0], poses.shape[0]) + + def test_sequence_has_static_world_to_world_global(self): + """world_global is the AV2 city frame; verify the static anchor exists.""" + poses_readers = self.reader.open_component_readers(PosesComponent.Reader) + poses_reader = list(poses_readers.values())[0] + + static_poses = dict(poses_reader.get_static_poses()) + self.assertIn(("world", "world_global"), static_poses) + self.assertEqual(static_poses[("world", "world_global")].shape, (4, 4)) + + def test_first_real_pose_near_identity(self): + """The anchored ego pose is stored as relative identity in the trajectory. + + The first pose's city_SE3_egovehicle is the world_global anchor, so its + relative rig -> world pose must be (near) identity. Boundary extrapolation + may prepend an extra pose, so we locate the identity pose rather than + assuming a fixed index. + """ + poses_readers = self.reader.open_component_readers(PosesComponent.Reader) + poses_reader = list(poses_readers.values())[0] + + poses, _ = poses_reader.get_dynamic_pose("rig", "world") + deviations = np.linalg.norm(poses - np.eye(4, dtype=np.float32), axis=(1, 2)) + np.testing.assert_array_almost_equal(poses[int(np.argmin(deviations))], np.eye(4, dtype=np.float32), decimal=3) + + # --- Cameras -------------------------------------------------------------- + + def test_nine_cameras_exist(self): + camera_readers = self.reader.open_component_readers(CameraSensorComponent.Reader) + self.assertEqual(set(camera_readers.keys()), set(CAMERA_NAMES)) + for cam_id, cam_reader in camera_readers.items(): + self.assertGreater(cam_reader.frames_count, 0, f"{cam_id} should have frames") + + def test_original_distortion_coefficients_preserved_in_metadata(self): + # The released imagery is undistorted (so the stored model is distortion-free), + # but the original lens k1/k2/k3 are preserved per camera as provenance. + camera_readers = self.reader.open_component_readers(CameraSensorComponent.Reader) + for cam_id, cam_reader in camera_readers.items(): + meta = cam_reader.generic_meta_data + self.assertIn("av2_original_distortion", meta, f"{cam_id} missing distortion provenance") + distortion = meta["av2_original_distortion"] + self.assertEqual(set(distortion), {"k1", "k2", "k3"}, f"{cam_id} distortion keys") + for key, value in distortion.items(): + self.assertIsInstance(value, float, f"{cam_id} {key} should be a float") + + def test_camera_intrinsics_ideal_pinhole_global_shutter(self): + intrinsics_readers = self.reader.open_component_readers(IntrinsicsComponent.Reader) + self.assertEqual(len(intrinsics_readers), 1) + intrinsics_reader = list(intrinsics_readers.values())[0] + + for cam_id in CAMERA_NAMES: + params = intrinsics_reader.get_camera_model_parameters(cam_id) + # AV2 imagery is shipped undistorted, so an ideal (distortion-free) + # pinhole is the exact model. + self.assertIsInstance(params, IdealPinholeCameraModelParameters) + params = cast(IdealPinholeCameraModelParameters, params) + self.assertEqual(params.shutter_type, ShutterType.GLOBAL) + self.assertTrue(np.all(params.focal_length > 0)) + + def test_camera_extrinsics_stored_as_static_poses(self): + poses_readers = self.reader.open_component_readers(PosesComponent.Reader) + poses_reader = list(poses_readers.values())[0] + + static_poses = dict(poses_reader.get_static_poses()) + for cam_id in CAMERA_NAMES: + self.assertIn((cam_id, "rig"), static_poses) + + # --- Lidar ---------------------------------------------------------------- + + def test_two_lidar_units_exist(self): + lidar_readers = self.reader.open_component_readers(LidarSensorComponent.Reader) + self.assertEqual(set(lidar_readers.keys()), set(LIDAR_NAMES)) + for lidar_id, lidar_reader in lidar_readers.items(): + self.assertGreater(lidar_reader.frames_count, 0, f"{lidar_id} should have frames") + + def test_lidar_extrinsics_stored_as_static_poses(self): + poses_readers = self.reader.open_component_readers(PosesComponent.Reader) + poses_reader = list(poses_readers.values())[0] + + static_poses = dict(poses_reader.get_static_poses()) + for lidar_id in LIDAR_NAMES: + self.assertIn((lidar_id, "rig"), static_poses) + + def test_lidar_directions_unit_norm(self): + lidar_readers = self.reader.open_component_readers(LidarSensorComponent.Reader) + lidar_reader = lidar_readers["up_lidar"] + ts = int(lidar_reader.frames_timestamps_us[0, 1]) # end-of-frame timestamp key + direction = lidar_reader.get_frame_ray_bundle_data(ts, "direction") + norms = np.linalg.norm(direction, axis=1) + # Zero-distance rays may have zero direction; check the populated ones. + nonzero = norms > 0 + np.testing.assert_allclose(norms[nonzero], 1.0, atol=1e-4) + + def test_lidar_unit_split_recovered_from_geometry(self): + """The two units carry comparable point counts (~half the sweep each). + + Each VLP-32C contributes 32 of the 64 beams, so a correct split yields + roughly balanced point counts per unit (allowing for differing FOV + occupancy). + """ + lidar_readers = self.reader.open_component_readers(LidarSensorComponent.Reader) + counts = {} + for unit in ("up_lidar", "down_lidar"): + reader = lidar_readers[unit] + ts = int(reader.frames_timestamps_us[0, 1]) + counts[unit] = len(reader.get_frame_ray_bundle_data(ts, "direction")) + ratio = min(counts.values()) / max(counts.values()) + self.assertGreater(ratio, 0.5, f"Lidar unit point counts unbalanced: {counts}") + + # --- Lidar structured model ----------------------------------------------- + + def test_lidar_intrinsics_vlp32c_model_per_unit(self): + """Each unit stores a VLP-32C structured model (32 rows) as intrinsics.""" + intrinsics_reader = list(self.reader.open_component_readers(IntrinsicsComponent.Reader).values())[0] + for unit in LIDAR_NAMES: + model = intrinsics_reader.get_lidar_model_parameters(unit) + self.assertIsInstance(model, RowOffsetStructuredSpinningLidarModelParameters) + model = cast(RowOffsetStructuredSpinningLidarModelParameters, model) + self.assertEqual(model.n_rows, 32) + self.assertGreater(model.n_columns, 100) + self.assertIn(model.spinning_direction, ("cw", "ccw")) + + def test_lidar_model_elements_in_bounds(self): + """Stored per-point model elements index valid (row, column) cells.""" + intrinsics_reader = list(self.reader.open_component_readers(IntrinsicsComponent.Reader).values())[0] + lidar_readers = self.reader.open_component_readers(LidarSensorComponent.Reader) + for unit in LIDAR_NAMES: + model = cast( + RowOffsetStructuredSpinningLidarModelParameters, + intrinsics_reader.get_lidar_model_parameters(unit), + ) + reader = lidar_readers[unit] + ts = int(reader.frames_timestamps_us[0, 1]) + elem = reader.get_frame_ray_bundle_data(ts, "model_element") + self.assertEqual(elem.shape[1], 2) + self.assertEqual(elem.dtype, np.uint16) + self.assertTrue(np.all(elem[:, 0] < model.n_rows), "row index out of bounds") + self.assertTrue(np.all(elem[:, 1] < model.n_columns), "column index out of bounds") + + def test_lidar_model_reconstructs_directions(self): + """Model-predicted directions match stored native directions (far-range). + + Validates the firing-pattern reconstruction: the structured model, indexed + by the stored per-point (row, column), should reproduce the stored ray + directions to within a small angular error for far-range returns. + """ + from ncore.impl.sensors.lidar import StructuredLidarModel + + intrinsics_reader = list(self.reader.open_component_readers(IntrinsicsComponent.Reader).values())[0] + lidar_readers = self.reader.open_component_readers(LidarSensorComponent.Reader) + for unit in LIDAR_NAMES: + model_params = cast( + RowOffsetStructuredSpinningLidarModelParameters, + intrinsics_reader.get_lidar_model_parameters(unit), + ) + reader = lidar_readers[unit] + ts = int(reader.frames_timestamps_us[0, 1]) + direction = reader.get_frame_ray_bundle_data(ts, "direction") + elem = reader.get_frame_ray_bundle_data(ts, "model_element") + distance = np.asarray(reader._get_ray_bundle_returns_group(ts)["distance_m"])[0] + + far = np.isfinite(distance) & (distance > 20.0) & (np.linalg.norm(direction, axis=1) > 0) + self.assertGreater(int(far.sum()), 100, f"{unit}: too few far returns to validate") + + model = StructuredLidarModel.maybe_from_parameters(model_params, device="cpu") + assert model is not None + predicted = model.elements_to_sensor_points(elem[far], np.ones(int(far.sum()), dtype=np.float32)) + predicted = predicted.cpu().numpy() + predicted /= np.linalg.norm(predicted, axis=1, keepdims=True) + cos = np.clip(np.sum(predicted * direction[far], axis=1), -1.0, 1.0) + median_err_deg = float(np.degrees(np.median(np.arccos(cos)))) + # The structured model is reconstructed from offset_ns + laser_number on the + # decompensated cloud, with empirical per-row azimuth offsets. This yields + # sub-0.1 deg far-range reconstruction (on par with native-column sensors). + self.assertLess(median_err_deg, 0.2, f"{unit}: model direction error {median_err_deg:.3f} deg too high") + + # --- No radar ------------------------------------------------------------- + + def test_no_radar(self): + radar_readers = self.reader.open_component_readers(RadarSensorComponent.Reader) + self.assertEqual(len(radar_readers), 0) + + # --- Cuboids -------------------------------------------------------------- + + def test_cuboids_in_rig_frame(self): + """Cuboids are stored in the native ``rig`` frame (no ego pose baked in).""" + cuboid_readers = self.reader.open_component_readers(CuboidsComponent.Reader) + if not cuboid_readers: + self.skipTest("No cuboids (test split)") + cuboid_reader = list(cuboid_readers.values())[0] + observations = list(cuboid_reader.get_observations()) + self.assertGreater(len(observations), 0) + for obs in observations[:50]: + self.assertEqual(obs.reference_frame_id, "rig") + + def test_cuboids_align_with_lidar(self): + """A reasonable fraction of lidar points fall inside annotated cuboids. + + This is the regression guard for the lidar decompensation reference bug: if + the points were decompensated against the wrong reference (or the cuboids + mis-referenced), almost no points would land inside the boxes. We transform + the stored (decompensated) first-frame lidar points to ``world`` via their + own per-point rig pose, transform the active cuboids from ``rig`` at the + sweep timestamp to ``world``, and count points inside. + """ + from ncore.impl.common.transformations import is_within_3d_bboxes, transform_bbox + + cuboid_readers = self.reader.open_component_readers(CuboidsComponent.Reader) + if not cuboid_readers: + self.skipTest("No cuboids (test split)") + + poses_reader = list(self.reader.open_component_readers(PosesComponent.Reader).values())[0] + lidar_reader = self.reader.open_component_readers(LidarSensorComponent.Reader)["up_lidar"] + frame_start_us, frame_end_us = (int(v) for v in lidar_reader.frames_timestamps_us[0]) + ts = frame_end_us # reader frame key is the end-of-frame timestamp + + # The cuboid reference timestamp is the AV2 sweep reference time, which is + # the start of the point window (offset_ns runs forward from it). + cuboid_ts = frame_start_us + + # Each lidar point is in its own per-point-time sensor frame; transform via + # sensor -> rig (static) and rig -> world at the point's own timestamp. + static = dict(poses_reader.get_static_poses()) + T_up_rig = static[("up_lidar", "rig")] + rig_poses, pose_ts = poses_reader.get_dynamic_pose("rig", "world") + + direction = lidar_reader.get_frame_ray_bundle_data(ts, "direction") + distance = np.asarray(lidar_reader._get_ray_bundle_returns_group(ts)["distance_m"])[0] + point_ts = lidar_reader.get_frame_ray_bundle_data(ts, "timestamp_us").astype(np.int64) + valid = np.isfinite(distance) & (distance > 0) + pts_sensor = direction[valid] * distance[valid, None] + pts_rig = (T_up_rig[:3, :3] @ pts_sensor.T).T + T_up_rig[:3, 3] + # Per-point rig -> world using the nearest stored pose (poses are dense). + nearest = np.searchsorted(pose_ts.astype(np.int64), point_ts[valid]).clip(0, len(rig_poses) - 1) + T_pts = rig_poses[nearest] + pts_world = np.einsum("nij,nj->ni", T_pts[:, :3, :3], pts_rig) + T_pts[:, :3, 3] + + observations = list(list(cuboid_readers.values())[0].get_observations()) + # cuboids active at this sweep (cuboid ref ts == sweep start) + active = [o for o in observations if abs(o.reference_frame_timestamp_us - cuboid_ts) < 2000] + self.assertGreater(len(active), 0, "no cuboids active at first lidar frame") + + # Cuboids are in rig at the sweep timestamp; bring them to world via the + # rig pose at that timestamp. + cuboid_pose_idx = int(np.argmin(np.abs(pose_ts.astype(np.int64) - cuboid_ts))) + T_rig_world_cuboid = rig_poses[cuboid_pose_idx] + boxes = np.stack([transform_bbox(o.bbox3.to_array().astype(np.float64), T_rig_world_cuboid) for o in active]) + inside = is_within_3d_bboxes(pts_world.astype(np.float64), boxes.astype(np.float64)) + n_inside = int(inside.any(axis=1).sum()) + self.assertGreater( + n_inside, 50, f"only {n_inside} lidar points inside any cuboid -- likely a frame/timestamp shift" + ) diff --git a/tools/data_converter/argoverse2/main.py b/tools/data_converter/argoverse2/main.py new file mode 100644 index 00000000..aa60f7c4 --- /dev/null +++ b/tools/data_converter/argoverse2/main.py @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Argoverse 2 converter CLI entry point.""" + +from tools.data_converter.argoverse2.converter import argoverse2_v4 # noqa: F401 -- registers CLI command +from tools.data_converter.cli import cli + + +if __name__ == "__main__": + cli(show_default=True) diff --git a/tools/data_converter/argoverse2/utils.py b/tools/data_converter/argoverse2/utils.py new file mode 100644 index 00000000..c8b7f93d --- /dev/null +++ b/tools/data_converter/argoverse2/utils.py @@ -0,0 +1,661 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Argoverse 2 specific utilities for the NCore V4 converter. + +This module reads the Argoverse 2 Sensor Dataset directly from its on-disk +Apache Feather files using ``pyarrow`` only, deliberately avoiding the heavy +``av2`` devkit (which pulls in torch, kornia, numba, polars and PyAV). Quaternions +are converted with ``scipy.spatial.transform.Rotation`` (already an ncore +dependency), so no extra runtime dependency is introduced. + +Reference (sourced from github.com/argoverse/av2-api and the AV2 User Guide): + +- Lidar sweeps are *egomotion-compensated* to the sweep reference timestamp and + stored in the **egovehicle** frame (not the individual sensor frame). The + feather columns are ``x, y, z, intensity, laser_number, offset_ns``. + Per-point absolute time is ``sweep_timestamp_ns + offset_ns``. +- The released imagery is **already undistorted**, so a pinhole model with zero + distortion is exact and **global shutter is assumed** on that basis. The + original lens radial-distortion coefficients ``(k1, k2, k3)`` are returned by + :func:`read_intrinsics` for provenance but are never applied. +- All quaternions are scalar-first ``(qw, qx, qy, qz)``. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Dict, List, Literal, Optional + +import numpy as np +import pyarrow.feather as feather + +from scipy.spatial.transform import Rotation as R +from upath import UPath + +from ncore.impl.common.transformations import se3_inverse +from ncore.impl.data.types import ( + IdealPinholeCameraModelParameters, + RowOffsetStructuredSpinningLidarModelParameters, + ShutterType, +) +from tools.data_converter.structured_lidar_model import enforce_spinning_monotonic + + +# --- Feather reading (no pandas) ----------------------------------------------- +# We read Arrow tables directly and pull columns out as numpy arrays. This avoids +# pulling pandas into the dependency closure (pyarrow.read_feather would default to +# a pandas DataFrame). + + +def _read_columns(path: UPath) -> Dict[str, np.ndarray]: + """Read a feather file into a ``column_name -> numpy array`` mapping.""" + table = feather.read_table(str(path)) + return {name: table.column(name).to_numpy(zero_copy_only=False) for name in table.column_names} + + +# --- Sensor ID mappings -------------------------------------------------------- +# Argoverse 2 sensor names are already descriptive; we keep them verbatim as the +# NCore sensor IDs so that any downstream alignment with AV2 map / metadata stays +# unambiguous. + +# All nine global-shutter cameras (7 ring + 2 stereo). +CAMERA_NAMES: List[str] = [ + "ring_front_center", + "ring_front_left", + "ring_front_right", + "ring_side_left", + "ring_side_right", + "ring_rear_left", + "ring_rear_right", + "stereo_front_left", + "stereo_front_right", +] + +# The two stacked Velodyne VLP-32C units. +LIDAR_NAMES: List[str] = ["up_lidar", "down_lidar"] + +# Number of beams per VLP-32C unit. laser_number spans [0, 63] across both units. +VLP32C_N_BEAMS: int = 32 + +# VLP-32C spins at 10 Hz (~100 ms per revolution). Both stacked units share this. +VLP32C_SCAN_DURATION_US: int = 100_000 +VLP32C_SPINNING_FREQUENCY_HZ: float = 10.0 +# Note: the apparent spin direction in a unit's own frame is detected per unit +# (the two stacked units fire in opposite phase), not assumed. + +# AV2 ships no radar. + +# --- Annotation taxonomy ------------------------------------------------------- +# Argoverse 2 3D cuboid categories (the 30-class `AnnotationCategories` taxonomy) +# mapped to NCore class IDs. AV2 category strings are upper snake-case. +AV2_CATEGORY_MAP: Dict[str, str] = { + "REGULAR_VEHICLE": "car", + "LARGE_VEHICLE": "truck", + "BOX_TRUCK": "truck", + "TRUCK": "truck", + "TRUCK_CAB": "truck", + "VEHICULAR_TRAILER": "trailer", + "SCHOOL_BUS": "bus", + "ARTICULATED_BUS": "bus", + "BUS": "bus", + "MESSAGE_BOARD_TRAILER": "trailer", + "RAILED_VEHICLE": "vehicle", + "MOTORCYCLE": "motorcycle", + "MOTORCYCLIST": "motorcyclist", + "BICYCLE": "bicycle", + "BICYCLIST": "bicyclist", + "WHEELED_DEVICE": "wheeled_device", + "WHEELED_RIDER": "wheeled_rider", + "PEDESTRIAN": "pedestrian", + "OFFICIAL_SIGNALER": "pedestrian", + "STROLLER": "stroller", + "WHEELCHAIR": "wheelchair", + "DOG": "animal", + "ANIMAL": "animal", + "CONSTRUCTION_CONE": "traffic_cone", + "CONSTRUCTION_BARREL": "barrier", + "STOP_SIGN": "stop_sign", + "BOLLARD": "bollard", + "SIGN": "sign", + "MOBILE_PEDESTRIAN_CROSSING_SIGN": "sign", + "TRAFFIC_LIGHT_TRAILER": "trailer", +} + + +# --- Pose / quaternion helpers ------------------------------------------------- + + +def se3_from_qwxyz_t(qw: float, qx: float, qy: float, qz: float, tx: float, ty: float, tz: float) -> np.ndarray: + """Build a 4x4 SE(3) matrix from a scalar-first quaternion and translation. + + Argoverse 2 stores all rotations as ``(qw, qx, qy, qz)``. + """ + T = np.eye(4, dtype=np.float64) + T[:3, :3] = R.from_quat((qw, qx, qy, qz), scalar_first=True).as_matrix() + T[:3, 3] = (tx, ty, tz) + return T + + +# --- Dataset layout / feather readers ------------------------------------------ + + +def list_log_ids(split_dir: UPath) -> List[str]: + """Return the sorted log IDs (sub-directory names) under a split directory.""" + return sorted(p.name for p in split_dir.iterdir() if p.is_dir()) + + +def read_city_se3_ego(log_dir: UPath) -> tuple[np.ndarray, np.ndarray]: + """Read ``city_SE3_egovehicle.feather`` (at the log root). + + Returns: + timestamps_ns: [N] uint64 sweep/pose timestamps (sorted ascending). + T_ego_city: [N, 4, 4] float64 poses (egovehicle -> city/global frame). + """ + cols = _read_columns(log_dir / "city_SE3_egovehicle.feather") + order = np.argsort(cols["timestamp_ns"]) + timestamps_ns = cols["timestamp_ns"][order].astype(np.uint64) + poses = np.stack( + [ + se3_from_qwxyz_t( + cols["qw"][i], + cols["qx"][i], + cols["qy"][i], + cols["qz"][i], + cols["tx_m"][i], + cols["ty_m"][i], + cols["tz_m"][i], + ) + for i in order + ] + ) + return timestamps_ns, poses + + +def read_ego_se3_sensor(log_dir: UPath) -> Dict[str, np.ndarray]: + """Read ``calibration/egovehicle_SE3_sensor.feather``. + + Returns a mapping ``sensor_name -> T_sensor_ego`` (4x4, sensor-frame point -> + egovehicle frame). + """ + cols = _read_columns(log_dir / "calibration" / "egovehicle_SE3_sensor.feather") + result: Dict[str, np.ndarray] = {} + for i, name in enumerate(cols["sensor_name"]): + result[str(name)] = se3_from_qwxyz_t( + cols["qw"][i], + cols["qx"][i], + cols["qy"][i], + cols["qz"][i], + cols["tx_m"][i], + cols["ty_m"][i], + cols["tz_m"][i], + ) + return result + + +@dataclass(frozen=True) +class CameraIntrinsics: + """An AV2 camera's ideal-pinhole model plus its original distortion provenance. + + AV2 ships already-undistorted imagery, so the converted ``model`` is a + distortion-free ideal pinhole. The original lens radial-distortion coefficients + ``(k1, k2, k3)`` from ``intrinsics.feather`` are kept here purely as provenance + (they describe the *raw* lens and are never applied to the released images). + """ + + model: IdealPinholeCameraModelParameters + original_distortion_k1k2k3: tuple[float, float, float] + + +def read_intrinsics( + log_dir: UPath, +) -> Dict[str, CameraIntrinsics]: + """Read ``calibration/intrinsics.feather`` into ideal-pinhole camera models. + + AV2 imagery is shipped already undistorted: the official av2 devkit projects + with the intrinsic matrix K only (``PinholeCamera.project_ego_to_img``) and its + ``Intrinsics`` dataclass does not even load the ``k1, k2, k3`` columns present + in the file. Those coefficients describe the *original* lens (for re-distorting + into the raw frame) and must not be applied to the released images, so an ideal + (distortion-free) pinhole is the exact model. Because the released imagery is + already undistorted, the cameras are modelled as global shutter. + + The raw ``k1, k2, k3`` coefficients are returned alongside the pinhole model so + callers can preserve them as provenance metadata (they are *not* applied to the + undistorted images). + """ + cols = _read_columns(log_dir / "calibration" / "intrinsics.feather") + result: Dict[str, CameraIntrinsics] = {} + for i, name in enumerate(cols["sensor_name"]): + model = IdealPinholeCameraModelParameters( + resolution=np.array([int(cols["width_px"][i]), int(cols["height_px"][i])], dtype=np.uint64), + shutter_type=ShutterType.GLOBAL, + external_distortion_parameters=None, + principal_point=np.array([cols["cx_px"][i], cols["cy_px"][i]], dtype=np.float32), + focal_length=np.array([cols["fx_px"][i], cols["fy_px"][i]], dtype=np.float32), + ) + result[str(name)] = CameraIntrinsics( + model=model, + original_distortion_k1k2k3=( + float(cols["k1"][i]), + float(cols["k2"][i]), + float(cols["k3"][i]), + ), + ) + return result + + +@dataclass(frozen=True) +class LidarSweep: + """A single AV2 lidar sweep, in the egovehicle frame. + + xyz are egomotion-compensated to ``timestamp_ns``; per-point absolute time is + ``timestamp_ns + offset_ns``. + """ + + xyz: np.ndarray # [N, 3] float32, egovehicle frame + intensity: np.ndarray # [N] float32 in [0, 1] + laser_number: np.ndarray # [N] uint8 in [0, 63] + offset_ns: np.ndarray # [N] int64, offset from sweep start + timestamp_ns: int # sweep reference timestamp (filename) + + +def read_lidar_sweep(path: UPath) -> LidarSweep: + """Read a single lidar sweep feather file (filename is the sweep timestamp).""" + cols = _read_columns(path) + timestamp_ns = int(UPath(path).stem) + return LidarSweep( + xyz=np.stack( + [ + cols["x"].astype(np.float32), + cols["y"].astype(np.float32), + cols["z"].astype(np.float32), + ], + axis=1, + ), + intensity=(cols["intensity"].astype(np.float32) / 255.0), + laser_number=cols["laser_number"].astype(np.uint8), + offset_ns=cols["offset_ns"].astype(np.int64), + timestamp_ns=timestamp_ns, + ) + + +def read_annotations(log_dir: UPath) -> Dict[str, np.ndarray]: + """Read ``annotations.feather`` into a column -> numpy array mapping.""" + return _read_columns(log_dir / "annotations.feather") + + +def list_sensor_timestamps(log_dir: UPath, sensor_kind: str, sensor_name: Optional[str] = None) -> List[int]: + """List the sorted nanosecond timestamps available for a sensor stream. + Args: + sensor_kind: ``"lidar"`` or ``"cameras"``. + sensor_name: camera name (required for ``"cameras"``; ignored for lidar). + """ + if sensor_kind == "lidar": + sensor_dir = log_dir / "sensors" / "lidar" + suffix = ".feather" + elif sensor_kind == "cameras": + assert sensor_name is not None, "sensor_name required for cameras" + sensor_dir = log_dir / "sensors" / "cameras" / sensor_name + suffix = ".jpg" + else: + raise ValueError(f"Unknown sensor_kind: {sensor_kind}") + + if not sensor_dir.exists(): + return [] + + return sorted(int(p.stem) for p in sensor_dir.iterdir() if p.name.endswith(suffix)) + + +def assign_lidar_units( + laser_number: np.ndarray, + xyz_ego: np.ndarray, + T_up_ego: np.ndarray, + T_down_ego: np.ndarray, +) -> Dict[str, np.ndarray]: + """Assign each point to ``up_lidar`` or ``down_lidar``. + + Argoverse 2 distributes a single aggregated sweep from two stacked Velodyne + VLP-32C units whose 64 beams share one ``laser_number`` range ``[0, 63]``. The + boundary that separates the two units is not documented in the AV2 devkit, so + we recover it from the geometry of the calibrated extrinsics. + + The two units are split into the two laser-number halves (``< 32`` and + ``>= 32``); empirically these are the two physical sensors (at any shared + ``offset_ns`` they point ~180 deg apart in the ego frame). To decide *which* + half is ``up_lidar`` vs ``down_lidar`` we use per-beam elevation flatness: a + single laser ring traces a cone of (nearly) constant elevation only in its own + sensor frame. Mapping a half into the wrong unit's extrinsic tilts that cone + (the two units differ in pitch/roll), inflating the per-ring elevation spread. + We pick the labelling that minimises the summed per-ring elevation spread, + which separates the two assignments by a wide, stable margin (~2-10x). + + Returns a mapping ``unit_name -> boolean point mask``. + """ + lo_mask = laser_number < VLP32C_N_BEAMS + hi_mask = ~lo_mask + + # Cost of assigning lo->up_unit and hi->down_unit (assignment A) vs swapped (B). + cost_a = _ring_elevation_spread( + laser_number, xyz_ego, np.arange(VLP32C_N_BEAMS), T_up_ego + ) + _ring_elevation_spread(laser_number, xyz_ego, np.arange(VLP32C_N_BEAMS, 2 * VLP32C_N_BEAMS), T_down_ego) + cost_b = _ring_elevation_spread( + laser_number, xyz_ego, np.arange(VLP32C_N_BEAMS), T_down_ego + ) + _ring_elevation_spread(laser_number, xyz_ego, np.arange(VLP32C_N_BEAMS, 2 * VLP32C_N_BEAMS), T_up_ego) + + if cost_b < cost_a: + return {"up_lidar": hi_mask, "down_lidar": lo_mask} + return {"up_lidar": lo_mask, "down_lidar": hi_mask} + + +def _ring_elevation_spread( + laser_number: np.ndarray, + xyz_ego: np.ndarray, + beams: np.ndarray, + T_unit_ego: np.ndarray, + min_valid_distance_m: float = 2.0, + min_ring_points: int = 10, +) -> float: + """Mean per-beam elevation standard deviation when ``beams`` are mapped to a unit. + + In the correct sensor frame each laser ring has near-constant elevation across + azimuth, so a tight per-ring elevation distribution indicates the correct + extrinsic. Returns the mean per-ring elevation std in degrees. + + Args: + min_valid_distance_m: ignore near-range returns when estimating elevation. + min_ring_points: minimum returns for a ring to contribute to the estimate. + """ + T_ego_unit = se3_inverse(T_unit_ego) + pts = (T_ego_unit[:3, :3] @ xyz_ego.T).T + T_ego_unit[:3, 3] + dist = np.linalg.norm(pts, axis=1) + + spreads: List[float] = [] + for beam in beams: + ring = (laser_number == beam) & (dist > min_valid_distance_m) + if int(ring.sum()) < min_ring_points: + continue + elev = np.degrees(np.arcsin(np.clip(pts[ring, 2] / dist[ring], -1.0, 1.0))) + spreads.append(float(np.std(elev))) + + return float(np.mean(spreads)) if spreads else float("inf") + + +# --- Structured VLP-32C lidar model -------------------------------------------- +# AV2 provides no native firing-column index, only per-point ``laser_number`` +# (0..31 within a unit) and ``offset_ns``. The two are a faithful proxy for the +# firing pattern: ``offset_ns`` quantizes into firing columns (one VLP-32C +# revolution at 10 Hz), and ``laser_number`` selects the beam (row). This lets us +# reconstruct the rows x columns structure required for a structured spinning +# lidar model and reuse the generic ``structured_lidar_model`` library. + + +@dataclass(frozen=True) +class Vlp32cGeometry: + """Per-unit VLP-32C geometry recovered from a reference sweep. + + Derived empirically (and stably) from the data rather than a hard-coded spec + table, so it self-corrects to the dataset's actual calibration. + """ + + elevations_rad: np.ndarray # [32] float32, model row order (elevation high -> low) + laser_to_row: np.ndarray # [32] int, maps laser_number (0..31) -> model row index + column_period_ns: float # firing-column period (one beam refire interval) + n_columns: int # upsampled columns per revolution (native_columns * resolution_factor) + spinning_direction: Literal["cw", "ccw"] # apparent spin in this unit's own frame + column_azimuths_rad: np.ndarray # [n_columns] float32, per-(upsampled-)column azimuth + row_azimuth_offsets_rad: np.ndarray # [32] float32, per-row azimuth offset from the column + resolution_factor: int # column upsampling factor (1 = native firing resolution) + + +def derive_vlp32c_geometry( + xyz_decompensated: np.ndarray, + laser_number_in_unit: np.ndarray, + offset_ns: np.ndarray, + min_valid_distance_m: float = 2.0, + far_range_m: float = 5.0, + n_refine_iterations: int = 5, + resolution_factor: int = 4, +) -> Vlp32cGeometry: + """Recover the VLP-32C firing geometry from one *decompensated* sweep. + + The input must be the decompensated point cloud (each point in the sensor frame + at its own measurement time). On the raw motion-compensated cloud the azimuths + are smeared by ego motion (~0.5 deg), which would dominate the model error; on + the decompensated cloud the firing geometry is clean (~0.05 deg). + + The model is ``azimuth(point) = column_azimuth[col] + row_azimuth_offset[row]``. + The 32 beams of a firing column are *not* co-azimuthal -- VLP-32C fires them + across a wide span -- so the per-row offset is essential (it is the dominant + structure, several degrees) and is fit empirically rather than assumed. + + Args: + xyz_decompensated: decompensated points in the unit's sensor frame, [N, 3]. + laser_number_in_unit: per-point beam index within the unit (0..31), [N]. + offset_ns: per-point nanosecond offset from the sweep start, [N]. + min_valid_distance_m: ignore near-range returns when estimating elevation. + far_range_m: distance threshold for azimuth fitting (near returns are noisy). + n_refine_iterations: alternating column/row-offset refinement passes. + """ + dist = np.linalg.norm(xyz_decompensated, axis=1) + valid = dist > min_valid_distance_m + elev = np.arcsin(np.clip(xyz_decompensated[valid, 2] / dist[valid], -1.0, 1.0)) + lasers = laser_number_in_unit[valid] + + # Median elevation per laser, then sort lasers by elevation (high -> low). + median_elev = np.full(VLP32C_N_BEAMS, np.nan, dtype=np.float64) + for laser in range(VLP32C_N_BEAMS): + sel = lasers == laser + if sel.any(): + median_elev[laser] = float(np.median(elev[sel])) + if np.isnan(median_elev).any(): + good = ~np.isnan(median_elev) + median_elev[~good] = np.interp(np.flatnonzero(~good), np.flatnonzero(good), median_elev[good]) + + laser_order_high_to_low = np.argsort(-median_elev) # laser indices, highest elevation first + elevations_rad = median_elev[laser_order_high_to_low].astype(np.float32) + laser_to_row = np.empty(VLP32C_N_BEAMS, dtype=np.int64) + laser_to_row[laser_order_high_to_low] = np.arange(VLP32C_N_BEAMS) + row = laser_to_row[laser_number_in_unit] + + column_period_ns, n_columns = _estimate_column_timing(laser_number_in_unit, offset_ns) + + o = offset_ns.astype(np.int64) + # Wrap modulo one revolution: the sweep slightly exceeds one revolution, and the + # overlap folds back onto early columns (same physical azimuth). + col = np.round((o - o.min()) / column_period_ns).astype(np.int64) % n_columns + az = np.arctan2(xyz_decompensated[:, 1], xyz_decompensated[:, 0]) + far = dist > far_range_m + + # Detect spin direction from the sign of azimuth-vs-column (the two stacked units + # fire in opposite phase, so they spin oppositely in their own frames). + az_unwrapped = np.unwrap(az[far][np.argsort(col[far])]) + slope = float(np.polyfit(np.arange(len(az_unwrapped)), az_unwrapped, 1)[0]) if len(az_unwrapped) > 1 else -1.0 + spinning_direction: Literal["cw", "ccw"] = "cw" if slope < 0 else "ccw" + + # Jointly fit per-column azimuths and per-row azimuth offsets by alternating + # circular medians. The 32 beams of a column span several degrees of azimuth, + # captured by the per-row offset; averaging it into the column azimuth (offset=0) + # would leave multi-degree per-row error. + def circmean(a: np.ndarray) -> float: + return float(np.arctan2(np.mean(np.sin(a)), np.mean(np.cos(a)))) + + # Steep downward beams (e.g. the lowest VLP-32C laser at ~-25 deg) only ever hit + # nearby ground, so they have no far-range returns. Fit their azimuth offset from + # whatever valid returns they do have (near-range) rather than leaving it at 0. + valid = dist > min_valid_distance_m + row_has_far = np.zeros(VLP32C_N_BEAMS, dtype=bool) + for r in range(VLP32C_N_BEAMS): + row_has_far[r] = bool(((row == r) & far).any()) + + row_offsets = np.zeros(VLP32C_N_BEAMS, dtype=np.float64) + col_az = np.zeros(n_columns, dtype=np.float64) + for _ in range(max(n_refine_iterations, 1)): + # column azimuth = circular median of (az - row_offset) over far points in the + # column (far returns give the cleanest, least range-dependent azimuth). + adj = np.angle(np.exp(1j * (az - row_offsets[row]))) + col_az[:] = np.nan + for c in np.unique(col[far]): + sel = (col == c) & far + col_az[c] = np.arctan2(np.median(np.sin(adj[sel])), np.median(np.cos(adj[sel]))) + good = ~np.isnan(col_az) + if not good.any(): + break + col_az = np.interp(np.arange(n_columns), np.flatnonzero(good), np.unwrap(col_az[good])) + # row offset = circular mean of (az - column azimuth). Use far returns where a + # row has them; otherwise fall back to all valid (near-range) returns so every + # row gets a real offset estimate. + for r in range(VLP32C_N_BEAMS): + sel = (row == r) & (far if row_has_far[r] else valid) + if sel.any(): + row_offsets[r] = circmean(np.angle(np.exp(1j * (az[sel] - col_az[col[sel]])))) + + column_azimuths_rad = enforce_spinning_monotonic(col_az, n_columns, spinning_direction) + row_azimuth_offsets_rad = np.angle(np.exp(1j * row_offsets)).astype(np.float32) + + # Upsample the column-azimuth grid. The native column step is ~0.2 deg, so the + # per-frame integer column shift (see reconstruct_model_elements) would quantize + # alignment to ~0.1 deg. Upsampling shrinks the step by resolution_factor, + # removing that quantization (4x -> ~0.025 deg). + factor = max(int(resolution_factor), 1) + if factor > 1: + column_azimuths_rad = _upsample_azimuths(column_azimuths_rad, factor, spinning_direction) + + return Vlp32cGeometry( + elevations_rad=elevations_rad, + laser_to_row=laser_to_row, + column_period_ns=column_period_ns, + n_columns=n_columns * factor, + spinning_direction=spinning_direction, + column_azimuths_rad=column_azimuths_rad, + row_azimuth_offsets_rad=row_azimuth_offsets_rad, + resolution_factor=factor, + ) + + +def _upsample_azimuths( + column_azimuths_rad: np.ndarray, factor: int, spinning_direction: Literal["cw", "ccw"] +) -> np.ndarray: + """Interpolate column azimuths to ``len * factor`` entries, preserving monotonicity.""" + n = len(column_azimuths_rad) + src_idx = np.arange(n) * factor + dst_idx = np.arange(n * factor) + unwrapped = np.unwrap(column_azimuths_rad.astype(np.float64)) + upsampled = np.interp(dst_idx, src_idx, unwrapped) + return enforce_spinning_monotonic(upsampled, n * factor, spinning_direction) + + +def build_vlp32c_model(geometry: Vlp32cGeometry) -> RowOffsetStructuredSpinningLidarModelParameters: + """Build a structured VLP-32C model from recovered geometry. + + Uses the empirically measured per-column azimuths, per-row azimuth offsets and + elevation table. The per-row offsets capture the (several-degree) intra-column + firing spread and are essential for sub-degree reconstruction. The spin + direction is the one detected for this unit (the two stacked units fire in + opposite phase, so they spin oppositely in their own frames). + """ + return RowOffsetStructuredSpinningLidarModelParameters( + spinning_frequency_hz=VLP32C_SPINNING_FREQUENCY_HZ, + spinning_direction=geometry.spinning_direction, + n_rows=VLP32C_N_BEAMS, + n_columns=geometry.n_columns, + row_elevations_rad=geometry.elevations_rad, + column_azimuths_rad=geometry.column_azimuths_rad, + row_azimuth_offsets_rad=geometry.row_azimuth_offsets_rad, + ) + + +def _estimate_column_timing(laser_number_in_unit: np.ndarray, offset_ns: np.ndarray) -> tuple[float, int]: + """Estimate the firing-column period and column count from beam-0 refire timing. + + The column count spans exactly one revolution. An AV2 sweep covers slightly + more than one revolution (~1.02 rev over ~102 ms at 10 Hz), so columns are + wrapped modulo this count (see :func:`reconstruct_model_elements`): the few + degrees of overlap fold back onto the early columns, which represent the same + physical azimuth. Sizing to one revolution keeps the column-azimuth ramp below + 2*pi, as the structured spinning-lidar model requires. + """ + o = offset_ns.astype(np.int64) + beam0_times = np.sort(o[laser_number_in_unit == 0]) + if len(beam0_times) >= 2: + gaps = np.diff(beam0_times) + # Use the median of the small, regular gaps (drop large no-return stretches). + column_period_ns = float(np.median(gaps[gaps <= np.median(gaps) * 1.5])) + else: + # Fallback: one revolution divided by a nominal VLP-32C column count. + column_period_ns = VLP32C_SCAN_DURATION_US * 1000.0 / 1800.0 + revolution_ns = VLP32C_SCAN_DURATION_US * 1000.0 + n_columns = max(int(round(revolution_ns / column_period_ns)), 2) + return column_period_ns, n_columns + + +def reconstruct_model_elements( + laser_number_in_unit: np.ndarray, + offset_ns: np.ndarray, + geometry: Vlp32cGeometry, + xyz_decompensated: np.ndarray, + min_valid_distance_m: float = 5.0, +) -> np.ndarray: + """Build per-point ``model_element`` = (row, column) for one frame. + + Row comes from the laser->row map (elevation order). The column comes from + quantizing ``offset_ns`` by the firing-column period (wrapped modulo one + revolution), then applying a single per-frame column shift. + + The per-frame shift is essential: the static model fixes one mapping from + ``offset_ns`` to azimuth, but the sensor's spin phase at a given ``offset_ns`` + drifts a degree or so between sweeps (and ``offset_ns`` is referenced to a + per-sweep start). Without re-aligning, frames other than the one the model was + derived from are systematically rotated by up to ~1.2 deg. We estimate the + frame's rigid azimuth offset from the model (circular mean of the residual + between measured and model-predicted azimuth over far returns) and convert it + to an integer column shift, which restores sub-0.1 deg accuracy on every frame. + + Args: + xyz_decompensated: per-point decompensated points in the sensor frame [N, 3], + used to measure this frame's azimuth phase relative to the model. + min_valid_distance_m: far-range threshold for the phase estimate. + + Returns a [N, 2] uint16 array (row, column). + """ + row = geometry.laser_to_row[laser_number_in_unit] + o = offset_ns.astype(np.int64) + # Native firing column from the firing timing, scaled onto the (upsampled) grid. + native_col = (o - o.min()) / geometry.column_period_ns + col = (np.round(native_col).astype(np.int64) * geometry.resolution_factor) % geometry.n_columns + + # Re-align this frame to the model. The static model fixes one mapping from + # offset_ns to azimuth, but between sweeps the spin phase drifts (a rigid + # rotation) AND the spin rate varies slightly within a sweep (a drift that grows + # with the column index). We fit the residual azimuth as an affine function of + # the native column, residual ~= a + b * native_col, and fold it back into the + # column index. The constant term handles the phase, the linear term the + # intra-sweep rate drift; without the linear term some scenes retain ~0.25 deg. + dist = np.linalg.norm(xyz_decompensated, axis=1) + far = dist > min_valid_distance_m + if far.any(): + az = np.arctan2(xyz_decompensated[far, 1], xyz_decompensated[far, 0]) + predicted = geometry.column_azimuths_rad[col[far]] + geometry.row_azimuth_offsets_rad[row[far]] + residual = np.angle(np.exp(1j * (az - predicted))) + column_step_rad = 2.0 * np.pi / geometry.n_columns + sign = -1.0 if geometry.spinning_direction == "cw" else 1.0 + # Affine fit residual_columns ~= a + b * native_col (least squares). + residual_cols = sign * residual / column_step_rad + nc_far = native_col[far] + coeffs = np.polyfit(nc_far, residual_cols, 1) + col_shift = np.round(np.polyval(coeffs, native_col)).astype(np.int64) + col = (col + col_shift) % geometry.n_columns + + return np.stack([row, col], axis=1).astype(np.uint16) diff --git a/tools/data_converter/argoverse2/utils_test.py b/tools/data_converter/argoverse2/utils_test.py new file mode 100644 index 00000000..bcc0ea4b --- /dev/null +++ b/tools/data_converter/argoverse2/utils_test.py @@ -0,0 +1,272 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Data-free unit tests for the Argoverse 2 VLP-32C lidar-model derivation. + +These run in CI (no external dataset needed): a synthetic spinning-lidar sweep is +generated with a known firing geometry, the model is derived from it, and the +reconstruction accuracy is asserted -- the same property the eval tool measures. +This guards the firing-pattern reconstruction (and in particular the +more-than-one-revolution column wrap) against regressions. +""" + +from __future__ import annotations + +import tempfile +import unittest + +from pathlib import Path + +import numpy as np +import pyarrow as pa +import pyarrow.feather as feather + +from upath import UPath + +from ncore.impl.data.types import IdealPinholeCameraModelParameters, ShutterType +from ncore.impl.sensors.lidar import StructuredLidarModel +from tools.data_converter.argoverse2.utils import ( + VLP32C_N_BEAMS, + VLP32C_SCAN_DURATION_US, + build_vlp32c_model, + derive_vlp32c_geometry, + read_intrinsics, + reconstruct_model_elements, +) + + +# A VLP-32C-like, deliberately non-uniform elevation table (degrees), and a +# laser-number -> physical order that is NOT elevation-sorted, so the test +# exercises the laser->row recovery. +_TRUE_ELEVATIONS_DEG = np.linspace(15.0, -25.0, VLP32C_N_BEAMS) +_LASER_PERMUTATION = np.array( + [(i * 7) % VLP32C_N_BEAMS for i in range(VLP32C_N_BEAMS)], dtype=np.int64 +) # laser_number -> index into the (elevation-sorted) beam list + + +def _synthesize_sweep( + spinning_direction: str, + revolutions: float, + column_period_ns: float = 55296.0, + seed: int = 0, + start_az_rad: float | None = None, + az_rate_factor: float = 1.0, +): + """Build a synthetic decompensated VLP-32C sweep with a known geometry. + + Returns (xyz_decompensated [N,3], laser_number_in_unit [N], offset_ns [N], + true_elevations_rad [32] in laser order, true_row_offsets_rad [32] in laser order). + """ + rng = np.random.default_rng(seed) + sign = -1.0 if spinning_direction == "cw" else 1.0 + + # Per-laser true elevation (in laser-number order) and a per-laser azimuth + # offset spanning several degrees (VLP-32C beams in a column are not co-azimuthal). + elev_sorted_rad = np.radians(_TRUE_ELEVATIONS_DEG) # highest..lowest + true_elev_by_laser = np.empty(VLP32C_N_BEAMS) + true_elev_by_laser[np.arange(VLP32C_N_BEAMS)] = elev_sorted_rad[_LASER_PERMUTATION] + true_offset_by_laser = np.radians(np.linspace(-4.0, 4.0, VLP32C_N_BEAMS))[_LASER_PERMUTATION] + + revolution_ns = VLP32C_SCAN_DURATION_US * 1000.0 + n_cols = int(round(revolution_ns / column_period_ns)) + total_cols = int(round(n_cols * revolutions)) + + laser_list = [] + offset_list = [] + xyz_list = [] + base_az = rng.uniform(-np.pi, np.pi) if start_az_rad is None else start_az_rad + for c in range(total_cols): + # column reference azimuth advances with the spin. az_rate_factor != 1 + # models an intra-sweep spin-rate drift relative to the offset_ns timing + # (the azimuth advances slightly faster/slower than the nominal column rate). + col_az = base_az + sign * 2.0 * np.pi * c * az_rate_factor / n_cols + for laser in range(VLP32C_N_BEAMS): + # drop ~10% of returns to mimic sparsity; keep enough far points + if rng.random() < 0.1: + continue + az = col_az + true_offset_by_laser[laser] + el = true_elev_by_laser[laser] + rng_m = rng.uniform(25.0, 60.0) # far-range so azimuth fitting is clean + x = rng_m * np.cos(el) * np.cos(az) + y = rng_m * np.cos(el) * np.sin(az) + z = rng_m * np.sin(el) + xyz_list.append((x, y, z)) + laser_list.append(laser) + offset_list.append(int(round(c * column_period_ns))) + + xyz = np.array(xyz_list, dtype=np.float64) + laser = np.array(laser_list, dtype=np.int64) + offset = np.array(offset_list, dtype=np.int64) + return xyz, laser, offset, true_elev_by_laser, true_offset_by_laser + + +class TestVlp32cModelDerivation(unittest.TestCase): + def _check(self, spinning_direction: str, revolutions: float) -> None: + xyz, laser, offset, _, _ = _synthesize_sweep(spinning_direction, revolutions) + + geometry = derive_vlp32c_geometry(xyz, laser, offset) + self.assertEqual(geometry.spinning_direction, spinning_direction) + # Columns are upsampled for sub-column alignment precision. + self.assertGreater(geometry.resolution_factor, 1) + self.assertEqual(geometry.n_columns, len(geometry.column_azimuths_rad)) + + model = build_vlp32c_model(geometry) + self.assertEqual(model.n_rows, VLP32C_N_BEAMS) + # The column-azimuth ramp must stay strictly below one revolution; the model + # constructor enforces this, so building it at all proves the >1-rev wrap held. + self.assertLess(abs(float(model.column_azimuths_rad[-1] - model.column_azimuths_rad[0])), 2.0 * np.pi) + + elem = reconstruct_model_elements(laser, offset, geometry, xyz) + self.assertEqual(elem.dtype, np.uint16) + self.assertTrue(np.all(elem[:, 0] < model.n_rows)) + self.assertTrue(np.all(elem[:, 1] < model.n_columns)) + + sm = StructuredLidarModel.maybe_from_parameters(model, device="cpu") + assert sm is not None + predicted = sm.elements_to_sensor_points(elem, np.ones(len(elem), dtype=np.float32)).cpu().numpy() + predicted /= np.linalg.norm(predicted, axis=1, keepdims=True) + direction = xyz / np.linalg.norm(xyz, axis=1, keepdims=True) + cos = np.clip(np.sum(predicted * direction, axis=1), -1.0, 1.0) + err_deg = np.degrees(np.arccos(cos)) + median_err = float(np.median(err_deg)) + self.assertLess( + median_err, + 0.1, + f"{spinning_direction} {revolutions}rev: reconstruction error {median_err:.4f} deg too high", + ) + + def test_cw_single_revolution(self) -> None: + self._check("cw", revolutions=1.0) + + def test_cw_more_than_one_revolution(self) -> None: + # The regression case: an AV2 sweep covers ~1.02 revolutions, so the column + # ramp would exceed 2*pi unless columns are wrapped modulo one revolution. + self._check("cw", revolutions=1.05) + + def test_ccw_more_than_one_revolution(self) -> None: + # The two stacked units spin oppositely in their own frames; cover the ccw case. + self._check("ccw", revolutions=1.05) + + def test_model_generalizes_across_phase_shifted_frames(self) -> None: + """The model derived from one sweep must reconstruct other sweeps accurately. + + The sensor's spin phase at a given ``offset_ns`` drifts between sweeps, so a + sweep other than the one the model was derived from is rigidly rotated in + azimuth. ``reconstruct_model_elements`` must re-align per frame; without it, + these frames are off by ~1 deg (the multi-scene failure this guards against). + """ + # Derive the model from a reference sweep at one phase. + ref_xyz, ref_laser, ref_offset, _, _ = _synthesize_sweep("cw", revolutions=1.05, start_az_rad=0.3) + geometry = derive_vlp32c_geometry(ref_xyz, ref_laser, ref_offset) + model = build_vlp32c_model(geometry) + sm = StructuredLidarModel.maybe_from_parameters(model, device="cpu") + assert sm is not None + + # Reconstruct several sweeps captured at different spin phases. + for shift_deg in (-30.0, -1.0, 1.0, 30.0): + xyz, laser, offset, _, _ = _synthesize_sweep( + "cw", revolutions=1.05, seed=1, start_az_rad=0.3 + np.radians(shift_deg) + ) + elem = reconstruct_model_elements(laser, offset, geometry, xyz) + self.assertTrue(np.all(elem[:, 1] < model.n_columns)) + predicted = sm.elements_to_sensor_points(elem, np.ones(len(elem), dtype=np.float32)).cpu().numpy() + predicted /= np.linalg.norm(predicted, axis=1, keepdims=True) + direction = xyz / np.linalg.norm(xyz, axis=1, keepdims=True) + cos = np.clip(np.sum(predicted * direction, axis=1), -1.0, 1.0) + median_err = float(np.degrees(np.median(np.arccos(cos)))) + self.assertLess( + median_err, + 0.1, + f"phase shift {shift_deg} deg: reconstruction error {median_err:.4f} deg too high", + ) + + def test_model_handles_intra_sweep_rate_drift(self) -> None: + """A frame whose spin rate drifts vs offset_ns is reconstructed accurately. + + On some scenes the azimuth advances slightly faster/slower than the nominal + ``offset_ns`` column rate within a sweep. A single rigid phase shift cannot + correct this (it leaves ~0.25 deg); the affine (phase + linear) per-frame + alignment in ``reconstruct_model_elements`` does. This guards that fix. + """ + ref_xyz, ref_laser, ref_offset, _, _ = _synthesize_sweep("cw", revolutions=1.05, start_az_rad=0.2) + geometry = derive_vlp32c_geometry(ref_xyz, ref_laser, ref_offset) + model = build_vlp32c_model(geometry) + sm = StructuredLidarModel.maybe_from_parameters(model, device="cpu") + assert sm is not None + + # A sweep whose azimuth advances 0.3% faster than the nominal column rate + # (~1 deg of drift accumulated over the revolution) plus a phase offset. + xyz, laser, offset, _, _ = _synthesize_sweep( + "cw", revolutions=1.05, seed=2, start_az_rad=0.2 + np.radians(5.0), az_rate_factor=1.003 + ) + elem = reconstruct_model_elements(laser, offset, geometry, xyz) + predicted = sm.elements_to_sensor_points(elem, np.ones(len(elem), dtype=np.float32)).cpu().numpy() + predicted /= np.linalg.norm(predicted, axis=1, keepdims=True) + direction = xyz / np.linalg.norm(xyz, axis=1, keepdims=True) + cos = np.clip(np.sum(predicted * direction, axis=1), -1.0, 1.0) + median_err = float(np.degrees(np.median(np.arccos(cos)))) + self.assertLess(median_err, 0.1, f"intra-sweep drift: reconstruction error {median_err:.4f} deg too high") + + +class TestReadIntrinsics(unittest.TestCase): + """``read_intrinsics`` builds an ideal pinhole and preserves k1/k2/k3.""" + + def _write_intrinsics(self, log_dir: Path) -> None: + (log_dir / "calibration").mkdir(parents=True, exist_ok=True) + table = pa.table( + { + "sensor_name": ["ring_front_center", "ring_rear_left"], + "fx_px": [1685.0, 1683.0], + "fy_px": [1685.5, 1683.5], + "cx_px": [775.0, 773.0], + "cy_px": [1023.0, 1021.0], + "k1": [-0.27, -0.26], + "k2": [0.11, 0.10], + "k3": [-0.018, -0.017], + "height_px": [2048, 1550], + "width_px": [1550, 2048], + } + ) + feather.write_feather(table, str(log_dir / "calibration" / "intrinsics.feather")) + + def test_model_is_ideal_pinhole_global_shutter(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + log_dir = Path(tmp) + self._write_intrinsics(log_dir) + intrinsics = read_intrinsics(UPath(log_dir)) + + self.assertEqual(set(intrinsics), {"ring_front_center", "ring_rear_left"}) + cam = intrinsics["ring_front_center"] + self.assertIsInstance(cam.model, IdealPinholeCameraModelParameters) + # Global shutter is assumed because the released imagery is already undistorted. + self.assertEqual(cam.model.shutter_type, ShutterType.GLOBAL) + self.assertIsNone(cam.model.external_distortion_parameters) + np.testing.assert_array_equal(cam.model.resolution, np.array([1550, 2048], dtype=np.uint64)) + np.testing.assert_allclose(cam.model.focal_length, np.array([1685.0, 1685.5], dtype=np.float32)) + np.testing.assert_allclose(cam.model.principal_point, np.array([775.0, 1023.0], dtype=np.float32)) + + def test_original_distortion_coefficients_preserved(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + log_dir = Path(tmp) + self._write_intrinsics(log_dir) + intrinsics = read_intrinsics(UPath(log_dir)) + + # The raw lens coefficients are kept verbatim (not applied to the images). + self.assertEqual(intrinsics["ring_front_center"].original_distortion_k1k2k3, (-0.27, 0.11, -0.018)) + self.assertEqual(intrinsics["ring_rear_left"].original_distortion_k1k2k3, (-0.26, 0.10, -0.017)) + + +if __name__ == "__main__": + unittest.main()