Source code for embodichain.lab.sim.sensors.stereo

# ----------------------------------------------------------------------------
# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ----------------------------------------------------------------------------

from __future__ import annotations

import dexsim
import torch
import numpy as np
import dexsim.render as dr

from typing import Dict, Tuple, List, Sequence

from dexsim.utility import inv_transform
from embodichain.lab.sim.sensors import Camera, CameraCfg
from embodichain.utils.math import matrix_from_euler
from embodichain.utils import logger, configclass


[docs] @configclass class StereoCameraCfg(CameraCfg): """Configuration class for StereoCamera.""" sensor_type: str = "StereoCamera" # The camera intrinsics of the right camera. # The default camera is the left camera. intrinsics_right: Tuple[float, float, float, float] = (600, 600, 320.0, 240.0) left_to_right_pos: Tuple[float, float, float] = (0.05, 0.0, 0.0) # The rotation from left camera to right camera in degrees. left_to_right_rot: Tuple[float, float, float] = (0.0, 0.0, 0.0) enable_disparity: bool = False fx_r: float = intrinsics_right[0] fy_r: float = intrinsics_right[1] cx_r: float = intrinsics_right[2] cy_r: float = intrinsics_right[3] @property def left_to_right(self) -> torch.Tensor: """Get the transformation matrix from left camera to right camera.""" left_to_right = torch.eye(4, dtype=torch.float32) left_to_right[:3, 3] = torch.tensor(self.left_to_right_pos, dtype=torch.float32) rot = torch.tensor(self.left_to_right_rot, dtype=torch.float32) left_to_right[:3, :3] = matrix_from_euler(rot.unsqueeze(0)).squeeze(0) return left_to_right @property def right_to_left(self) -> torch.Tensor: """Get the transformation matrix from right camera to left camera.""" return torch.inverse(self.left_to_right)
[docs] def get_data_types(self) -> List[str]: data_types = [] if self.enable_color: data_types.append("color") data_types.append("color_right") if self.enable_depth: data_types.append("depth") data_types.append("depth_right") if self.enable_mask: data_types.append("mask") data_types.append("mask_right") if self.enable_normal: data_types.append("normal") data_types.append("normal_right") if self.enable_position: data_types.append("position") data_types.append("position_right") if self.enable_disparity: data_types.append("disparity") return data_types
class PairCameraView: def __init__( self, left_view: dr.CameraView, right_view: dr.CameraView, left_to_right: np.ndarray, ) -> PairCameraView: self._left_view = left_view self._right_view = right_view self._left_to_right = left_to_right self._left_to_center = np.eye(4, dtype=np.float32) self._left_to_center[:3, 3] = left_to_right[:3, 3] * -0.5 self._right_to_center = np.eye(4, dtype=np.float32) self._right_to_center[:3, 3] = left_to_right[:3, 3] * 0.5 def set_local_pose(self, pose: np.ndarray) -> None: left_pose = pose @ self._left_to_center right_pose = pose @ self._right_to_center self._left_view.set_local_pose(left_pose) self._right_view.set_local_pose(right_pose) def get_local_pose(self) -> np.ndarray: left_pose = self._left_view.get_local_pose() return left_pose @ inv_transform(self._left_to_center) def set_world_pose(self, pose: np.ndarray) -> None: left_pose = pose @ self._left_to_center right_pose = pose @ self._right_to_center self._left_view.set_world_pose(left_pose) self._right_view.set_world_pose(right_pose) def get_world_pose(self) -> np.ndarray: left_pose = self._left_view.get_world_pose() return left_pose @ inv_transform(self._left_to_center) def get_node(self) -> dexsim.engine.Node: return self._left_view.get_node() def attach_node(self, parent: dexsim.engine.Node) -> None: self._left_view.attach_node(parent) self._right_view.attach_node(parent)
[docs] class StereoCamera(Camera): """Base class for sensor abstraction in the simulation engine. Sensors should inherit from this class and implement the `update` and `get_data` methods. """ SUPPORTED_DATA_TYPES = [ "color", "depth", "mask", "normal", "position", "color_right", "depth_right", "mask_right", "normal_right", "position_right", "disparity", ]
[docs] def __init__( self, config: StereoCameraCfg, device: torch.device = torch.device("cpu"), ) -> None: super().__init__(config, device) # check valid config if self.cfg.enable_disparity and not self.cfg.enable_depth: logger.log_error("Disparity can only be enabled when depth is enabled.")
def _build_sensor_from_config( self, config: StereoCameraCfg, device: torch.device ) -> None: self._world = dexsim.default_world() env = self._world.get_env() arenas = env.get_all_arenas() if len(arenas) == 0: arenas = [env] num_instances = len(arenas) self._frame_buffer = self._world.create_camera_group( [config.width, config.height], num_instances * 2, True ) view_attrib = config.get_view_attrib() left_list = [] right_list = [] for i, arena in enumerate(arenas): left_view_name = f"{self.uid}_left_view{i + 1}" left_view = arena.create_camera( left_view_name, config.width, config.height, True, view_attrib, self._frame_buffer, ) left_view.set_intrinsic(config.intrinsics) left_view.set_near(config.near) left_view.set_far(config.far) left_list.append(left_view) for i, arena in enumerate(arenas): right_view_name = f"{self.uid}_right_view{i + 1}" right_view = arena.create_camera( right_view_name, config.width, config.height, True, view_attrib, self._frame_buffer, ) right_view.set_intrinsic(config.intrinsics_right) right_view.set_near(config.near) right_view.set_far(config.far) right_list.append(right_view) for i in range(num_instances): self._entities[i] = PairCameraView( left_list[i], right_list[i], config.left_to_right.cpu().numpy() ) # Define a mapping of data types to their respective shapes and dtypes buffer_specs = { "color": ( (self.num_instances, config.height, config.width, 4), torch.uint8, ), "depth": ( (self.num_instances, config.height, config.width, 1), torch.float32, ), "mask": ( (self.num_instances, config.height, config.width, 1), torch.int32, ), "normal": ( (self.num_instances, config.height, config.width, 3), torch.float32, ), "position": ( (self.num_instances, config.height, config.width, 3), torch.float32, ), "disparity": ( (self.num_instances, config.height, config.width, 1), torch.float32, ), } buffer_specs.update( { f"{data_type}_right": buffer_specs[data_type] for data_type in ["color", "depth", "mask", "normal", "position"] } ) data_types = config.get_data_types() # stereo buffer to store data for left and right cameras # the data in `_data_buffer` is shared with the data in `_data_buffer_stereo`. self._data_buffer_stereo: Dict[str, torch.Tensor] = {} # Iterate through enabled data types and initialize buffers for data_type in data_types: if "right" in data_type: continue if getattr(config, f"enable_{data_type}", False): shape, dtype = buffer_specs[data_type] if data_type == "disparity": self._data_buffer[data_type] = torch.empty( shape, dtype=dtype, device=device ) # create new shape with width * 2 for stereo camera shape_ = (shape[0], shape[1], shape[2] * 2, shape[3]) self._data_buffer_stereo[data_type] = torch.empty( shape_, dtype=dtype, device=device ) self._data_buffer[data_type] = self._data_buffer_stereo[data_type][ :, :, : config.width, : ] self._data_buffer[f"{data_type}_right"] = self._data_buffer_stereo[ data_type ][:, :, config.width :, :] self.cfg: CameraCfg = config if self.cfg.extrinsics.parent is not None: self._attach_to_entity()
[docs] def update(self, **kwargs) -> None: """Update the sensor data. The supported data types are: - color: RGB images with shape (B, H, W, 4) and dtype torch.uint8 - depth: Depth images with shape (B, H, W, 1) and dtype torch.float32 - mask: Instance segmentation masks with shape (B, H, W, 1) and dtype torch.int32 - normal: Normal images with shape (B, H, W, 3) and dtype torch.float32 - position: Position images with shape (B, H, W, 3) and dtype torch.float32 - disparity: Disparity images with shape (B, H, W, 1) and dtype torch.float32 Args: **kwargs: Additional keyword arguments for sensor update. """ fetch_only = kwargs.get("fetch_only", False) if not fetch_only: self._frame_buffer.apply() self.cfg: StereoCameraCfg if self.cfg.enable_color: data = self._frame_buffer.get_rgb_gpu_buffer().to(self.device) self._data_buffer["color"] = data[: self.num_instances, ...] self._data_buffer[f"color_right"] = data[self.num_instances :, ...] if self.cfg.enable_depth: data = self._frame_buffer.get_depth_gpu_buffer().to(self.device) self._data_buffer["depth"] = data[: self.num_instances, ...].unsqueeze_(-1) self._data_buffer[f"depth_right"] = data[ self.num_instances :, ... ].unsqueeze_(-1) if self.cfg.enable_mask: data = self._frame_buffer.get_visible_mask_gpu_buffer().to( self.device, torch.int32 ) self._data_buffer["mask"] = data[: self.num_instances, ...].unsqueeze_(-1) self._data_buffer[f"mask_right"] = data[ self.num_instances :, ... ].unsqueeze_(-1) if self.cfg.enable_normal: data = self._frame_buffer.get_normal_gpu_buffer().to(self.device)[..., :3] self._data_buffer["normal"] = data[: self.num_instances, ...] self._data_buffer[f"normal_right"] = data[self.num_instances :, ...] if self.cfg.enable_position: data = self._frame_buffer.get_position_gpu_buffer().to(self.device)[..., :3] self._data_buffer["position"] = data[: self.num_instances, ...] self._data_buffer[f"position_right"] = data[self.num_instances :, ...] if self.cfg.enable_disparity: disparity = self._data_buffer["disparity"] disparity.fill_(0.0) distance = torch.sqrt( torch.sum(torch.square(self.cfg.left_to_right[:3, 3])) ) # Compute disparity only for non-zero depth values depth = self._data_buffer["depth"] valid_depth_mask = depth > 0 disparity[valid_depth_mask] = ( self.cfg.fx * distance / depth[valid_depth_mask] )
[docs] def get_left_right_arena_pose(self) -> torch.Tensor: """Get the local pose of the left and right cameras. Returns: torch.Tensor: The local pose of the left camera with shape (num_envs, 4, 4). """ from embodichain.lab.sim.utility import get_dexsim_arenas arenas = get_dexsim_arenas() left_poses = [] right_poses = [] for i, entity in enumerate(self._entities): arena_pose = arenas[i].get_root_node().get_local_pose() left_pose = entity._left_view.get_world_pose() left_pose[:2, 3] -= arena_pose[:2, 3] left_poses.append( torch.as_tensor( left_pose, dtype=torch.float32, ) ) right_pose = entity._right_view.get_world_pose() right_pose[:2, 3] -= arena_pose[:2, 3] right_poses.append( torch.as_tensor( right_pose, dtype=torch.float32, ) ) return torch.stack(left_poses, dim=0).to(self.device), torch.stack( right_poses, dim=0 ).to(self.device)
[docs] def set_intrinsics( self, intrinsics: torch.Tensor, right_intrinsics: torch.Tensor | None = None, env_ids: Sequence[int] | None = None, ) -> None: """ Set the camera intrinsics for both left and right cameras. Args: intrinsics (torch.Tensor): The intrinsics for the left camera with shape (4,) / (3, 3) or (B, 4) / (B, 3, 3). right_intrinsics (torch.Tensor | None): The intrinsics for the right camera with shape (4,) / (3, 3) or (B, 4) / (B, 3, 3). If None, use the same intrinsics as the left camera. env_ids (Sequence[int] | None): The environment ids to set the intrinsics. If None, set for all environments. """ ids = env_ids if env_ids is not None else range(self.num_instances) if intrinsics.dim() == 2 and intrinsics.shape[1] == 3: intrinsics = intrinsics.unsqueeze(0).repeat(len(ids), 1, 1) if intrinsics.dim() == 1: intrinsics = intrinsics.unsqueeze(0).repeat(len(ids), 1) if len(ids) != intrinsics.shape[0]: logger.log_error( f"Intrinsics shape {intrinsics.shape} does not match env_ids length {len(ids)}" ) if right_intrinsics is None: right_intrinsics = intrinsics else: if right_intrinsics.dim() == 2 and right_intrinsics.shape[1] == 3: right_intrinsics = right_intrinsics.unsqueeze(0).repeat(len(ids), 1, 1) if right_intrinsics.dim() == 1: right_intrinsics = right_intrinsics.unsqueeze(0).repeat(len(ids), 1) if len(ids) != right_intrinsics.shape[0]: logger.log_error( f"Right intrinsics shape {right_intrinsics.shape} does not match env_ids length {len(ids)}" ) for i, env_id in enumerate(ids): entity = self._entities[env_id] if intrinsics.shape[1] == 3: entity._left_view.set_intrinsic(intrinsics[i].cpu().numpy()) entity._right_view.set_intrinsic(right_intrinsics[i].cpu().numpy()) else: entity._left_view.set_intrinsic(intrinsics[i].cpu().tolist()) entity._right_view.set_intrinsic(right_intrinsics[i].cpu().tolist())
[docs] def get_intrinsics(self) -> Tuple[torch.Tensor, torch.Tensor]: """ Get the camera intrinsics for both left and right cameras. Returns: Tuple[torch.Tensor, torch.Tensor]: The intrinsics for the left and right cameras with shape (B, 3, 3). """ intrinsics_left = [] intrinsics_right = [] for entity in self._entities: intrinsics_left.append( torch.as_tensor(entity._left_view.get_intrinsic(), dtype=torch.float32) ) intrinsics_right.append( torch.as_tensor(entity._right_view.get_intrinsic(), dtype=torch.float32) ) return ( torch.stack(intrinsics_left, dim=0).to(self.device), torch.stack(intrinsics_right, dim=0).to(self.device), )