# ----------------------------------------------------------------------------
# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ----------------------------------------------------------------------------
from __future__ import annotations
import dexsim
import torch
import numpy as np
import dexsim.render as dr
from typing import Dict, Tuple, List, Sequence
from dexsim.utility import inv_transform
from embodichain.lab.sim.sensors import Camera, CameraCfg
from embodichain.utils.math import matrix_from_euler
from embodichain.utils import logger, configclass
[docs]
@configclass
class StereoCameraCfg(CameraCfg):
"""Configuration class for StereoCamera."""
sensor_type: str = "StereoCamera"
# The camera intrinsics of the right camera.
# The default camera is the left camera.
intrinsics_right: Tuple[float, float, float, float] = (600, 600, 320.0, 240.0)
left_to_right_pos: Tuple[float, float, float] = (0.05, 0.0, 0.0)
# The rotation from left camera to right camera in degrees.
left_to_right_rot: Tuple[float, float, float] = (0.0, 0.0, 0.0)
enable_disparity: bool = False
fx_r: float = intrinsics_right[0]
fy_r: float = intrinsics_right[1]
cx_r: float = intrinsics_right[2]
cy_r: float = intrinsics_right[3]
@property
def left_to_right(self) -> torch.Tensor:
"""Get the transformation matrix from left camera to right camera."""
left_to_right = torch.eye(4, dtype=torch.float32)
left_to_right[:3, 3] = torch.tensor(self.left_to_right_pos, dtype=torch.float32)
rot = torch.tensor(self.left_to_right_rot, dtype=torch.float32)
left_to_right[:3, :3] = matrix_from_euler(rot.unsqueeze(0)).squeeze(0)
return left_to_right
@property
def right_to_left(self) -> torch.Tensor:
"""Get the transformation matrix from right camera to left camera."""
return torch.inverse(self.left_to_right)
[docs]
def get_data_types(self) -> List[str]:
data_types = []
if self.enable_color:
data_types.append("color")
data_types.append("color_right")
if self.enable_depth:
data_types.append("depth")
data_types.append("depth_right")
if self.enable_mask:
data_types.append("mask")
data_types.append("mask_right")
if self.enable_normal:
data_types.append("normal")
data_types.append("normal_right")
if self.enable_position:
data_types.append("position")
data_types.append("position_right")
if self.enable_disparity:
data_types.append("disparity")
return data_types
class PairCameraView:
def __init__(
self,
left_view: dr.CameraView,
right_view: dr.CameraView,
left_to_right: np.ndarray,
) -> PairCameraView:
self._left_view = left_view
self._right_view = right_view
self._left_to_right = left_to_right
self._left_to_center = np.eye(4, dtype=np.float32)
self._left_to_center[:3, 3] = left_to_right[:3, 3] * -0.5
self._right_to_center = np.eye(4, dtype=np.float32)
self._right_to_center[:3, 3] = left_to_right[:3, 3] * 0.5
def set_local_pose(self, pose: np.ndarray) -> None:
left_pose = pose @ self._left_to_center
right_pose = pose @ self._right_to_center
self._left_view.set_local_pose(left_pose)
self._right_view.set_local_pose(right_pose)
def get_local_pose(self) -> np.ndarray:
left_pose = self._left_view.get_local_pose()
return left_pose @ inv_transform(self._left_to_center)
def set_world_pose(self, pose: np.ndarray) -> None:
left_pose = pose @ self._left_to_center
right_pose = pose @ self._right_to_center
self._left_view.set_world_pose(left_pose)
self._right_view.set_world_pose(right_pose)
def get_world_pose(self) -> np.ndarray:
left_pose = self._left_view.get_world_pose()
return left_pose @ inv_transform(self._left_to_center)
def get_node(self) -> dexsim.engine.Node:
return self._left_view.get_node()
def attach_node(self, parent: dexsim.engine.Node) -> None:
self._left_view.attach_node(parent)
self._right_view.attach_node(parent)
[docs]
class StereoCamera(Camera):
"""Base class for sensor abstraction in the simulation engine.
Sensors should inherit from this class and implement the `update` and `get_data` methods.
"""
SUPPORTED_DATA_TYPES = [
"color",
"depth",
"mask",
"normal",
"position",
"color_right",
"depth_right",
"mask_right",
"normal_right",
"position_right",
"disparity",
]
[docs]
def __init__(
self,
config: StereoCameraCfg,
device: torch.device = torch.device("cpu"),
) -> None:
super().__init__(config, device)
# check valid config
if self.cfg.enable_disparity and not self.cfg.enable_depth:
logger.log_error("Disparity can only be enabled when depth is enabled.")
def _build_sensor_from_config(
self, config: StereoCameraCfg, device: torch.device
) -> None:
self._world = dexsim.default_world()
env = self._world.get_env()
arenas = env.get_all_arenas()
if len(arenas) == 0:
arenas = [env]
num_instances = len(arenas)
self._frame_buffer = self._world.create_camera_group(
[config.width, config.height], num_instances * 2, True
)
view_attrib = config.get_view_attrib()
left_list = []
right_list = []
for i, arena in enumerate(arenas):
left_view_name = f"{self.uid}_left_view{i + 1}"
left_view = arena.create_camera(
left_view_name,
config.width,
config.height,
True,
view_attrib,
self._frame_buffer,
)
left_view.set_intrinsic(config.intrinsics)
left_view.set_near(config.near)
left_view.set_far(config.far)
left_list.append(left_view)
for i, arena in enumerate(arenas):
right_view_name = f"{self.uid}_right_view{i + 1}"
right_view = arena.create_camera(
right_view_name,
config.width,
config.height,
True,
view_attrib,
self._frame_buffer,
)
right_view.set_intrinsic(config.intrinsics_right)
right_view.set_near(config.near)
right_view.set_far(config.far)
right_list.append(right_view)
for i in range(num_instances):
self._entities[i] = PairCameraView(
left_list[i], right_list[i], config.left_to_right.cpu().numpy()
)
# Define a mapping of data types to their respective shapes and dtypes
buffer_specs = {
"color": (
(self.num_instances, config.height, config.width, 4),
torch.uint8,
),
"depth": (
(self.num_instances, config.height, config.width, 1),
torch.float32,
),
"mask": (
(self.num_instances, config.height, config.width, 1),
torch.int32,
),
"normal": (
(self.num_instances, config.height, config.width, 3),
torch.float32,
),
"position": (
(self.num_instances, config.height, config.width, 3),
torch.float32,
),
"disparity": (
(self.num_instances, config.height, config.width, 1),
torch.float32,
),
}
buffer_specs.update(
{
f"{data_type}_right": buffer_specs[data_type]
for data_type in ["color", "depth", "mask", "normal", "position"]
}
)
data_types = config.get_data_types()
# stereo buffer to store data for left and right cameras
# the data in `_data_buffer` is shared with the data in `_data_buffer_stereo`.
self._data_buffer_stereo: Dict[str, torch.Tensor] = {}
# Iterate through enabled data types and initialize buffers
for data_type in data_types:
if "right" in data_type:
continue
if getattr(config, f"enable_{data_type}", False):
shape, dtype = buffer_specs[data_type]
if data_type == "disparity":
self._data_buffer[data_type] = torch.empty(
shape, dtype=dtype, device=device
)
# create new shape with width * 2 for stereo camera
shape_ = (shape[0], shape[1], shape[2] * 2, shape[3])
self._data_buffer_stereo[data_type] = torch.empty(
shape_, dtype=dtype, device=device
)
self._data_buffer[data_type] = self._data_buffer_stereo[data_type][
:, :, : config.width, :
]
self._data_buffer[f"{data_type}_right"] = self._data_buffer_stereo[
data_type
][:, :, config.width :, :]
self.cfg: CameraCfg = config
if self.cfg.extrinsics.parent is not None:
self._attach_to_entity()
[docs]
def update(self, **kwargs) -> None:
"""Update the sensor data.
The supported data types are:
- color: RGB images with shape (B, H, W, 4) and dtype torch.uint8
- depth: Depth images with shape (B, H, W, 1) and dtype torch.float32
- mask: Instance segmentation masks with shape (B, H, W, 1) and dtype torch.int32
- normal: Normal images with shape (B, H, W, 3) and dtype torch.float32
- position: Position images with shape (B, H, W, 3) and dtype torch.float32
- disparity: Disparity images with shape (B, H, W, 1) and dtype torch.float32
Args:
**kwargs: Additional keyword arguments for sensor update.
"""
fetch_only = kwargs.get("fetch_only", False)
if not fetch_only:
self._frame_buffer.apply()
self.cfg: StereoCameraCfg
if self.cfg.enable_color:
data = self._frame_buffer.get_rgb_gpu_buffer().to(self.device)
self._data_buffer["color"] = data[: self.num_instances, ...]
self._data_buffer[f"color_right"] = data[self.num_instances :, ...]
if self.cfg.enable_depth:
data = self._frame_buffer.get_depth_gpu_buffer().to(self.device)
self._data_buffer["depth"] = data[: self.num_instances, ...].unsqueeze_(-1)
self._data_buffer[f"depth_right"] = data[
self.num_instances :, ...
].unsqueeze_(-1)
if self.cfg.enable_mask:
data = self._frame_buffer.get_visible_mask_gpu_buffer().to(
self.device, torch.int32
)
self._data_buffer["mask"] = data[: self.num_instances, ...].unsqueeze_(-1)
self._data_buffer[f"mask_right"] = data[
self.num_instances :, ...
].unsqueeze_(-1)
if self.cfg.enable_normal:
data = self._frame_buffer.get_normal_gpu_buffer().to(self.device)[..., :3]
self._data_buffer["normal"] = data[: self.num_instances, ...]
self._data_buffer[f"normal_right"] = data[self.num_instances :, ...]
if self.cfg.enable_position:
data = self._frame_buffer.get_position_gpu_buffer().to(self.device)[..., :3]
self._data_buffer["position"] = data[: self.num_instances, ...]
self._data_buffer[f"position_right"] = data[self.num_instances :, ...]
if self.cfg.enable_disparity:
disparity = self._data_buffer["disparity"]
disparity.fill_(0.0)
distance = torch.sqrt(
torch.sum(torch.square(self.cfg.left_to_right[:3, 3]))
)
# Compute disparity only for non-zero depth values
depth = self._data_buffer["depth"]
valid_depth_mask = depth > 0
disparity[valid_depth_mask] = (
self.cfg.fx * distance / depth[valid_depth_mask]
)
[docs]
def get_left_right_arena_pose(self) -> torch.Tensor:
"""Get the local pose of the left and right cameras.
Returns:
torch.Tensor: The local pose of the left camera with shape (num_envs, 4, 4).
"""
from embodichain.lab.sim.utility import get_dexsim_arenas
arenas = get_dexsim_arenas()
left_poses = []
right_poses = []
for i, entity in enumerate(self._entities):
arena_pose = arenas[i].get_root_node().get_local_pose()
left_pose = entity._left_view.get_world_pose()
left_pose[:2, 3] -= arena_pose[:2, 3]
left_poses.append(
torch.as_tensor(
left_pose,
dtype=torch.float32,
)
)
right_pose = entity._right_view.get_world_pose()
right_pose[:2, 3] -= arena_pose[:2, 3]
right_poses.append(
torch.as_tensor(
right_pose,
dtype=torch.float32,
)
)
return torch.stack(left_poses, dim=0).to(self.device), torch.stack(
right_poses, dim=0
).to(self.device)
[docs]
def set_intrinsics(
self,
intrinsics: torch.Tensor,
right_intrinsics: torch.Tensor | None = None,
env_ids: Sequence[int] | None = None,
) -> None:
"""
Set the camera intrinsics for both left and right cameras.
Args:
intrinsics (torch.Tensor): The intrinsics for the left camera with shape (4,) / (3, 3) or (B, 4) / (B, 3, 3).
right_intrinsics (torch.Tensor | None): The intrinsics for the right camera with shape (4,) / (3, 3) or (B, 4) / (B, 3, 3). If None, use the same intrinsics as the left camera.
env_ids (Sequence[int] | None): The environment ids to set the intrinsics. If None, set for all environments.
"""
ids = env_ids if env_ids is not None else range(self.num_instances)
if intrinsics.dim() == 2 and intrinsics.shape[1] == 3:
intrinsics = intrinsics.unsqueeze(0).repeat(len(ids), 1, 1)
if intrinsics.dim() == 1:
intrinsics = intrinsics.unsqueeze(0).repeat(len(ids), 1)
if len(ids) != intrinsics.shape[0]:
logger.log_error(
f"Intrinsics shape {intrinsics.shape} does not match env_ids length {len(ids)}"
)
if right_intrinsics is None:
right_intrinsics = intrinsics
else:
if right_intrinsics.dim() == 2 and right_intrinsics.shape[1] == 3:
right_intrinsics = right_intrinsics.unsqueeze(0).repeat(len(ids), 1, 1)
if right_intrinsics.dim() == 1:
right_intrinsics = right_intrinsics.unsqueeze(0).repeat(len(ids), 1)
if len(ids) != right_intrinsics.shape[0]:
logger.log_error(
f"Right intrinsics shape {right_intrinsics.shape} does not match env_ids length {len(ids)}"
)
for i, env_id in enumerate(ids):
entity = self._entities[env_id]
if intrinsics.shape[1] == 3:
entity._left_view.set_intrinsic(intrinsics[i].cpu().numpy())
entity._right_view.set_intrinsic(right_intrinsics[i].cpu().numpy())
else:
entity._left_view.set_intrinsic(intrinsics[i].cpu().tolist())
entity._right_view.set_intrinsic(right_intrinsics[i].cpu().tolist())
[docs]
def get_intrinsics(self) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Get the camera intrinsics for both left and right cameras.
Returns:
Tuple[torch.Tensor, torch.Tensor]: The intrinsics for the left and right cameras with shape (B, 3, 3).
"""
intrinsics_left = []
intrinsics_right = []
for entity in self._entities:
intrinsics_left.append(
torch.as_tensor(entity._left_view.get_intrinsic(), dtype=torch.float32)
)
intrinsics_right.append(
torch.as_tensor(entity._right_view.get_intrinsic(), dtype=torch.float32)
)
return (
torch.stack(intrinsics_left, dim=0).to(self.device),
torch.stack(intrinsics_right, dim=0).to(self.device),
)