diff --git a/autoware_ml/configs/detection3d/dataset/t4dataset/gen2_base.py b/autoware_ml/configs/detection3d/dataset/t4dataset/gen2_base.py new file mode 100644 index 000000000..4bdf3b06d --- /dev/null +++ b/autoware_ml/configs/detection3d/dataset/t4dataset/gen2_base.py @@ -0,0 +1,221 @@ +custom_imports = dict( + imports=[ + "autoware_ml.detection3d.datasets.t4dataset", + "autoware_ml.detection3d.evaluation.t4metric.t4metric", + "autoware_ml.detection3d.evaluation.t4metric.t4metric_v2", + ] +) + +# dataset type setting +dataset_type = "T4Dataset" +info_train_file_name = "t4dataset_gen2_base_infos_train.pkl" +info_val_file_name = "t4dataset_gen2_base_infos_val.pkl" +info_test_file_name = "t4dataset_gen2_base_infos_test.pkl" + +info_train_statistics_file_name = "t4dataset_gen2_base_statistics_train.parquet" +info_val_statistics_file_name = "t4dataset_gen2_base_statistics_val.parquet" +info_test_statistics_file_name = "t4dataset_gen2_base_statistics_test.parquet" + +# dataset scene setting +dataset_version_list = [ + "db_jpntaxigen2_v1", + "db_jpntaxigen2_v2", + "db_j6gen2_v1", + "db_j6gen2_v2", + "db_j6gen2_v3", + "db_j6gen2_v4", + "db_j6gen2_v5", + "db_j6gen2_v6", + "db_j6gen2_v7", + "db_j6gen2_v8", + "db_j6gen2_v9", + "db_j6gen2_v10", + "db_j6gen2_v11", + "db_j6gen2_v12", + "db_largebus_v1", + "db_largebus_v2", + "db_largebus_v3", +] + +# TODO (KokSeang): This will be removed to avoid repeatitive computation +# Dataset set, test info files, and enable/disable evaluation of prefix +dataset_test_groups = { + "j6gen2_base": ("t4dataset_j6gen2_base_infos_test.pkl", False), + "j6gen2": ("t4dataset_j6gen2_infos_test.pkl", False), + "largebus": ("t4dataset_largebus_infos_test.pkl", False), + "jpntaxi_gen2": ("t4dataset_jpntaxi_gen2_infos_test.pkl", False), + "gen2_base": ("t4dataset_gen2_base_infos_test.pkl", True), +} + +# dataset format setting +data_prefix = dict( + pts="", + CAM_FRONT="", + CAM_FRONT_WIDE="", + CAM_FRONT_LEFT="", + CAM_FRONT_LEFT_WIDE="", + CAM_FRONT_RIGHT="", + CAM_FRONT_RIGHT_WIDE="", + CAM_BACK="", + CAM_BACK_RIGHT="", + CAM_BACK_RIGHT_WIDE="", + CAM_BACK_LEFT="", + CAM_BACK_LEFT_WIDE="", + sweeps="", +) +camera_types = { + "CAM_FRONT", + "CAM_FRONT_WIDE", + "CAM_FRONT_RIGHT", + "CAM_FRONT_RIGHT_WIDE", + "CAM_FRONT_LEFT", + "CAM_FRONT_LEFT_WIDE", + "CAM_BACK", + "CAM_BACK_LEFT", + "CAM_BACK_LEFT_WIDE", + "CAM_BACK_RIGHT", + "CAM_BACK_RIGHT_WIDE", +} + +# class setting +name_mapping = { + # DBv1.0 + "vehicle.car": "car", + "vehicle.construction": "truck", + "vehicle.emergency (ambulance & police)": "car", + "vehicle.motorcycle": "bicycle", + "vehicle.trailer": "trailer", + "vehicle.truck": "truck", + "vehicle.bicycle": "bicycle", + "vehicle.bus (bendy & rigid)": "bus", + "pedestrian.adult": "pedestrian", + "pedestrian.child": "pedestrian", + "pedestrian.construction_worker": "pedestrian", + "pedestrian.personal_mobility": "pedestrian", + "pedestrian.police_officer": "pedestrian", + "pedestrian.stroller": "pedestrian", + "pedestrian.wheelchair": "pedestrian", + "movable_object.barrier": "barrier", + "movable_object.debris": "barrier", + "movable_object.pushable_pullable": "barrier", + "movable_object.trafficcone": "traffic_cone", + "movable_object.traffic_cone": "traffic_cone", + "animal": "animal", + "static_object.bicycle_rack": "bicycle_rack", + # DBv1.1 and UCv2.0 + "car": "car", + "truck": "truck", + "bus": "bus", + "trailer": "trailer", + "motorcycle": "bicycle", + "bicycle": "bicycle", + "police_car": "car", + "pedestrian": "pedestrian", + "police_officer": "pedestrian", + "forklift": "car", + "construction_worker": "pedestrian", + "stroller": "pedestrian", + # DBv2.0 and DBv3.0 + "animal": "animal", + "movable_object.barrier": "barrier", + "movable_object.pushable_pullable": "barrier", + "movable_object.traffic_cone": "traffic_cone", + "pedestrian.adult": "pedestrian", + "pedestrian.child": "pedestrian", + "pedestrian.construction_worker": "pedestrian", + "pedestrian.personal_mobility": "pedestrian", + "pedestrian.police_officer": "pedestrian", + "pedestrian.stroller": "pedestrian", + "pedestrian.wheelchair": "pedestrian", + "static_object.bicycle rack": "bicycle rack", + "static_object.bollard": "bollard", + "vehicle.ambulance": "car", # Define vehicle.ambulance as car since vehicle.emergency (ambulance & police) is defined as car + "vehicle.bicycle": "bicycle", + "vehicle.bus": "bus", + "vehicle.car": "car", + "vehicle.construction": "truck", + "vehicle.fire": "truck", + "vehicle.motorcycle": "bicycle", + "vehicle.police": "car", + "vehicle.trailer": "trailer", + "vehicle.truck": "truck", + # DBv1.3 + "ambulance": "car", + "kart": "car", + "wheelchair": "pedestrian", + "personal_mobility": "pedestrian", + "fire_truck": "truck", + "semi_trailer": "trailer", + "tractor_unit": "truck", + "construction_vehicle": "truck", + "traffic_cone": "traffic_cone", + "trafficcone": "traffic_cone", + "barrier": "barrier", + "other_vehicle": "car", + "other_pedestrian": "pedestrian", +} + +class_names = ["car", "truck", "bus", "bicycle", "pedestrian", "traffic_cone", "barrier"] +num_class = len(class_names) +metainfo = dict(classes=class_names) + +merge_objects = [ + ("truck", ["truck", "trailer"]), +] +merge_type = "extend_longer" # One of ["extend_longer","union", None] + +# visualization +class_colors = { + "car": (30, 144, 255), + "truck": (140, 0, 255), + "construction_vehicle": (255, 255, 0), + "bus": (111, 255, 111), + "trailer": (0, 255, 255), + "barrier": (0, 0, 0), + "motorcycle": (100, 0, 30), + "bicycle": (255, 0, 30), + "pedestrian": (255, 200, 200), + "traffic_cone": (120, 120, 120), +} +camera_panels = [ + "data/CAM_FRONT_LEFT", + "data/CAM_FRONT", + "data/CAM_FRONT_RIGHT", + "data/CAM_BACK_LEFT", + "data/CAM_BACK", + "data/CAM_BACK_RIGHT", +] + +# Add filter attributes +filter_attributes = [ + ("vehicle.bicycle", "vehicle_state.parked"), + ("vehicle.bicycle", "cycle_state.without_rider"), + ("vehicle.bicycle", "motorcycle_state.without_rider"), + ("vehicle.motorcycle", "vehicle_state.parked"), + ("vehicle.motorcycle", "cycle_state.without_rider"), + ("vehicle.motorcycle", "motorcycle_state.without_rider"), + ("bicycle", "vehicle_state.parked"), + ("bicycle", "cycle_state.without_rider"), + ("bicycle", "motorcycle_state.without_rider"), + ("motorcycle", "vehicle_state.parked"), + ("motorcycle", "cycle_state.without_rider"), + ("motorcycle", "motorcycle_state.without_rider"), +] + +evaluator_metric_configs = dict( + evaluation_task="detection", + target_labels=class_names, + center_distance_bev_thresholds=[0.5, 1.0, 2.0, 4.0], + # plane_distance_thresholds is required for the pass fail evaluation + plane_distance_thresholds=[2.0, 4.0], + iou_2d_thresholds=None, + iou_3d_thresholds=None, + label_prefix="autoware", + # bev minimum distance ranges for each range bucket, must be the same length as max_distance, + # they will form bev distance ranges in [(min_distance[0], max_distance[0]), (min_distance[1], max_distance[1]), ...] when filtering + min_distance=[0.0, 50.0, 90.0, 0.0], + # bev maximum distance ranges for each range bucket, must be the same length as min_distance + max_distance=[50.0, 90.0, 121.0, 121.0], + min_point_numbers=0, + matching_class_agnostic_fps=False, +) diff --git a/autoware_ml/configs/detection3d/default_runtime.py b/autoware_ml/configs/detection3d/default_runtime.py index cc2b896f7..6da761425 100644 --- a/autoware_ml/configs/detection3d/default_runtime.py +++ b/autoware_ml/configs/detection3d/default_runtime.py @@ -2,9 +2,17 @@ default_hooks = dict( timer=dict(type="IterTimerHook"), - logger=dict(type="LoggerHook", interval=50), + logger=dict( + type="LoggerHook", + interval=50, + backend_args=dict(backend="local"), + ), param_scheduler=dict(type="ParamSchedulerHook"), - checkpoint=dict(type="CheckpointHook", interval=-1), + checkpoint=dict( + type="CheckpointHook", + interval=-1, + backend_args=dict(backend="local"), + ), sampler_seed=dict(type="DistSamplerSeedHook"), visualization=dict(type="Det3DVisualizationHook"), ) diff --git a/autoware_ml/detection3d/datasets/t4dataset.py b/autoware_ml/detection3d/datasets/t4dataset.py index ce1c78f31..06e063233 100644 --- a/autoware_ml/detection3d/datasets/t4dataset.py +++ b/autoware_ml/detection3d/datasets/t4dataset.py @@ -2,6 +2,7 @@ from typing import List import numpy as np +import tqdm from mmdet3d.datasets import NuScenesDataset from mmengine.logging import print_log from mmengine.registry import DATASETS @@ -51,21 +52,27 @@ def filter_data(self) -> List[dict]: if not self.filter_cfg: return self.data_list - filter_frames_with_camera_order = self.filter_cfg.get("filter_frames_with_camera_order", None) - if filter_frames_with_camera_order is None: + filter_frames_with_camera_orders = self.filter_cfg.get("filter_frames_with_camera_orders", None) + if filter_frames_with_camera_orders is None: return self.data_list filtered_data_list = [] - for entry in self.data_list: + for entry in tqdm.tqdm(self.data_list, desc="Filtering data"): + vehicle_type = entry.get("vehicle_type", None) + if vehicle_type is None: + raise KeyError(f"Missing 'vehicle_type' in entry: {entry}") + + filter_frames_with_camera_order = filter_frames_with_camera_orders.get(vehicle_type, None) + if filter_frames_with_camera_order is None: + raise KeyError(f"Missing camera order for vehicle type '{vehicle_type}' in filter configuration.") + filtered = False for camera_order in filter_frames_with_camera_order: if camera_order not in entry["images"]: filtered = True break - if entry["images"][camera_order]["img_path"] is None or not osp.exists( - entry["images"][camera_order]["img_path"] - ): + if entry["images"][camera_order]["img_path"] is None: filtered = True break @@ -180,6 +187,7 @@ def parse_data_info(self, info: dict) -> dict: cam_prefix, img_info["img_path"], ) + # print_log(f"Camera path: {img_info['img_path']}", logger="current") if self.default_cam_key is not None: info["img_path"] = info["images"][self.default_cam_key]["img_path"] @@ -192,4 +200,7 @@ def parse_data_info(self, info: dict) -> dict: else: info["lidar2img"] = info["cam2img"] @ info["lidar2cam"] + # Default difficulty to 0 if not present + if "difficulty" not in info: + info["difficulty"] = 0 return info diff --git a/autoware_ml/detection3d/evaluation/t4metric/t4metric.py b/autoware_ml/detection3d/evaluation/t4metric/t4metric.py index 2df0ac490..e3f68c5e3 100644 --- a/autoware_ml/detection3d/evaluation/t4metric/t4metric.py +++ b/autoware_ml/detection3d/evaluation/t4metric/t4metric.py @@ -262,8 +262,7 @@ def _parse_ground_truth_from_sample(self, data_sample: Dict[str, Any]) -> dict: "num_lidar_pts": num_lidar_pts, } - @staticmethod - def _get_scene_info(data_infos: List[dict]) -> Tuple[List[str], List[str]]: + def _get_scene_info(self, data_infos: List[dict]) -> Tuple[List[str], List[str]]: """Get scene tokens and directory names from data infos. Args: @@ -284,6 +283,7 @@ def _get_scene_info(data_infos: List[dict]) -> Tuple[List[str], List[str]]: if directory not in directories: scene_tokens.append(scene_token) directories.append(directory) + return scene_tokens, directories @staticmethod diff --git a/projects/BEVFusion/bevfusion/__init__.py b/projects/BEVFusion/bevfusion/__init__.py index e84525651..09bb61b9a 100644 --- a/projects/BEVFusion/bevfusion/__init__.py +++ b/projects/BEVFusion/bevfusion/__init__.py @@ -3,10 +3,17 @@ from .bevfusion_necks import GeneralizedLSSFPN from .bevfusion_voxel_encoder import HardSimpleVoxelSinCosEncoder from .depth_lss import DepthLSSTransform, LSSTransform -from .loading import BEVLoadMultiViewImageFromFiles +from .depth_lss_v2 import LSSTransformV2, LSSTransformV2DepthAware +from .loading import BEVLoadMultiViewImageFromFiles, PointsToMultiViewImageDepths from .sparse_encoder import BEVFusionSparseEncoder from .transformer import TransformerDecoderLayer -from .transforms_3d import BEVFusionGlobalRotScaleTrans, BEVFusionRandomFlip3D, GridMask, ImageAug3D +from .transforms_3d import ( + BEVFusionGlobalRotScaleTrans, + BEVFusionRandomFlip3D, + BEVFusionRemoveLiDARPoints, + GridMask, + ImageAug3D, +) from .utils import BBoxBEVL1Cost, HeuristicAssigner3D, HungarianAssigner3D, IoU3DCost, TransFusionBBoxCoder __all__ = [ @@ -27,6 +34,10 @@ "TransformerDecoderLayer", "BEVFusionRandomFlip3D", "BEVFusionGlobalRotScaleTrans", + "BEVFusionRemoveLiDARPoints", "TransFusionBBoxCoder", "HardSimpleVoxelSinCosEncoder", + "LSSTransformV2", + "PointsToMultiViewImageDepths", + "LSSTransformV2DepthAware", ] diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py index 709d851a9..4ff2519d7 100644 --- a/projects/BEVFusion/bevfusion/bevfusion.py +++ b/projects/BEVFusion/bevfusion/bevfusion.py @@ -1,7 +1,10 @@ +import math from collections import OrderedDict from copy import deepcopy +from pathlib import Path from typing import Dict, List, Optional, Tuple +import matplotlib.pyplot as plt import numpy as np import torch import torch.distributed as dist @@ -9,6 +12,7 @@ from mmdet3d.registry import MODELS from mmdet3d.structures import Det3DDataSample from mmdet3d.utils import OptConfigType, OptMultiConfig, OptSampleList +from mmengine.logging import print_log from mmengine.utils import is_list_of from torch import Tensor from torch.nn import functional as F @@ -34,6 +38,9 @@ def __init__( bbox_head: Optional[dict] = None, init_cfg: OptMultiConfig = None, seg_head: Optional[dict] = None, + loss_depth_weight: float = 3.0, + depth_gt_downsample: int = 1, + visualize_gt_depth_dir: Optional[str] = None, **kwargs, ) -> None: """Initialize BEVFusion model. @@ -74,8 +81,12 @@ def __init__( self.pts_neck = MODELS.build(pts_neck) if pts_neck is not None else None self.bbox_head = MODELS.build(bbox_head) - - self.init_weights() + self._weights_initialized = False + self.loss_depth_weight = loss_depth_weight + self.depth_gt_downsample = depth_gt_downsample + self.visualize_gt_depth_dir = Path(visualize_gt_depth_dir) if visualize_gt_depth_dir is not None else None + if self.visualize_gt_depth_dir is not None: + self.visualize_gt_depth_dir.mkdir(parents=True, exist_ok=True) def _forward( self, batch_inputs_dict: Tensor, batch_data_samples: OptSampleList = [], using_image_features=False, **kwargs @@ -131,8 +142,11 @@ def parse_losses(self, losses: Dict[str, torch.Tensor]) -> Tuple[torch.Tensor, D return loss, log_vars # type: ignore def init_weights(self) -> None: + if self._weights_initialized: + return if self.img_backbone is not None: self.img_backbone.init_weights() + self._weights_initialized = True @property def with_bbox_head(self): @@ -144,6 +158,53 @@ def with_seg_head(self): """bool: Whether the detector has a segmentation head.""" return hasattr(self, "seg_head") and self.seg_head is not None + def prepare_camera_depth_aware_parameters( + self, + camera_intrinsics: torch.Tensor, + img_aug_matrix: torch.Tensor, + lidar_aug_matrix: torch.Tensor, + camera2lidar: torch.Tensor, + ) -> torch.Tensor: + """ + Args: + camera_intrinsics: torch.Tensor, the camera intrinsics of shape (B, N, 3, 3). + img_aug_matrix: torch.Tensor, the image augmentation matrix of shape (B, N, 4, 4). + lidar_aug_matrix: torch.Tensor, the lidar augmentation matrix of shape (B, 4, 4). + camera2lidar: torch.Tensor, the camera to lidar matrix of shape (B, N, 4, 4). + Returns: + torch.Tensor, the camera depth aware parameters of shape (B*N, N_CAMERA_DEPTH_PARAMETERS). + """ + B, N, _, _ = camera_intrinsics.shape + lidar_aug_matrix = lidar_aug_matrix.view(B, 1, 4, 4).repeat(1, N, 1, 1) + + # (B*N, 15) + mlp_input = torch.stack( + [ + camera_intrinsics[:, :, 0, 0], # fx + camera_intrinsics[:, :, 1, 1], # fy + camera_intrinsics[:, :, 0, 2], # cx + camera_intrinsics[:, :, 1, 2], # cy + img_aug_matrix[:, :, 0, 0], # r11 + img_aug_matrix[:, :, 0, 1], # r12 + img_aug_matrix[:, :, 0, 3], # t1 + img_aug_matrix[:, :, 1, 0], # r21 + img_aug_matrix[:, :, 1, 1], # r22 + img_aug_matrix[:, :, 1, 3], # t2 + lidar_aug_matrix[:, :, 0, 0], # r11 + lidar_aug_matrix[:, :, 0, 1], # r12 + lidar_aug_matrix[:, :, 1, 0], # r21 + lidar_aug_matrix[:, :, 1, 1], # r22 + lidar_aug_matrix[:, :, 2, 2], # r33 + ], + dim=-1, + ) + # (B, N, 4, 4) -> (B, N, 3, 4) -> (B*N, 12) + camera2lidar_flatten = camera2lidar[:, :, :3, :].view(B, N, -1) + + # (B, N, 15+12) + mlp_input = torch.cat([mlp_input, camera2lidar_flatten], dim=-1) + return mlp_input + def get_image_backbone_features(self, x: torch.Tensor) -> torch.Tensor: B, N, C, H, W = x.size() x = x.view(B * N, C, H, W).contiguous() @@ -174,14 +235,15 @@ def extract_img_feat( lidar_aug_matrix_inverse=None, geom_feats=None, using_image_features=False, - ) -> torch.Tensor: + camera_depth_aware_parameters=None, + ) -> Tuple[torch.Tensor, torch.Tensor]: if not using_image_features: x = self.get_image_backbone_features(x) with torch.amp.autocast("cuda", enabled=False): # with torch.autocast(device_type='cuda', dtype=torch.float32): - x = self.view_transform( + x, pred_depths = self.view_transform( x, points, lidar2image, @@ -194,8 +256,9 @@ def extract_img_feat( img_aug_matrix_inverse, lidar_aug_matrix_inverse, geom_feats, + camera_depth_aware_parameters=camera_depth_aware_parameters, ) - return x + return x, pred_depths def extract_pts_feat(self, feats, coords, sizes, points=None) -> torch.Tensor: if points is not None: @@ -271,7 +334,7 @@ def predict( contains a tensor with shape (num_instances, 7). """ batch_input_metas = [item.metainfo for item in batch_data_samples] - feats = self.extract_feat(batch_inputs_dict, batch_input_metas, using_image_features) + feats, _ = self.extract_feat(batch_inputs_dict, batch_input_metas, using_image_features) if self.with_bbox_head: outputs = self.bbox_head.predict(feats, batch_input_metas) @@ -292,6 +355,7 @@ def extract_feat( features = [] is_onnx_inference = False + pred_depths = None if imgs is not None and "lidar2img" not in batch_inputs_dict: # NOTE(knzo25): normal training and testing imgs = imgs.contiguous() @@ -309,7 +373,13 @@ def extract_feat( camera2lidar = imgs.new_tensor(np.asarray(camera2lidar)) img_aug_matrix = imgs.new_tensor(np.asarray(img_aug_matrix)) lidar_aug_matrix = imgs.new_tensor(np.asarray(lidar_aug_matrix)) - img_feature = self.extract_img_feat( + camera_depth_aware_parameters = self.prepare_camera_depth_aware_parameters( + camera_intrinsics=camera_intrinsics, + img_aug_matrix=img_aug_matrix, + lidar_aug_matrix=lidar_aug_matrix, + camera2lidar=camera2lidar, + ) + img_feature, pred_depths = self.extract_img_feat( imgs, deepcopy(points), lidar2image, @@ -319,6 +389,7 @@ def extract_feat( lidar_aug_matrix, batch_input_metas, using_image_features=using_image_features, + camera_depth_aware_parameters=camera_depth_aware_parameters, ) features.append(img_feature) elif imgs is not None: @@ -330,8 +401,10 @@ def extract_feat( img_aug_matrix = batch_inputs_dict["img_aug_matrix"] lidar_aug_matrix = batch_inputs_dict["lidar_aug_matrix"] geom_feats = batch_inputs_dict["geom_feats"] + # Retrieve the parameters from deployment code directly + camera_depth_aware_parameters = batch_inputs_dict["camera_depth_aware_parameters"] - img_feature = self.extract_img_feat( + img_feature, pred_depths = self.extract_img_feat( imgs, points, lidar2image, @@ -342,6 +415,7 @@ def extract_feat( batch_input_metas, geom_feats=geom_feats, using_image_features=using_image_features, + camera_depth_aware_parameters=camera_depth_aware_parameters, ) features.append(img_feature) @@ -366,7 +440,7 @@ def extract_feat( if self.pts_neck is not None: x = self.pts_neck(x) - return x + return x, pred_depths def loss( self, @@ -376,12 +450,145 @@ def loss( **kwargs, ) -> List[Det3DDataSample]: batch_input_metas = [item.metainfo for item in batch_data_samples] - feats = self.extract_feat(batch_inputs_dict, batch_input_metas, using_image_features) + feats, pred_depths = self.extract_feat(batch_inputs_dict, batch_input_metas, using_image_features) losses = dict() + if self.loss_depth_weight > 0 and pred_depths is not None: + with torch.amp.autocast("cuda", enabled=False): + gt_depths = torch.stack( + [ + ( + meta["gt_depths"] + if isinstance(meta["gt_depths"], torch.Tensor) + else torch.as_tensor(meta["gt_depths"]) + ) + for meta in batch_input_metas + ] + ).to(device=pred_depths.device, dtype=torch.float32) + depth_loss = self.get_depth_loss(gt_depths, pred_depths) + losses["loss_depth"] = depth_loss + if self.with_bbox_head: bbox_loss = self.bbox_head.loss(feats, batch_data_samples) - - losses.update(bbox_loss) + losses.update(bbox_loss) return losses + + def _visualize_one_hot_gt_depth( + self, + gt_depths_one_hot: Tensor, + batch_size: int, + num_cameras: int, + height: int, + width: int, + batch_idx: int = 0, + num_channels: int = 6, + ) -> None: + """Save one-hot depth GT maps for the first batch and first few depth channels. + + Args: + gt_depths_one_hot (Tensor): One-hot depth GT of shape [B*N*H*W, D]. + batch_size (int): Batch size B from the original input. + num_cameras (int): Number of camera views N from the original input. + height (int): Original input height H before downsampling. + width (int): Original input width W before downsampling. + batch_idx (int): Batch index to visualize. + num_channels (int): Number of depth-bin channels to visualize. + """ + if self.visualize_gt_depth_dir is None: + return + + if dist.is_available() and dist.is_initialized() and dist.get_rank() != 0: + return + + if batch_size <= batch_idx or num_cameras == 0: + return + + downsample = self.depth_gt_downsample + height_down = height // downsample + width_down = width // downsample + num_depth_bins = gt_depths_one_hot.shape[1] + + num_channels = min(num_channels, num_depth_bins) + if num_channels == 0 or height_down == 0 or width_down == 0: + return + + with torch.no_grad(): + one_hot = gt_depths_one_hot.view(batch_size, num_cameras, height_down, width_down, num_depth_bins) + depth_channels = one_hot[batch_idx, 0, :, :, :num_channels].detach().float().cpu().numpy() + + ncols = min(3, num_channels) + nrows = math.ceil(num_channels / ncols) + fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 4 * nrows), squeeze=False) + + dbounds = self.view_transform.dbound + for ch_idx in range(num_channels): + ax = axes[ch_idx // ncols, ch_idx % ncols] + channel_map = depth_channels[:, :, ch_idx] + depth_m = dbounds[0] + (ch_idx + 0.5) * dbounds[2] + im = ax.imshow(channel_map, cmap="viridis", vmin=0, vmax=1, interpolation="nearest") + ax.set_title(f"batch {batch_idx}, depth bin {ch_idx} (~{depth_m:.1f}m)") + ax.set_xticks([]) + ax.set_yticks([]) + fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04) + + for ch_idx in range(num_channels, nrows * ncols): + axes[ch_idx // ncols, ch_idx % ncols].axis("off") + + fig.suptitle(f"one-hot gt_depth (batch={batch_idx}, cam=0, bins=0-{num_channels - 1})") + fig.tight_layout() + + if not hasattr(self, "_gt_depth_one_hot_vis_count"): + self._gt_depth_one_hot_vis_count = 0 + self._gt_depth_one_hot_vis_count += 1 + save_path = self.visualize_gt_depth_dir / f"gt_depth_one_hot_{self._gt_depth_one_hot_vis_count:06d}.png" + fig.savefig(save_path, dpi=150, bbox_inches="tight") + plt.close(fig) + print_log(f"Saved one-hot gt_depth visualization to {save_path.resolve()}") + + def get_downsampled_gt_depth(self, gt_depths): + """ + Input: + gt_depths: [B, N, H, W] + Output: + gt_depths: [B*N*h*w, d] + """ + B, N, H, W = gt_depths.shape + D = self.view_transform.D + dbounds = self.view_transform.dbound + gt_depths = gt_depths.view( + B * N, + H // self.depth_gt_downsample, + self.depth_gt_downsample, + W // self.depth_gt_downsample, + self.depth_gt_downsample, + 1, + ) + gt_depths = gt_depths.permute(0, 1, 3, 5, 2, 4).contiguous() + gt_depths = gt_depths.view(-1, self.depth_gt_downsample * self.depth_gt_downsample) + gt_depths_tmp = torch.where(gt_depths == 0.0, 1e5 * torch.ones_like(gt_depths), gt_depths) + gt_depths = torch.min(gt_depths_tmp, dim=-1).values + gt_depths = gt_depths.view(B * N, H // self.depth_gt_downsample, W // self.depth_gt_downsample) + + gt_depths = (gt_depths - (dbounds[0] - dbounds[2])) / dbounds[2] + # gt_depths = torch.where(gt_depths >= 0.0, gt_depths, torch.zeros_like(gt_depths)) + # gt_depths = torch.clamp(gt_depths, max=float(D)) + gt_depths = torch.where((gt_depths >= 0.0) & (gt_depths < D + 1), gt_depths, torch.zeros_like(gt_depths)) + # gt_depths = torch.clamp(gt_depths, max=float(D)) + gt_depths = F.one_hot(gt_depths.long(), num_classes=D + 1).view(-1, D + 1)[:, 1:] + self._visualize_one_hot_gt_depth(gt_depths, B, N, H, W) + return gt_depths.float() + + def get_depth_loss(self, depth_labels, depth_preds): + depth_labels = self.get_downsampled_gt_depth(depth_labels) + # (B, N, D, H, W) -> (B*N*H*W, D) + depth_preds = depth_preds.permute(0, 1, 3, 4, 2).contiguous().view(-1, self.view_transform.D) + fg_mask = torch.max(depth_labels, dim=1).values > 0.0 + depth_labels = depth_labels[fg_mask] + depth_preds = depth_preds[fg_mask] + depth_loss = F.binary_cross_entropy( + depth_preds, + depth_labels, + reduction="none", + ).sum() / max(1.0, fg_mask.sum()) + return self.loss_depth_weight * depth_loss diff --git a/projects/BEVFusion/bevfusion/bevfusion_head.py b/projects/BEVFusion/bevfusion/bevfusion_head.py index 2d713b022..a7ddca4ca 100644 --- a/projects/BEVFusion/bevfusion/bevfusion_head.py +++ b/projects/BEVFusion/bevfusion/bevfusion_head.py @@ -26,11 +26,18 @@ def clip_sigmoid(x, eps=1e-4): @MODELS.register_module() class ConvFuser(nn.Sequential): - def __init__(self, in_channels: int, out_channels: int, kernel_size: int, padding: int) -> None: + def __init__(self, in_channels: int, out_channels: int, kernel_size: int, stride: int, padding: int) -> None: self.in_channels = in_channels self.out_channels = out_channels super().__init__( - nn.Conv2d(sum(in_channels), out_channels, kernel_size, padding, bias=False), + nn.Conv2d( + sum(in_channels), + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias=False, + ), nn.BatchNorm2d(out_channels), nn.ReLU(True), ) diff --git a/projects/BEVFusion/bevfusion/depth_lss.py b/projects/BEVFusion/bevfusion/depth_lss.py index ac7c5b503..d0a547258 100644 --- a/projects/BEVFusion/bevfusion/depth_lss.py +++ b/projects/BEVFusion/bevfusion/depth_lss.py @@ -1,8 +1,13 @@ # modify from https://github.com/mit-han-lab/bevfusion +import math +from pathlib import Path from typing import Tuple +import matplotlib.pyplot as plt +import numpy as np import torch from mmdet3d.registry import MODELS +from mmengine.logging import print_log from torch import nn from .ops import bev_pool @@ -164,6 +169,7 @@ def __init__( ybound: Tuple[float, float, float], zbound: Tuple[float, float, float], dbound: Tuple[float, float, float], + visualize_bev_feat: bool = False, ) -> None: super().__init__() self.in_channels = in_channels @@ -183,6 +189,7 @@ def __init__( self.frustum = self.create_frustum() self.D = self.frustum.shape[0] self.fp16_enabled = False + self.visualize_bev_feat = visualize_bev_feat def create_frustum(self): iH, iW = self.image_size @@ -319,8 +326,55 @@ def bev_pool_precomputed(self, x, geom_feats, kept, ranks, indices): # collapse Z final = torch.cat(x.unbind(dim=2), 1) + if self.visualize_bev_feat: + self.plot_bev_feat(final) + return final + def plot_bev_feat(self, bev_feat): + """Visualize the BEV feat for the given batch index.""" + try: + import torch.distributed as dist + + if dist.is_available() and dist.is_initialized() and dist.get_rank() != 0: + return + except ImportError: + pass + + batch_idx = 0 + if bev_feat.shape[0] <= batch_idx: + return + + # save first 10 raw channel maps for one batch sample (B, C, Y, X) + num_channels = 10 + with torch.no_grad(): + feat = bev_feat[batch_idx].detach().float().cpu().numpy() + channel_indices = np.arange(min(num_channels, feat.shape[0])) + ncols = min(5, len(channel_indices)) + nrows = math.ceil(len(channel_indices) / ncols) + fig, axes = plt.subplots(nrows, ncols, figsize=(3 * ncols, 3 * nrows), squeeze=False) + for ax, ch_idx in zip(axes.ravel(), channel_indices): + ch_map = feat[ch_idx] + im = ax.imshow(ch_map, cmap="viridis", origin="lower", aspect="equal") + ax.set_title(f"ch {ch_idx}", fontsize=9) + ax.set_xlabel("X") + ax.set_ylabel("Y") + fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04) + for ax in axes.ravel()[len(channel_indices) :]: + ax.axis("off") + fig.suptitle(f"bev_feat channels 0-{len(channel_indices) - 1} (batch={batch_idx})") + fig.tight_layout() + + save_dir = Path("work_dirs/bev_feat_vis_2") + save_dir.mkdir(parents=True, exist_ok=True) + if not hasattr(self, "_bev_feat_vis_count"): + self._bev_feat_vis_count = 0 + self._bev_feat_vis_count += 1 + save_path = save_dir / f"bev_feat_batch{batch_idx}_{self._bev_feat_vis_count:06d}.png" + fig.savefig(save_path, dpi=150, bbox_inches="tight") + plt.close(fig) + print_log(f"Saved BEV feat visualization to {save_path.resolve()}") + def forward( self, img, diff --git a/projects/BEVFusion/bevfusion/depth_lss_v2.py b/projects/BEVFusion/bevfusion/depth_lss_v2.py new file mode 100644 index 000000000..addf038d3 --- /dev/null +++ b/projects/BEVFusion/bevfusion/depth_lss_v2.py @@ -0,0 +1,537 @@ +import math +from pathlib import Path +from typing import Optional, Tuple + +import numpy as np +import torch +from mmdet3d.registry import MODELS +from mmengine.logging import print_log +from torch import nn +from torch.utils.checkpoint import checkpoint + +from .depth_lss import BaseViewTransform, DepthLSSNet, DownSampleNet, LidarDepthImageNet +from .ops import bev_pool_v2 + + +class SELayer(nn.Module): + """ + Squeeze-and-Excitation (SE) layer. + This is used to modulate features with camera-depth aware parameters. + The code is taken from BEVDET (https://github.com/hustvl/BEVDET). + """ + + def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid): + super().__init__() + # Dont need global pooling because inputs are (B*N, C, 1, 1). + self.sequeeze_net = nn.Sequential( + # Squeeze with 1x1 convolution + nn.Conv2d(channels, channels, 1, bias=True), + # Activation + act_layer(), + # Expand with 1x1 convolution + nn.Conv2d(channels, channels, 1, bias=True), + # Gate with sigmoid activation + gate_layer(), + ) + + def forward(self, x: torch.Tensor, depth_aware_features: torch.Tensor) -> torch.Tensor: + """ + Args: + x: Tuple[torch.Tensor, torch.Tensor], the input tuple containing the image features and camera-depth aware parameters. + Returns: + torch.Tensor, the output tensor of shape (B, N, C). + """ + feature_attentions = self.sequeeze_net(depth_aware_features) + return x * feature_attentions + + +class CameraDepthLinearProjectionMLP(nn.Module): + """ + Linear projection module by MLP. This is used to project image (context) features and camera-depth + aware parameters (for example, intrinsics) to embedding space. + The code is taken from BEVDET (https://github.com/hustvl/BEVDET). + """ + + def __init__(self, in_channels: int, hidden_channels: int, out_channels: int, drop_out: float = 0.0): + """ + Args: + in_channels: int, the number of input channels. + hidden_channels: int, the number of hidden channels. + out_channels: int, the number of output channels. + drop_out: float, the dropout rate. + """ + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.drop_out = drop_out + + self.sequential_mlp = nn.Sequential( + nn.Linear(in_channels, hidden_channels), + nn.ReLU(inplace=True), + nn.Dropout(drop_out), + nn.Linear(hidden_channels, out_channels), + nn.Dropout(drop_out), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Args: + x: torch.Tensor, the input tensor of shape (B, N, C). + Returns: + torch.Tensor, the output tensor of shape (B, N, C). + """ + return self.sequential_mlp(x) + + +class CameraDepthAwareNet(nn.Module): + """ + Camera-depth aware depth net. This is used to predict the depth of the scene. + The code is taken from BEVDET (https://github.com/hustvl/BEVDET). + """ + + def __init__( + self, + in_channels: int, + hidden_channels: int, + out_channels: int, + mlp_drop_out: float, + depth_channels: int, + with_cp: bool = False, + num_camera_depth_parameters: int = 27, + ) -> None: + """ + Args: + in_channels: int, the number of input channels. + out_channels: int, the number of output channels. + mlp_drop_out: float, the dropout rate of the MLP. + mlp_hidden_channels: int, the number of hidden channels of the MLP. + mlp_out_channels: int, the number of output channels of the MLP. + """ + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.mlp_drop_out = mlp_drop_out + self.num_camera_depth_parameters = num_camera_depth_parameters + self.depth_channels = depth_channels + self.with_cp = with_cp + + # Input convolution for context/image features + # Camera depth aware parameters branch + self.camera_depth_aware_parameters_bn = nn.BatchNorm1d(self.num_camera_depth_parameters) + + # Context/image feature branch + self.context_input_conv = nn.Sequential( + nn.Conv2d(in_channels, hidden_channels, kernel_size=1, stride=1, bias=False), + nn.BatchNorm2d(hidden_channels), + nn.ReLU(inplace=True), + ) + self.context_camera_depth_aware_mlp = CameraDepthLinearProjectionMLP( + in_channels=self.num_camera_depth_parameters, + hidden_channels=hidden_channels, + out_channels=hidden_channels, + drop_out=self.mlp_drop_out, + ) + self.context_se = SELayer(channels=hidden_channels) + self.context_conv = nn.Conv2d(hidden_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=True) + + # Depth branch + self.depth_camera_depth_aware_mlp = CameraDepthLinearProjectionMLP( + in_channels=self.num_camera_depth_parameters, + hidden_channels=hidden_channels, + out_channels=hidden_channels, + drop_out=self.mlp_drop_out, + ) + self.depth_se = SELayer(channels=hidden_channels) + self.depth_conv = nn.Sequential( + nn.Conv2d(hidden_channels, depth_channels, kernel_size=1, stride=1, padding=0, bias=True) + ) + + def context_forward( + self, context_features: torch.Tensor, camera_depth_aware_features: torch.Tensor + ) -> torch.Tensor: + """ + Args: + x: torch.Tensor, the input tensor of shape (B*N, C, H, W). + camera_depth_aware_parameters: torch.Tensor, the camera-depth aware parameters of shape (B*N, N_CAMERA_DEPTH_PARAMETERS). + Returns: + torch.Tensor, the output tensor of shape (B*N, C, H, W). + """ + context_camera_depth_aware_features = self.context_camera_depth_aware_mlp(camera_depth_aware_features) + # # (B*N, mlp_out_channels) -> (B*N, mlp_out_channels, 1, 1) + context_camera_depth_aware_features = context_camera_depth_aware_features.view(-1, self.hidden_channels, 1, 1) + context_features = self.context_se(context_features, context_camera_depth_aware_features) + context_features = self.context_conv(context_features) + return context_features + + def depth_forward(self, depth_features: torch.Tensor, camera_depth_aware_features: torch.Tensor) -> torch.Tensor: + """ + Args: + depth_features: torch.Tensor, the input tensor of shape (B*N, C, H, W). + camera_depth_aware_parameters: torch.Tensor, the camera-depth aware parameters of shape (B, N, D). + Returns: + torch.Tensor, the output tensor of shape (B*N, C, H, W). + """ + depth_camera_depth_aware_features = self.depth_camera_depth_aware_mlp(camera_depth_aware_features) + # # (B*N, mlp_out_channels) -> (B*N, mlp_out_channels, 1, 1) + depth_camera_depth_aware_features = depth_camera_depth_aware_features.view(-1, self.hidden_channels, 1, 1) + # # (B*N, C, H, W) + depth_features = self.depth_se(depth_features, depth_camera_depth_aware_features) + if self.with_cp: + depth_features = checkpoint(self.depth_conv, depth_features) + else: + depth_features = self.depth_conv(depth_features) + return depth_features + + def forward(self, x: torch.Tensor, camera_depth_aware_parameters: torch.Tensor) -> torch.Tensor: + """ + Args: + x: torch.Tensor, the input tensor of shape (B, N, C, H, W). + camera_depth_aware_parameters: torch.Tensor, the camera-depth aware parameters of shape (B, N, N_CAMERA_DEPTH_PARAMETERS). + Returns: + torch.Tensor, the output tensor of shape (B*N, C, H, W). + """ + # (B, N, N_CAMERA_DEPTH_PARAMETERS) -> (B*N, N_CAMERA_DEPTH_PARAMETERS) + camera_depth_aware_parameters = camera_depth_aware_parameters.view(-1, self.num_camera_depth_parameters) + + # (B*N, N_CAMERA_DEPTH_PARAMETERS) + camera_depth_aware_features = self.camera_depth_aware_parameters_bn(camera_depth_aware_parameters) + context_input_features = self.context_input_conv(x) + context_features = self.context_forward(context_input_features, camera_depth_aware_features) + depth_features = self.depth_forward(context_input_features, camera_depth_aware_features) + return torch.cat([depth_features, context_features], dim=1) + + +class BaseViewTransformV2(BaseViewTransform): + + def __init__( + self, + in_channels: int, + out_channels: int, + image_size: Tuple[int, int], + feature_size: Tuple[int, int], + xbound: Tuple[float, float, float], + ybound: Tuple[float, float, float], + zbound: Tuple[float, float, float], + dbound: Tuple[float, float, float], + collapse_z: bool = True, + expand_batch_axis: bool = False, + visualize_bev_feat: bool = False, + ): + """ + Args: + collapse_z: collapse the Z axis of the BEV grid + expand_batch_axis: expand the batch axis of the inputs to bev pool if this is set to True. + """ + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + image_size=image_size, + feature_size=feature_size, + xbound=xbound, + ybound=ybound, + zbound=zbound, + dbound=dbound, + visualize_bev_feat=visualize_bev_feat, + ) + self.collapse_z = collapse_z + self.expand_batch_axis = expand_batch_axis + + def get_cam_feats( + self, x, camera_depth_aware_parameters: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: + raise NotImplementedError + + def forward( + self, + img, + points, + lidar2image, + camera_intrinsics, + camera2lidar, + img_aug_matrix, + lidar_aug_matrix, + metas, + camera_intrinsics_inverse, + img_aug_matrix_inverse, + lidar_aug_matrix_inverse, + geom_feats_precomputed, + camera_depth_aware_parameters: Optional[torch.Tensor] = None, + ): + if geom_feats_precomputed is not None: + ranks_bev, ranks_depth, ranks_feat = geom_feats_precomputed + x, depth_softmax = self.get_cam_feats(img) + x = self.bev_pool_precomputed(x, depth_softmax, ranks_bev, ranks_depth, ranks_feat) + + # No return depth predictions when precomputed geometry features are used + depth_softmax = None + + else: + intrins = camera_intrinsics[..., :3, :3] + post_rots = img_aug_matrix[..., :3, :3] + post_trans = img_aug_matrix[..., :3, 3] + camera2lidar_rots = camera2lidar[..., :3, :3] + camera2lidar_trans = camera2lidar[..., :3, 3] + + extra_rots = lidar_aug_matrix[..., :3, :3] + extra_trans = lidar_aug_matrix[..., :3, 3] + + geom = self.get_geometry( + camera2lidar_rots, + camera2lidar_trans, + torch.inverse(intrins), + torch.inverse(post_rots), + post_trans, + extra_rots=extra_rots, + extra_trans=extra_trans, + ) + + # depth is not connected to the calibration + # on_img is + # is also flattened_indices + ( + view_feats, + depth_softmax, + ) = self.get_cam_feats(img, camera_depth_aware_parameters) + x = self.bev_pool(view_feats, depth_softmax, geom) + + return x, depth_softmax + + def bev_pool_aux(self, geom_feats): + B, N, D, H, W, C = geom_feats.shape + Nprime = B * N * D * H * W + assert C == 3 + + # record the index of selected points for acceleration purpose + ranks_depth = torch.arange(0, Nprime, dtype=torch.int, device=geom_feats.device) + ranks_feat = torch.arange(0, Nprime // D, dtype=torch.int, device=geom_feats.device) + ranks_feat = ranks_feat.reshape(B, N, 1, H, W) + ranks_feat = ranks_feat.expand(B, N, D, H, W).flatten() + + # flatten indices + geom_feats = ((geom_feats - (self.bx - self.dx / 2.0)) / self.dx).long() + geom_feats = geom_feats.view(Nprime, 3) + batch_ix = torch.cat( + [torch.full([Nprime // B, 1], ix, device=geom_feats.device, dtype=torch.long) for ix in range(B)] + ) + geom_feats = torch.cat((geom_feats, batch_ix), 1) + + # filter out points that are outside box + kept = ( + (geom_feats[:, 0] >= 0) + & (geom_feats[:, 0] < self.nx[0]) + & (geom_feats[:, 1] >= 0) + & (geom_feats[:, 1] < self.nx[1]) + & (geom_feats[:, 2] >= 0) + & (geom_feats[:, 2] < self.nx[2]) + ) + + if len(kept) == 0: + return None, None, None + + geom_feats, ranks_depth, ranks_feat = geom_feats[kept], ranks_depth[kept], ranks_feat[kept] + + # Switch x and y to match the order of the BEV grid + ranks_bev = ( + geom_feats[:, 3] * (self.nx[2] * self.nx[1] * self.nx[0]) + + geom_feats[:, 2] * (self.nx[1] * self.nx[0]) + + geom_feats[:, 0] * self.nx[1] + + geom_feats[:, 1] + ) + indices = ranks_bev.argsort() + ranks_bev, ranks_depth, ranks_feat = ranks_bev[indices], ranks_depth[indices], ranks_feat[indices] + return ( + ranks_bev.int().contiguous(), + ranks_depth.int().contiguous(), + ranks_feat.int().contiguous(), + ) + + def compute_intervals(self, ranks_bev: Optional[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]: + if ranks_bev is None: + return None, None + + kept = torch.ones(ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool) + kept[1:] = ranks_bev[1:] != ranks_bev[:-1] + interval_starts = torch.where(kept)[0].int() + if len(interval_starts) == 0: + return None, None + + interval_lengths = torch.zeros_like(interval_starts) + interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1] + interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1] + return interval_starts.int().contiguous(), interval_lengths.int().contiguous() + + def bev_pool(self, view_feats, depth_softmax, geom) -> torch.Tensor: + """ """ + ranks_bev, ranks_depth, ranks_feat = self.bev_pool_aux(geom) + interval_starts, interval_lengths = self.compute_intervals(ranks_bev) + bev_feat = self.compute_bev_pool( + view_feats, depth_softmax, ranks_bev, ranks_depth, ranks_feat, interval_starts, interval_lengths + ) + return bev_feat + + def compute_bev_pool( + self, view_feats, depth_softmax, ranks_bev, ranks_depth, ranks_feat, interval_starts, interval_lengths + ): + """Compute the BEV pool for the given view features, depth softmax, ranks, and intervals.""" + if interval_starts is None: + print_log("warning ---> no points within the predefined bev receptive field") + dummy = torch.zeros( + size=[view_feats.shape[0], view_feats.shape[2], self.nx[2], self.nx[1], self.nx[0]], + dtype=view_feats.dtype, + device=view_feats.device, + ) + if self.collapse_z: + dummy = torch.cat(dummy.unbind(dim=2), 1) + return dummy + + if self.expand_batch_axis: + view_feats = view_feats.unsqueeze(0) + depth_softmax = depth_softmax.unsqueeze(0) + + # permute view_feats from (B, N, C, fH, fW) to (B, N, fH, fW, C) + view_feats = view_feats.permute(0, 1, 3, 4, 2) + bev_feat_shape = ( + depth_softmax.shape[0], + int(self.nx[2]), + int(self.nx[1]), + int(self.nx[0]), + view_feats.shape[-1], + ) # (B, Z, Y, X, C) + bev_feat = bev_pool_v2( + depth=depth_softmax, + feat=view_feats, + ranks_depth=ranks_depth, + ranks_feat=ranks_feat, + ranks_bev=ranks_bev, + interval_starts=interval_starts, + interval_lengths=interval_lengths, + bev_feat_shape=bev_feat_shape, + is_training=self.training, + ) + + # collapse Z + if self.collapse_z: + bev_feat = torch.cat(bev_feat.unbind(dim=2), 1) + + if self.visualize_bev_feat: + self.plot_bev_feat(bev_feat) + + return bev_feat + + def bev_pool_precomputed(self, view_feats, depth_softmax, ranks_bev, ranks_depth, ranks_feat): + interval_starts, interval_lengths = self.compute_intervals(ranks_bev) + bev_feat = self.compute_bev_pool( + view_feats, depth_softmax, ranks_bev, ranks_depth, ranks_feat, interval_starts, interval_lengths + ) + return bev_feat + + def get_depth_softmax(self, x: torch.Tensor, B, N, fH, fW) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + x: torch.Tensor, the input tensor of shape (B*N, D+C, H, W). + Returns: + Tuple[torch.Tensor, torch.Tensor], the tuple containing the view features and depth softmax. + view_feats: torch.Tensor, the view features of shape (B, N, C, H, W). + depth_softmax: torch.Tensor, the depth softmax of shape (B, N, D, H, W). + """ + depth_softmax = x[:, : self.D].softmax(dim=1) + depth_softmax = depth_softmax.view(B, N, self.D, fH, fW) + view_feats = x[:, self.D : (self.D + self.C)] + view_feats = view_feats.view(B, N, self.C, fH, fW) + return view_feats, depth_softmax + + +@MODELS.register_module() +class LSSTransformV2(BaseViewTransformV2): + + def __init__( + self, + in_channels: int, + out_channels: int, + image_size: Tuple[int, int], + feature_size: Tuple[int, int], + xbound: Tuple[float, float, float], + ybound: Tuple[float, float, float], + zbound: Tuple[float, float, float], + dbound: Tuple[float, float, float], + downsample: int = 1, + ): + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + image_size=image_size, + feature_size=feature_size, + xbound=xbound, + ybound=ybound, + zbound=zbound, + dbound=dbound, + ) + self.depthnet = nn.Conv2d(self.in_channels, self.D + self.C, 1) + self.downsample = DownSampleNet(downsample, out_channels, out_channels) + + def get_cam_feats( + self, x: torch.Tensor, camera_depth_aware_parameters: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: + B, N, C, fH, fW = x.shape + x = x.view(B * N, C, fH, fW) + x = self.depthnet(x) + return self.get_depth_softmax(x, B=B, N=N, fH=fH, fW=fW) + + def forward(self, *args, **kwargs): + x, depth_softmax = super().forward(*args, **kwargs) + x = self.downsample(x) + return x, depth_softmax + + +@MODELS.register_module() +class LSSTransformV2DepthAware(BaseViewTransformV2): + + def __init__( + self, + in_channels: int, + out_channels: int, + image_size: Tuple[int, int], + feature_size: Tuple[int, int], + xbound: Tuple[float, float, float], + ybound: Tuple[float, float, float], + zbound: Tuple[float, float, float], + dbound: Tuple[float, float, float], + camera_depth_aware_configs: dict, + downsample: int = 1, + ): + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + image_size=image_size, + feature_size=feature_size, + xbound=xbound, + ybound=ybound, + zbound=zbound, + dbound=dbound, + ) + if downsample > 1: + self.downsample = DownSampleNet(downsample, out_channels, out_channels) + else: + self.downsample = nn.Identity() + self.camera_depth_aware_net = CameraDepthAwareNet( + in_channels=in_channels, + hidden_channels=in_channels, + mlp_drop_out=camera_depth_aware_configs["mlp_drop_out"], + depth_channels=self.D, + out_channels=self.C, + ) + + def get_cam_feats( + self, x: torch.Tensor, camera_depth_aware_parameters: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: + B, N, C, fH, fW = x.shape + x = x.view(B * N, C, fH, fW) + x = self.camera_depth_aware_net(x, camera_depth_aware_parameters) + return self.get_depth_softmax(x, B=B, N=N, fH=fH, fW=fW) + + def forward(self, *args, **kwargs): + x, depth_softmax = super().forward(*args, **kwargs) + x = self.downsample(x) + return x, depth_softmax diff --git a/projects/BEVFusion/bevfusion/loading.py b/projects/BEVFusion/bevfusion/loading.py index 0478d67a3..be9101b64 100644 --- a/projects/BEVFusion/bevfusion/loading.py +++ b/projects/BEVFusion/bevfusion/loading.py @@ -1,10 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy -import os -from typing import List, Optional +from pathlib import Path +from typing import Dict, List, Optional, Tuple +import matplotlib.pyplot as plt import mmcv import numpy as np +from mmcv.transforms import BaseTransform from mmdet3d.datasets.transforms import LoadMultiViewImageFromFiles from mmdet3d.registry import TRANSFORMS from mmengine.fileio import get @@ -35,7 +37,7 @@ class BEVLoadMultiViewImageFromFiles(LoadMultiViewImageFromFiles): def __init__( self, - camera_order: List[str], + camera_orders: Dict[str, List[str]], to_float32: bool = False, color_type: str = "unchanged", backend_args: Optional[dict] = None, @@ -44,7 +46,7 @@ def __init__( test_mode: bool = False, set_default_scale: bool = True, ) -> None: - self.camera_order = camera_order + self.camera_orders = camera_orders self.to_float32 = to_float32 self.color_type = color_type self.backend_args = backend_args @@ -56,6 +58,7 @@ def __init__( self.test_mode = test_mode self.set_default_scale = set_default_scale self.before_camera_info = dict() + self.camera_order_types = list(camera_orders.keys()) def transform(self, results: dict) -> Optional[dict]: """Call function to load multi-view image from files. @@ -75,6 +78,12 @@ def transform(self, results: dict) -> Optional[dict]: - scale_factor (float): Scale factor. - img_norm_cfg (dict): Normalization configuration of images. """ + vehicle_type = results.get("vehicle_type", None) + if vehicle_type is None: + camera_order = self.camera_orders[self.camera_order_types[0]] + else: + camera_order = self.camera_orders[vehicle_type] + # TODO: consider split the multi-sweep part out of this pipeline # Derive the mask and transform for loading of multi-sweep data if self.num_ref_frames > 0: @@ -138,7 +147,7 @@ def transform(self, results: dict) -> Optional[dict]: # to fill None data # for _ , cam_item in results['images'].items(): - for camera_type in self.camera_order: + for camera_type in camera_order: if camera_type not in results["images"]: continue @@ -217,3 +226,219 @@ def transform(self, results: dict) -> Optional[dict]: results["num_views"] = self.num_views results["num_ref_frames"] = self.num_ref_frames return results + + +@TRANSFORMS.register_module() +class PointsToMultiViewImageDepths(BaseTransform): + """Convert points to multi-view image depths. + + Args: + points (np.ndarray): Points in the world coordinate system. + img_shape (tuple): Shape of the image. + cam2img (np.ndarray): Camera to image transformation matrix. + lidar2cam (np.ndarray): LiDAR to camera transformation matrix. + visualize_dir (str, optional): If set, saves a per-sample subplot + of `gt_depths` (one panel per camera) to this directory. + Useful for debugging the projection. Defaults to None. + max_depth (float): Upper clip for the depth color scale (m). + Defaults to 80. + """ + + def __init__( + self, + img_shape, + num_cameras: int, + depth_bounds: Tuple[float, float], + visualize_dir: Optional[str] = None, + max_depth: float = 80.0, + ): + self.img_shape = img_shape + self.num_cameras = num_cameras + self.visualize_dir = visualize_dir + self.max_depth = max_depth + self.depth_bounds = depth_bounds + self.visualize_dir = Path(visualize_dir) if visualize_dir is not None else None + if self.visualize_dir is not None: + self.visualize_dir.mkdir(parents=True, exist_ok=True) + self._depth_idx = 0 + + def transform(self, results: dict) -> Optional[dict]: + """Call function to load multi-view image from files. + + Args: + results (dict): Result dict containing multi-view image filenames. + + Returns: + dict: The result dict containing the multi-view image data. + Added keys: + - gt_depths (np.ndarray): Ground truth depths in (N, H, W) for (number of cameras, height, width). + """ + lidar2image = np.asarray(results["lidar2img"]) + img_aug_matrix = np.asarray(results["img_aug_matrix"]) if "img_aug_matrix" in results else np.eye(4) + cur_coords = results["points"].numpy()[:, :3] + + # inverse lidar aug + if "lidar_aug_matrix" in results: + lidar_aug_matrix = np.asarray(results["lidar_aug_matrix"]) + lidar_aug_matrix_inverse = np.linalg.inv(lidar_aug_matrix) + cur_coords -= lidar_aug_matrix[:3, 3] + cur_coords = lidar_aug_matrix_inverse[:3, :3] @ cur_coords.transpose(1, 0) + else: + cur_coords = cur_coords.transpose(1, 0) + + # lidar2image + cur_coords = lidar2image[:, :3, :3] @ cur_coords + cur_coords += lidar2image[:, :3, 3].reshape(-1, 3, 1) + + # get 2d coords + dist = cur_coords[:, 2, :] + valid_dist_mask = (dist >= self.depth_bounds[0]) & (dist < self.depth_bounds[1]) + + cur_coords[:, 2, :] = np.clip(cur_coords[:, 2, :], 1e-5, 1e5) + cur_coords[:, :2, :] /= cur_coords[:, 2:3, :] + + # imgaug + cur_coords = img_aug_matrix[:, :3, :3] @ cur_coords + cur_coords += img_aug_matrix[:, :3, 3].reshape(-1, 3, 1) + cur_coords = cur_coords[:, :2, :].transpose(0, 2, 1) + + # normalize coords for grid sample + cur_coords = cur_coords[..., [1, 0]] + on_img = ( + (cur_coords[..., 0] < self.img_shape[0]) + & (cur_coords[..., 0] >= 0) + & (cur_coords[..., 1] < self.img_shape[1]) + & (cur_coords[..., 1] >= 0) + & valid_dist_mask + ) + + # Avoid loops since it's slow + indices = np.nonzero(on_img) + camera_indices = indices[0] + point_indices = indices[1] + masked_coords = cur_coords[camera_indices, point_indices].astype(np.int64) + masked_dist = dist[camera_indices, point_indices] + + # Possibly to have duplicates and the last one will be used, however, the chance is small + flatten_indices = ( + camera_indices * self.img_shape[0] * self.img_shape[1] + + masked_coords[:, 0] * self.img_shape[1] + + masked_coords[:, 1] + ) + depth_flat = np.zeros(self.num_cameras * self.img_shape[0] * self.img_shape[1], dtype=np.float32) + depth_flat[flatten_indices] = masked_dist + depth = depth_flat.reshape(self.num_cameras, self.img_shape[0], self.img_shape[1]) + results["gt_depths"] = depth + + if self.visualize_dir is not None: + self._save_depth_subplot(depth, results) + return results + + def _save_depth_subplot(self, depth: np.ndarray, results: dict) -> None: + """Save `gt_depths` as a subplot with one panel per camera. + + The figure contains three row blocks per camera: + - image underlay (if available) + projected LiDAR depth points + - image pixels only + - depth-only heatmap (no image pixel values) + + Args: + depth (np.ndarray): (num_cameras, H, W) ground-truth depth map. + results (dict): The pipeline result dict; used for the underlay + image and to derive a unique filename. + """ + imgs = results.get("img", None) + + # Layout: + # - Top block: image underlay + projected depth points. + # - Middle block: image pixels only. + # - Bottom block: depth-only heatmap (no image pixel values). + if self.num_cameras <= 6: + base_rows, cols = 1, self.num_cameras + else: + cols = int(np.ceil(np.sqrt(self.num_cameras))) + base_rows = int(np.ceil(self.num_cameras / cols)) + rows = base_rows * 3 + + fig, axes = plt.subplots(rows, cols, figsize=(4 * cols, 4 * rows), squeeze=False) + + for c in range(self.num_cameras): + d = depth[c] + ys, xs = np.nonzero(d) + vals = d[ys, xs] + + # Row block 1: image + depth scatter. + ax_overlay = axes[c // cols, c % cols] + if imgs is not None and c < len(imgs): + ax_overlay.imshow(imgs[c].astype(np.uint8)) + if vals.size > 0: + ax_overlay.scatter( + xs, + ys, + c=vals, + cmap="turbo", + vmin=0, + vmax=self.max_depth, + s=1, + ) + else: + ax_overlay.imshow( + d, + cmap="turbo", + vmin=0, + vmax=self.max_depth, + interpolation="nearest", + ) + ax_overlay.set_title(f"cam {c} overlay ({vals.size} pts)") + ax_overlay.set_xticks([]) + ax_overlay.set_yticks([]) + + # Row block 2: image-only visualization. + ax_img = axes[base_rows + (c // cols), c % cols] + if imgs is not None and c < len(imgs): + ax_img.imshow(imgs[c].astype(np.uint8)) + else: + ax_img.imshow( + d, + cmap="gray", + vmin=0, + vmax=self.max_depth, + interpolation="nearest", + ) + ax_img.set_title(f"cam {c} image-only") + ax_img.set_xticks([]) + ax_img.set_yticks([]) + + # Row block 3: depth-only visualization. + ax_depth = axes[(base_rows * 2) + (c // cols), c % cols] + ax_depth.imshow( + d, + cmap="turbo", + vmin=0, + vmax=self.max_depth, + interpolation="nearest", + ) + ax_depth.set_title(f"cam {c} depth-only") + ax_depth.set_xticks([]) + ax_depth.set_yticks([]) + + # Hide any unused subplots when n doesn't fill the grid. + for c in range(self.num_cameras, base_rows * cols): + axes[c // cols, c % cols].axis("off") + axes[base_rows + (c // cols), c % cols].axis("off") + axes[(base_rows * 2) + (c // cols), c % cols].axis("off") + + # Shared depth colorbar with numeric values. + depth_mappable = plt.cm.ScalarMappable(cmap="turbo", norm=plt.Normalize(vmin=0, vmax=self.max_depth)) + depth_mappable.set_array([]) + cbar = fig.colorbar(depth_mappable, ax=axes, location="right", fraction=0.02, pad=0.02) + cbar.set_label("Depth (m)") + + fig.suptitle(f"gt_depths — {self._depth_idx}") + fig.tight_layout(rect=[0, 0, 0.96, 0.97]) + + self._depth_idx += 1 + out_path = self.visualize_dir / f"{self._depth_idx:06d}_gt_depths.png" + fig.savefig(out_path, dpi=120, bbox_inches="tight") + plt.close(fig) + print(f"Saved gt_depths visualization to {out_path}") diff --git a/projects/BEVFusion/bevfusion/ops/__init__.py b/projects/BEVFusion/bevfusion/ops/__init__.py index e08abbc6d..f74f0edbb 100644 --- a/projects/BEVFusion/bevfusion/ops/__init__.py +++ b/projects/BEVFusion/bevfusion/ops/__init__.py @@ -1,4 +1,12 @@ from .bev_pool import bev_pool +from .bev_pool_v2 import bev_pool_v2 from .voxel import DynamicScatter, Voxelization, dynamic_scatter, voxelization -__all__ = ["bev_pool", "Voxelization", "voxelization", "dynamic_scatter", "DynamicScatter"] +__all__ = [ + "bev_pool", + "bev_pool_v2", + "Voxelization", + "voxelization", + "dynamic_scatter", + "DynamicScatter", +] diff --git a/projects/BEVFusion/bevfusion/ops/bev_pool_v2/__init__.py b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/__init__.py new file mode 100644 index 000000000..ff2fdfff7 --- /dev/null +++ b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/__init__.py @@ -0,0 +1,3 @@ +from .bev_pool_v2 import bev_pool_v2 + +__all__ = ["bev_pool_v2"] diff --git a/projects/BEVFusion/bevfusion/ops/bev_pool_v2/bev_pool_v2.py b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/bev_pool_v2.py new file mode 100644 index 000000000..af1ba15de --- /dev/null +++ b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/bev_pool_v2.py @@ -0,0 +1,191 @@ +# Copyright (c) Phigent Robotics. All rights reserved. + +import numpy as np +import torch + +from . import bev_pool_v2_ext + + +class QuickCumsumV2TrainingCuda(torch.autograd.Function): + r"""BEVPoolv2 implementation for Lift-Splat-Shoot view transformation. + + Please refer to the `paper `_ + """ + + @staticmethod + def forward( + ctx, depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths + ): + ranks_bev = ranks_bev.int() + depth = depth.contiguous().float() + feat = feat.contiguous().float() + ranks_depth = ranks_depth.contiguous().int() + ranks_feat = ranks_feat.contiguous().int() + interval_lengths = interval_lengths.contiguous().int() + interval_starts = interval_starts.contiguous().int() + + out = feat.new_zeros(bev_feat_shape) + + bev_pool_v2_ext.bev_pool_v2_forward( + depth, + feat, + out, + ranks_depth, + ranks_feat, + ranks_bev, + interval_lengths, + interval_starts, + ) + + ctx.save_for_backward(ranks_bev, depth, feat, ranks_feat, ranks_depth) + return out + + @staticmethod + def backward(ctx, out_grad): + ranks_bev, depth, feat, ranks_feat, ranks_depth = ctx.saved_tensors + + order = ranks_feat.argsort() + ranks_feat, ranks_depth, ranks_bev = ranks_feat[order], ranks_depth[order], ranks_bev[order] + kept = torch.ones(ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool) + kept[1:] = ranks_feat[1:] != ranks_feat[:-1] + interval_starts_bp = torch.where(kept)[0].int() + interval_lengths_bp = torch.zeros_like(interval_starts_bp) + interval_lengths_bp[:-1] = interval_starts_bp[1:] - interval_starts_bp[:-1] + interval_lengths_bp[-1] = ranks_bev.shape[0] - interval_starts_bp[-1] + + depth = depth.contiguous() + feat = feat.contiguous() + ranks_depth = ranks_depth.contiguous() + ranks_feat = ranks_feat.contiguous() + ranks_bev = ranks_bev.contiguous() + interval_lengths_bp = interval_lengths_bp.contiguous() + interval_starts_bp = interval_starts_bp.contiguous() + + depth_grad = depth.new_zeros(depth.shape) + feat_grad = feat.new_zeros(feat.shape) + out_grad = out_grad.contiguous() + bev_pool_v2_ext.bev_pool_v2_backward( + out_grad, + depth_grad, + feat_grad, + depth, + feat, + ranks_depth, + ranks_feat, + ranks_bev, + interval_lengths_bp, + interval_starts_bp, + ) + return depth_grad, feat_grad, None, None, None, None, None, None, None, None + + +class QuickCumsumV2Cuda(torch.autograd.Function): + + @staticmethod + def symbolic( + g, + depth, + feat, + ranks_depth, + ranks_feat, + ranks_bev, + interval_starts, + interval_lengths, + out_height=128, + out_width=128, + ): + """symbolic function for creating onnx op.""" + x = g.op( + "autoware::QuickCumsumV2Cuda", + depth, + feat, + ranks_depth, + ranks_feat, + ranks_bev, + interval_starts, + interval_lengths, + out_height_i=out_height, + out_width_i=out_width, + ) + + # features_shape = _get_tensor_sizes(feat) + # if features_shape is not None and hasattr(x.type(), "with_sizes"): + # output_type = x.type().with_sizes([B, D, H, W, _get_tensor_dim_size(x, -1)]) + # output.setType(output_type) + + @staticmethod + def forward( + ctx, + depth, # B,N,D,H,W + feat, # B,N,H,W,C + ranks_depth, + ranks_feat, + ranks_bev, + interval_starts, + interval_lengths, + out_height=128, + out_width=128, + ): + """run forward.""" + out = feat.new_zeros(depth.shape[0], 1, out_height, out_width, feat.shape[-1]) + bev_feat = bev_pool_v2_ext.bev_pool_v2_forward( + depth, + feat, + out, + ranks_depth, + ranks_feat, + ranks_bev, + interval_lengths, + interval_starts, + ) + return bev_feat + + @staticmethod + def backward(ctx, out_grad): + raise NotImplementedError + + +def bev_pool_v2( + depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, bev_feat_shape, is_training +): + # Always use full (B, Z, H, W, C) buffer; QuickCumsumV2Cuda (Z=1) is ONNX-only. + del is_training + x = QuickCumsumV2TrainingCuda.apply( + depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths + ) + + # Final shape: (B, C, Z, H, W) — matches LSSTransform v1 after permute + x = x.permute(0, 4, 1, 2, 3).contiguous() + return x + + +def test_bev_pool_v2(): + depth = np.array([0.3, 0.4, 0.2, 0.1, 0.7, 0.6, 0.8, 0.9]) + depth = torch.from_numpy(depth).float().cuda() + depth = depth.view(1, 1, 2, 2, 2).requires_grad_() + feat = torch.ones(size=[1, 1, 2, 2, 2], dtype=torch.float, device="cuda").requires_grad_() + ranks_depth = torch.from_numpy(np.array([0, 4, 1, 6])).int().cuda() + ranks_feat = torch.from_numpy(np.array([0, 0, 1, 2])).int().cuda() + ranks_bev = torch.from_numpy(np.array([0, 0, 1, 1])).int().cuda() + + kept = torch.ones(ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool) + kept[1:] = ranks_bev[1:] != ranks_bev[:-1] + interval_starts = torch.where(kept)[0].int() + if len(interval_starts) == 0: + return None, None, None, None, None + interval_lengths = torch.zeros_like(interval_starts) + interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1] + interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1] + bev_feat = bev_pool_v2( + depth, feat, ranks_depth, ranks_feat, ranks_bev, (1, 1, 2, 2, 2), interval_starts, interval_lengths + ) + loss = torch.sum(bev_feat) + loss.backward() + assert loss == 4.4 + grad_depth = np.array([2.0, 2.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0]) + grad_depth = torch.from_numpy(grad_depth).float() + grad_depth = grad_depth.cuda().view(1, 1, 2, 2, 2) + assert depth.grad.allclose(grad_depth) + grad_feat = np.array([1.0, 1.0, 0.4, 0.4, 0.8, 0.8, 0.0, 0.0]) + grad_feat = torch.from_numpy(grad_feat).float().cuda().view(1, 1, 2, 2, 2) + assert feat.grad.allclose(grad_feat) diff --git a/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool.cpp b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool.cpp new file mode 100644 index 000000000..c7c38f695 --- /dev/null +++ b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool.cpp @@ -0,0 +1,111 @@ +// Copyright (c) Phigent Robotics. All rights reserved. +// Reference https://arxiv.org/abs/2211.17111 +#include +#include + +// CUDA function declarations +void bev_pool_v2(int c, int n_intervals, const float* depth, const float* feat, + const int* ranks_depth, const int* ranks_feat, const int* ranks_bev, + const int* interval_starts, const int* interval_lengths, float* out); + +void bev_pool_v2_grad(int c, int n_intervals, const float* out_grad, + const float* depth, const float* feat, const int* ranks_depth, const int* ranks_feat, + const int* ranks_bev, const int* interval_starts, const int* interval_lengths, + float* depth_grad, float* feat_grad); + + +/* + Function: pillar pooling (forward, cuda) + Args: + depth : input depth, FloatTensor[n, d, h, w] + feat : input features, FloatTensor[n, h, w, c] + out : output features, FloatTensor[b, c, h_out, w_out] + ranks_depth : depth index of points, IntTensor[n_points] + ranks_feat : feat index of points, IntTensor[n_points] + ranks_bev : output index of points, IntTensor[n_points] + interval_lengths : starting position for pooled point, IntTensor[n_intervals] + interval_starts : how many points in each pooled point, IntTensor[n_intervals] + Return: +*/ +void bev_pool_v2_forward( + const at::Tensor _depth, + const at::Tensor _feat, + at::Tensor _out, + const at::Tensor _ranks_depth, + const at::Tensor _ranks_feat, + const at::Tensor _ranks_bev, + const at::Tensor _interval_lengths, + const at::Tensor _interval_starts +) { + int c = _feat.size(4); + int n_intervals = _interval_lengths.size(0); + const at::cuda::OptionalCUDAGuard device_guard(device_of(_depth)); + const float* depth = _depth.data_ptr(); + const float* feat = _feat.data_ptr(); + const int* ranks_depth = _ranks_depth.data_ptr(); + const int* ranks_feat = _ranks_feat.data_ptr(); + const int* ranks_bev = _ranks_bev.data_ptr(); + + const int* interval_lengths = _interval_lengths.data_ptr(); + const int* interval_starts = _interval_starts.data_ptr(); + + float* out = _out.data_ptr(); + bev_pool_v2( + c, n_intervals, depth, feat, ranks_depth, ranks_feat, + ranks_bev, interval_starts, interval_lengths, out + ); +} + + +/* + Function: pillar pooling (backward, cuda) + Args: + out_grad : grad of output bev feature, FloatTensor[b, c, h_out, w_out] + depth_grad : grad of input depth, FloatTensor[n, d, h, w] + feat_grad : grad of input feature, FloatTensor[n, h, w, c] + depth : input depth, FloatTensor[n, d, h, w] + feat : input features, FloatTensor[n, h, w, c] + ranks_depth : depth index of points, IntTensor[n_points] + ranks_feat : feat index of points, IntTensor[n_points] + ranks_bev : output index of points, IntTensor[n_points] + interval_lengths : starting position for pooled point, IntTensor[n_intervals] + interval_starts : how many points in each pooled point, IntTensor[n_intervals] +*/ +void bev_pool_v2_backward( + const at::Tensor _out_grad, + at::Tensor _depth_grad, + at::Tensor _feat_grad, + const at::Tensor _depth, + const at::Tensor _feat, + const at::Tensor _ranks_depth, + const at::Tensor _ranks_feat, + const at::Tensor _ranks_bev, + const at::Tensor _interval_lengths, + const at::Tensor _interval_starts +) { + int c = _out_grad.size(4); + int n_intervals = _interval_lengths.size(0); + const at::cuda::OptionalCUDAGuard device_guard(device_of(_out_grad)); + const float* out_grad = _out_grad.data_ptr(); + float* depth_grad = _depth_grad.data_ptr(); + float* feat_grad = _feat_grad.data_ptr(); + const float* depth = _depth.data_ptr(); + const float* feat = _feat.data_ptr(); + const int* ranks_depth = _ranks_depth.data_ptr(); + const int* ranks_feat = _ranks_feat.data_ptr(); + const int* ranks_bev = _ranks_bev.data_ptr(); + const int* interval_lengths = _interval_lengths.data_ptr(); + const int* interval_starts = _interval_starts.data_ptr(); + + bev_pool_v2_grad( + c, n_intervals, out_grad, depth, feat, ranks_depth, ranks_feat, + ranks_bev, interval_starts, interval_lengths, depth_grad, feat_grad + ); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("bev_pool_v2_forward", &bev_pool_v2_forward, + "bev_pool_v2_forward"); + m.def("bev_pool_v2_backward", &bev_pool_v2_backward, + "bev_pool_v2_backward"); +} diff --git a/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool_cuda.cu b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool_cuda.cu new file mode 100644 index 000000000..7fa3179b7 --- /dev/null +++ b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool_cuda.cu @@ -0,0 +1,140 @@ +// Copyright (c) Phigent Robotics. All rights reserved. +// Reference https://arxiv.org/abs/2211.17111 + +#include +#include + +/* + Function: pillar pooling + Args: + c : number of channels + n_intervals : number of unique points + depth : input depth, FloatTensor[b,n,d,h,w] + feat : input feat, FloatTensor[b,n,h,w,c] + ranks_depth : input index of depth, IntTensor[n] + ranks_feat : input index of feat, IntTensor[n] + ranks_bev : output index, IntTensor[n] + interval_lengths : starting position for pooled point, IntTensor[n_intervals] + interval_starts : how many points in each pooled point, IntTensor[n_intervals] + out : output features, FloatTensor[b, d, h, w, c] +*/ +__global__ void bev_pool_v2_kernel(int c, int n_intervals, + const float *__restrict__ depth, + const float *__restrict__ feat, + const int *__restrict__ ranks_depth, + const int *__restrict__ ranks_feat, + const int *__restrict__ ranks_bev, + const int *__restrict__ interval_starts, + const int *__restrict__ interval_lengths, + float* __restrict__ out) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int index = idx / c; + int cur_c = idx % c; + if (index >= n_intervals) return; + int interval_start = interval_starts[index]; + int interval_length = interval_lengths[index]; + float psum = 0; + const float* cur_depth; + const float* cur_feat; + for(int i = 0; i < interval_length; i++){ + cur_depth = depth + ranks_depth[interval_start+i]; + cur_feat = feat + ranks_feat[interval_start+i] * c + cur_c; + psum += *cur_feat * *cur_depth; + } + + const int* cur_rank = ranks_bev + interval_start; + float* cur_out = out + *cur_rank * c + cur_c; + *cur_out = psum; +} + + +/* + Function: pillar pooling backward + Args: + c : number of channels + n_intervals : number of unique points + out_grad : gradient of the BEV fmap from top, FloatTensor[b, d, h, w, c] + depth : input depth, FloatTensor[b,n,d,h,w] + feat : input feat, FloatTensor[b,n,h,w,c] + ranks_depth : input index of depth, IntTensor[n] + ranks_feat : input index of feat, IntTensor[n] + ranks_bev : output index, IntTensor[n] + interval_lengths : starting position for pooled point, IntTensor[n_intervals] + interval_starts : how many points in each pooled point, IntTensor[n_intervals] + depth_grad : gradient of the depth fmap, FloatTensor + feat_grad : gradient of the feature fmap, FloatTensor +*/ +__global__ void bev_pool_grad_kernel(int c, int n_intervals, + const float *__restrict__ out_grad, + const float *__restrict__ depth, + const float *__restrict__ feat, + const int *__restrict__ ranks_depth, + const int *__restrict__ ranks_feat, + const int *__restrict__ ranks_bev, + const int *__restrict__ interval_starts, + const int *__restrict__ interval_lengths, + float* __restrict__ depth_grad, + float* __restrict__ feat_grad) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n_intervals) return; + int interval_start = interval_starts[idx]; + int interval_length = interval_lengths[idx]; + + const int* cur_rank; + const float* cur_out_grad; + const float* cur_out_grad_start; + + const float* cur_feat; + const float* cur_feat_start; + float* cur_depth_grad; + float grad_sum; + for(int i = 0; i < interval_length; i++){ + cur_rank = ranks_bev + interval_start + i; + cur_out_grad_start = out_grad + * cur_rank * c; + cur_feat_start = feat + ranks_feat[interval_start+i] * c; + + grad_sum = 0; + for(int cur_c = 0; cur_c < c; cur_c++){ + cur_out_grad = cur_out_grad_start + cur_c; + cur_feat = cur_feat_start + cur_c; + grad_sum += *cur_out_grad * *cur_feat; + } + + cur_depth_grad = depth_grad + ranks_depth[interval_start+i]; + *cur_depth_grad = grad_sum; + } + + float* cur_feat_grad; + const float* cur_depth; + for(int cur_c = 0; cur_c < c; cur_c++){ + grad_sum = 0; + for(int i = 0; i < interval_length; i++){ + cur_rank = ranks_bev + interval_start + i; + cur_out_grad = out_grad + *cur_rank * c + cur_c; + + cur_depth = depth + ranks_depth[interval_start+i]; + grad_sum += *cur_out_grad * *cur_depth; + } + cur_feat_grad = feat_grad + ranks_feat[interval_start] * c + cur_c ; + * cur_feat_grad = grad_sum; + } +} + + + +void bev_pool_v2(int c, int n_intervals, const float* depth, const float* feat, const int* ranks_depth, + const int* ranks_feat, const int* ranks_bev, const int* interval_starts, const int* interval_lengths, float* out) { + bev_pool_v2_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>( + c, n_intervals, depth, feat, ranks_depth, ranks_feat, + ranks_bev, interval_starts, interval_lengths, out + ); +} + +void bev_pool_v2_grad(int c, int n_intervals, const float* out_grad, + const float* depth, const float* feat, const int* ranks_depth, const int* ranks_feat, + const int* ranks_bev, const int* interval_starts, const int* interval_lengths, float* depth_grad, float* feat_grad) { + bev_pool_grad_kernel<<<(int)ceil(((double)n_intervals / 256)), 256>>>( + c, n_intervals, out_grad, depth, feat, ranks_depth, ranks_feat, + ranks_bev, interval_starts, interval_lengths, depth_grad, feat_grad + ); +} diff --git a/projects/BEVFusion/bevfusion/transforms_3d.py b/projects/BEVFusion/bevfusion/transforms_3d.py index 7e9faca24..31d0cc417 100644 --- a/projects/BEVFusion/bevfusion/transforms_3d.py +++ b/projects/BEVFusion/bevfusion/transforms_3d.py @@ -188,6 +188,19 @@ def transform(self, input_dict: dict) -> dict: return input_dict +@TRANSFORMS.register_module() +class BEVFusionRemoveLiDARPoints(BaseTransform): + """Remove LiDAR points from the data.""" + + def __init__(self): + super().__init__() + + def transform(self, results: Dict[str, Any]) -> Dict[str, Any]: + if "points" in results: + results["points"] = None + return results + + @TRANSFORMS.register_module() class GridMask(BaseTransform): diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/bevfusion_camera_swin_fpn_30e_8xb8_j6gen2_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb32_j6gen2_base_50m.py similarity index 75% rename from projects/BEVFusion/configs/t4dataset/BEVFusion-C/bevfusion_camera_swin_fpn_30e_8xb8_j6gen2_base_120m.py rename to projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb32_j6gen2_base_50m.py index e65c52ece..0a416c9fd 100644 --- a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/bevfusion_camera_swin_fpn_30e_8xb8_j6gen2_base_120m.py +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb32_j6gen2_base_50m.py @@ -1,9 +1,8 @@ _base_ = [ "../../../../../autoware_ml/configs/detection3d/default_runtime.py", "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py", - "../default/pipelines/default_camera_lidar_intensity_120m.py", - "../default/models/default_camera_swin_fpn_120m.py", - "../default/schedulers/default_30e_8xb8_adamw_linear_cosine.py", + "../default/pipelines/cameras/default_camera_50m.py", + "../default/schedulers/default_30e_8xb32_adamw_linear_cosine.py", "../default/default_misc.py", ] @@ -13,35 +12,7 @@ # user setting data_root = "data/t4dataset/" -info_directory_path = "info/user_name/" - -experiment_group_name = "bevfusion_camera/j6gen2_base/" + _base_.dataset_type -experiment_name = "bevfusion_camera_swin_fpn_30e_8xb8_j6gen2_base_120m" -work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name - -# model parameter -model = dict( - type="BEVFusion", - view_transform=dict(image_size=_base_.image_size), - bbox_head=dict( - class_names=_base_.class_names, - in_channels=80, - train_cfg=dict( - point_cloud_range=_base_.point_cloud_range, - grid_size=_base_.grid_size, - voxel_size=_base_.voxel_size, - ), - test_cfg=dict( - grid_size=_base_.grid_size, - voxel_size=_base_.voxel_size[0:2], - pc_range=_base_.point_cloud_range[0:2], - ), - bbox_coder=dict( - pc_range=_base_.point_cloud_range[0:2], - voxel_size=_base_.voxel_size[0:2], - ), - ), -) +info_directory_path = "info/kokseang_2_9_0/" # Dataset parameters train_dataloader = dict( @@ -82,6 +53,7 @@ test_mode=True, box_type_3d="LiDAR", backend_args=_base_.backend_args, + filter_cfg=_base_.filter_cfg, ), ) @@ -102,6 +74,7 @@ test_mode=True, box_type_3d="LiDAR", backend_args=_base_.backend_args, + filter_cfg=_base_.filter_cfg, ), ) @@ -135,3 +108,5 @@ checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3, save_best="NuScenes metric/T4Metric/mAP"), ) log_processor = dict(window_size=50) + +load_from = "work_dirs/bevfusion_camera_2_8_0/gen2_base/T4Dataset/bevfusion_camera_resnet50_fpn_lss_depthaware_50e_8xb32_gen2_base_50m/best_epoch_46.pth" diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb32_gen2_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb32_gen2_base_50m.py new file mode 100644 index 000000000..f9bc419f9 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb32_gen2_base_50m.py @@ -0,0 +1,110 @@ +_base_ = [ + "../../../../../autoware_ml/configs/detection3d/default_runtime.py", + "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/gen2_base.py", + "../default/pipelines/cameras/default_camera_50m.py", + "../default/schedulers/default_50e_8xb32_adamw_linear_cosine.py", + "../default/default_misc.py", +] + +custom_imports = dict(imports=["projects.BEVFusion.bevfusion"], allow_failed_imports=False) +custom_imports["imports"] += _base_.custom_imports["imports"] +custom_imports["imports"] += ["autoware_ml.detection3d.datasets.transforms"] + +# user setting +data_root = "data/t4dataset/" +info_directory_path = "info/kokseang_2_9_0/" + +# Dataset parameters +train_dataloader = dict( + batch_size=_base_.train_batch_size, + num_workers=_base_.num_workers, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=True), + dataset=dict( + type=_base_.dataset_type, + pipeline=_base_.train_pipeline, + modality=_base_.input_modality, + backend_args=_base_.backend_args, + data_root=data_root, + ann_file=info_directory_path + _base_.info_train_file_name, + metainfo=_base_.metainfo, + class_names=_base_.class_names, + test_mode=False, + data_prefix=_base_.data_prefix, + box_type_3d="LiDAR", + filter_cfg=_base_.filter_cfg, + ), +) + +val_dataloader = dict( + batch_size=_base_.test_batch_size, + num_workers=_base_.num_workers, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + ann_file=info_directory_path + _base_.info_val_file_name, + pipeline=_base_.test_pipeline, + metainfo=_base_.metainfo, + class_names=_base_.class_names, + modality=_base_.input_modality, + data_prefix=_base_.data_prefix, + test_mode=True, + box_type_3d="LiDAR", + backend_args=_base_.backend_args, + filter_cfg=_base_.filter_cfg, + ), +) + +test_dataloader = dict( + batch_size=_base_.test_batch_size, + num_workers=_base_.num_workers, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + ann_file=info_directory_path + _base_.info_test_file_name, + pipeline=_base_.test_pipeline, + metainfo=_base_.metainfo, + class_names=_base_.class_names, + modality=_base_.input_modality, + data_prefix=_base_.data_prefix, + test_mode=True, + box_type_3d="LiDAR", + backend_args=_base_.backend_args, + filter_cfg=_base_.filter_cfg, + ), +) + +val_evaluator = dict( + type="T4Metric", + data_root=data_root, + ann_file=data_root + info_directory_path + _base_.info_val_file_name, + metric="bbox", + backend_args=_base_.backend_args, + class_names=_base_.class_names, + name_mapping=_base_.name_mapping, + eval_class_range=_base_.eval_class_range, + filter_attributes=_base_.filter_attributes, +) + +test_evaluator = dict( + type="T4Metric", + data_root=data_root, + ann_file=data_root + info_directory_path + _base_.info_test_file_name, + metric="bbox", + backend_args=_base_.backend_args, + class_names=_base_.class_names, + name_mapping=_base_.name_mapping, + eval_class_range=_base_.eval_class_range, + filter_attributes=_base_.filter_attributes, + save_csv=True, +) + +default_hooks = dict( + logger=dict(type="LoggerHook", interval=50), + checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3, save_best="NuScenes metric/T4Metric/mAP"), +) +log_processor = dict(window_size=50) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m.py new file mode 100644 index 000000000..944da6470 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m.py @@ -0,0 +1,31 @@ +_base_ = [ + "../default_bevfusion_camera_30e_8xb32_j6gen2_base_50m.py", + "../../default/models/resnet50/camera_resnet50_fpn_lss_depthaware_50m.py", +] + +experiment_group_name = "bevfusion_camera/j6gen2_base/" + _base_.dataset_type +experiment_name = "bevfusion_camera_resnet50_fpn_lss_v2_depthaware_30e_8xb32_j6gen2_base_50m" +work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name + +# model parameter +model = dict( + type="BEVFusion", + view_transform=dict(image_size=_base_.image_size), + bbox_head=dict( + class_names=_base_.class_names, + train_cfg=dict( + point_cloud_range=_base_.point_cloud_range, + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size, + ), + test_cfg=dict( + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size[0:2], + pc_range=_base_.point_cloud_range[0:2], + ), + bbox_coder=dict( + pc_range=_base_.point_cloud_range[0:2], + voxel_size=_base_.voxel_size[0:2], + ), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m_t4metric_v2.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m_t4metric_v2.py new file mode 100644 index 000000000..cb0e50306 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m_t4metric_v2.py @@ -0,0 +1,94 @@ +_base_ = [ + "./bevfusion_camera_resnet50_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m.py", +] + +experiment_name = "bevfusion_camera_resnet50_fpn_lss_v2_depthaware_30e_8xb32_j6gen2_base_50m_t4metric_v2" +work_dir = "work_dirs/" + _base_.experiment_group_name + "/" + experiment_name + +# Add evaluator configs +evaluator_metric_configs = dict( + evaluation_task="detection", + target_labels=_base_.class_names, + center_distance_bev_thresholds=[0.5, 1.0, 2.0, 4.0], + # plane_distance_thresholds is required for the pass fail evaluation + plane_distance_thresholds=[2.0, 4.0], + iou_2d_thresholds=None, + iou_3d_thresholds=None, + label_prefix="autoware", + # bev minimum distance ranges for each range bucket, must be the same length as max_distance, + # they will form bev distance ranges in [(min_distance[0], max_distance[0]), (min_distance[1], max_distance[1]), ...] when filtering + min_distance=[0.0], + # bev maximum distance ranges for each range bucket, must be the same length as min_distance + max_distance=[51.2], + min_point_numbers=0, + matching_class_agnostic_fps=False, +) + +perception_evaluator_configs = dict( + dataset_paths=_base_.data_root, + frame_id="base_link", + evaluation_config_dict=evaluator_metric_configs, + load_raw_data=False, +) + + +frame_pass_fail_config = dict( + target_labels=_base_.class_names, + # Matching thresholds per class (must align with `plane_distance_thresholds` used in evaluation) + matching_threshold_list=[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0], + confidence_threshold_list=None, +) + +training_statistics_parquet_path = ( + _base_.data_root + _base_.info_directory_path + _base_.info_train_statistics_file_name +) +testing_statistics_parquet_path = _base_.data_root + _base_.info_directory_path + _base_.info_test_statistics_file_name +validation_statistics_parquet_path = ( + _base_.data_root + _base_.info_directory_path + _base_.info_val_statistics_file_name +) + +val_evaluator = dict( + _delete_=True, + type="T4MetricV2", + data_root=_base_.data_root, + ann_file=_base_.data_root + _base_.info_directory_path + _base_.info_val_file_name, + training_statistics_parquet_path=training_statistics_parquet_path, + testing_statistics_parquet_path=testing_statistics_parquet_path, + validation_statistics_parquet_path=validation_statistics_parquet_path, + output_dir="validation", + dataset_name="j6gen2_base", + perception_evaluator_configs=perception_evaluator_configs, + critical_object_filter_config=None, + frame_pass_fail_config=frame_pass_fail_config, + num_workers=64, + scene_batch_size=-1, + write_metric_summary=False, + class_names={{_base_.class_names}}, + name_mapping={{_base_.name_mapping}}, + experiment_name=experiment_name, + experiment_group_name=_base_.experiment_group_name, + min_num_points=2, +) + +test_evaluator = dict( + _delete_=True, + type="T4MetricV2", + data_root=_base_.data_root, + ann_file=_base_.data_root + _base_.info_directory_path + _base_.info_test_file_name, + training_statistics_parquet_path=training_statistics_parquet_path, + testing_statistics_parquet_path=testing_statistics_parquet_path, + validation_statistics_parquet_path=validation_statistics_parquet_path, + output_dir="testing", + dataset_name="j6gen2_base", + perception_evaluator_configs=perception_evaluator_configs, + critical_object_filter_config=None, + frame_pass_fail_config=frame_pass_fail_config, + num_workers=64, + scene_batch_size=-1, + write_metric_summary=True, + class_names={{_base_.class_names}}, + name_mapping={{_base_.name_mapping}}, + experiment_name=experiment_name, + experiment_group_name=_base_.experiment_group_name, + min_num_points=2, +) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_50e_8xb32_gen2_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_50e_8xb32_gen2_base_50m.py new file mode 100644 index 000000000..769f738cd --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_50e_8xb32_gen2_base_50m.py @@ -0,0 +1,31 @@ +_base_ = [ + "../default_bevfusion_camera_50e_8xb32_gen2_base_50m.py", + "../../default/models/resnet50/camera_resnet50_fpn_lss_depthaware_50m.py", +] + +experiment_group_name = "bevfusion_camera/gen2_base/" + _base_.dataset_type +experiment_name = "bevfusion_camera_resnet50_fpn_lss_depthaware_50e_8xb32_gen2_base_50m" +work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name + +# model parameter +model = dict( + type="BEVFusion", + view_transform=dict(image_size=_base_.image_size), + bbox_head=dict( + class_names=_base_.class_names, + train_cfg=dict( + point_cloud_range=_base_.point_cloud_range, + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size, + ), + test_cfg=dict( + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size[0:2], + pc_range=_base_.point_cloud_range[0:2], + ), + bbox_coder=dict( + pc_range=_base_.point_cloud_range[0:2], + voxel_size=_base_.voxel_size[0:2], + ), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m.py new file mode 100644 index 000000000..650f7b835 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m.py @@ -0,0 +1,32 @@ +_base_ = [ + "../default_bevfusion_camera_30e_8xb32_j6gen2_base_50m.py", + "../../default/models/swin_transformer/camera_swin_fpn_lss_depthaware_50m.py", +] + +experiment_group_name = "bevfusion_camera/j6gen2_base/" + _base_.dataset_type +experiment_name = "bevfusion_camera_swin_fpn_lss_30e_8xb32_j6gen2_base_50m" +work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name + +# model parameter +model = dict( + type="BEVFusion", + view_transform=dict(image_size=_base_.image_size), + bbox_head=dict( + class_names=_base_.class_names, + in_channels=80, + train_cfg=dict( + point_cloud_range=_base_.point_cloud_range, + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size, + ), + test_cfg=dict( + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size[0:2], + pc_range=_base_.point_cloud_range[0:2], + ), + bbox_coder=dict( + pc_range=_base_.point_cloud_range[0:2], + voxel_size=_base_.voxel_size[0:2], + ), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_depthaware_50e_8xb32_gen2_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_depthaware_50e_8xb32_gen2_base_50m.py new file mode 100644 index 000000000..fcbd79355 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_depthaware_50e_8xb32_gen2_base_50m.py @@ -0,0 +1,32 @@ +_base_ = [ + "../default_bevfusion_camera_30e_8xb32_gen2_base_50m.py", + "../../default/models/swin_transformer/camera_swin_fpn_lss_depthaware_50m.py", +] + +experiment_group_name = "bevfusion_camera/gen2_base/" + _base_.dataset_type +experiment_name = "bevfusion_camera_swin_fpn_lss_50e_8xb32_gen2_base_50m" +work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name + +# model parameter +model = dict( + type="BEVFusion", + view_transform=dict(image_size=_base_.image_size), + bbox_head=dict( + class_names=_base_.class_names, + in_channels=80, + train_cfg=dict( + point_cloud_range=_base_.point_cloud_range, + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size, + ), + test_cfg=dict( + grid_size=_base_.grid_size, + voxel_size=_base_.voxel_size[0:2], + pc_range=_base_.point_cloud_range[0:2], + ), + bbox_coder=dict( + pc_range=_base_.point_cloud_range[0:2], + voxel_size=_base_.voxel_size[0:2], + ), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_j6gen2_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_j6gen2_base_120m.py index 4f81af760..a93b1d435 100644 --- a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_j6gen2_base_120m.py +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_j6gen2_base_120m.py @@ -2,8 +2,8 @@ "../../../../../autoware_ml/configs/detection3d/default_runtime.py", "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py", "../default/pipelines/default_camera_lidar_intensity_120m.py", - "../default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py", - "../default/schedulers/default_20e_8xb8_adamw_linear_cosine.py", + "../default/models/default_camera_swin_fpn_depthlss_lidar_second_secfpn_120m.py", + "../default/schedulers/default_20e_8xb16_adamw_linear_cosine.py", "../default/default_misc.py", ] diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_jpntaxi_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_jpntaxi_base_120m.py index 20c85b1d8..b8408956b 100644 --- a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_jpntaxi_base_120m.py +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_jpntaxi_base_120m.py @@ -2,8 +2,8 @@ "../../../../../autoware_ml/configs/detection3d/default_runtime.py", "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_base.py", "../default/pipelines/default_camera_lidar_intensity_120m.py", - "../default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py", - "../default/schedulers/default_20e_8xb8_adamw_linear_cosine.py", + "../default/models/default_camera_swin_fpn_depthlss_lidar_second_secfpn_120m.py", + "../default/schedulers/default_20e_8xb16_adamw_linear_cosine.py", "../default/default_misc.py", ] diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m.py index 3edd06c92..c1b0bdaae 100644 --- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m.py +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m.py @@ -13,7 +13,7 @@ # user setting data_root = "data/t4dataset/" -info_directory_path = "info/user_name/" +info_directory_path = "info/kokseang_2_8_1/" experiment_group_name = "bevfusion_lidar_intensity/j6gen2_base/" + _base_.dataset_type experiment_name = "lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m" diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m.py index 05947c2fd..040e18c58 100644 --- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m.py +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m.py @@ -13,7 +13,7 @@ # user setting data_root = "data/t4dataset/" -info_directory_path = "info/user_name/" +info_directory_path = "info/kokseang_2_8_1/" experiment_group_name = "bevfusion_lidar_intensity/jpntaxi_base/" + _base_.dataset_type experiment_name = "lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m" @@ -165,4 +165,6 @@ ) log_processor = dict(window_size=50) -load_from = None +load_from = ( + "work_dirs/bevfusion_lidar_2_8_0/base/T4Dataset/lidar_voxel_second_secfpn_50e_8xb16_base_120m/best_epoch_47.pth" +) diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m.py index 7716a1508..2099bacca 100644 --- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m.py +++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m.py @@ -13,7 +13,7 @@ # user setting data_root = "data/t4dataset/" -info_directory_path = "info/user_name/" +info_directory_path = "info/kokseang_2_8_1/" experiment_group_name = "bevfusion_lidar/base/" + _base_.dataset_type experiment_name = "lidar_voxel_second_secfpn_50e_8xb16_base_120m" @@ -152,3 +152,5 @@ checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3, save_best="NuScenes metric/T4Metric/mAP"), ) log_processor = dict(window_size=50) + +resume = True diff --git a/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m_iou_loss.py b/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m_iou_loss.py new file mode 100644 index 000000000..e90687fe3 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m_iou_loss.py @@ -0,0 +1,10 @@ +_base_ = [ + "./default_lidar_second_secfpn_120m.py", +] + +model = dict( + bbox_head=dict( + common_heads=dict(center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2], iou=[1, 2]), + loss_iou=dict(type="mmdet.L1Loss", reduction="mean", loss_weight=1.0), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_depthlss_120m.py similarity index 52% rename from projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py rename to projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_depthlss_120m.py index c4097de3d..c807668a3 100644 --- a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py +++ b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_depthlss_120m.py @@ -1,9 +1,15 @@ _base_ = [ - "./default_lidar_second_secfpn_120m.py", + "../default_lidar_second_secfpn_120m.py", ] # Image network model = dict( + # Remove all lidar related configs + voxelize_cfg=None, + pts_voxel_encoder=None, + pts_middle_encoder=None, + pts_neck=None, + pts_backbone=None, data_preprocessor=dict( type="Det3DDataPreprocessor", pad_size_divisor=32, @@ -13,34 +19,26 @@ rgb_to_bgr=False, ), img_backbone=dict( - type="mmdet.SwinTransformer", - pretrain_img_size=(256, 704), - embed_dims=96, - depths=[2, 2, 6, 2], - num_heads=[3, 6, 12, 24], - window_size=7, - mlp_ratio=4, - qkv_bias=True, - qk_scale=None, - drop_rate=0.0, - attn_drop_rate=0.0, - drop_path_rate=0.2, - patch_norm=True, - out_indices=[1, 2, 3], - with_cp=False, - convert_weights=True, + type="mmdet.ResNet", + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type="BN2d", requires_grad=True), + norm_eval=False, + with_cp=True, + style="pytorch", init_cfg=dict( type="Pretrained", - # https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/swint-nuimages-pretrained.pth - checkpoint="work_dirs/swin_transformer/swint_nuimages_pretrained.pth", # noqa: E251 + checkpoint="work_dirs/resnet50/mmdet_resnet50-19c8e357.pth", # noqa: E251 ), ), img_neck=dict( type="GeneralizedLSSFPN", - in_channels=[192, 384, 768], + in_channels=[512, 1024, 2048], out_channels=256, start_level=0, - num_outs=3, + num_outs=2, norm_cfg=dict(type="BN2d", requires_grad=True), act_cfg=dict(type="ReLU", inplace=True), upsample_cfg=dict(mode="bilinear", align_corners=False), @@ -56,5 +54,7 @@ dbound=[1.0, 130, 1.0], downsample=2, ), - fusion_layer=dict(type="ConvFuser", in_channels=[80, 256], out_channels=256, kernel_size=5, padding=2), + bbox_head=dict( + in_channels=80, + ), ) diff --git a/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_depthaware_50m.py b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_depthaware_50m.py new file mode 100644 index 000000000..89c35aca9 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_depthaware_50m.py @@ -0,0 +1,25 @@ +_base_ = [ + "./camera_resnet50_fpn_depthlss_120m.py", +] +num_proposals = 200 + +# Image network +model = dict( + depth_gt_downsample=8, + loss_depth_weight=1.0, + view_transform=dict( + type="LSSTransformV2DepthAware", + xbound=[-54.0, 54.0, 0.3], + ybound=[-54.0, 54.0, 0.3], + zbound=[-10.0, 10.0, 20.0], + dbound=[1.0, 60, 0.5], + downsample=2, + camera_depth_aware_configs=dict(mlp_drop_out=0.0, downsample=8, num_camera_depth_parameters=27), + ), + bbox_head=dict( + num_proposals=num_proposals, + bbox_coder=dict( + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + ), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_120m.py b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_120m.py similarity index 97% rename from projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_120m.py rename to projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_120m.py index c4b0cd9ab..88e74efc7 100644 --- a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_120m.py +++ b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_120m.py @@ -1,5 +1,5 @@ _base_ = [ - "./default_lidar_second_secfpn_120m.py", + "../default_lidar_second_secfpn_120m.py", ] # Image network diff --git a/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_lss_depthaware_50m.py b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_lss_depthaware_50m.py new file mode 100644 index 000000000..09b317343 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_lss_depthaware_50m.py @@ -0,0 +1,21 @@ +_base_ = [ + "./default_camera_swin_fpn_depthlss_120m.py", +] + +# Image network +model = dict( + view_transform=dict( + type="LSSTransformV2DepthAware", + xbound=[-54.0, 54.0, 0.3], + ybound=[-54.0, 54.0, 0.3], + zbound=[-10.0, 10.0, 20.0], + dbound=[1.0, 60, 0.5], + downsample=2, + camera_depth_aware_configs=dict(mlp_drop_out=0.0, downsample=8, num_camera_depth_parameters=27), + ), + bbox_head=dict( + bbox_coder=dict( + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + ), + ), +) diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_120m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_120m.py new file mode 100644 index 000000000..ec41f5012 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_120m.py @@ -0,0 +1,175 @@ +## This config is for the camera_base only model, without lidar points + +_base_ = [ + "../default_lidar_120m.py", +] +input_modality = dict(use_lidar=True, use_camera=True) + +# Image parameters +image_size = [384, 768] # Height, Width +camera_orders = { + "J6_erga_Gen2": ["CAM_FRONT", "CAM_FRONT_LEFT", "CAM_BACK_LEFT", "CAM_FRONT_RIGHT", "CAM_BACK_RIGHT"], + "J6_x2_Gen2": ["CAM_FRONT", "CAM_FRONT_LEFT", "CAM_BACK_LEFT", "CAM_FRONT_RIGHT", "CAM_BACK_RIGHT"], + "JPNTaxi_xx1_Gen2": [ + "CAM_FRONT_WIDE", + "CAM_FRONT_LEFT_WIDE", + "CAM_BACK_LEFT_WIDE", + "CAM_FRONT_RIGHT_WIDE", + "CAM_BACK_RIGHT_WIDE", + ], + "JPNTaxi_solio_Gen2": [ + "CAM_FRONT_WIDE", + "CAM_FRONT_LEFT_WIDE", + "CAM_BACK_LEFT_WIDE", + "CAM_FRONT_RIGHT_WIDE", + "CAM_BACK_RIGHT_WIDE", + ], +} + +train_pipeline = [ + dict( + type="BEVLoadMultiViewImageFromFiles", + to_float32=True, + color_type="color", + backend_args=_base_.backend_args, + camera_order=camera_order, + ), + # We keep loading LiDAR points to make downstream BEV augmentation easier + dict( + type="LoadPointsFromFile", + coord_type="LIDAR", + load_dim=_base_.point_load_dim, + use_dim=_base_.point_load_dim, + backend_args=_base_.backend_args, + ), + dict( + type="PointsToMultiViewImageDepths", + img_shape=image_size, + num_cameras=5, + depth_bounds=[1.0, 120.0], + # visualize_dir="work_dirs/visualize_depths_6", + ), + dict( + type="LoadPointsFromMultiSweeps", + sweeps_num=_base_.sweeps_num, + load_dim=_base_.point_load_dim, + use_dim=_base_.lidar_sweep_dims, + pad_empty_sweeps=True, + remove_close=True, + backend_args=_base_.backend_args, + test_mode=False, + ), + dict(type="PointsRangeFilter", point_cloud_range=_base_.point_cloud_range), + dict(type="LoadAnnotations3D", with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict( + type="ImageAug3D", + final_dim=image_size, + resize_lim=[0.28, 0.40], + bot_pct_lim=[0.0, 0.0], + rot_lim=[0.0, 0.0], + rand_flip=True, + is_train=True, + ), + dict( + type="BEVFusionGlobalRotScaleTrans", + scale_ratio_range=[0.95, 1.05], + rot_range=[-0.78539816, 0.78539816], + translation_std=[0.5, 0.5, 0.2], + ), + dict(type="BEVFusionRandomFlip3D"), + dict(type="ObjectRangeFilter", point_cloud_range=_base_.point_cloud_range), + dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=3), + dict(type="ObjectRangeMinPointsFilter", range_radius=[60, 130], min_num_points=2), + dict( + type="ObjectNameFilter", + classes=[ + "car", + "truck", + "bus", + "bicycle", + "pedestrian", + "traffic_cone", + "barrier", + ], + ), + dict( + type="Pack3DDetInputs", + keys=["points", "img", "gt_bboxes_3d", "gt_labels_3d", "gt_bboxes", "gt_labels"], + meta_keys=[ + "cam2img", + "ori_cam2img", + "lidar2cam", + "lidar2img", + "cam2lidar", + "ori_lidar2img", + "img_aug_matrix", + "box_type_3d", + "sample_idx", + "lidar_path", + "img_path", + "transformation_3d_flow", + "pcd_rotation", + "pcd_scale_factor", + "pcd_trans", + "img_aug_matrix", + "lidar_aug_matrix", + "timestamp", + "vehicle_type", + "city", + "traffic_cone_barrier_status", + "gt_depths", + ], + ), +] + +test_pipeline = [ + dict( + type="BEVLoadMultiViewImageFromFiles", + to_float32=True, + color_type="color", + backend_args=_base_.backend_args, + camera_orders=camera_orders, + ), + dict( + type="LoadPointsFromFile", + coord_type="LIDAR", + load_dim=_base_.point_load_dim, + use_dim=_base_.point_load_dim, + backend_args=_base_.backend_args, + ), + dict( + type="ImageAug3D", + final_dim=image_size, + resize_lim=[0.34, 0.34], + bot_pct_lim=[0.0, 0.0], + rot_lim=[0.0, 0.0], + rand_flip=False, + is_train=False, + ), + dict(type="ObjectRangeFilter", point_cloud_range=_base_.point_cloud_range), + dict( + type="Pack3DDetInputs", + keys=["img", "points", "gt_bboxes_3d", "gt_labels_3d"], + meta_keys=[ + "cam2img", + "ori_cam2img", + "lidar2cam", + "lidar2img", + "cam2lidar", + "ori_lidar2img", + "img_aug_matrix", + "box_type_3d", + "sample_idx", + "lidar_path", + "img_path", + "num_pts_feats", + "num_views", + "timestamp", + "vehicle_type", + "city", + "traffic_cone_barrier_status", + ], + ), +] + +filter_cfg = dict(filter_frames_with_camera_orders=camera_orders) diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_50m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_50m.py new file mode 100644 index 000000000..492ad7866 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_50m.py @@ -0,0 +1,168 @@ +## This config is for the camera_base only model, without lidar points + +_base_ = [ + "../default_lidar_50m.py", +] +input_modality = dict(use_lidar=True, use_camera=True) + +# Image parameters +image_size = [384, 768] # Height, Width +camera_orders = { + "J6_erga_Gen2": ["CAM_FRONT", "CAM_FRONT_LEFT", "CAM_BACK_LEFT", "CAM_FRONT_RIGHT", "CAM_BACK_RIGHT"], + "J6_x2_Gen2": ["CAM_FRONT", "CAM_FRONT_LEFT", "CAM_BACK_LEFT", "CAM_FRONT_RIGHT", "CAM_BACK_RIGHT"], + "JPNTaxi_xx1_Gen2": [ + "CAM_FRONT_WIDE", + "CAM_FRONT_LEFT_WIDE", + "CAM_BACK_LEFT_WIDE", + "CAM_FRONT_RIGHT_WIDE", + "CAM_BACK_RIGHT_WIDE", + ], + "JPNTaxi_solio_Gen2": [ + "CAM_FRONT_WIDE", + "CAM_FRONT_LEFT_WIDE", + "CAM_BACK_LEFT_WIDE", + "CAM_FRONT_RIGHT_WIDE", + "CAM_BACK_RIGHT_WIDE", + ], +} + +train_pipeline = [ + dict( + type="BEVLoadMultiViewImageFromFiles", + to_float32=True, + color_type="color", + backend_args=_base_.backend_args, + camera_orders=camera_orders, + ), + # We keep loading LiDAR points to make downstream BEV augmentation easier + dict( + type="LoadPointsFromFile", + coord_type="LIDAR", + load_dim=_base_.point_load_dim, + use_dim=_base_.point_load_dim, + backend_args=_base_.backend_args, + ), + dict(type="LoadAnnotations3D", with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict( + type="ImageAug3D", + final_dim=image_size, + resize_lim=[0.29, 0.35], + bot_pct_lim=[0.0, 0.0], + rot_lim=[0.0, 0.0], + rand_flip=True, + is_train=True, + ), + dict(type="PointsRangeFilter", point_cloud_range=[-80.0, -80.0, -10.0, 80.0, 80.0, 10.0]), + dict( + type="PointsToMultiViewImageDepths", + img_shape=image_size, + num_cameras=5, + depth_bounds=[1.0, 60.0], + ), + dict( + type="BEVFusionGlobalRotScaleTrans", + scale_ratio_range=[0.95, 1.05], + rot_range=[-0.78539816, 0.78539816], + translation_std=[0.5, 0.5, 0.2], + ), + dict(type="BEVFusionRandomFlip3D"), + dict(type="ObjectRangeFilter", point_cloud_range=_base_.point_cloud_range), + dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=3), + # Remove LiDAR points from the data + dict(type="BEVFusionRemoveLiDARPoints"), + dict( + type="ObjectNameFilter", + classes=[ + "car", + "truck", + "construction_vehicle", + "bus", + "trailer", + "barrier", + "motorcycle", + "bicycle", + "pedestrian", + "traffic_cone", + ], + ), + dict( + type="Pack3DDetInputs", + keys=["points", "img", "gt_bboxes_3d", "gt_labels_3d", "gt_bboxes", "gt_labels"], + meta_keys=[ + "cam2img", + "ori_cam2img", + "lidar2cam", + "lidar2img", + "cam2lidar", + "ori_lidar2img", + "img_aug_matrix", + "box_type_3d", + "sample_idx", + "lidar_path", + "img_path", + "transformation_3d_flow", + "pcd_rotation", + "pcd_scale_factor", + "pcd_trans", + "img_aug_matrix", + "lidar_aug_matrix", + "timestamp", + "vehicle_type", + "city", + "traffic_cone_barrier_status", + "gt_depths", + ], + ), +] + +test_pipeline = [ + dict( + type="BEVLoadMultiViewImageFromFiles", + to_float32=True, + color_type="color", + backend_args=_base_.backend_args, + camera_orders=camera_orders, + ), + dict( + type="LoadPointsFromFile", + coord_type="LIDAR", + load_dim=_base_.point_load_dim, + use_dim=_base_.point_load_dim, + backend_args=_base_.backend_args, + ), + dict( + type="ImageAug3D", + final_dim=image_size, + resize_lim=[0.32, 0.32], + bot_pct_lim=[0.0, 0.0], + rot_lim=[0.0, 0.0], + rand_flip=False, + is_train=False, + ), + dict(type="PointsRangeFilter", point_cloud_range=[-80.0, -80.0, -10.0, 80.0, 80.0, 10.0]), + dict( + type="Pack3DDetInputs", + keys=["img", "points", "gt_bboxes_3d", "gt_labels_3d"], + meta_keys=[ + "cam2img", + "ori_cam2img", + "lidar2cam", + "lidar2img", + "cam2lidar", + "ori_lidar2img", + "img_aug_matrix", + "box_type_3d", + "sample_idx", + "lidar_path", + "img_path", + "num_pts_feats", + "num_views", + "timestamp", + "vehicle_type", + "city", + "traffic_cone_barrier_status", + ], + ), +] + +filter_cfg = dict(filter_frames_with_camera_orders=camera_orders) diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_camera_lidar_intensity_120m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_camera_lidar_intensity_120m.py index 4d9a5aa12..b468b0f9c 100644 --- a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_camera_lidar_intensity_120m.py +++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_camera_lidar_intensity_120m.py @@ -58,7 +58,7 @@ final_dim=image_size, resize_lim=[0.29, 0.35], bot_pct_lim=[0.0, 0.0], - rot_lim=[-5.4, 5.4], + rot_lim=[0.0, 0.0], rand_flip=True, is_train=True, ), @@ -71,6 +71,7 @@ dict(type="BEVFusionRandomFlip3D"), dict(type="PointsRangeFilter", point_cloud_range=point_cloud_range), dict(type="ObjectRangeFilter", point_cloud_range=point_cloud_range), + dict(type="BEVFusionRemoveLiDARPoints"), dict( type="ObjectNameFilter", classes=[ @@ -138,7 +139,7 @@ pad_empty_sweeps=True, remove_close=True, backend_args=backend_args, - test_mode=True, + test_mode=False, ), dict( type="ImageAug3D", diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_120m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_120m.py index b37108873..64cf8b076 100644 --- a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_120m.py +++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_120m.py @@ -148,4 +148,4 @@ # e.g., dict(filter_frames_with_missing_image=True). # - This is a LiDAR-only config (`input_modality['use_camera'] = False`), so # image-based filtering does not apply and `filter_cfg` is intentionally None. -filter_cfg = None +filter_cfg = dict() diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_50m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_50m.py new file mode 100644 index 000000000..90c5a1dea --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_50m.py @@ -0,0 +1,138 @@ +# Dataset parameters +backend_args = None +num_workers = 4 +input_modality = dict(use_lidar=True, use_camera=False) + +# range setting +point_cloud_range = [-54.0, -54.0, -3.0, 54.0, 54.0, 5.0] +voxel_size = [0.075, 0.075, 0.2] +grid_size = [1440, 1440, 41] +eval_class_range = { + "car": 51.2, + "truck": 51.2, + "bus": 51.2, + "bicycle": 51.2, + "pedestrian": 51.2, + "traffic_cone": 51.2, + "barrier": 51.2, +} + +# LiDAR parameters +point_load_dim = 5 # x, y, z, intensity, ring_id +point_use_dim = 4 +lidar_sweep_dims = [0, 1, 2, 4] # x, y, z, time_lag +sweeps_num = 1 + +train_pipeline = [ + dict( + type="LoadPointsFromFile", + coord_type="LIDAR", + load_dim=point_load_dim, + use_dim=point_load_dim, + backend_args=backend_args, + ), + dict( + type="LoadPointsFromMultiSweeps", + sweeps_num=sweeps_num, + load_dim=point_load_dim, + use_dim=lidar_sweep_dims, + pad_empty_sweeps=True, + remove_close=True, + backend_args=backend_args, + test_mode=False, + ), + dict(type="LoadAnnotations3D", with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict( + type="BEVFusionGlobalRotScaleTrans", + scale_ratio_range=[0.95, 1.05], + rot_range=[-0.78539816, 0.78539816], + translation_std=[0.5, 0.5, 0.2], + ), + dict(type="BEVFusionRandomFlip3D"), + dict(type="PointsRangeFilter", point_cloud_range=point_cloud_range), + dict(type="ObjectRangeFilter", point_cloud_range=point_cloud_range), + dict( + type="ObjectNameFilter", + classes=[ + "car", + "truck", + "bus", + "bicycle", + "pedestrian", + "traffic_cone", + "barrier", + ], + ), + dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=3), + dict(type="ObjectRangeMinPointsFilter", range_radius=[60, 130], min_num_points=2), + dict(type="PointShuffle"), + dict( + type="Pack3DDetInputs", + keys=["points", "img", "gt_bboxes_3d", "gt_labels_3d", "gt_bboxes", "gt_labels"], + meta_keys=[ + "cam2img", + "ori_cam2img", + "lidar2cam", + "lidar2img", + "cam2lidar", + "ori_lidar2img", + "img_aug_matrix", + "box_type_3d", + "sample_idx", + "lidar_path", + "img_path", + "transformation_3d_flow", + "pcd_rotation", + "pcd_scale_factor", + "pcd_trans", + "img_aug_matrix", + "lidar_aug_matrix", + "timestamp", + "vehicle_type", + "city", + "traffic_cone_barrier_status", + ], + ), +] + +test_pipeline = [ + dict( + type="LoadPointsFromFile", + coord_type="LIDAR", + load_dim=point_load_dim, + use_dim=point_load_dim, + backend_args=backend_args, + ), + dict(type="PointsRangeFilter", point_cloud_range=point_cloud_range), + dict( + type="Pack3DDetInputs", + keys=["img", "points", "gt_bboxes_3d", "gt_labels_3d"], + meta_keys=[ + "cam2img", + "ori_cam2img", + "lidar2cam", + "lidar2img", + "cam2lidar", + "ori_lidar2img", + "img_aug_matrix", + "box_type_3d", + "sample_idx", + "lidar_path", + "img_path", + "num_pts_feats", + "num_views", + "timestamp", + "vehicle_type", + "city", + "traffic_cone_barrier_status", + ], + ), +] + +# Filtering configuration +# Note: +# - In camera–LiDAR configs, `filter_cfg` can enable image-based frame filtering, +# e.g., dict(filter_frames_with_missing_image=True). +# - This is a LiDAR-only config (`input_modality['use_camera'] = False`), so +# image-based filtering does not apply and `filter_cfg` is intentionally None. +filter_cfg = dict() diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_intensity_120m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_intensity_120m.py index 7d6b8e506..b03a22d36 100644 --- a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_intensity_120m.py +++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_intensity_120m.py @@ -148,4 +148,4 @@ # e.g., dict(filter_frames_with_missing_image=True). # - This is a LiDAR-only config (`input_modality['use_camera'] = False`), so # image-based filtering does not apply and `filter_cfg` is intentionally None. -filter_cfg = None +filter_cfg = dict() diff --git a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb8_adamw_linear_cosine.py b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb16_adamw_linear_cosine.py similarity index 98% rename from projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb8_adamw_linear_cosine.py rename to projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb16_adamw_linear_cosine.py index 64bc2b717..05740e442 100644 --- a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb8_adamw_linear_cosine.py +++ b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb16_adamw_linear_cosine.py @@ -1,6 +1,6 @@ # learning rate -lr = 1e-4 -t_max = 6 +lr = 2e-4 +t_max = 2 max_epochs = 20 val_interval = 1 diff --git a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb32_adamw_linear_cosine.py b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb32_adamw_linear_cosine.py new file mode 100644 index 000000000..34d5b95f6 --- /dev/null +++ b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb32_adamw_linear_cosine.py @@ -0,0 +1,69 @@ +# learning rate +lr = 2e-4 +t_max = 3 +max_epochs = 30 +val_interval = 1 + +train_gpu_size = 8 +test_batch_size = 2 +train_batch_size = 32 + +param_scheduler = [ + # learning rate scheduler + dict( + type="LinearLR", + start_factor=1.0 / 3, + begin=0, + end=t_max, + by_epoch=True, + convert_to_iter_based=True, + ), + dict( + type="CosineAnnealingLR", + T_max=(max_epochs - t_max), + eta_min=lr * 1e-4, + begin=t_max, + end=max_epochs, + by_epoch=True, + convert_to_iter_based=True, + ), + # momentum scheduler + # During the first (0.4 * max_epochs) epochs, momentum increases from 0 to 0.85 / 0.95 + # during the next epochs, momentum increases from 0.85 / 0.95 to 1 + dict( + type="CosineAnnealingMomentum", + T_max=t_max, + eta_min=0.85 / 0.95, + begin=0, + end=t_max, + by_epoch=True, + convert_to_iter_based=True, + ), + dict( + type="CosineAnnealingMomentum", + T_max=(max_epochs - t_max), + eta_min=1, + begin=t_max, + end=max_epochs, + by_epoch=True, + convert_to_iter_based=True, + ), +] + +train_cfg = dict( + by_epoch=True, max_epochs=max_epochs, val_interval=val_interval, dynamic_intervals=[(max_epochs - 5, 1)] +) +val_cfg = dict() +test_cfg = dict() + +optim_wrapper = dict( + type="OptimWrapper", + optimizer=dict(type="AdamW", lr=lr, weight_decay=1e-2), + clip_grad=dict(max_norm=5.0, norm_type=2), +) + +auto_scale_lr = dict(enable=False, base_batch_size=train_gpu_size * train_batch_size) + +# Only set if the number of train_gpu_size more than 1 +if train_gpu_size > 1: + sync_bn = "torch" diff --git a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_linear_cosine.py b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb32_adamw_linear_cosine.py similarity index 93% rename from projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_linear_cosine.py rename to projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb32_adamw_linear_cosine.py index 23d29acc1..6763b8779 100644 --- a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_linear_cosine.py +++ b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb32_adamw_linear_cosine.py @@ -1,12 +1,12 @@ # learning rate -lr = 1e-4 -t_max = 8 -max_epochs = 30 +lr = 2e-4 +t_max = 5 +max_epochs = 50 val_interval = 1 train_gpu_size = 8 test_batch_size = 2 -train_batch_size = 8 +train_batch_size = 32 param_scheduler = [ # learning rate scheduler @@ -52,7 +52,7 @@ optim_wrapper = dict( type="OptimWrapper", optimizer=dict(type="AdamW", lr=lr, weight_decay=0.01), - clip_grad=dict(max_norm=0.1, norm_type=2), + clip_grad=dict(max_norm=1.0, norm_type=2), ) auto_scale_lr = dict(enable=False, base_batch_size=train_gpu_size * train_batch_size) diff --git a/projects/BEVFusion/setup.py b/projects/BEVFusion/setup.py index 38f588b20..52d397c12 100644 --- a/projects/BEVFusion/setup.py +++ b/projects/BEVFusion/setup.py @@ -54,6 +54,14 @@ def make_cuda_ext(name, module, sources, sources_cuda=[], extra_args=[], extra_i "src/bev_pool_cuda.cu", ], ), + make_cuda_ext( + name="bev_pool_v2_ext", + module="projects.BEVFusion.bevfusion.ops.bev_pool_v2", + sources=[ + "src/bev_pool.cpp", + "src/bev_pool_cuda.cu", + ], + ), make_cuda_ext( name="voxel_layer", module="projects.BEVFusion.bevfusion.ops.voxel",