diff --git a/autoware_ml/configs/detection3d/dataset/t4dataset/gen2_base.py b/autoware_ml/configs/detection3d/dataset/t4dataset/gen2_base.py
new file mode 100644
index 000000000..4bdf3b06d
--- /dev/null
+++ b/autoware_ml/configs/detection3d/dataset/t4dataset/gen2_base.py
@@ -0,0 +1,221 @@
+custom_imports = dict(
+    imports=[
+        "autoware_ml.detection3d.datasets.t4dataset",
+        "autoware_ml.detection3d.evaluation.t4metric.t4metric",
+        "autoware_ml.detection3d.evaluation.t4metric.t4metric_v2",
+    ]
+)
+
+# dataset type setting
+dataset_type = "T4Dataset"
+info_train_file_name = "t4dataset_gen2_base_infos_train.pkl"
+info_val_file_name = "t4dataset_gen2_base_infos_val.pkl"
+info_test_file_name = "t4dataset_gen2_base_infos_test.pkl"
+
+info_train_statistics_file_name = "t4dataset_gen2_base_statistics_train.parquet"
+info_val_statistics_file_name = "t4dataset_gen2_base_statistics_val.parquet"
+info_test_statistics_file_name = "t4dataset_gen2_base_statistics_test.parquet"
+
+# dataset scene setting
+dataset_version_list = [
+    "db_jpntaxigen2_v1",
+    "db_jpntaxigen2_v2",
+    "db_j6gen2_v1",
+    "db_j6gen2_v2",
+    "db_j6gen2_v3",
+    "db_j6gen2_v4",
+    "db_j6gen2_v5",
+    "db_j6gen2_v6",
+    "db_j6gen2_v7",
+    "db_j6gen2_v8",
+    "db_j6gen2_v9",
+    "db_j6gen2_v10",
+    "db_j6gen2_v11",
+    "db_j6gen2_v12",
+    "db_largebus_v1",
+    "db_largebus_v2",
+    "db_largebus_v3",
+]
+
+# TODO (KokSeang): This will be removed to avoid repeatitive computation
+# Dataset set, test info files, and enable/disable evaluation of prefix
+dataset_test_groups = {
+    "j6gen2_base": ("t4dataset_j6gen2_base_infos_test.pkl", False),
+    "j6gen2": ("t4dataset_j6gen2_infos_test.pkl", False),
+    "largebus": ("t4dataset_largebus_infos_test.pkl", False),
+    "jpntaxi_gen2": ("t4dataset_jpntaxi_gen2_infos_test.pkl", False),
+    "gen2_base": ("t4dataset_gen2_base_infos_test.pkl", True),
+}
+
+# dataset format setting
+data_prefix = dict(
+    pts="",
+    CAM_FRONT="",
+    CAM_FRONT_WIDE="",
+    CAM_FRONT_LEFT="",
+    CAM_FRONT_LEFT_WIDE="",
+    CAM_FRONT_RIGHT="",
+    CAM_FRONT_RIGHT_WIDE="",
+    CAM_BACK="",
+    CAM_BACK_RIGHT="",
+    CAM_BACK_RIGHT_WIDE="",
+    CAM_BACK_LEFT="",
+    CAM_BACK_LEFT_WIDE="",
+    sweeps="",
+)
+camera_types = {
+    "CAM_FRONT",
+    "CAM_FRONT_WIDE",
+    "CAM_FRONT_RIGHT",
+    "CAM_FRONT_RIGHT_WIDE",
+    "CAM_FRONT_LEFT",
+    "CAM_FRONT_LEFT_WIDE",
+    "CAM_BACK",
+    "CAM_BACK_LEFT",
+    "CAM_BACK_LEFT_WIDE",
+    "CAM_BACK_RIGHT",
+    "CAM_BACK_RIGHT_WIDE",
+}
+
+# class setting
+name_mapping = {
+    # DBv1.0
+    "vehicle.car": "car",
+    "vehicle.construction": "truck",
+    "vehicle.emergency (ambulance & police)": "car",
+    "vehicle.motorcycle": "bicycle",
+    "vehicle.trailer": "trailer",
+    "vehicle.truck": "truck",
+    "vehicle.bicycle": "bicycle",
+    "vehicle.bus (bendy & rigid)": "bus",
+    "pedestrian.adult": "pedestrian",
+    "pedestrian.child": "pedestrian",
+    "pedestrian.construction_worker": "pedestrian",
+    "pedestrian.personal_mobility": "pedestrian",
+    "pedestrian.police_officer": "pedestrian",
+    "pedestrian.stroller": "pedestrian",
+    "pedestrian.wheelchair": "pedestrian",
+    "movable_object.barrier": "barrier",
+    "movable_object.debris": "barrier",
+    "movable_object.pushable_pullable": "barrier",
+    "movable_object.trafficcone": "traffic_cone",
+    "movable_object.traffic_cone": "traffic_cone",
+    "animal": "animal",
+    "static_object.bicycle_rack": "bicycle_rack",
+    # DBv1.1 and UCv2.0
+    "car": "car",
+    "truck": "truck",
+    "bus": "bus",
+    "trailer": "trailer",
+    "motorcycle": "bicycle",
+    "bicycle": "bicycle",
+    "police_car": "car",
+    "pedestrian": "pedestrian",
+    "police_officer": "pedestrian",
+    "forklift": "car",
+    "construction_worker": "pedestrian",
+    "stroller": "pedestrian",
+    # DBv2.0 and DBv3.0
+    "animal": "animal",
+    "movable_object.barrier": "barrier",
+    "movable_object.pushable_pullable": "barrier",
+    "movable_object.traffic_cone": "traffic_cone",
+    "pedestrian.adult": "pedestrian",
+    "pedestrian.child": "pedestrian",
+    "pedestrian.construction_worker": "pedestrian",
+    "pedestrian.personal_mobility": "pedestrian",
+    "pedestrian.police_officer": "pedestrian",
+    "pedestrian.stroller": "pedestrian",
+    "pedestrian.wheelchair": "pedestrian",
+    "static_object.bicycle rack": "bicycle rack",
+    "static_object.bollard": "bollard",
+    "vehicle.ambulance": "car",  # Define vehicle.ambulance as car since vehicle.emergency (ambulance & police) is defined as car
+    "vehicle.bicycle": "bicycle",
+    "vehicle.bus": "bus",
+    "vehicle.car": "car",
+    "vehicle.construction": "truck",
+    "vehicle.fire": "truck",
+    "vehicle.motorcycle": "bicycle",
+    "vehicle.police": "car",
+    "vehicle.trailer": "trailer",
+    "vehicle.truck": "truck",
+    # DBv1.3
+    "ambulance": "car",
+    "kart": "car",
+    "wheelchair": "pedestrian",
+    "personal_mobility": "pedestrian",
+    "fire_truck": "truck",
+    "semi_trailer": "trailer",
+    "tractor_unit": "truck",
+    "construction_vehicle": "truck",
+    "traffic_cone": "traffic_cone",
+    "trafficcone": "traffic_cone",
+    "barrier": "barrier",
+    "other_vehicle": "car",
+    "other_pedestrian": "pedestrian",
+}
+
+class_names = ["car", "truck", "bus", "bicycle", "pedestrian", "traffic_cone", "barrier"]
+num_class = len(class_names)
+metainfo = dict(classes=class_names)
+
+merge_objects = [
+    ("truck", ["truck", "trailer"]),
+]
+merge_type = "extend_longer"  # One of ["extend_longer","union", None]
+
+# visualization
+class_colors = {
+    "car": (30, 144, 255),
+    "truck": (140, 0, 255),
+    "construction_vehicle": (255, 255, 0),
+    "bus": (111, 255, 111),
+    "trailer": (0, 255, 255),
+    "barrier": (0, 0, 0),
+    "motorcycle": (100, 0, 30),
+    "bicycle": (255, 0, 30),
+    "pedestrian": (255, 200, 200),
+    "traffic_cone": (120, 120, 120),
+}
+camera_panels = [
+    "data/CAM_FRONT_LEFT",
+    "data/CAM_FRONT",
+    "data/CAM_FRONT_RIGHT",
+    "data/CAM_BACK_LEFT",
+    "data/CAM_BACK",
+    "data/CAM_BACK_RIGHT",
+]
+
+# Add filter attributes
+filter_attributes = [
+    ("vehicle.bicycle", "vehicle_state.parked"),
+    ("vehicle.bicycle", "cycle_state.without_rider"),
+    ("vehicle.bicycle", "motorcycle_state.without_rider"),
+    ("vehicle.motorcycle", "vehicle_state.parked"),
+    ("vehicle.motorcycle", "cycle_state.without_rider"),
+    ("vehicle.motorcycle", "motorcycle_state.without_rider"),
+    ("bicycle", "vehicle_state.parked"),
+    ("bicycle", "cycle_state.without_rider"),
+    ("bicycle", "motorcycle_state.without_rider"),
+    ("motorcycle", "vehicle_state.parked"),
+    ("motorcycle", "cycle_state.without_rider"),
+    ("motorcycle", "motorcycle_state.without_rider"),
+]
+
+evaluator_metric_configs = dict(
+    evaluation_task="detection",
+    target_labels=class_names,
+    center_distance_bev_thresholds=[0.5, 1.0, 2.0, 4.0],
+    # plane_distance_thresholds is required for the pass fail evaluation
+    plane_distance_thresholds=[2.0, 4.0],
+    iou_2d_thresholds=None,
+    iou_3d_thresholds=None,
+    label_prefix="autoware",
+    # bev minimum distance ranges for each range bucket, must be the same length as max_distance,
+    # they will form bev distance ranges in [(min_distance[0], max_distance[0]), (min_distance[1], max_distance[1]), ...] when filtering
+    min_distance=[0.0, 50.0, 90.0, 0.0],
+    # bev maximum distance ranges for each range bucket, must be the same length as min_distance
+    max_distance=[50.0, 90.0, 121.0, 121.0],
+    min_point_numbers=0,
+    matching_class_agnostic_fps=False,
+)
diff --git a/autoware_ml/configs/detection3d/default_runtime.py b/autoware_ml/configs/detection3d/default_runtime.py
index cc2b896f7..6da761425 100644
--- a/autoware_ml/configs/detection3d/default_runtime.py
+++ b/autoware_ml/configs/detection3d/default_runtime.py
@@ -2,9 +2,17 @@
 
 default_hooks = dict(
     timer=dict(type="IterTimerHook"),
-    logger=dict(type="LoggerHook", interval=50),
+    logger=dict(
+        type="LoggerHook",
+        interval=50,
+        backend_args=dict(backend="local"),
+    ),
     param_scheduler=dict(type="ParamSchedulerHook"),
-    checkpoint=dict(type="CheckpointHook", interval=-1),
+    checkpoint=dict(
+        type="CheckpointHook",
+        interval=-1,
+        backend_args=dict(backend="local"),
+    ),
     sampler_seed=dict(type="DistSamplerSeedHook"),
     visualization=dict(type="Det3DVisualizationHook"),
 )
diff --git a/autoware_ml/detection3d/datasets/t4dataset.py b/autoware_ml/detection3d/datasets/t4dataset.py
index ce1c78f31..06e063233 100644
--- a/autoware_ml/detection3d/datasets/t4dataset.py
+++ b/autoware_ml/detection3d/datasets/t4dataset.py
@@ -2,6 +2,7 @@
 from typing import List
 
 import numpy as np
+import tqdm
 from mmdet3d.datasets import NuScenesDataset
 from mmengine.logging import print_log
 from mmengine.registry import DATASETS
@@ -51,21 +52,27 @@ def filter_data(self) -> List[dict]:
         if not self.filter_cfg:
             return self.data_list
 
-        filter_frames_with_camera_order = self.filter_cfg.get("filter_frames_with_camera_order", None)
-        if filter_frames_with_camera_order is None:
+        filter_frames_with_camera_orders = self.filter_cfg.get("filter_frames_with_camera_orders", None)
+        if filter_frames_with_camera_orders is None:
             return self.data_list
 
         filtered_data_list = []
-        for entry in self.data_list:
+        for entry in tqdm.tqdm(self.data_list, desc="Filtering data"):
+            vehicle_type = entry.get("vehicle_type", None)
+            if vehicle_type is None:
+                raise KeyError(f"Missing 'vehicle_type' in entry: {entry}")
+
+            filter_frames_with_camera_order = filter_frames_with_camera_orders.get(vehicle_type, None)
+            if filter_frames_with_camera_order is None:
+                raise KeyError(f"Missing camera order for vehicle type '{vehicle_type}' in filter configuration.")
+
             filtered = False
             for camera_order in filter_frames_with_camera_order:
                 if camera_order not in entry["images"]:
                     filtered = True
                     break
 
-                if entry["images"][camera_order]["img_path"] is None or not osp.exists(
-                    entry["images"][camera_order]["img_path"]
-                ):
+                if entry["images"][camera_order]["img_path"] is None:
                     filtered = True
                     break
 
@@ -180,6 +187,7 @@ def parse_data_info(self, info: dict) -> dict:
                             cam_prefix,
                             img_info["img_path"],
                         )
+                    # print_log(f"Camera path: {img_info['img_path']}", logger="current")
 
             if self.default_cam_key is not None:
                 info["img_path"] = info["images"][self.default_cam_key]["img_path"]
@@ -192,4 +200,7 @@ def parse_data_info(self, info: dict) -> dict:
                 else:
                     info["lidar2img"] = info["cam2img"] @ info["lidar2cam"]
 
+        # Default difficulty to 0 if not present
+        if "difficulty" not in info:
+            info["difficulty"] = 0
         return info
diff --git a/autoware_ml/detection3d/evaluation/t4metric/t4metric.py b/autoware_ml/detection3d/evaluation/t4metric/t4metric.py
index 2df0ac490..e3f68c5e3 100644
--- a/autoware_ml/detection3d/evaluation/t4metric/t4metric.py
+++ b/autoware_ml/detection3d/evaluation/t4metric/t4metric.py
@@ -262,8 +262,7 @@ def _parse_ground_truth_from_sample(self, data_sample: Dict[str, Any]) -> dict:
             "num_lidar_pts": num_lidar_pts,
         }
 
-    @staticmethod
-    def _get_scene_info(data_infos: List[dict]) -> Tuple[List[str], List[str]]:
+    def _get_scene_info(self, data_infos: List[dict]) -> Tuple[List[str], List[str]]:
         """Get scene tokens and directory names from data infos.
 
         Args:
@@ -284,6 +283,7 @@ def _get_scene_info(data_infos: List[dict]) -> Tuple[List[str], List[str]]:
             if directory not in directories:
                 scene_tokens.append(scene_token)
                 directories.append(directory)
+
         return scene_tokens, directories
 
     @staticmethod
diff --git a/projects/BEVFusion/bevfusion/__init__.py b/projects/BEVFusion/bevfusion/__init__.py
index e84525651..09bb61b9a 100644
--- a/projects/BEVFusion/bevfusion/__init__.py
+++ b/projects/BEVFusion/bevfusion/__init__.py
@@ -3,10 +3,17 @@
 from .bevfusion_necks import GeneralizedLSSFPN
 from .bevfusion_voxel_encoder import HardSimpleVoxelSinCosEncoder
 from .depth_lss import DepthLSSTransform, LSSTransform
-from .loading import BEVLoadMultiViewImageFromFiles
+from .depth_lss_v2 import LSSTransformV2, LSSTransformV2DepthAware
+from .loading import BEVLoadMultiViewImageFromFiles, PointsToMultiViewImageDepths
 from .sparse_encoder import BEVFusionSparseEncoder
 from .transformer import TransformerDecoderLayer
-from .transforms_3d import BEVFusionGlobalRotScaleTrans, BEVFusionRandomFlip3D, GridMask, ImageAug3D
+from .transforms_3d import (
+    BEVFusionGlobalRotScaleTrans,
+    BEVFusionRandomFlip3D,
+    BEVFusionRemoveLiDARPoints,
+    GridMask,
+    ImageAug3D,
+)
 from .utils import BBoxBEVL1Cost, HeuristicAssigner3D, HungarianAssigner3D, IoU3DCost, TransFusionBBoxCoder
 
 __all__ = [
@@ -27,6 +34,10 @@
     "TransformerDecoderLayer",
     "BEVFusionRandomFlip3D",
     "BEVFusionGlobalRotScaleTrans",
+    "BEVFusionRemoveLiDARPoints",
     "TransFusionBBoxCoder",
     "HardSimpleVoxelSinCosEncoder",
+    "LSSTransformV2",
+    "PointsToMultiViewImageDepths",
+    "LSSTransformV2DepthAware",
 ]
diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py
index 709d851a9..4ff2519d7 100644
--- a/projects/BEVFusion/bevfusion/bevfusion.py
+++ b/projects/BEVFusion/bevfusion/bevfusion.py
@@ -1,7 +1,10 @@
+import math
 from collections import OrderedDict
 from copy import deepcopy
+from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 
+import matplotlib.pyplot as plt
 import numpy as np
 import torch
 import torch.distributed as dist
@@ -9,6 +12,7 @@
 from mmdet3d.registry import MODELS
 from mmdet3d.structures import Det3DDataSample
 from mmdet3d.utils import OptConfigType, OptMultiConfig, OptSampleList
+from mmengine.logging import print_log
 from mmengine.utils import is_list_of
 from torch import Tensor
 from torch.nn import functional as F
@@ -34,6 +38,9 @@ def __init__(
         bbox_head: Optional[dict] = None,
         init_cfg: OptMultiConfig = None,
         seg_head: Optional[dict] = None,
+        loss_depth_weight: float = 3.0,
+        depth_gt_downsample: int = 1,
+        visualize_gt_depth_dir: Optional[str] = None,
         **kwargs,
     ) -> None:
         """Initialize BEVFusion model.
@@ -74,8 +81,12 @@ def __init__(
         self.pts_neck = MODELS.build(pts_neck) if pts_neck is not None else None
 
         self.bbox_head = MODELS.build(bbox_head)
-
-        self.init_weights()
+        self._weights_initialized = False
+        self.loss_depth_weight = loss_depth_weight
+        self.depth_gt_downsample = depth_gt_downsample
+        self.visualize_gt_depth_dir = Path(visualize_gt_depth_dir) if visualize_gt_depth_dir is not None else None
+        if self.visualize_gt_depth_dir is not None:
+            self.visualize_gt_depth_dir.mkdir(parents=True, exist_ok=True)
 
     def _forward(
         self, batch_inputs_dict: Tensor, batch_data_samples: OptSampleList = [], using_image_features=False, **kwargs
@@ -131,8 +142,11 @@ def parse_losses(self, losses: Dict[str, torch.Tensor]) -> Tuple[torch.Tensor, D
         return loss, log_vars  # type: ignore
 
     def init_weights(self) -> None:
+        if self._weights_initialized:
+            return
         if self.img_backbone is not None:
             self.img_backbone.init_weights()
+        self._weights_initialized = True
 
     @property
     def with_bbox_head(self):
@@ -144,6 +158,53 @@ def with_seg_head(self):
         """bool: Whether the detector has a segmentation head."""
         return hasattr(self, "seg_head") and self.seg_head is not None
 
+    def prepare_camera_depth_aware_parameters(
+        self,
+        camera_intrinsics: torch.Tensor,
+        img_aug_matrix: torch.Tensor,
+        lidar_aug_matrix: torch.Tensor,
+        camera2lidar: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            camera_intrinsics: torch.Tensor, the camera intrinsics of shape (B, N, 3, 3).
+            img_aug_matrix: torch.Tensor, the image augmentation matrix of shape (B, N, 4, 4).
+            lidar_aug_matrix: torch.Tensor, the lidar augmentation matrix of shape (B, 4, 4).
+            camera2lidar: torch.Tensor, the camera to lidar matrix of shape (B, N, 4, 4).
+        Returns:
+            torch.Tensor, the camera depth aware parameters of shape (B*N, N_CAMERA_DEPTH_PARAMETERS).
+        """
+        B, N, _, _ = camera_intrinsics.shape
+        lidar_aug_matrix = lidar_aug_matrix.view(B, 1, 4, 4).repeat(1, N, 1, 1)
+
+        # (B*N, 15)
+        mlp_input = torch.stack(
+            [
+                camera_intrinsics[:, :, 0, 0],  # fx
+                camera_intrinsics[:, :, 1, 1],  # fy
+                camera_intrinsics[:, :, 0, 2],  # cx
+                camera_intrinsics[:, :, 1, 2],  # cy
+                img_aug_matrix[:, :, 0, 0],  # r11
+                img_aug_matrix[:, :, 0, 1],  # r12
+                img_aug_matrix[:, :, 0, 3],  # t1
+                img_aug_matrix[:, :, 1, 0],  # r21
+                img_aug_matrix[:, :, 1, 1],  # r22
+                img_aug_matrix[:, :, 1, 3],  # t2
+                lidar_aug_matrix[:, :, 0, 0],  # r11
+                lidar_aug_matrix[:, :, 0, 1],  # r12
+                lidar_aug_matrix[:, :, 1, 0],  # r21
+                lidar_aug_matrix[:, :, 1, 1],  # r22
+                lidar_aug_matrix[:, :, 2, 2],  # r33
+            ],
+            dim=-1,
+        )
+        # (B, N, 4, 4) -> (B, N, 3, 4) -> (B*N, 12)
+        camera2lidar_flatten = camera2lidar[:, :, :3, :].view(B, N, -1)
+
+        # (B, N, 15+12)
+        mlp_input = torch.cat([mlp_input, camera2lidar_flatten], dim=-1)
+        return mlp_input
+
     def get_image_backbone_features(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C, H, W = x.size()
         x = x.view(B * N, C, H, W).contiguous()
@@ -174,14 +235,15 @@ def extract_img_feat(
         lidar_aug_matrix_inverse=None,
         geom_feats=None,
         using_image_features=False,
-    ) -> torch.Tensor:
+        camera_depth_aware_parameters=None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
 
         if not using_image_features:
             x = self.get_image_backbone_features(x)
 
         with torch.amp.autocast("cuda", enabled=False):
             # with torch.autocast(device_type='cuda', dtype=torch.float32):
-            x = self.view_transform(
+            x, pred_depths = self.view_transform(
                 x,
                 points,
                 lidar2image,
@@ -194,8 +256,9 @@ def extract_img_feat(
                 img_aug_matrix_inverse,
                 lidar_aug_matrix_inverse,
                 geom_feats,
+                camera_depth_aware_parameters=camera_depth_aware_parameters,
             )
-        return x
+        return x, pred_depths
 
     def extract_pts_feat(self, feats, coords, sizes, points=None) -> torch.Tensor:
         if points is not None:
@@ -271,7 +334,7 @@ def predict(
                 contains a tensor with shape (num_instances, 7).
         """
         batch_input_metas = [item.metainfo for item in batch_data_samples]
-        feats = self.extract_feat(batch_inputs_dict, batch_input_metas, using_image_features)
+        feats, _ = self.extract_feat(batch_inputs_dict, batch_input_metas, using_image_features)
 
         if self.with_bbox_head:
             outputs = self.bbox_head.predict(feats, batch_input_metas)
@@ -292,6 +355,7 @@ def extract_feat(
         features = []
 
         is_onnx_inference = False
+        pred_depths = None
         if imgs is not None and "lidar2img" not in batch_inputs_dict:
             # NOTE(knzo25): normal training and testing
             imgs = imgs.contiguous()
@@ -309,7 +373,13 @@ def extract_feat(
             camera2lidar = imgs.new_tensor(np.asarray(camera2lidar))
             img_aug_matrix = imgs.new_tensor(np.asarray(img_aug_matrix))
             lidar_aug_matrix = imgs.new_tensor(np.asarray(lidar_aug_matrix))
-            img_feature = self.extract_img_feat(
+            camera_depth_aware_parameters = self.prepare_camera_depth_aware_parameters(
+                camera_intrinsics=camera_intrinsics,
+                img_aug_matrix=img_aug_matrix,
+                lidar_aug_matrix=lidar_aug_matrix,
+                camera2lidar=camera2lidar,
+            )
+            img_feature, pred_depths = self.extract_img_feat(
                 imgs,
                 deepcopy(points),
                 lidar2image,
@@ -319,6 +389,7 @@ def extract_feat(
                 lidar_aug_matrix,
                 batch_input_metas,
                 using_image_features=using_image_features,
+                camera_depth_aware_parameters=camera_depth_aware_parameters,
             )
             features.append(img_feature)
         elif imgs is not None:
@@ -330,8 +401,10 @@ def extract_feat(
             img_aug_matrix = batch_inputs_dict["img_aug_matrix"]
             lidar_aug_matrix = batch_inputs_dict["lidar_aug_matrix"]
             geom_feats = batch_inputs_dict["geom_feats"]
+            # Retrieve the parameters from deployment code directly
+            camera_depth_aware_parameters = batch_inputs_dict["camera_depth_aware_parameters"]
 
-            img_feature = self.extract_img_feat(
+            img_feature, pred_depths = self.extract_img_feat(
                 imgs,
                 points,
                 lidar2image,
@@ -342,6 +415,7 @@ def extract_feat(
                 batch_input_metas,
                 geom_feats=geom_feats,
                 using_image_features=using_image_features,
+                camera_depth_aware_parameters=camera_depth_aware_parameters,
             )
             features.append(img_feature)
 
@@ -366,7 +440,7 @@ def extract_feat(
         if self.pts_neck is not None:
             x = self.pts_neck(x)
 
-        return x
+        return x, pred_depths
 
     def loss(
         self,
@@ -376,12 +450,145 @@ def loss(
         **kwargs,
     ) -> List[Det3DDataSample]:
         batch_input_metas = [item.metainfo for item in batch_data_samples]
-        feats = self.extract_feat(batch_inputs_dict, batch_input_metas, using_image_features)
+        feats, pred_depths = self.extract_feat(batch_inputs_dict, batch_input_metas, using_image_features)
 
         losses = dict()
+        if self.loss_depth_weight > 0 and pred_depths is not None:
+            with torch.amp.autocast("cuda", enabled=False):
+                gt_depths = torch.stack(
+                    [
+                        (
+                            meta["gt_depths"]
+                            if isinstance(meta["gt_depths"], torch.Tensor)
+                            else torch.as_tensor(meta["gt_depths"])
+                        )
+                        for meta in batch_input_metas
+                    ]
+                ).to(device=pred_depths.device, dtype=torch.float32)
+                depth_loss = self.get_depth_loss(gt_depths, pred_depths)
+                losses["loss_depth"] = depth_loss
+
         if self.with_bbox_head:
             bbox_loss = self.bbox_head.loss(feats, batch_data_samples)
-
-        losses.update(bbox_loss)
+            losses.update(bbox_loss)
 
         return losses
+
+    def _visualize_one_hot_gt_depth(
+        self,
+        gt_depths_one_hot: Tensor,
+        batch_size: int,
+        num_cameras: int,
+        height: int,
+        width: int,
+        batch_idx: int = 0,
+        num_channels: int = 6,
+    ) -> None:
+        """Save one-hot depth GT maps for the first batch and first few depth channels.
+
+        Args:
+            gt_depths_one_hot (Tensor): One-hot depth GT of shape [B*N*H*W, D].
+            batch_size (int): Batch size B from the original input.
+            num_cameras (int): Number of camera views N from the original input.
+            height (int): Original input height H before downsampling.
+            width (int): Original input width W before downsampling.
+            batch_idx (int): Batch index to visualize.
+            num_channels (int): Number of depth-bin channels to visualize.
+        """
+        if self.visualize_gt_depth_dir is None:
+            return
+
+        if dist.is_available() and dist.is_initialized() and dist.get_rank() != 0:
+            return
+
+        if batch_size <= batch_idx or num_cameras == 0:
+            return
+
+        downsample = self.depth_gt_downsample
+        height_down = height // downsample
+        width_down = width // downsample
+        num_depth_bins = gt_depths_one_hot.shape[1]
+
+        num_channels = min(num_channels, num_depth_bins)
+        if num_channels == 0 or height_down == 0 or width_down == 0:
+            return
+
+        with torch.no_grad():
+            one_hot = gt_depths_one_hot.view(batch_size, num_cameras, height_down, width_down, num_depth_bins)
+            depth_channels = one_hot[batch_idx, 0, :, :, :num_channels].detach().float().cpu().numpy()
+
+        ncols = min(3, num_channels)
+        nrows = math.ceil(num_channels / ncols)
+        fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 4 * nrows), squeeze=False)
+
+        dbounds = self.view_transform.dbound
+        for ch_idx in range(num_channels):
+            ax = axes[ch_idx // ncols, ch_idx % ncols]
+            channel_map = depth_channels[:, :, ch_idx]
+            depth_m = dbounds[0] + (ch_idx + 0.5) * dbounds[2]
+            im = ax.imshow(channel_map, cmap="viridis", vmin=0, vmax=1, interpolation="nearest")
+            ax.set_title(f"batch {batch_idx}, depth bin {ch_idx} (~{depth_m:.1f}m)")
+            ax.set_xticks([])
+            ax.set_yticks([])
+            fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
+
+        for ch_idx in range(num_channels, nrows * ncols):
+            axes[ch_idx // ncols, ch_idx % ncols].axis("off")
+
+        fig.suptitle(f"one-hot gt_depth (batch={batch_idx}, cam=0, bins=0-{num_channels - 1})")
+        fig.tight_layout()
+
+        if not hasattr(self, "_gt_depth_one_hot_vis_count"):
+            self._gt_depth_one_hot_vis_count = 0
+        self._gt_depth_one_hot_vis_count += 1
+        save_path = self.visualize_gt_depth_dir / f"gt_depth_one_hot_{self._gt_depth_one_hot_vis_count:06d}.png"
+        fig.savefig(save_path, dpi=150, bbox_inches="tight")
+        plt.close(fig)
+        print_log(f"Saved one-hot gt_depth visualization to {save_path.resolve()}")
+
+    def get_downsampled_gt_depth(self, gt_depths):
+        """
+        Input:
+            gt_depths: [B, N, H, W]
+        Output:
+            gt_depths: [B*N*h*w, d]
+        """
+        B, N, H, W = gt_depths.shape
+        D = self.view_transform.D
+        dbounds = self.view_transform.dbound
+        gt_depths = gt_depths.view(
+            B * N,
+            H // self.depth_gt_downsample,
+            self.depth_gt_downsample,
+            W // self.depth_gt_downsample,
+            self.depth_gt_downsample,
+            1,
+        )
+        gt_depths = gt_depths.permute(0, 1, 3, 5, 2, 4).contiguous()
+        gt_depths = gt_depths.view(-1, self.depth_gt_downsample * self.depth_gt_downsample)
+        gt_depths_tmp = torch.where(gt_depths == 0.0, 1e5 * torch.ones_like(gt_depths), gt_depths)
+        gt_depths = torch.min(gt_depths_tmp, dim=-1).values
+        gt_depths = gt_depths.view(B * N, H // self.depth_gt_downsample, W // self.depth_gt_downsample)
+
+        gt_depths = (gt_depths - (dbounds[0] - dbounds[2])) / dbounds[2]
+        # gt_depths = torch.where(gt_depths >= 0.0, gt_depths, torch.zeros_like(gt_depths))
+        # gt_depths = torch.clamp(gt_depths, max=float(D))
+        gt_depths = torch.where((gt_depths >= 0.0) & (gt_depths < D + 1), gt_depths, torch.zeros_like(gt_depths))
+        # gt_depths = torch.clamp(gt_depths, max=float(D))
+        gt_depths = F.one_hot(gt_depths.long(), num_classes=D + 1).view(-1, D + 1)[:, 1:]
+        self._visualize_one_hot_gt_depth(gt_depths, B, N, H, W)
+        return gt_depths.float()
+
+    def get_depth_loss(self, depth_labels, depth_preds):
+        depth_labels = self.get_downsampled_gt_depth(depth_labels)
+        # (B, N, D, H, W) -> (B*N*H*W, D)
+        depth_preds = depth_preds.permute(0, 1, 3, 4, 2).contiguous().view(-1, self.view_transform.D)
+        fg_mask = torch.max(depth_labels, dim=1).values > 0.0
+        depth_labels = depth_labels[fg_mask]
+        depth_preds = depth_preds[fg_mask]
+        depth_loss = F.binary_cross_entropy(
+            depth_preds,
+            depth_labels,
+            reduction="none",
+        ).sum() / max(1.0, fg_mask.sum())
+        return self.loss_depth_weight * depth_loss
diff --git a/projects/BEVFusion/bevfusion/bevfusion_head.py b/projects/BEVFusion/bevfusion/bevfusion_head.py
index 2d713b022..a7ddca4ca 100644
--- a/projects/BEVFusion/bevfusion/bevfusion_head.py
+++ b/projects/BEVFusion/bevfusion/bevfusion_head.py
@@ -26,11 +26,18 @@ def clip_sigmoid(x, eps=1e-4):
 @MODELS.register_module()
 class ConvFuser(nn.Sequential):
 
-    def __init__(self, in_channels: int, out_channels: int, kernel_size: int, padding: int) -> None:
+    def __init__(self, in_channels: int, out_channels: int, kernel_size: int, stride: int, padding: int) -> None:
         self.in_channels = in_channels
         self.out_channels = out_channels
         super().__init__(
-            nn.Conv2d(sum(in_channels), out_channels, kernel_size, padding, bias=False),
+            nn.Conv2d(
+                sum(in_channels),
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                bias=False,
+            ),
             nn.BatchNorm2d(out_channels),
             nn.ReLU(True),
         )
diff --git a/projects/BEVFusion/bevfusion/depth_lss.py b/projects/BEVFusion/bevfusion/depth_lss.py
index ac7c5b503..d0a547258 100644
--- a/projects/BEVFusion/bevfusion/depth_lss.py
+++ b/projects/BEVFusion/bevfusion/depth_lss.py
@@ -1,8 +1,13 @@
 # modify from https://github.com/mit-han-lab/bevfusion
+import math
+from pathlib import Path
 from typing import Tuple
 
+import matplotlib.pyplot as plt
+import numpy as np
 import torch
 from mmdet3d.registry import MODELS
+from mmengine.logging import print_log
 from torch import nn
 
 from .ops import bev_pool
@@ -164,6 +169,7 @@ def __init__(
         ybound: Tuple[float, float, float],
         zbound: Tuple[float, float, float],
         dbound: Tuple[float, float, float],
+        visualize_bev_feat: bool = False,
     ) -> None:
         super().__init__()
         self.in_channels = in_channels
@@ -183,6 +189,7 @@ def __init__(
         self.frustum = self.create_frustum()
         self.D = self.frustum.shape[0]
         self.fp16_enabled = False
+        self.visualize_bev_feat = visualize_bev_feat
 
     def create_frustum(self):
         iH, iW = self.image_size
@@ -319,8 +326,55 @@ def bev_pool_precomputed(self, x, geom_feats, kept, ranks, indices):
 
         # collapse Z
         final = torch.cat(x.unbind(dim=2), 1)
+        if self.visualize_bev_feat:
+            self.plot_bev_feat(final)
+
         return final
 
+    def plot_bev_feat(self, bev_feat):
+        """Visualize the BEV feat for the given batch index."""
+        try:
+            import torch.distributed as dist
+
+            if dist.is_available() and dist.is_initialized() and dist.get_rank() != 0:
+                return
+        except ImportError:
+            pass
+
+        batch_idx = 0
+        if bev_feat.shape[0] <= batch_idx:
+            return
+
+        # save first 10 raw channel maps for one batch sample (B, C, Y, X)
+        num_channels = 10
+        with torch.no_grad():
+            feat = bev_feat[batch_idx].detach().float().cpu().numpy()
+        channel_indices = np.arange(min(num_channels, feat.shape[0]))
+        ncols = min(5, len(channel_indices))
+        nrows = math.ceil(len(channel_indices) / ncols)
+        fig, axes = plt.subplots(nrows, ncols, figsize=(3 * ncols, 3 * nrows), squeeze=False)
+        for ax, ch_idx in zip(axes.ravel(), channel_indices):
+            ch_map = feat[ch_idx]
+            im = ax.imshow(ch_map, cmap="viridis", origin="lower", aspect="equal")
+            ax.set_title(f"ch {ch_idx}", fontsize=9)
+            ax.set_xlabel("X")
+            ax.set_ylabel("Y")
+            fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
+        for ax in axes.ravel()[len(channel_indices) :]:
+            ax.axis("off")
+        fig.suptitle(f"bev_feat channels 0-{len(channel_indices) - 1} (batch={batch_idx})")
+        fig.tight_layout()
+
+        save_dir = Path("work_dirs/bev_feat_vis_2")
+        save_dir.mkdir(parents=True, exist_ok=True)
+        if not hasattr(self, "_bev_feat_vis_count"):
+            self._bev_feat_vis_count = 0
+        self._bev_feat_vis_count += 1
+        save_path = save_dir / f"bev_feat_batch{batch_idx}_{self._bev_feat_vis_count:06d}.png"
+        fig.savefig(save_path, dpi=150, bbox_inches="tight")
+        plt.close(fig)
+        print_log(f"Saved BEV feat visualization to {save_path.resolve()}")
+
     def forward(
         self,
         img,
diff --git a/projects/BEVFusion/bevfusion/depth_lss_v2.py b/projects/BEVFusion/bevfusion/depth_lss_v2.py
new file mode 100644
index 000000000..addf038d3
--- /dev/null
+++ b/projects/BEVFusion/bevfusion/depth_lss_v2.py
@@ -0,0 +1,537 @@
+import math
+from pathlib import Path
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+from mmdet3d.registry import MODELS
+from mmengine.logging import print_log
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+
+from .depth_lss import BaseViewTransform, DepthLSSNet, DownSampleNet, LidarDepthImageNet
+from .ops import bev_pool_v2
+
+
+class SELayer(nn.Module):
+    """
+    Squeeze-and-Excitation (SE) layer.
+    This is used to modulate features with camera-depth aware parameters.
+    The code is taken from BEVDET (https://github.com/hustvl/BEVDET).
+    """
+
+    def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid):
+        super().__init__()
+        # Dont need global pooling because inputs are (B*N, C, 1, 1).
+        self.sequeeze_net = nn.Sequential(
+            # Squeeze with 1x1 convolution
+            nn.Conv2d(channels, channels, 1, bias=True),
+            # Activation
+            act_layer(),
+            # Expand with 1x1 convolution
+            nn.Conv2d(channels, channels, 1, bias=True),
+            # Gate with sigmoid activation
+            gate_layer(),
+        )
+
+    def forward(self, x: torch.Tensor, depth_aware_features: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: Tuple[torch.Tensor, torch.Tensor], the input tuple containing the image features and camera-depth aware parameters.
+        Returns:
+            torch.Tensor, the output tensor of shape (B, N, C).
+        """
+        feature_attentions = self.sequeeze_net(depth_aware_features)
+        return x * feature_attentions
+
+
+class CameraDepthLinearProjectionMLP(nn.Module):
+    """
+    Linear projection module by MLP. This is used to project image (context) features and camera-depth
+    aware parameters (for example, intrinsics) to embedding space.
+    The code is taken from BEVDET (https://github.com/hustvl/BEVDET).
+    """
+
+    def __init__(self, in_channels: int, hidden_channels: int, out_channels: int, drop_out: float = 0.0):
+        """
+        Args:
+            in_channels: int, the number of input channels.
+            hidden_channels: int, the number of hidden channels.
+            out_channels: int, the number of output channels.
+            drop_out: float, the dropout rate.
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.drop_out = drop_out
+
+        self.sequential_mlp = nn.Sequential(
+            nn.Linear(in_channels, hidden_channels),
+            nn.ReLU(inplace=True),
+            nn.Dropout(drop_out),
+            nn.Linear(hidden_channels, out_channels),
+            nn.Dropout(drop_out),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: torch.Tensor, the input tensor of shape (B, N, C).
+        Returns:
+            torch.Tensor, the output tensor of shape (B, N, C).
+        """
+        return self.sequential_mlp(x)
+
+
+class CameraDepthAwareNet(nn.Module):
+    """
+    Camera-depth aware depth net. This is used to predict the depth of the scene.
+    The code is taken from BEVDET (https://github.com/hustvl/BEVDET).
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        hidden_channels: int,
+        out_channels: int,
+        mlp_drop_out: float,
+        depth_channels: int,
+        with_cp: bool = False,
+        num_camera_depth_parameters: int = 27,
+    ) -> None:
+        """
+        Args:
+            in_channels: int, the number of input channels.
+            out_channels: int, the number of output channels.
+            mlp_drop_out: float, the dropout rate of the MLP.
+            mlp_hidden_channels: int, the number of hidden channels of the MLP.
+            mlp_out_channels: int, the number of output channels of the MLP.
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.mlp_drop_out = mlp_drop_out
+        self.num_camera_depth_parameters = num_camera_depth_parameters
+        self.depth_channels = depth_channels
+        self.with_cp = with_cp
+
+        # Input convolution for context/image features
+        # Camera depth aware parameters branch
+        self.camera_depth_aware_parameters_bn = nn.BatchNorm1d(self.num_camera_depth_parameters)
+
+        # Context/image feature branch
+        self.context_input_conv = nn.Sequential(
+            nn.Conv2d(in_channels, hidden_channels, kernel_size=1, stride=1, bias=False),
+            nn.BatchNorm2d(hidden_channels),
+            nn.ReLU(inplace=True),
+        )
+        self.context_camera_depth_aware_mlp = CameraDepthLinearProjectionMLP(
+            in_channels=self.num_camera_depth_parameters,
+            hidden_channels=hidden_channels,
+            out_channels=hidden_channels,
+            drop_out=self.mlp_drop_out,
+        )
+        self.context_se = SELayer(channels=hidden_channels)
+        self.context_conv = nn.Conv2d(hidden_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=True)
+
+        # Depth branch
+        self.depth_camera_depth_aware_mlp = CameraDepthLinearProjectionMLP(
+            in_channels=self.num_camera_depth_parameters,
+            hidden_channels=hidden_channels,
+            out_channels=hidden_channels,
+            drop_out=self.mlp_drop_out,
+        )
+        self.depth_se = SELayer(channels=hidden_channels)
+        self.depth_conv = nn.Sequential(
+            nn.Conv2d(hidden_channels, depth_channels, kernel_size=1, stride=1, padding=0, bias=True)
+        )
+
+    def context_forward(
+        self, context_features: torch.Tensor, camera_depth_aware_features: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: torch.Tensor, the input tensor of shape (B*N, C, H, W).
+            camera_depth_aware_parameters: torch.Tensor, the camera-depth aware parameters of shape (B*N, N_CAMERA_DEPTH_PARAMETERS).
+        Returns:
+            torch.Tensor, the output tensor of shape (B*N, C, H, W).
+        """
+        context_camera_depth_aware_features = self.context_camera_depth_aware_mlp(camera_depth_aware_features)
+        # # (B*N, mlp_out_channels) -> (B*N, mlp_out_channels, 1, 1)
+        context_camera_depth_aware_features = context_camera_depth_aware_features.view(-1, self.hidden_channels, 1, 1)
+        context_features = self.context_se(context_features, context_camera_depth_aware_features)
+        context_features = self.context_conv(context_features)
+        return context_features
+
+    def depth_forward(self, depth_features: torch.Tensor, camera_depth_aware_features: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            depth_features: torch.Tensor, the input tensor of shape (B*N, C, H, W).
+            camera_depth_aware_parameters: torch.Tensor, the camera-depth aware parameters of shape (B, N, D).
+        Returns:
+            torch.Tensor, the output tensor of shape (B*N, C, H, W).
+        """
+        depth_camera_depth_aware_features = self.depth_camera_depth_aware_mlp(camera_depth_aware_features)
+        # # (B*N, mlp_out_channels) -> (B*N, mlp_out_channels, 1, 1)
+        depth_camera_depth_aware_features = depth_camera_depth_aware_features.view(-1, self.hidden_channels, 1, 1)
+        # # (B*N, C, H, W)
+        depth_features = self.depth_se(depth_features, depth_camera_depth_aware_features)
+        if self.with_cp:
+            depth_features = checkpoint(self.depth_conv, depth_features)
+        else:
+            depth_features = self.depth_conv(depth_features)
+        return depth_features
+
+    def forward(self, x: torch.Tensor, camera_depth_aware_parameters: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: torch.Tensor, the input tensor of shape (B, N, C, H, W).
+            camera_depth_aware_parameters: torch.Tensor, the camera-depth aware parameters of shape (B, N, N_CAMERA_DEPTH_PARAMETERS).
+        Returns:
+            torch.Tensor, the output tensor of shape (B*N, C, H, W).
+        """
+        # (B, N, N_CAMERA_DEPTH_PARAMETERS) -> (B*N, N_CAMERA_DEPTH_PARAMETERS)
+        camera_depth_aware_parameters = camera_depth_aware_parameters.view(-1, self.num_camera_depth_parameters)
+
+        # (B*N, N_CAMERA_DEPTH_PARAMETERS)
+        camera_depth_aware_features = self.camera_depth_aware_parameters_bn(camera_depth_aware_parameters)
+        context_input_features = self.context_input_conv(x)
+        context_features = self.context_forward(context_input_features, camera_depth_aware_features)
+        depth_features = self.depth_forward(context_input_features, camera_depth_aware_features)
+        return torch.cat([depth_features, context_features], dim=1)
+
+
+class BaseViewTransformV2(BaseViewTransform):
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        image_size: Tuple[int, int],
+        feature_size: Tuple[int, int],
+        xbound: Tuple[float, float, float],
+        ybound: Tuple[float, float, float],
+        zbound: Tuple[float, float, float],
+        dbound: Tuple[float, float, float],
+        collapse_z: bool = True,
+        expand_batch_axis: bool = False,
+        visualize_bev_feat: bool = False,
+    ):
+        """
+        Args:
+            collapse_z: collapse the Z axis of the BEV grid
+            expand_batch_axis: expand the batch axis of the inputs to bev pool if this is set to True.
+        """
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            image_size=image_size,
+            feature_size=feature_size,
+            xbound=xbound,
+            ybound=ybound,
+            zbound=zbound,
+            dbound=dbound,
+            visualize_bev_feat=visualize_bev_feat,
+        )
+        self.collapse_z = collapse_z
+        self.expand_batch_axis = expand_batch_axis
+
+    def get_cam_feats(
+        self, x, camera_depth_aware_parameters: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
+
+    def forward(
+        self,
+        img,
+        points,
+        lidar2image,
+        camera_intrinsics,
+        camera2lidar,
+        img_aug_matrix,
+        lidar_aug_matrix,
+        metas,
+        camera_intrinsics_inverse,
+        img_aug_matrix_inverse,
+        lidar_aug_matrix_inverse,
+        geom_feats_precomputed,
+        camera_depth_aware_parameters: Optional[torch.Tensor] = None,
+    ):
+        if geom_feats_precomputed is not None:
+            ranks_bev, ranks_depth, ranks_feat = geom_feats_precomputed
+            x, depth_softmax = self.get_cam_feats(img)
+            x = self.bev_pool_precomputed(x, depth_softmax, ranks_bev, ranks_depth, ranks_feat)
+
+            # No return depth predictions when precomputed geometry features are used
+            depth_softmax = None
+
+        else:
+            intrins = camera_intrinsics[..., :3, :3]
+            post_rots = img_aug_matrix[..., :3, :3]
+            post_trans = img_aug_matrix[..., :3, 3]
+            camera2lidar_rots = camera2lidar[..., :3, :3]
+            camera2lidar_trans = camera2lidar[..., :3, 3]
+
+            extra_rots = lidar_aug_matrix[..., :3, :3]
+            extra_trans = lidar_aug_matrix[..., :3, 3]
+
+            geom = self.get_geometry(
+                camera2lidar_rots,
+                camera2lidar_trans,
+                torch.inverse(intrins),
+                torch.inverse(post_rots),
+                post_trans,
+                extra_rots=extra_rots,
+                extra_trans=extra_trans,
+            )
+
+            # depth is not connected to the calibration
+            # on_img is
+            # is also flattened_indices
+            (
+                view_feats,
+                depth_softmax,
+            ) = self.get_cam_feats(img, camera_depth_aware_parameters)
+            x = self.bev_pool(view_feats, depth_softmax, geom)
+
+        return x, depth_softmax
+
+    def bev_pool_aux(self, geom_feats):
+        B, N, D, H, W, C = geom_feats.shape
+        Nprime = B * N * D * H * W
+        assert C == 3
+
+        # record the index of selected points for acceleration purpose
+        ranks_depth = torch.arange(0, Nprime, dtype=torch.int, device=geom_feats.device)
+        ranks_feat = torch.arange(0, Nprime // D, dtype=torch.int, device=geom_feats.device)
+        ranks_feat = ranks_feat.reshape(B, N, 1, H, W)
+        ranks_feat = ranks_feat.expand(B, N, D, H, W).flatten()
+
+        # flatten indices
+        geom_feats = ((geom_feats - (self.bx - self.dx / 2.0)) / self.dx).long()
+        geom_feats = geom_feats.view(Nprime, 3)
+        batch_ix = torch.cat(
+            [torch.full([Nprime // B, 1], ix, device=geom_feats.device, dtype=torch.long) for ix in range(B)]
+        )
+        geom_feats = torch.cat((geom_feats, batch_ix), 1)
+
+        # filter out points that are outside box
+        kept = (
+            (geom_feats[:, 0] >= 0)
+            & (geom_feats[:, 0] < self.nx[0])
+            & (geom_feats[:, 1] >= 0)
+            & (geom_feats[:, 1] < self.nx[1])
+            & (geom_feats[:, 2] >= 0)
+            & (geom_feats[:, 2] < self.nx[2])
+        )
+
+        if len(kept) == 0:
+            return None, None, None
+
+        geom_feats, ranks_depth, ranks_feat = geom_feats[kept], ranks_depth[kept], ranks_feat[kept]
+
+        # Switch x and y to match the order of the BEV grid
+        ranks_bev = (
+            geom_feats[:, 3] * (self.nx[2] * self.nx[1] * self.nx[0])
+            + geom_feats[:, 2] * (self.nx[1] * self.nx[0])
+            + geom_feats[:, 0] * self.nx[1]
+            + geom_feats[:, 1]
+        )
+        indices = ranks_bev.argsort()
+        ranks_bev, ranks_depth, ranks_feat = ranks_bev[indices], ranks_depth[indices], ranks_feat[indices]
+        return (
+            ranks_bev.int().contiguous(),
+            ranks_depth.int().contiguous(),
+            ranks_feat.int().contiguous(),
+        )
+
+    def compute_intervals(self, ranks_bev: Optional[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+        if ranks_bev is None:
+            return None, None
+
+        kept = torch.ones(ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
+        kept[1:] = ranks_bev[1:] != ranks_bev[:-1]
+        interval_starts = torch.where(kept)[0].int()
+        if len(interval_starts) == 0:
+            return None, None
+
+        interval_lengths = torch.zeros_like(interval_starts)
+        interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
+        interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1]
+        return interval_starts.int().contiguous(), interval_lengths.int().contiguous()
+
+    def bev_pool(self, view_feats, depth_softmax, geom) -> torch.Tensor:
+        """ """
+        ranks_bev, ranks_depth, ranks_feat = self.bev_pool_aux(geom)
+        interval_starts, interval_lengths = self.compute_intervals(ranks_bev)
+        bev_feat = self.compute_bev_pool(
+            view_feats, depth_softmax, ranks_bev, ranks_depth, ranks_feat, interval_starts, interval_lengths
+        )
+        return bev_feat
+
+    def compute_bev_pool(
+        self, view_feats, depth_softmax, ranks_bev, ranks_depth, ranks_feat, interval_starts, interval_lengths
+    ):
+        """Compute the BEV pool for the given view features, depth softmax, ranks, and intervals."""
+        if interval_starts is None:
+            print_log("warning ---> no points within the predefined bev receptive field")
+            dummy = torch.zeros(
+                size=[view_feats.shape[0], view_feats.shape[2], self.nx[2], self.nx[1], self.nx[0]],
+                dtype=view_feats.dtype,
+                device=view_feats.device,
+            )
+            if self.collapse_z:
+                dummy = torch.cat(dummy.unbind(dim=2), 1)
+            return dummy
+
+        if self.expand_batch_axis:
+            view_feats = view_feats.unsqueeze(0)
+            depth_softmax = depth_softmax.unsqueeze(0)
+
+        # permute view_feats from (B, N, C, fH, fW) to (B, N, fH, fW, C)
+        view_feats = view_feats.permute(0, 1, 3, 4, 2)
+        bev_feat_shape = (
+            depth_softmax.shape[0],
+            int(self.nx[2]),
+            int(self.nx[1]),
+            int(self.nx[0]),
+            view_feats.shape[-1],
+        )  # (B, Z, Y, X, C)
+        bev_feat = bev_pool_v2(
+            depth=depth_softmax,
+            feat=view_feats,
+            ranks_depth=ranks_depth,
+            ranks_feat=ranks_feat,
+            ranks_bev=ranks_bev,
+            interval_starts=interval_starts,
+            interval_lengths=interval_lengths,
+            bev_feat_shape=bev_feat_shape,
+            is_training=self.training,
+        )
+
+        # collapse Z
+        if self.collapse_z:
+            bev_feat = torch.cat(bev_feat.unbind(dim=2), 1)
+
+        if self.visualize_bev_feat:
+            self.plot_bev_feat(bev_feat)
+
+        return bev_feat
+
+    def bev_pool_precomputed(self, view_feats, depth_softmax, ranks_bev, ranks_depth, ranks_feat):
+        interval_starts, interval_lengths = self.compute_intervals(ranks_bev)
+        bev_feat = self.compute_bev_pool(
+            view_feats, depth_softmax, ranks_bev, ranks_depth, ranks_feat, interval_starts, interval_lengths
+        )
+        return bev_feat
+
+    def get_depth_softmax(self, x: torch.Tensor, B, N, fH, fW) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            x: torch.Tensor, the input tensor of shape (B*N, D+C, H, W).
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor], the tuple containing the view features and depth softmax.
+            view_feats: torch.Tensor, the view features of shape (B, N, C, H, W).
+            depth_softmax: torch.Tensor, the depth softmax of shape (B, N, D, H, W).
+        """
+        depth_softmax = x[:, : self.D].softmax(dim=1)
+        depth_softmax = depth_softmax.view(B, N, self.D, fH, fW)
+        view_feats = x[:, self.D : (self.D + self.C)]
+        view_feats = view_feats.view(B, N, self.C, fH, fW)
+        return view_feats, depth_softmax
+
+
+@MODELS.register_module()
+class LSSTransformV2(BaseViewTransformV2):
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        image_size: Tuple[int, int],
+        feature_size: Tuple[int, int],
+        xbound: Tuple[float, float, float],
+        ybound: Tuple[float, float, float],
+        zbound: Tuple[float, float, float],
+        dbound: Tuple[float, float, float],
+        downsample: int = 1,
+    ):
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            image_size=image_size,
+            feature_size=feature_size,
+            xbound=xbound,
+            ybound=ybound,
+            zbound=zbound,
+            dbound=dbound,
+        )
+        self.depthnet = nn.Conv2d(self.in_channels, self.D + self.C, 1)
+        self.downsample = DownSampleNet(downsample, out_channels, out_channels)
+
+    def get_cam_feats(
+        self, x: torch.Tensor, camera_depth_aware_parameters: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        B, N, C, fH, fW = x.shape
+        x = x.view(B * N, C, fH, fW)
+        x = self.depthnet(x)
+        return self.get_depth_softmax(x, B=B, N=N, fH=fH, fW=fW)
+
+    def forward(self, *args, **kwargs):
+        x, depth_softmax = super().forward(*args, **kwargs)
+        x = self.downsample(x)
+        return x, depth_softmax
+
+
+@MODELS.register_module()
+class LSSTransformV2DepthAware(BaseViewTransformV2):
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        image_size: Tuple[int, int],
+        feature_size: Tuple[int, int],
+        xbound: Tuple[float, float, float],
+        ybound: Tuple[float, float, float],
+        zbound: Tuple[float, float, float],
+        dbound: Tuple[float, float, float],
+        camera_depth_aware_configs: dict,
+        downsample: int = 1,
+    ):
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            image_size=image_size,
+            feature_size=feature_size,
+            xbound=xbound,
+            ybound=ybound,
+            zbound=zbound,
+            dbound=dbound,
+        )
+        if downsample > 1:
+            self.downsample = DownSampleNet(downsample, out_channels, out_channels)
+        else:
+            self.downsample = nn.Identity()
+        self.camera_depth_aware_net = CameraDepthAwareNet(
+            in_channels=in_channels,
+            hidden_channels=in_channels,
+            mlp_drop_out=camera_depth_aware_configs["mlp_drop_out"],
+            depth_channels=self.D,
+            out_channels=self.C,
+        )
+
+    def get_cam_feats(
+        self, x: torch.Tensor, camera_depth_aware_parameters: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        B, N, C, fH, fW = x.shape
+        x = x.view(B * N, C, fH, fW)
+        x = self.camera_depth_aware_net(x, camera_depth_aware_parameters)
+        return self.get_depth_softmax(x, B=B, N=N, fH=fH, fW=fW)
+
+    def forward(self, *args, **kwargs):
+        x, depth_softmax = super().forward(*args, **kwargs)
+        x = self.downsample(x)
+        return x, depth_softmax
diff --git a/projects/BEVFusion/bevfusion/loading.py b/projects/BEVFusion/bevfusion/loading.py
index 0478d67a3..be9101b64 100644
--- a/projects/BEVFusion/bevfusion/loading.py
+++ b/projects/BEVFusion/bevfusion/loading.py
@@ -1,10 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
-import os
-from typing import List, Optional
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
 
+import matplotlib.pyplot as plt
 import mmcv
 import numpy as np
+from mmcv.transforms import BaseTransform
 from mmdet3d.datasets.transforms import LoadMultiViewImageFromFiles
 from mmdet3d.registry import TRANSFORMS
 from mmengine.fileio import get
@@ -35,7 +37,7 @@ class BEVLoadMultiViewImageFromFiles(LoadMultiViewImageFromFiles):
 
     def __init__(
         self,
-        camera_order: List[str],
+        camera_orders: Dict[str, List[str]],
         to_float32: bool = False,
         color_type: str = "unchanged",
         backend_args: Optional[dict] = None,
@@ -44,7 +46,7 @@ def __init__(
         test_mode: bool = False,
         set_default_scale: bool = True,
     ) -> None:
-        self.camera_order = camera_order
+        self.camera_orders = camera_orders
         self.to_float32 = to_float32
         self.color_type = color_type
         self.backend_args = backend_args
@@ -56,6 +58,7 @@ def __init__(
         self.test_mode = test_mode
         self.set_default_scale = set_default_scale
         self.before_camera_info = dict()
+        self.camera_order_types = list(camera_orders.keys())
 
     def transform(self, results: dict) -> Optional[dict]:
         """Call function to load multi-view image from files.
@@ -75,6 +78,12 @@ def transform(self, results: dict) -> Optional[dict]:
                 - scale_factor (float): Scale factor.
                 - img_norm_cfg (dict): Normalization configuration of images.
         """
+        vehicle_type = results.get("vehicle_type", None)
+        if vehicle_type is None:
+            camera_order = self.camera_orders[self.camera_order_types[0]]
+        else:
+            camera_order = self.camera_orders[vehicle_type]
+
         # TODO: consider split the multi-sweep part out of this pipeline
         # Derive the mask and transform for loading of multi-sweep data
         if self.num_ref_frames > 0:
@@ -138,7 +147,7 @@ def transform(self, results: dict) -> Optional[dict]:
 
         # to fill None data
         # for _ , cam_item in results['images'].items():
-        for camera_type in self.camera_order:
+        for camera_type in camera_order:
             if camera_type not in results["images"]:
                 continue
 
@@ -217,3 +226,219 @@ def transform(self, results: dict) -> Optional[dict]:
         results["num_views"] = self.num_views
         results["num_ref_frames"] = self.num_ref_frames
         return results
+
+
+@TRANSFORMS.register_module()
+class PointsToMultiViewImageDepths(BaseTransform):
+    """Convert points to multi-view image depths.
+
+    Args:
+        points (np.ndarray): Points in the world coordinate system.
+        img_shape (tuple): Shape of the image.
+        cam2img (np.ndarray): Camera to image transformation matrix.
+        lidar2cam (np.ndarray): LiDAR to camera transformation matrix.
+        visualize_dir (str, optional): If set, saves a per-sample subplot
+            of `gt_depths` (one panel per camera) to this directory.
+            Useful for debugging the projection. Defaults to None.
+        max_depth (float): Upper clip for the depth color scale (m).
+            Defaults to 80.
+    """
+
+    def __init__(
+        self,
+        img_shape,
+        num_cameras: int,
+        depth_bounds: Tuple[float, float],
+        visualize_dir: Optional[str] = None,
+        max_depth: float = 80.0,
+    ):
+        self.img_shape = img_shape
+        self.num_cameras = num_cameras
+        self.visualize_dir = visualize_dir
+        self.max_depth = max_depth
+        self.depth_bounds = depth_bounds
+        self.visualize_dir = Path(visualize_dir) if visualize_dir is not None else None
+        if self.visualize_dir is not None:
+            self.visualize_dir.mkdir(parents=True, exist_ok=True)
+        self._depth_idx = 0
+
+    def transform(self, results: dict) -> Optional[dict]:
+        """Call function to load multi-view image from files.
+
+        Args:
+            results (dict): Result dict containing multi-view image filenames.
+
+        Returns:
+            dict: The result dict containing the multi-view image data.
+            Added keys:
+                - gt_depths (np.ndarray): Ground truth depths in (N, H, W) for (number of cameras, height, width).
+        """
+        lidar2image = np.asarray(results["lidar2img"])
+        img_aug_matrix = np.asarray(results["img_aug_matrix"]) if "img_aug_matrix" in results else np.eye(4)
+        cur_coords = results["points"].numpy()[:, :3]
+
+        # inverse lidar aug
+        if "lidar_aug_matrix" in results:
+            lidar_aug_matrix = np.asarray(results["lidar_aug_matrix"])
+            lidar_aug_matrix_inverse = np.linalg.inv(lidar_aug_matrix)
+            cur_coords -= lidar_aug_matrix[:3, 3]
+            cur_coords = lidar_aug_matrix_inverse[:3, :3] @ cur_coords.transpose(1, 0)
+        else:
+            cur_coords = cur_coords.transpose(1, 0)
+
+        # lidar2image
+        cur_coords = lidar2image[:, :3, :3] @ cur_coords
+        cur_coords += lidar2image[:, :3, 3].reshape(-1, 3, 1)
+
+        # get 2d coords
+        dist = cur_coords[:, 2, :]
+        valid_dist_mask = (dist >= self.depth_bounds[0]) & (dist < self.depth_bounds[1])
+
+        cur_coords[:, 2, :] = np.clip(cur_coords[:, 2, :], 1e-5, 1e5)
+        cur_coords[:, :2, :] /= cur_coords[:, 2:3, :]
+
+        # imgaug
+        cur_coords = img_aug_matrix[:, :3, :3] @ cur_coords
+        cur_coords += img_aug_matrix[:, :3, 3].reshape(-1, 3, 1)
+        cur_coords = cur_coords[:, :2, :].transpose(0, 2, 1)
+
+        # normalize coords for grid sample
+        cur_coords = cur_coords[..., [1, 0]]
+        on_img = (
+            (cur_coords[..., 0] < self.img_shape[0])
+            & (cur_coords[..., 0] >= 0)
+            & (cur_coords[..., 1] < self.img_shape[1])
+            & (cur_coords[..., 1] >= 0)
+            & valid_dist_mask
+        )
+
+        # Avoid loops since it's slow
+        indices = np.nonzero(on_img)
+        camera_indices = indices[0]
+        point_indices = indices[1]
+        masked_coords = cur_coords[camera_indices, point_indices].astype(np.int64)
+        masked_dist = dist[camera_indices, point_indices]
+
+        # Possibly to have duplicates and the last one will be used, however, the chance is small
+        flatten_indices = (
+            camera_indices * self.img_shape[0] * self.img_shape[1]
+            + masked_coords[:, 0] * self.img_shape[1]
+            + masked_coords[:, 1]
+        )
+        depth_flat = np.zeros(self.num_cameras * self.img_shape[0] * self.img_shape[1], dtype=np.float32)
+        depth_flat[flatten_indices] = masked_dist
+        depth = depth_flat.reshape(self.num_cameras, self.img_shape[0], self.img_shape[1])
+        results["gt_depths"] = depth
+
+        if self.visualize_dir is not None:
+            self._save_depth_subplot(depth, results)
+        return results
+
+    def _save_depth_subplot(self, depth: np.ndarray, results: dict) -> None:
+        """Save `gt_depths` as a subplot with one panel per camera.
+
+        The figure contains three row blocks per camera:
+        - image underlay (if available) + projected LiDAR depth points
+        - image pixels only
+        - depth-only heatmap (no image pixel values)
+
+        Args:
+            depth (np.ndarray): (num_cameras, H, W) ground-truth depth map.
+            results (dict): The pipeline result dict; used for the underlay
+                image and to derive a unique filename.
+        """
+        imgs = results.get("img", None)
+
+        # Layout:
+        # - Top block: image underlay + projected depth points.
+        # - Middle block: image pixels only.
+        # - Bottom block: depth-only heatmap (no image pixel values).
+        if self.num_cameras <= 6:
+            base_rows, cols = 1, self.num_cameras
+        else:
+            cols = int(np.ceil(np.sqrt(self.num_cameras)))
+            base_rows = int(np.ceil(self.num_cameras / cols))
+        rows = base_rows * 3
+
+        fig, axes = plt.subplots(rows, cols, figsize=(4 * cols, 4 * rows), squeeze=False)
+
+        for c in range(self.num_cameras):
+            d = depth[c]
+            ys, xs = np.nonzero(d)
+            vals = d[ys, xs]
+
+            # Row block 1: image + depth scatter.
+            ax_overlay = axes[c // cols, c % cols]
+            if imgs is not None and c < len(imgs):
+                ax_overlay.imshow(imgs[c].astype(np.uint8))
+                if vals.size > 0:
+                    ax_overlay.scatter(
+                        xs,
+                        ys,
+                        c=vals,
+                        cmap="turbo",
+                        vmin=0,
+                        vmax=self.max_depth,
+                        s=1,
+                    )
+            else:
+                ax_overlay.imshow(
+                    d,
+                    cmap="turbo",
+                    vmin=0,
+                    vmax=self.max_depth,
+                    interpolation="nearest",
+                )
+            ax_overlay.set_title(f"cam {c} overlay  ({vals.size} pts)")
+            ax_overlay.set_xticks([])
+            ax_overlay.set_yticks([])
+
+            # Row block 2: image-only visualization.
+            ax_img = axes[base_rows + (c // cols), c % cols]
+            if imgs is not None and c < len(imgs):
+                ax_img.imshow(imgs[c].astype(np.uint8))
+            else:
+                ax_img.imshow(
+                    d,
+                    cmap="gray",
+                    vmin=0,
+                    vmax=self.max_depth,
+                    interpolation="nearest",
+                )
+            ax_img.set_title(f"cam {c} image-only")
+            ax_img.set_xticks([])
+            ax_img.set_yticks([])
+
+            # Row block 3: depth-only visualization.
+            ax_depth = axes[(base_rows * 2) + (c // cols), c % cols]
+            ax_depth.imshow(
+                d,
+                cmap="turbo",
+                vmin=0,
+                vmax=self.max_depth,
+                interpolation="nearest",
+            )
+            ax_depth.set_title(f"cam {c} depth-only")
+            ax_depth.set_xticks([])
+            ax_depth.set_yticks([])
+
+        # Hide any unused subplots when n doesn't fill the grid.
+        for c in range(self.num_cameras, base_rows * cols):
+            axes[c // cols, c % cols].axis("off")
+            axes[base_rows + (c // cols), c % cols].axis("off")
+            axes[(base_rows * 2) + (c // cols), c % cols].axis("off")
+
+        # Shared depth colorbar with numeric values.
+        depth_mappable = plt.cm.ScalarMappable(cmap="turbo", norm=plt.Normalize(vmin=0, vmax=self.max_depth))
+        depth_mappable.set_array([])
+        cbar = fig.colorbar(depth_mappable, ax=axes, location="right", fraction=0.02, pad=0.02)
+        cbar.set_label("Depth (m)")
+
+        fig.suptitle(f"gt_depths — {self._depth_idx}")
+        fig.tight_layout(rect=[0, 0, 0.96, 0.97])
+
+        self._depth_idx += 1
+        out_path = self.visualize_dir / f"{self._depth_idx:06d}_gt_depths.png"
+        fig.savefig(out_path, dpi=120, bbox_inches="tight")
+        plt.close(fig)
+        print(f"Saved gt_depths visualization to {out_path}")
diff --git a/projects/BEVFusion/bevfusion/ops/__init__.py b/projects/BEVFusion/bevfusion/ops/__init__.py
index e08abbc6d..f74f0edbb 100644
--- a/projects/BEVFusion/bevfusion/ops/__init__.py
+++ b/projects/BEVFusion/bevfusion/ops/__init__.py
@@ -1,4 +1,12 @@
 from .bev_pool import bev_pool
+from .bev_pool_v2 import bev_pool_v2
 from .voxel import DynamicScatter, Voxelization, dynamic_scatter, voxelization
 
-__all__ = ["bev_pool", "Voxelization", "voxelization", "dynamic_scatter", "DynamicScatter"]
+__all__ = [
+    "bev_pool",
+    "bev_pool_v2",
+    "Voxelization",
+    "voxelization",
+    "dynamic_scatter",
+    "DynamicScatter",
+]
diff --git a/projects/BEVFusion/bevfusion/ops/bev_pool_v2/__init__.py b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/__init__.py
new file mode 100644
index 000000000..ff2fdfff7
--- /dev/null
+++ b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/__init__.py
@@ -0,0 +1,3 @@
+from .bev_pool_v2 import bev_pool_v2
+
+__all__ = ["bev_pool_v2"]
diff --git a/projects/BEVFusion/bevfusion/ops/bev_pool_v2/bev_pool_v2.py b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/bev_pool_v2.py
new file mode 100644
index 000000000..af1ba15de
--- /dev/null
+++ b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/bev_pool_v2.py
@@ -0,0 +1,191 @@
+# Copyright (c) Phigent Robotics. All rights reserved.
+
+import numpy as np
+import torch
+
+from . import bev_pool_v2_ext
+
+
+class QuickCumsumV2TrainingCuda(torch.autograd.Function):
+    r"""BEVPoolv2 implementation for Lift-Splat-Shoot view transformation.
+
+    Please refer to the `paper <https://arxiv.org/abs/2211.17111>`_
+    """
+
+    @staticmethod
+    def forward(
+        ctx, depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths
+    ):
+        ranks_bev = ranks_bev.int()
+        depth = depth.contiguous().float()
+        feat = feat.contiguous().float()
+        ranks_depth = ranks_depth.contiguous().int()
+        ranks_feat = ranks_feat.contiguous().int()
+        interval_lengths = interval_lengths.contiguous().int()
+        interval_starts = interval_starts.contiguous().int()
+
+        out = feat.new_zeros(bev_feat_shape)
+
+        bev_pool_v2_ext.bev_pool_v2_forward(
+            depth,
+            feat,
+            out,
+            ranks_depth,
+            ranks_feat,
+            ranks_bev,
+            interval_lengths,
+            interval_starts,
+        )
+
+        ctx.save_for_backward(ranks_bev, depth, feat, ranks_feat, ranks_depth)
+        return out
+
+    @staticmethod
+    def backward(ctx, out_grad):
+        ranks_bev, depth, feat, ranks_feat, ranks_depth = ctx.saved_tensors
+
+        order = ranks_feat.argsort()
+        ranks_feat, ranks_depth, ranks_bev = ranks_feat[order], ranks_depth[order], ranks_bev[order]
+        kept = torch.ones(ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
+        kept[1:] = ranks_feat[1:] != ranks_feat[:-1]
+        interval_starts_bp = torch.where(kept)[0].int()
+        interval_lengths_bp = torch.zeros_like(interval_starts_bp)
+        interval_lengths_bp[:-1] = interval_starts_bp[1:] - interval_starts_bp[:-1]
+        interval_lengths_bp[-1] = ranks_bev.shape[0] - interval_starts_bp[-1]
+
+        depth = depth.contiguous()
+        feat = feat.contiguous()
+        ranks_depth = ranks_depth.contiguous()
+        ranks_feat = ranks_feat.contiguous()
+        ranks_bev = ranks_bev.contiguous()
+        interval_lengths_bp = interval_lengths_bp.contiguous()
+        interval_starts_bp = interval_starts_bp.contiguous()
+
+        depth_grad = depth.new_zeros(depth.shape)
+        feat_grad = feat.new_zeros(feat.shape)
+        out_grad = out_grad.contiguous()
+        bev_pool_v2_ext.bev_pool_v2_backward(
+            out_grad,
+            depth_grad,
+            feat_grad,
+            depth,
+            feat,
+            ranks_depth,
+            ranks_feat,
+            ranks_bev,
+            interval_lengths_bp,
+            interval_starts_bp,
+        )
+        return depth_grad, feat_grad, None, None, None, None, None, None, None, None
+
+
+class QuickCumsumV2Cuda(torch.autograd.Function):
+
+    @staticmethod
+    def symbolic(
+        g,
+        depth,
+        feat,
+        ranks_depth,
+        ranks_feat,
+        ranks_bev,
+        interval_starts,
+        interval_lengths,
+        out_height=128,
+        out_width=128,
+    ):
+        """symbolic function for creating onnx op."""
+        x = g.op(
+            "autoware::QuickCumsumV2Cuda",
+            depth,
+            feat,
+            ranks_depth,
+            ranks_feat,
+            ranks_bev,
+            interval_starts,
+            interval_lengths,
+            out_height_i=out_height,
+            out_width_i=out_width,
+        )
+
+        # features_shape = _get_tensor_sizes(feat)
+        # if features_shape is not None and hasattr(x.type(), "with_sizes"):
+        #     output_type = x.type().with_sizes([B, D, H, W, _get_tensor_dim_size(x, -1)])
+        #     output.setType(output_type)
+
+    @staticmethod
+    def forward(
+        ctx,
+        depth,  # B,N,D,H,W
+        feat,  # B,N,H,W,C
+        ranks_depth,
+        ranks_feat,
+        ranks_bev,
+        interval_starts,
+        interval_lengths,
+        out_height=128,
+        out_width=128,
+    ):
+        """run forward."""
+        out = feat.new_zeros(depth.shape[0], 1, out_height, out_width, feat.shape[-1])
+        bev_feat = bev_pool_v2_ext.bev_pool_v2_forward(
+            depth,
+            feat,
+            out,
+            ranks_depth,
+            ranks_feat,
+            ranks_bev,
+            interval_lengths,
+            interval_starts,
+        )
+        return bev_feat
+
+    @staticmethod
+    def backward(ctx, out_grad):
+        raise NotImplementedError
+
+
+def bev_pool_v2(
+    depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, bev_feat_shape, is_training
+):
+    # Always use full (B, Z, H, W, C) buffer; QuickCumsumV2Cuda (Z=1) is ONNX-only.
+    del is_training
+    x = QuickCumsumV2TrainingCuda.apply(
+        depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths
+    )
+
+    # Final shape: (B, C, Z, H, W) — matches LSSTransform v1 after permute
+    x = x.permute(0, 4, 1, 2, 3).contiguous()
+    return x
+
+
+def test_bev_pool_v2():
+    depth = np.array([0.3, 0.4, 0.2, 0.1, 0.7, 0.6, 0.8, 0.9])
+    depth = torch.from_numpy(depth).float().cuda()
+    depth = depth.view(1, 1, 2, 2, 2).requires_grad_()
+    feat = torch.ones(size=[1, 1, 2, 2, 2], dtype=torch.float, device="cuda").requires_grad_()
+    ranks_depth = torch.from_numpy(np.array([0, 4, 1, 6])).int().cuda()
+    ranks_feat = torch.from_numpy(np.array([0, 0, 1, 2])).int().cuda()
+    ranks_bev = torch.from_numpy(np.array([0, 0, 1, 1])).int().cuda()
+
+    kept = torch.ones(ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
+    kept[1:] = ranks_bev[1:] != ranks_bev[:-1]
+    interval_starts = torch.where(kept)[0].int()
+    if len(interval_starts) == 0:
+        return None, None, None, None, None
+    interval_lengths = torch.zeros_like(interval_starts)
+    interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
+    interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1]
+    bev_feat = bev_pool_v2(
+        depth, feat, ranks_depth, ranks_feat, ranks_bev, (1, 1, 2, 2, 2), interval_starts, interval_lengths
+    )
+    loss = torch.sum(bev_feat)
+    loss.backward()
+    assert loss == 4.4
+    grad_depth = np.array([2.0, 2.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0])
+    grad_depth = torch.from_numpy(grad_depth).float()
+    grad_depth = grad_depth.cuda().view(1, 1, 2, 2, 2)
+    assert depth.grad.allclose(grad_depth)
+    grad_feat = np.array([1.0, 1.0, 0.4, 0.4, 0.8, 0.8, 0.0, 0.0])
+    grad_feat = torch.from_numpy(grad_feat).float().cuda().view(1, 1, 2, 2, 2)
+    assert feat.grad.allclose(grad_feat)
diff --git a/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool.cpp b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool.cpp
new file mode 100644
index 000000000..c7c38f695
--- /dev/null
+++ b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool.cpp
@@ -0,0 +1,111 @@
+// Copyright (c) Phigent Robotics. All rights reserved.
+// Reference https://arxiv.org/abs/2211.17111
+#include <torch/torch.h>
+#include <c10/cuda/CUDAGuard.h>
+
+// CUDA function declarations
+void bev_pool_v2(int c, int n_intervals, const float* depth, const float* feat,
+    const int* ranks_depth, const int* ranks_feat, const int* ranks_bev,
+    const int* interval_starts, const int* interval_lengths, float* out);
+
+void bev_pool_v2_grad(int c, int n_intervals, const float* out_grad,
+  const float* depth, const float* feat, const int* ranks_depth, const int* ranks_feat,
+  const int* ranks_bev, const int* interval_starts, const int* interval_lengths,
+  float* depth_grad, float* feat_grad);
+
+
+/*
+  Function: pillar pooling (forward, cuda)
+  Args:
+    depth            : input depth, FloatTensor[n, d, h, w]
+    feat             : input features, FloatTensor[n, h, w, c]
+    out              : output features, FloatTensor[b, c, h_out, w_out]
+    ranks_depth      : depth index of points, IntTensor[n_points]
+    ranks_feat       : feat index of points, IntTensor[n_points]
+    ranks_bev        : output index of points, IntTensor[n_points]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+  Return:
+*/
+void bev_pool_v2_forward(
+  const at::Tensor _depth,
+  const at::Tensor _feat,
+  at::Tensor _out,
+  const at::Tensor _ranks_depth,
+  const at::Tensor _ranks_feat,
+  const at::Tensor _ranks_bev,
+  const at::Tensor _interval_lengths,
+  const at::Tensor _interval_starts
+) {
+  int c = _feat.size(4);
+  int n_intervals = _interval_lengths.size(0);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(_depth));
+  const float* depth = _depth.data_ptr<float>();
+  const float* feat = _feat.data_ptr<float>();
+  const int* ranks_depth = _ranks_depth.data_ptr<int>();
+  const int* ranks_feat = _ranks_feat.data_ptr<int>();
+  const int* ranks_bev = _ranks_bev.data_ptr<int>();
+
+  const int* interval_lengths = _interval_lengths.data_ptr<int>();
+  const int* interval_starts = _interval_starts.data_ptr<int>();
+
+  float* out = _out.data_ptr<float>();
+  bev_pool_v2(
+    c, n_intervals, depth, feat, ranks_depth, ranks_feat,
+    ranks_bev, interval_starts, interval_lengths, out
+  );
+}
+
+
+/*
+  Function: pillar pooling (backward, cuda)
+  Args:
+    out_grad         : grad of output bev feature, FloatTensor[b, c, h_out, w_out]
+    depth_grad       : grad of input depth, FloatTensor[n, d, h, w]
+    feat_grad        : grad of input feature, FloatTensor[n, h, w, c]
+    depth            : input depth, FloatTensor[n, d, h, w]
+    feat             : input features, FloatTensor[n, h, w, c]
+    ranks_depth      : depth index of points, IntTensor[n_points]
+    ranks_feat       : feat index of points, IntTensor[n_points]
+    ranks_bev        : output index of points, IntTensor[n_points]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+*/
+void bev_pool_v2_backward(
+  const at::Tensor _out_grad,
+  at::Tensor _depth_grad,
+  at::Tensor _feat_grad,
+  const at::Tensor _depth,
+  const at::Tensor _feat,
+  const at::Tensor _ranks_depth,
+  const at::Tensor _ranks_feat,
+  const at::Tensor _ranks_bev,
+  const at::Tensor _interval_lengths,
+  const at::Tensor _interval_starts
+) {
+  int c = _out_grad.size(4);
+  int n_intervals = _interval_lengths.size(0);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(_out_grad));
+  const float* out_grad = _out_grad.data_ptr<float>();
+  float* depth_grad = _depth_grad.data_ptr<float>();
+  float* feat_grad = _feat_grad.data_ptr<float>();
+  const float* depth = _depth.data_ptr<float>();
+  const float* feat = _feat.data_ptr<float>();
+  const int* ranks_depth = _ranks_depth.data_ptr<int>();
+  const int* ranks_feat = _ranks_feat.data_ptr<int>();
+  const int* ranks_bev = _ranks_bev.data_ptr<int>();
+  const int* interval_lengths = _interval_lengths.data_ptr<int>();
+  const int* interval_starts = _interval_starts.data_ptr<int>();
+
+  bev_pool_v2_grad(
+    c, n_intervals, out_grad, depth, feat, ranks_depth, ranks_feat,
+    ranks_bev, interval_starts, interval_lengths, depth_grad, feat_grad
+  );
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("bev_pool_v2_forward", &bev_pool_v2_forward,
+        "bev_pool_v2_forward");
+  m.def("bev_pool_v2_backward", &bev_pool_v2_backward,
+        "bev_pool_v2_backward");
+}
diff --git a/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool_cuda.cu b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool_cuda.cu
new file mode 100644
index 000000000..7fa3179b7
--- /dev/null
+++ b/projects/BEVFusion/bevfusion/ops/bev_pool_v2/src/bev_pool_cuda.cu
@@ -0,0 +1,140 @@
+// Copyright (c) Phigent Robotics. All rights reserved.
+// Reference https://arxiv.org/abs/2211.17111
+
+#include <stdio.h>
+#include <stdlib.h>
+
+/*
+  Function: pillar pooling
+  Args:
+    c                : number of channels
+    n_intervals      : number of unique points
+    depth            : input depth, FloatTensor[b,n,d,h,w]
+    feat             : input feat, FloatTensor[b,n,h,w,c]
+    ranks_depth      : input index of depth, IntTensor[n]
+    ranks_feat       : input index of feat, IntTensor[n]
+    ranks_bev        : output index, IntTensor[n]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+    out              : output features, FloatTensor[b, d, h, w, c]
+*/
+__global__ void bev_pool_v2_kernel(int c, int n_intervals,
+                                  const float *__restrict__ depth,
+                                  const float *__restrict__ feat,
+                                  const int *__restrict__ ranks_depth,
+                                  const int *__restrict__ ranks_feat,
+                                  const int *__restrict__ ranks_bev,
+                                  const int *__restrict__ interval_starts,
+                                  const int *__restrict__ interval_lengths,
+                                  float* __restrict__ out) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int index = idx / c;
+  int cur_c = idx % c;
+  if (index >= n_intervals) return;
+  int interval_start = interval_starts[index];
+  int interval_length = interval_lengths[index];
+  float psum = 0;
+  const float* cur_depth;
+  const float* cur_feat;
+  for(int i = 0; i < interval_length; i++){
+    cur_depth = depth + ranks_depth[interval_start+i];
+    cur_feat = feat + ranks_feat[interval_start+i] * c + cur_c;
+    psum += *cur_feat * *cur_depth;
+  }
+
+  const int* cur_rank = ranks_bev + interval_start;
+  float* cur_out = out + *cur_rank * c + cur_c;
+  *cur_out = psum;
+}
+
+
+/*
+  Function: pillar pooling backward
+  Args:
+    c                : number of channels
+    n_intervals      : number of unique points
+    out_grad         : gradient of the BEV fmap from top, FloatTensor[b, d, h, w, c]
+    depth            : input depth, FloatTensor[b,n,d,h,w]
+    feat             : input feat, FloatTensor[b,n,h,w,c]
+    ranks_depth      : input index of depth, IntTensor[n]
+    ranks_feat       : input index of feat, IntTensor[n]
+    ranks_bev        : output index, IntTensor[n]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+    depth_grad       : gradient of the depth fmap, FloatTensor
+    feat_grad        : gradient of the feature fmap, FloatTensor
+*/
+__global__ void bev_pool_grad_kernel(int c, int n_intervals,
+                                  const float *__restrict__ out_grad,
+                                  const float *__restrict__ depth,
+                                  const float *__restrict__ feat,
+                                  const int *__restrict__ ranks_depth,
+                                  const int *__restrict__ ranks_feat,
+                                  const int *__restrict__ ranks_bev,
+                                  const int *__restrict__ interval_starts,
+                                  const int *__restrict__ interval_lengths,
+                                  float* __restrict__ depth_grad,
+                                  float* __restrict__ feat_grad) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= n_intervals) return;
+  int interval_start = interval_starts[idx];
+  int interval_length = interval_lengths[idx];
+
+  const int* cur_rank;
+  const float* cur_out_grad;
+  const float* cur_out_grad_start;
+
+  const float* cur_feat;
+  const float* cur_feat_start;
+  float* cur_depth_grad;
+  float grad_sum;
+  for(int i = 0; i < interval_length; i++){
+    cur_rank = ranks_bev + interval_start + i;
+    cur_out_grad_start = out_grad +  * cur_rank * c;
+    cur_feat_start = feat + ranks_feat[interval_start+i] * c;
+
+    grad_sum = 0;
+    for(int cur_c = 0; cur_c < c; cur_c++){
+      cur_out_grad = cur_out_grad_start + cur_c;
+      cur_feat = cur_feat_start + cur_c;
+      grad_sum += *cur_out_grad * *cur_feat;
+    }
+
+    cur_depth_grad = depth_grad + ranks_depth[interval_start+i];
+    *cur_depth_grad = grad_sum;
+  }
+
+  float* cur_feat_grad;
+  const float* cur_depth;
+  for(int cur_c = 0; cur_c < c; cur_c++){
+    grad_sum = 0;
+    for(int i = 0; i < interval_length; i++){
+      cur_rank = ranks_bev + interval_start + i;
+      cur_out_grad = out_grad + *cur_rank * c + cur_c;
+
+      cur_depth = depth + ranks_depth[interval_start+i];
+      grad_sum += *cur_out_grad * *cur_depth;
+    }
+    cur_feat_grad = feat_grad + ranks_feat[interval_start] * c + cur_c ;
+    * cur_feat_grad = grad_sum;
+  }
+}
+
+
+
+void bev_pool_v2(int c, int n_intervals, const float* depth, const float* feat, const int* ranks_depth,
+  const int* ranks_feat, const int* ranks_bev, const int* interval_starts, const int* interval_lengths, float* out) {
+  bev_pool_v2_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>(
+    c, n_intervals, depth, feat, ranks_depth, ranks_feat,
+    ranks_bev, interval_starts, interval_lengths, out
+  );
+}
+
+void bev_pool_v2_grad(int c, int n_intervals, const float* out_grad,
+  const float* depth, const float* feat, const int* ranks_depth, const int* ranks_feat,
+  const int* ranks_bev, const int* interval_starts, const int* interval_lengths, float* depth_grad, float* feat_grad) {
+  bev_pool_grad_kernel<<<(int)ceil(((double)n_intervals / 256)), 256>>>(
+     c, n_intervals, out_grad, depth, feat, ranks_depth, ranks_feat,
+     ranks_bev, interval_starts, interval_lengths, depth_grad, feat_grad
+  );
+}
diff --git a/projects/BEVFusion/bevfusion/transforms_3d.py b/projects/BEVFusion/bevfusion/transforms_3d.py
index 7e9faca24..31d0cc417 100644
--- a/projects/BEVFusion/bevfusion/transforms_3d.py
+++ b/projects/BEVFusion/bevfusion/transforms_3d.py
@@ -188,6 +188,19 @@ def transform(self, input_dict: dict) -> dict:
         return input_dict
 
 
+@TRANSFORMS.register_module()
+class BEVFusionRemoveLiDARPoints(BaseTransform):
+    """Remove LiDAR points from the data."""
+
+    def __init__(self):
+        super().__init__()
+
+    def transform(self, results: Dict[str, Any]) -> Dict[str, Any]:
+        if "points" in results:
+            results["points"] = None
+        return results
+
+
 @TRANSFORMS.register_module()
 class GridMask(BaseTransform):
 
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/bevfusion_camera_swin_fpn_30e_8xb8_j6gen2_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb32_j6gen2_base_50m.py
similarity index 75%
rename from projects/BEVFusion/configs/t4dataset/BEVFusion-C/bevfusion_camera_swin_fpn_30e_8xb8_j6gen2_base_120m.py
rename to projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb32_j6gen2_base_50m.py
index e65c52ece..0a416c9fd 100644
--- a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/bevfusion_camera_swin_fpn_30e_8xb8_j6gen2_base_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_30e_8xb32_j6gen2_base_50m.py
@@ -1,9 +1,8 @@
 _base_ = [
     "../../../../../autoware_ml/configs/detection3d/default_runtime.py",
     "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py",
-    "../default/pipelines/default_camera_lidar_intensity_120m.py",
-    "../default/models/default_camera_swin_fpn_120m.py",
-    "../default/schedulers/default_30e_8xb8_adamw_linear_cosine.py",
+    "../default/pipelines/cameras/default_camera_50m.py",
+    "../default/schedulers/default_30e_8xb32_adamw_linear_cosine.py",
     "../default/default_misc.py",
 ]
 
@@ -13,35 +12,7 @@
 
 # user setting
 data_root = "data/t4dataset/"
-info_directory_path = "info/user_name/"
-
-experiment_group_name = "bevfusion_camera/j6gen2_base/" + _base_.dataset_type
-experiment_name = "bevfusion_camera_swin_fpn_30e_8xb8_j6gen2_base_120m"
-work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
-
-# model parameter
-model = dict(
-    type="BEVFusion",
-    view_transform=dict(image_size=_base_.image_size),
-    bbox_head=dict(
-        class_names=_base_.class_names,
-        in_channels=80,
-        train_cfg=dict(
-            point_cloud_range=_base_.point_cloud_range,
-            grid_size=_base_.grid_size,
-            voxel_size=_base_.voxel_size,
-        ),
-        test_cfg=dict(
-            grid_size=_base_.grid_size,
-            voxel_size=_base_.voxel_size[0:2],
-            pc_range=_base_.point_cloud_range[0:2],
-        ),
-        bbox_coder=dict(
-            pc_range=_base_.point_cloud_range[0:2],
-            voxel_size=_base_.voxel_size[0:2],
-        ),
-    ),
-)
+info_directory_path = "info/kokseang_2_9_0/"
 
 # Dataset parameters
 train_dataloader = dict(
@@ -82,6 +53,7 @@
         test_mode=True,
         box_type_3d="LiDAR",
         backend_args=_base_.backend_args,
+        filter_cfg=_base_.filter_cfg,
     ),
 )
 
@@ -102,6 +74,7 @@
         test_mode=True,
         box_type_3d="LiDAR",
         backend_args=_base_.backend_args,
+        filter_cfg=_base_.filter_cfg,
     ),
 )
 
@@ -135,3 +108,5 @@
     checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3, save_best="NuScenes metric/T4Metric/mAP"),
 )
 log_processor = dict(window_size=50)
+
+load_from = "work_dirs/bevfusion_camera_2_8_0/gen2_base/T4Dataset/bevfusion_camera_resnet50_fpn_lss_depthaware_50e_8xb32_gen2_base_50m/best_epoch_46.pth"
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb32_gen2_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb32_gen2_base_50m.py
new file mode 100644
index 000000000..f9bc419f9
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/default_bevfusion_camera_50e_8xb32_gen2_base_50m.py
@@ -0,0 +1,110 @@
+_base_ = [
+    "../../../../../autoware_ml/configs/detection3d/default_runtime.py",
+    "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/gen2_base.py",
+    "../default/pipelines/cameras/default_camera_50m.py",
+    "../default/schedulers/default_50e_8xb32_adamw_linear_cosine.py",
+    "../default/default_misc.py",
+]
+
+custom_imports = dict(imports=["projects.BEVFusion.bevfusion"], allow_failed_imports=False)
+custom_imports["imports"] += _base_.custom_imports["imports"]
+custom_imports["imports"] += ["autoware_ml.detection3d.datasets.transforms"]
+
+# user setting
+data_root = "data/t4dataset/"
+info_directory_path = "info/kokseang_2_9_0/"
+
+# Dataset parameters
+train_dataloader = dict(
+    batch_size=_base_.train_batch_size,
+    num_workers=_base_.num_workers,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=True),
+    dataset=dict(
+        type=_base_.dataset_type,
+        pipeline=_base_.train_pipeline,
+        modality=_base_.input_modality,
+        backend_args=_base_.backend_args,
+        data_root=data_root,
+        ann_file=info_directory_path + _base_.info_train_file_name,
+        metainfo=_base_.metainfo,
+        class_names=_base_.class_names,
+        test_mode=False,
+        data_prefix=_base_.data_prefix,
+        box_type_3d="LiDAR",
+        filter_cfg=_base_.filter_cfg,
+    ),
+)
+
+val_dataloader = dict(
+    batch_size=_base_.test_batch_size,
+    num_workers=_base_.num_workers,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=False),
+    dataset=dict(
+        type=_base_.dataset_type,
+        data_root=data_root,
+        ann_file=info_directory_path + _base_.info_val_file_name,
+        pipeline=_base_.test_pipeline,
+        metainfo=_base_.metainfo,
+        class_names=_base_.class_names,
+        modality=_base_.input_modality,
+        data_prefix=_base_.data_prefix,
+        test_mode=True,
+        box_type_3d="LiDAR",
+        backend_args=_base_.backend_args,
+        filter_cfg=_base_.filter_cfg,
+    ),
+)
+
+test_dataloader = dict(
+    batch_size=_base_.test_batch_size,
+    num_workers=_base_.num_workers,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=False),
+    dataset=dict(
+        type=_base_.dataset_type,
+        data_root=data_root,
+        ann_file=info_directory_path + _base_.info_test_file_name,
+        pipeline=_base_.test_pipeline,
+        metainfo=_base_.metainfo,
+        class_names=_base_.class_names,
+        modality=_base_.input_modality,
+        data_prefix=_base_.data_prefix,
+        test_mode=True,
+        box_type_3d="LiDAR",
+        backend_args=_base_.backend_args,
+        filter_cfg=_base_.filter_cfg,
+    ),
+)
+
+val_evaluator = dict(
+    type="T4Metric",
+    data_root=data_root,
+    ann_file=data_root + info_directory_path + _base_.info_val_file_name,
+    metric="bbox",
+    backend_args=_base_.backend_args,
+    class_names=_base_.class_names,
+    name_mapping=_base_.name_mapping,
+    eval_class_range=_base_.eval_class_range,
+    filter_attributes=_base_.filter_attributes,
+)
+
+test_evaluator = dict(
+    type="T4Metric",
+    data_root=data_root,
+    ann_file=data_root + info_directory_path + _base_.info_test_file_name,
+    metric="bbox",
+    backend_args=_base_.backend_args,
+    class_names=_base_.class_names,
+    name_mapping=_base_.name_mapping,
+    eval_class_range=_base_.eval_class_range,
+    filter_attributes=_base_.filter_attributes,
+    save_csv=True,
+)
+
+default_hooks = dict(
+    logger=dict(type="LoggerHook", interval=50),
+    checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3, save_best="NuScenes metric/T4Metric/mAP"),
+)
+log_processor = dict(window_size=50)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m.py
new file mode 100644
index 000000000..944da6470
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m.py
@@ -0,0 +1,31 @@
+_base_ = [
+    "../default_bevfusion_camera_30e_8xb32_j6gen2_base_50m.py",
+    "../../default/models/resnet50/camera_resnet50_fpn_lss_depthaware_50m.py",
+]
+
+experiment_group_name = "bevfusion_camera/j6gen2_base/" + _base_.dataset_type
+experiment_name = "bevfusion_camera_resnet50_fpn_lss_v2_depthaware_30e_8xb32_j6gen2_base_50m"
+work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
+
+# model parameter
+model = dict(
+    type="BEVFusion",
+    view_transform=dict(image_size=_base_.image_size),
+    bbox_head=dict(
+        class_names=_base_.class_names,
+        train_cfg=dict(
+            point_cloud_range=_base_.point_cloud_range,
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size,
+        ),
+        test_cfg=dict(
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size[0:2],
+            pc_range=_base_.point_cloud_range[0:2],
+        ),
+        bbox_coder=dict(
+            pc_range=_base_.point_cloud_range[0:2],
+            voxel_size=_base_.voxel_size[0:2],
+        ),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m_t4metric_v2.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m_t4metric_v2.py
new file mode 100644
index 000000000..cb0e50306
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m_t4metric_v2.py
@@ -0,0 +1,94 @@
+_base_ = [
+    "./bevfusion_camera_resnet50_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m.py",
+]
+
+experiment_name = "bevfusion_camera_resnet50_fpn_lss_v2_depthaware_30e_8xb32_j6gen2_base_50m_t4metric_v2"
+work_dir = "work_dirs/" + _base_.experiment_group_name + "/" + experiment_name
+
+# Add evaluator configs
+evaluator_metric_configs = dict(
+    evaluation_task="detection",
+    target_labels=_base_.class_names,
+    center_distance_bev_thresholds=[0.5, 1.0, 2.0, 4.0],
+    # plane_distance_thresholds is required for the pass fail evaluation
+    plane_distance_thresholds=[2.0, 4.0],
+    iou_2d_thresholds=None,
+    iou_3d_thresholds=None,
+    label_prefix="autoware",
+    # bev minimum distance ranges for each range bucket, must be the same length as max_distance,
+    # they will form bev distance ranges in [(min_distance[0], max_distance[0]), (min_distance[1], max_distance[1]), ...] when filtering
+    min_distance=[0.0],
+    # bev maximum distance ranges for each range bucket, must be the same length as min_distance
+    max_distance=[51.2],
+    min_point_numbers=0,
+    matching_class_agnostic_fps=False,
+)
+
+perception_evaluator_configs = dict(
+    dataset_paths=_base_.data_root,
+    frame_id="base_link",
+    evaluation_config_dict=evaluator_metric_configs,
+    load_raw_data=False,
+)
+
+
+frame_pass_fail_config = dict(
+    target_labels=_base_.class_names,
+    # Matching thresholds per class (must align with `plane_distance_thresholds` used in evaluation)
+    matching_threshold_list=[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0],
+    confidence_threshold_list=None,
+)
+
+training_statistics_parquet_path = (
+    _base_.data_root + _base_.info_directory_path + _base_.info_train_statistics_file_name
+)
+testing_statistics_parquet_path = _base_.data_root + _base_.info_directory_path + _base_.info_test_statistics_file_name
+validation_statistics_parquet_path = (
+    _base_.data_root + _base_.info_directory_path + _base_.info_val_statistics_file_name
+)
+
+val_evaluator = dict(
+    _delete_=True,
+    type="T4MetricV2",
+    data_root=_base_.data_root,
+    ann_file=_base_.data_root + _base_.info_directory_path + _base_.info_val_file_name,
+    training_statistics_parquet_path=training_statistics_parquet_path,
+    testing_statistics_parquet_path=testing_statistics_parquet_path,
+    validation_statistics_parquet_path=validation_statistics_parquet_path,
+    output_dir="validation",
+    dataset_name="j6gen2_base",
+    perception_evaluator_configs=perception_evaluator_configs,
+    critical_object_filter_config=None,
+    frame_pass_fail_config=frame_pass_fail_config,
+    num_workers=64,
+    scene_batch_size=-1,
+    write_metric_summary=False,
+    class_names={{_base_.class_names}},
+    name_mapping={{_base_.name_mapping}},
+    experiment_name=experiment_name,
+    experiment_group_name=_base_.experiment_group_name,
+    min_num_points=2,
+)
+
+test_evaluator = dict(
+    _delete_=True,
+    type="T4MetricV2",
+    data_root=_base_.data_root,
+    ann_file=_base_.data_root + _base_.info_directory_path + _base_.info_test_file_name,
+    training_statistics_parquet_path=training_statistics_parquet_path,
+    testing_statistics_parquet_path=testing_statistics_parquet_path,
+    validation_statistics_parquet_path=validation_statistics_parquet_path,
+    output_dir="testing",
+    dataset_name="j6gen2_base",
+    perception_evaluator_configs=perception_evaluator_configs,
+    critical_object_filter_config=None,
+    frame_pass_fail_config=frame_pass_fail_config,
+    num_workers=64,
+    scene_batch_size=-1,
+    write_metric_summary=True,
+    class_names={{_base_.class_names}},
+    name_mapping={{_base_.name_mapping}},
+    experiment_name=experiment_name,
+    experiment_group_name=_base_.experiment_group_name,
+    min_num_points=2,
+)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_50e_8xb32_gen2_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_50e_8xb32_gen2_base_50m.py
new file mode 100644
index 000000000..769f738cd
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/resnet50/bevfusion_camera_resnet50_fpn_lss_depthaware_50e_8xb32_gen2_base_50m.py
@@ -0,0 +1,31 @@
+_base_ = [
+    "../default_bevfusion_camera_50e_8xb32_gen2_base_50m.py",
+    "../../default/models/resnet50/camera_resnet50_fpn_lss_depthaware_50m.py",
+]
+
+experiment_group_name = "bevfusion_camera/gen2_base/" + _base_.dataset_type
+experiment_name = "bevfusion_camera_resnet50_fpn_lss_depthaware_50e_8xb32_gen2_base_50m"
+work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
+
+# model parameter
+model = dict(
+    type="BEVFusion",
+    view_transform=dict(image_size=_base_.image_size),
+    bbox_head=dict(
+        class_names=_base_.class_names,
+        train_cfg=dict(
+            point_cloud_range=_base_.point_cloud_range,
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size,
+        ),
+        test_cfg=dict(
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size[0:2],
+            pc_range=_base_.point_cloud_range[0:2],
+        ),
+        bbox_coder=dict(
+            pc_range=_base_.point_cloud_range[0:2],
+            voxel_size=_base_.voxel_size[0:2],
+        ),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m.py
new file mode 100644
index 000000000..650f7b835
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_depthaware_30e_8xb32_j6gen2_base_50m.py
@@ -0,0 +1,32 @@
+_base_ = [
+    "../default_bevfusion_camera_30e_8xb32_j6gen2_base_50m.py",
+    "../../default/models/swin_transformer/camera_swin_fpn_lss_depthaware_50m.py",
+]
+
+experiment_group_name = "bevfusion_camera/j6gen2_base/" + _base_.dataset_type
+experiment_name = "bevfusion_camera_swin_fpn_lss_30e_8xb32_j6gen2_base_50m"
+work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
+
+# model parameter
+model = dict(
+    type="BEVFusion",
+    view_transform=dict(image_size=_base_.image_size),
+    bbox_head=dict(
+        class_names=_base_.class_names,
+        in_channels=80,
+        train_cfg=dict(
+            point_cloud_range=_base_.point_cloud_range,
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size,
+        ),
+        test_cfg=dict(
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size[0:2],
+            pc_range=_base_.point_cloud_range[0:2],
+        ),
+        bbox_coder=dict(
+            pc_range=_base_.point_cloud_range[0:2],
+            voxel_size=_base_.voxel_size[0:2],
+        ),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_depthaware_50e_8xb32_gen2_base_50m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_depthaware_50e_8xb32_gen2_base_50m.py
new file mode 100644
index 000000000..fcbd79355
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-C/swin_transformer/bevfusion_camera_swin_fpn_lss_depthaware_50e_8xb32_gen2_base_50m.py
@@ -0,0 +1,32 @@
+_base_ = [
+    "../default_bevfusion_camera_30e_8xb32_gen2_base_50m.py",
+    "../../default/models/swin_transformer/camera_swin_fpn_lss_depthaware_50m.py",
+]
+
+experiment_group_name = "bevfusion_camera/gen2_base/" + _base_.dataset_type
+experiment_name = "bevfusion_camera_swin_fpn_lss_50e_8xb32_gen2_base_50m"
+work_dir = "work_dirs/" + experiment_group_name + "/" + experiment_name
+
+# model parameter
+model = dict(
+    type="BEVFusion",
+    view_transform=dict(image_size=_base_.image_size),
+    bbox_head=dict(
+        class_names=_base_.class_names,
+        in_channels=80,
+        train_cfg=dict(
+            point_cloud_range=_base_.point_cloud_range,
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size,
+        ),
+        test_cfg=dict(
+            grid_size=_base_.grid_size,
+            voxel_size=_base_.voxel_size[0:2],
+            pc_range=_base_.point_cloud_range[0:2],
+        ),
+        bbox_coder=dict(
+            pc_range=_base_.point_cloud_range[0:2],
+            voxel_size=_base_.voxel_size[0:2],
+        ),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_j6gen2_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_j6gen2_base_120m.py
index 4f81af760..a93b1d435 100644
--- a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_j6gen2_base_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_j6gen2_base_120m.py
@@ -2,8 +2,8 @@
     "../../../../../autoware_ml/configs/detection3d/default_runtime.py",
     "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/j6gen2_base.py",
     "../default/pipelines/default_camera_lidar_intensity_120m.py",
-    "../default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py",
-    "../default/schedulers/default_20e_8xb8_adamw_linear_cosine.py",
+    "../default/models/default_camera_swin_fpn_depthlss_lidar_second_secfpn_120m.py",
+    "../default/schedulers/default_20e_8xb16_adamw_linear_cosine.py",
     "../default/default_misc.py",
 ]
 
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_jpntaxi_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_jpntaxi_base_120m.py
index 20c85b1d8..b8408956b 100644
--- a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_jpntaxi_base_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_20e_8xb8_jpntaxi_base_120m.py
@@ -2,8 +2,8 @@
     "../../../../../autoware_ml/configs/detection3d/default_runtime.py",
     "../../../../../autoware_ml/configs/detection3d/dataset/t4dataset/jpntaxi_base.py",
     "../default/pipelines/default_camera_lidar_intensity_120m.py",
-    "../default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py",
-    "../default/schedulers/default_20e_8xb8_adamw_linear_cosine.py",
+    "../default/models/default_camera_swin_fpn_depthlss_lidar_second_secfpn_120m.py",
+    "../default/schedulers/default_20e_8xb16_adamw_linear_cosine.py",
     "../default/default_misc.py",
 ]
 
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m.py
index 3edd06c92..c1b0bdaae 100644
--- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m.py
@@ -13,7 +13,7 @@
 
 # user setting
 data_root = "data/t4dataset/"
-info_directory_path = "info/user_name/"
+info_directory_path = "info/kokseang_2_8_1/"
 
 experiment_group_name = "bevfusion_lidar_intensity/j6gen2_base/" + _base_.dataset_type
 experiment_name = "lidar_voxel_second_secfpn_30e_8xb16_j6gen2_base_120m"
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m.py
index 05947c2fd..040e18c58 100644
--- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m.py
@@ -13,7 +13,7 @@
 
 # user setting
 data_root = "data/t4dataset/"
-info_directory_path = "info/user_name/"
+info_directory_path = "info/kokseang_2_8_1/"
 
 experiment_group_name = "bevfusion_lidar_intensity/jpntaxi_base/" + _base_.dataset_type
 experiment_name = "lidar_voxel_second_secfpn_30e_8xb16_jpntaxi_base_120m"
@@ -165,4 +165,6 @@
 )
 log_processor = dict(window_size=50)
 
-load_from = None
+load_from = (
+    "work_dirs/bevfusion_lidar_2_8_0/base/T4Dataset/lidar_voxel_second_secfpn_50e_8xb16_base_120m/best_epoch_47.pth"
+)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m.py
index 7716a1508..2099bacca 100644
--- a/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-L/bevfusion_lidar_voxel_second_secfpn_50e_8xb16_base_120m.py
@@ -13,7 +13,7 @@
 
 # user setting
 data_root = "data/t4dataset/"
-info_directory_path = "info/user_name/"
+info_directory_path = "info/kokseang_2_8_1/"
 
 experiment_group_name = "bevfusion_lidar/base/" + _base_.dataset_type
 experiment_name = "lidar_voxel_second_secfpn_50e_8xb16_base_120m"
@@ -152,3 +152,5 @@
     checkpoint=dict(type="CheckpointHook", interval=1, max_keep_ckpts=3, save_best="NuScenes metric/T4Metric/mAP"),
 )
 log_processor = dict(window_size=50)
+
+resume = True
diff --git a/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m_iou_loss.py b/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m_iou_loss.py
new file mode 100644
index 000000000..e90687fe3
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/default/models/default_lidar_second_secfpn_120m_iou_loss.py
@@ -0,0 +1,10 @@
+_base_ = [
+    "./default_lidar_second_secfpn_120m.py",
+]
+
+model = dict(
+    bbox_head=dict(
+        common_heads=dict(center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2], iou=[1, 2]),
+        loss_iou=dict(type="mmdet.L1Loss", reduction="mean", loss_weight=1.0),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_depthlss_120m.py
similarity index 52%
rename from projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py
rename to projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_depthlss_120m.py
index c4097de3d..c807668a3 100644
--- a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_lidar_second_secfpn_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_depthlss_120m.py
@@ -1,9 +1,15 @@
 _base_ = [
-    "./default_lidar_second_secfpn_120m.py",
+    "../default_lidar_second_secfpn_120m.py",
 ]
 
 # Image network
 model = dict(
+    # Remove all lidar related configs
+    voxelize_cfg=None,
+    pts_voxel_encoder=None,
+    pts_middle_encoder=None,
+    pts_neck=None,
+    pts_backbone=None,
     data_preprocessor=dict(
         type="Det3DDataPreprocessor",
         pad_size_divisor=32,
@@ -13,34 +19,26 @@
         rgb_to_bgr=False,
     ),
     img_backbone=dict(
-        type="mmdet.SwinTransformer",
-        pretrain_img_size=(256, 704),
-        embed_dims=96,
-        depths=[2, 2, 6, 2],
-        num_heads=[3, 6, 12, 24],
-        window_size=7,
-        mlp_ratio=4,
-        qkv_bias=True,
-        qk_scale=None,
-        drop_rate=0.0,
-        attn_drop_rate=0.0,
-        drop_path_rate=0.2,
-        patch_norm=True,
-        out_indices=[1, 2, 3],
-        with_cp=False,
-        convert_weights=True,
+        type="mmdet.ResNet",
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type="BN2d", requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style="pytorch",
         init_cfg=dict(
             type="Pretrained",
-            # https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/swint-nuimages-pretrained.pth
-            checkpoint="work_dirs/swin_transformer/swint_nuimages_pretrained.pth",  # noqa: E251
+            checkpoint="work_dirs/resnet50/mmdet_resnet50-19c8e357.pth",  # noqa: E251
         ),
     ),
     img_neck=dict(
         type="GeneralizedLSSFPN",
-        in_channels=[192, 384, 768],
+        in_channels=[512, 1024, 2048],
         out_channels=256,
         start_level=0,
-        num_outs=3,
+        num_outs=2,
         norm_cfg=dict(type="BN2d", requires_grad=True),
         act_cfg=dict(type="ReLU", inplace=True),
         upsample_cfg=dict(mode="bilinear", align_corners=False),
@@ -56,5 +54,7 @@
         dbound=[1.0, 130, 1.0],
         downsample=2,
     ),
-    fusion_layer=dict(type="ConvFuser", in_channels=[80, 256], out_channels=256, kernel_size=5, padding=2),
+    bbox_head=dict(
+        in_channels=80,
+    ),
 )
diff --git a/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_depthaware_50m.py b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_depthaware_50m.py
new file mode 100644
index 000000000..89c35aca9
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/default/models/resnet50/camera_resnet50_fpn_lss_depthaware_50m.py
@@ -0,0 +1,25 @@
+_base_ = [
+    "./camera_resnet50_fpn_depthlss_120m.py",
+]
+num_proposals = 200
+
+# Image network
+model = dict(
+    depth_gt_downsample=8,
+    loss_depth_weight=1.0,
+    view_transform=dict(
+        type="LSSTransformV2DepthAware",
+        xbound=[-54.0, 54.0, 0.3],
+        ybound=[-54.0, 54.0, 0.3],
+        zbound=[-10.0, 10.0, 20.0],
+        dbound=[1.0, 60, 0.5],
+        downsample=2,
+        camera_depth_aware_configs=dict(mlp_drop_out=0.0, downsample=8, num_camera_depth_parameters=27),
+    ),
+    bbox_head=dict(
+        num_proposals=num_proposals,
+        bbox_coder=dict(
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+        ),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_120m.py b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_120m.py
similarity index 97%
rename from projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_120m.py
rename to projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_120m.py
index c4b0cd9ab..88e74efc7 100644
--- a/projects/BEVFusion/configs/t4dataset/default/models/default_camera_swin_fpn_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_depthlss_120m.py
@@ -1,5 +1,5 @@
 _base_ = [
-    "./default_lidar_second_secfpn_120m.py",
+    "../default_lidar_second_secfpn_120m.py",
 ]
 
 # Image network
diff --git a/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_lss_depthaware_50m.py b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_lss_depthaware_50m.py
new file mode 100644
index 000000000..09b317343
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/default/models/swin_transformer/camera_swin_fpn_lss_depthaware_50m.py
@@ -0,0 +1,21 @@
+_base_ = [
+    "./default_camera_swin_fpn_depthlss_120m.py",
+]
+
+# Image network
+model = dict(
+    view_transform=dict(
+        type="LSSTransformV2DepthAware",
+        xbound=[-54.0, 54.0, 0.3],
+        ybound=[-54.0, 54.0, 0.3],
+        zbound=[-10.0, 10.0, 20.0],
+        dbound=[1.0, 60, 0.5],
+        downsample=2,
+        camera_depth_aware_configs=dict(mlp_drop_out=0.0, downsample=8, num_camera_depth_parameters=27),
+    ),
+    bbox_head=dict(
+        bbox_coder=dict(
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+        ),
+    ),
+)
diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_120m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_120m.py
new file mode 100644
index 000000000..ec41f5012
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_120m.py
@@ -0,0 +1,175 @@
+## This config is for the camera_base only model, without lidar points
+
+_base_ = [
+    "../default_lidar_120m.py",
+]
+input_modality = dict(use_lidar=True, use_camera=True)
+
+# Image parameters
+image_size = [384, 768]  # Height, Width
+camera_orders = {
+    "J6_erga_Gen2": ["CAM_FRONT", "CAM_FRONT_LEFT", "CAM_BACK_LEFT", "CAM_FRONT_RIGHT", "CAM_BACK_RIGHT"],
+    "J6_x2_Gen2": ["CAM_FRONT", "CAM_FRONT_LEFT", "CAM_BACK_LEFT", "CAM_FRONT_RIGHT", "CAM_BACK_RIGHT"],
+    "JPNTaxi_xx1_Gen2": [
+        "CAM_FRONT_WIDE",
+        "CAM_FRONT_LEFT_WIDE",
+        "CAM_BACK_LEFT_WIDE",
+        "CAM_FRONT_RIGHT_WIDE",
+        "CAM_BACK_RIGHT_WIDE",
+    ],
+    "JPNTaxi_solio_Gen2": [
+        "CAM_FRONT_WIDE",
+        "CAM_FRONT_LEFT_WIDE",
+        "CAM_BACK_LEFT_WIDE",
+        "CAM_FRONT_RIGHT_WIDE",
+        "CAM_BACK_RIGHT_WIDE",
+    ],
+}
+
+train_pipeline = [
+    dict(
+        type="BEVLoadMultiViewImageFromFiles",
+        to_float32=True,
+        color_type="color",
+        backend_args=_base_.backend_args,
+        camera_order=camera_order,
+    ),
+    # We keep loading LiDAR points to make downstream BEV augmentation easier
+    dict(
+        type="LoadPointsFromFile",
+        coord_type="LIDAR",
+        load_dim=_base_.point_load_dim,
+        use_dim=_base_.point_load_dim,
+        backend_args=_base_.backend_args,
+    ),
+    dict(
+        type="PointsToMultiViewImageDepths",
+        img_shape=image_size,
+        num_cameras=5,
+        depth_bounds=[1.0, 120.0],
+        # visualize_dir="work_dirs/visualize_depths_6",
+    ),
+    dict(
+        type="LoadPointsFromMultiSweeps",
+        sweeps_num=_base_.sweeps_num,
+        load_dim=_base_.point_load_dim,
+        use_dim=_base_.lidar_sweep_dims,
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=_base_.backend_args,
+        test_mode=False,
+    ),
+    dict(type="PointsRangeFilter", point_cloud_range=_base_.point_cloud_range),
+    dict(type="LoadAnnotations3D", with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(
+        type="ImageAug3D",
+        final_dim=image_size,
+        resize_lim=[0.28, 0.40],
+        bot_pct_lim=[0.0, 0.0],
+        rot_lim=[0.0, 0.0],
+        rand_flip=True,
+        is_train=True,
+    ),
+    dict(
+        type="BEVFusionGlobalRotScaleTrans",
+        scale_ratio_range=[0.95, 1.05],
+        rot_range=[-0.78539816, 0.78539816],
+        translation_std=[0.5, 0.5, 0.2],
+    ),
+    dict(type="BEVFusionRandomFlip3D"),
+    dict(type="ObjectRangeFilter", point_cloud_range=_base_.point_cloud_range),
+    dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=3),
+    dict(type="ObjectRangeMinPointsFilter", range_radius=[60, 130], min_num_points=2),
+    dict(
+        type="ObjectNameFilter",
+        classes=[
+            "car",
+            "truck",
+            "bus",
+            "bicycle",
+            "pedestrian",
+            "traffic_cone",
+            "barrier",
+        ],
+    ),
+    dict(
+        type="Pack3DDetInputs",
+        keys=["points", "img", "gt_bboxes_3d", "gt_labels_3d", "gt_bboxes", "gt_labels"],
+        meta_keys=[
+            "cam2img",
+            "ori_cam2img",
+            "lidar2cam",
+            "lidar2img",
+            "cam2lidar",
+            "ori_lidar2img",
+            "img_aug_matrix",
+            "box_type_3d",
+            "sample_idx",
+            "lidar_path",
+            "img_path",
+            "transformation_3d_flow",
+            "pcd_rotation",
+            "pcd_scale_factor",
+            "pcd_trans",
+            "img_aug_matrix",
+            "lidar_aug_matrix",
+            "timestamp",
+            "vehicle_type",
+            "city",
+            "traffic_cone_barrier_status",
+            "gt_depths",
+        ],
+    ),
+]
+
+test_pipeline = [
+    dict(
+        type="BEVLoadMultiViewImageFromFiles",
+        to_float32=True,
+        color_type="color",
+        backend_args=_base_.backend_args,
+        camera_orders=camera_orders,
+    ),
+    dict(
+        type="LoadPointsFromFile",
+        coord_type="LIDAR",
+        load_dim=_base_.point_load_dim,
+        use_dim=_base_.point_load_dim,
+        backend_args=_base_.backend_args,
+    ),
+    dict(
+        type="ImageAug3D",
+        final_dim=image_size,
+        resize_lim=[0.34, 0.34],
+        bot_pct_lim=[0.0, 0.0],
+        rot_lim=[0.0, 0.0],
+        rand_flip=False,
+        is_train=False,
+    ),
+    dict(type="ObjectRangeFilter", point_cloud_range=_base_.point_cloud_range),
+    dict(
+        type="Pack3DDetInputs",
+        keys=["img", "points", "gt_bboxes_3d", "gt_labels_3d"],
+        meta_keys=[
+            "cam2img",
+            "ori_cam2img",
+            "lidar2cam",
+            "lidar2img",
+            "cam2lidar",
+            "ori_lidar2img",
+            "img_aug_matrix",
+            "box_type_3d",
+            "sample_idx",
+            "lidar_path",
+            "img_path",
+            "num_pts_feats",
+            "num_views",
+            "timestamp",
+            "vehicle_type",
+            "city",
+            "traffic_cone_barrier_status",
+        ],
+    ),
+]
+
+filter_cfg = dict(filter_frames_with_camera_orders=camera_orders)
diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_50m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_50m.py
new file mode 100644
index 000000000..492ad7866
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/cameras/default_camera_50m.py
@@ -0,0 +1,168 @@
+## This config is for the camera_base only model, without lidar points
+
+_base_ = [
+    "../default_lidar_50m.py",
+]
+input_modality = dict(use_lidar=True, use_camera=True)
+
+# Image parameters
+image_size = [384, 768]  # Height, Width
+camera_orders = {
+    "J6_erga_Gen2": ["CAM_FRONT", "CAM_FRONT_LEFT", "CAM_BACK_LEFT", "CAM_FRONT_RIGHT", "CAM_BACK_RIGHT"],
+    "J6_x2_Gen2": ["CAM_FRONT", "CAM_FRONT_LEFT", "CAM_BACK_LEFT", "CAM_FRONT_RIGHT", "CAM_BACK_RIGHT"],
+    "JPNTaxi_xx1_Gen2": [
+        "CAM_FRONT_WIDE",
+        "CAM_FRONT_LEFT_WIDE",
+        "CAM_BACK_LEFT_WIDE",
+        "CAM_FRONT_RIGHT_WIDE",
+        "CAM_BACK_RIGHT_WIDE",
+    ],
+    "JPNTaxi_solio_Gen2": [
+        "CAM_FRONT_WIDE",
+        "CAM_FRONT_LEFT_WIDE",
+        "CAM_BACK_LEFT_WIDE",
+        "CAM_FRONT_RIGHT_WIDE",
+        "CAM_BACK_RIGHT_WIDE",
+    ],
+}
+
+train_pipeline = [
+    dict(
+        type="BEVLoadMultiViewImageFromFiles",
+        to_float32=True,
+        color_type="color",
+        backend_args=_base_.backend_args,
+        camera_orders=camera_orders,
+    ),
+    # We keep loading LiDAR points to make downstream BEV augmentation easier
+    dict(
+        type="LoadPointsFromFile",
+        coord_type="LIDAR",
+        load_dim=_base_.point_load_dim,
+        use_dim=_base_.point_load_dim,
+        backend_args=_base_.backend_args,
+    ),
+    dict(type="LoadAnnotations3D", with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(
+        type="ImageAug3D",
+        final_dim=image_size,
+        resize_lim=[0.29, 0.35],
+        bot_pct_lim=[0.0, 0.0],
+        rot_lim=[0.0, 0.0],
+        rand_flip=True,
+        is_train=True,
+    ),
+    dict(type="PointsRangeFilter", point_cloud_range=[-80.0, -80.0, -10.0, 80.0, 80.0, 10.0]),
+    dict(
+        type="PointsToMultiViewImageDepths",
+        img_shape=image_size,
+        num_cameras=5,
+        depth_bounds=[1.0, 60.0],
+    ),
+    dict(
+        type="BEVFusionGlobalRotScaleTrans",
+        scale_ratio_range=[0.95, 1.05],
+        rot_range=[-0.78539816, 0.78539816],
+        translation_std=[0.5, 0.5, 0.2],
+    ),
+    dict(type="BEVFusionRandomFlip3D"),
+    dict(type="ObjectRangeFilter", point_cloud_range=_base_.point_cloud_range),
+    dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=3),
+    # Remove LiDAR points from the data
+    dict(type="BEVFusionRemoveLiDARPoints"),
+    dict(
+        type="ObjectNameFilter",
+        classes=[
+            "car",
+            "truck",
+            "construction_vehicle",
+            "bus",
+            "trailer",
+            "barrier",
+            "motorcycle",
+            "bicycle",
+            "pedestrian",
+            "traffic_cone",
+        ],
+    ),
+    dict(
+        type="Pack3DDetInputs",
+        keys=["points", "img", "gt_bboxes_3d", "gt_labels_3d", "gt_bboxes", "gt_labels"],
+        meta_keys=[
+            "cam2img",
+            "ori_cam2img",
+            "lidar2cam",
+            "lidar2img",
+            "cam2lidar",
+            "ori_lidar2img",
+            "img_aug_matrix",
+            "box_type_3d",
+            "sample_idx",
+            "lidar_path",
+            "img_path",
+            "transformation_3d_flow",
+            "pcd_rotation",
+            "pcd_scale_factor",
+            "pcd_trans",
+            "img_aug_matrix",
+            "lidar_aug_matrix",
+            "timestamp",
+            "vehicle_type",
+            "city",
+            "traffic_cone_barrier_status",
+            "gt_depths",
+        ],
+    ),
+]
+
+test_pipeline = [
+    dict(
+        type="BEVLoadMultiViewImageFromFiles",
+        to_float32=True,
+        color_type="color",
+        backend_args=_base_.backend_args,
+        camera_orders=camera_orders,
+    ),
+    dict(
+        type="LoadPointsFromFile",
+        coord_type="LIDAR",
+        load_dim=_base_.point_load_dim,
+        use_dim=_base_.point_load_dim,
+        backend_args=_base_.backend_args,
+    ),
+    dict(
+        type="ImageAug3D",
+        final_dim=image_size,
+        resize_lim=[0.32, 0.32],
+        bot_pct_lim=[0.0, 0.0],
+        rot_lim=[0.0, 0.0],
+        rand_flip=False,
+        is_train=False,
+    ),
+    dict(type="PointsRangeFilter", point_cloud_range=[-80.0, -80.0, -10.0, 80.0, 80.0, 10.0]),
+    dict(
+        type="Pack3DDetInputs",
+        keys=["img", "points", "gt_bboxes_3d", "gt_labels_3d"],
+        meta_keys=[
+            "cam2img",
+            "ori_cam2img",
+            "lidar2cam",
+            "lidar2img",
+            "cam2lidar",
+            "ori_lidar2img",
+            "img_aug_matrix",
+            "box_type_3d",
+            "sample_idx",
+            "lidar_path",
+            "img_path",
+            "num_pts_feats",
+            "num_views",
+            "timestamp",
+            "vehicle_type",
+            "city",
+            "traffic_cone_barrier_status",
+        ],
+    ),
+]
+
+filter_cfg = dict(filter_frames_with_camera_orders=camera_orders)
diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_camera_lidar_intensity_120m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_camera_lidar_intensity_120m.py
index 4d9a5aa12..b468b0f9c 100644
--- a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_camera_lidar_intensity_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_camera_lidar_intensity_120m.py
@@ -58,7 +58,7 @@
         final_dim=image_size,
         resize_lim=[0.29, 0.35],
         bot_pct_lim=[0.0, 0.0],
-        rot_lim=[-5.4, 5.4],
+        rot_lim=[0.0, 0.0],
         rand_flip=True,
         is_train=True,
     ),
@@ -71,6 +71,7 @@
     dict(type="BEVFusionRandomFlip3D"),
     dict(type="PointsRangeFilter", point_cloud_range=point_cloud_range),
     dict(type="ObjectRangeFilter", point_cloud_range=point_cloud_range),
+    dict(type="BEVFusionRemoveLiDARPoints"),
     dict(
         type="ObjectNameFilter",
         classes=[
@@ -138,7 +139,7 @@
         pad_empty_sweeps=True,
         remove_close=True,
         backend_args=backend_args,
-        test_mode=True,
+        test_mode=False,
     ),
     dict(
         type="ImageAug3D",
diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_120m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_120m.py
index b37108873..64cf8b076 100644
--- a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_120m.py
@@ -148,4 +148,4 @@
 #   e.g., dict(filter_frames_with_missing_image=True).
 # - This is a LiDAR-only config (`input_modality['use_camera'] = False`), so
 #   image-based filtering does not apply and `filter_cfg` is intentionally None.
-filter_cfg = None
+filter_cfg = dict()
diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_50m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_50m.py
new file mode 100644
index 000000000..90c5a1dea
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_50m.py
@@ -0,0 +1,138 @@
+# Dataset parameters
+backend_args = None
+num_workers = 4
+input_modality = dict(use_lidar=True, use_camera=False)
+
+# range setting
+point_cloud_range = [-54.0, -54.0, -3.0, 54.0, 54.0, 5.0]
+voxel_size = [0.075, 0.075, 0.2]
+grid_size = [1440, 1440, 41]
+eval_class_range = {
+    "car": 51.2,
+    "truck": 51.2,
+    "bus": 51.2,
+    "bicycle": 51.2,
+    "pedestrian": 51.2,
+    "traffic_cone": 51.2,
+    "barrier": 51.2,
+}
+
+# LiDAR parameters
+point_load_dim = 5  # x, y, z, intensity, ring_id
+point_use_dim = 4
+lidar_sweep_dims = [0, 1, 2, 4]  # x, y, z, time_lag
+sweeps_num = 1
+
+train_pipeline = [
+    dict(
+        type="LoadPointsFromFile",
+        coord_type="LIDAR",
+        load_dim=point_load_dim,
+        use_dim=point_load_dim,
+        backend_args=backend_args,
+    ),
+    dict(
+        type="LoadPointsFromMultiSweeps",
+        sweeps_num=sweeps_num,
+        load_dim=point_load_dim,
+        use_dim=lidar_sweep_dims,
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args,
+        test_mode=False,
+    ),
+    dict(type="LoadAnnotations3D", with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(
+        type="BEVFusionGlobalRotScaleTrans",
+        scale_ratio_range=[0.95, 1.05],
+        rot_range=[-0.78539816, 0.78539816],
+        translation_std=[0.5, 0.5, 0.2],
+    ),
+    dict(type="BEVFusionRandomFlip3D"),
+    dict(type="PointsRangeFilter", point_cloud_range=point_cloud_range),
+    dict(type="ObjectRangeFilter", point_cloud_range=point_cloud_range),
+    dict(
+        type="ObjectNameFilter",
+        classes=[
+            "car",
+            "truck",
+            "bus",
+            "bicycle",
+            "pedestrian",
+            "traffic_cone",
+            "barrier",
+        ],
+    ),
+    dict(type="ObjectRangeMinPointsFilter", range_radius=[0, 60], min_num_points=3),
+    dict(type="ObjectRangeMinPointsFilter", range_radius=[60, 130], min_num_points=2),
+    dict(type="PointShuffle"),
+    dict(
+        type="Pack3DDetInputs",
+        keys=["points", "img", "gt_bboxes_3d", "gt_labels_3d", "gt_bboxes", "gt_labels"],
+        meta_keys=[
+            "cam2img",
+            "ori_cam2img",
+            "lidar2cam",
+            "lidar2img",
+            "cam2lidar",
+            "ori_lidar2img",
+            "img_aug_matrix",
+            "box_type_3d",
+            "sample_idx",
+            "lidar_path",
+            "img_path",
+            "transformation_3d_flow",
+            "pcd_rotation",
+            "pcd_scale_factor",
+            "pcd_trans",
+            "img_aug_matrix",
+            "lidar_aug_matrix",
+            "timestamp",
+            "vehicle_type",
+            "city",
+            "traffic_cone_barrier_status",
+        ],
+    ),
+]
+
+test_pipeline = [
+    dict(
+        type="LoadPointsFromFile",
+        coord_type="LIDAR",
+        load_dim=point_load_dim,
+        use_dim=point_load_dim,
+        backend_args=backend_args,
+    ),
+    dict(type="PointsRangeFilter", point_cloud_range=point_cloud_range),
+    dict(
+        type="Pack3DDetInputs",
+        keys=["img", "points", "gt_bboxes_3d", "gt_labels_3d"],
+        meta_keys=[
+            "cam2img",
+            "ori_cam2img",
+            "lidar2cam",
+            "lidar2img",
+            "cam2lidar",
+            "ori_lidar2img",
+            "img_aug_matrix",
+            "box_type_3d",
+            "sample_idx",
+            "lidar_path",
+            "img_path",
+            "num_pts_feats",
+            "num_views",
+            "timestamp",
+            "vehicle_type",
+            "city",
+            "traffic_cone_barrier_status",
+        ],
+    ),
+]
+
+# Filtering configuration
+# Note:
+# - In camera–LiDAR configs, `filter_cfg` can enable image-based frame filtering,
+#   e.g., dict(filter_frames_with_missing_image=True).
+# - This is a LiDAR-only config (`input_modality['use_camera'] = False`), so
+#   image-based filtering does not apply and `filter_cfg` is intentionally None.
+filter_cfg = dict()
diff --git a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_intensity_120m.py b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_intensity_120m.py
index 7d6b8e506..b03a22d36 100644
--- a/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_intensity_120m.py
+++ b/projects/BEVFusion/configs/t4dataset/default/pipelines/default_lidar_intensity_120m.py
@@ -148,4 +148,4 @@
 #   e.g., dict(filter_frames_with_missing_image=True).
 # - This is a LiDAR-only config (`input_modality['use_camera'] = False`), so
 #   image-based filtering does not apply and `filter_cfg` is intentionally None.
-filter_cfg = None
+filter_cfg = dict()
diff --git a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb8_adamw_linear_cosine.py b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb16_adamw_linear_cosine.py
similarity index 98%
rename from projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb8_adamw_linear_cosine.py
rename to projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb16_adamw_linear_cosine.py
index 64bc2b717..05740e442 100644
--- a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb8_adamw_linear_cosine.py
+++ b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_20e_8xb16_adamw_linear_cosine.py
@@ -1,6 +1,6 @@
 # learning rate
-lr = 1e-4
-t_max = 6
+lr = 2e-4
+t_max = 2
 max_epochs = 20
 val_interval = 1
 
diff --git a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb32_adamw_linear_cosine.py b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb32_adamw_linear_cosine.py
new file mode 100644
index 000000000..34d5b95f6
--- /dev/null
+++ b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb32_adamw_linear_cosine.py
@@ -0,0 +1,69 @@
+# learning rate
+lr = 2e-4
+t_max = 3
+max_epochs = 30
+val_interval = 1
+
+train_gpu_size = 8
+test_batch_size = 2
+train_batch_size = 32
+
+param_scheduler = [
+    # learning rate scheduler
+    dict(
+        type="LinearLR",
+        start_factor=1.0 / 3,
+        begin=0,
+        end=t_max,
+        by_epoch=True,
+        convert_to_iter_based=True,
+    ),
+    dict(
+        type="CosineAnnealingLR",
+        T_max=(max_epochs - t_max),
+        eta_min=lr * 1e-4,
+        begin=t_max,
+        end=max_epochs,
+        by_epoch=True,
+        convert_to_iter_based=True,
+    ),
+    # momentum scheduler
+    # During the first (0.4 * max_epochs) epochs, momentum increases from 0 to 0.85 / 0.95
+    # during the next epochs, momentum increases from 0.85 / 0.95 to 1
+    dict(
+        type="CosineAnnealingMomentum",
+        T_max=t_max,
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=t_max,
+        by_epoch=True,
+        convert_to_iter_based=True,
+    ),
+    dict(
+        type="CosineAnnealingMomentum",
+        T_max=(max_epochs - t_max),
+        eta_min=1,
+        begin=t_max,
+        end=max_epochs,
+        by_epoch=True,
+        convert_to_iter_based=True,
+    ),
+]
+
+train_cfg = dict(
+    by_epoch=True, max_epochs=max_epochs, val_interval=val_interval, dynamic_intervals=[(max_epochs - 5, 1)]
+)
+val_cfg = dict()
+test_cfg = dict()
+
+optim_wrapper = dict(
+    type="OptimWrapper",
+    optimizer=dict(type="AdamW", lr=lr, weight_decay=1e-2),
+    clip_grad=dict(max_norm=5.0, norm_type=2),
+)
+
+auto_scale_lr = dict(enable=False, base_batch_size=train_gpu_size * train_batch_size)
+
+# Only set if the number of train_gpu_size more than 1
+if train_gpu_size > 1:
+    sync_bn = "torch"
diff --git a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_linear_cosine.py b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb32_adamw_linear_cosine.py
similarity index 93%
rename from projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_linear_cosine.py
rename to projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb32_adamw_linear_cosine.py
index 23d29acc1..6763b8779 100644
--- a/projects/BEVFusion/configs/t4dataset/default/schedulers/default_30e_8xb8_adamw_linear_cosine.py
+++ b/projects/BEVFusion/configs/t4dataset/default/schedulers/default_50e_8xb32_adamw_linear_cosine.py
@@ -1,12 +1,12 @@
 # learning rate
-lr = 1e-4
-t_max = 8
-max_epochs = 30
+lr = 2e-4
+t_max = 5
+max_epochs = 50
 val_interval = 1
 
 train_gpu_size = 8
 test_batch_size = 2
-train_batch_size = 8
+train_batch_size = 32
 
 param_scheduler = [
     # learning rate scheduler
@@ -52,7 +52,7 @@
 optim_wrapper = dict(
     type="OptimWrapper",
     optimizer=dict(type="AdamW", lr=lr, weight_decay=0.01),
-    clip_grad=dict(max_norm=0.1, norm_type=2),
+    clip_grad=dict(max_norm=1.0, norm_type=2),
 )
 
 auto_scale_lr = dict(enable=False, base_batch_size=train_gpu_size * train_batch_size)
diff --git a/projects/BEVFusion/setup.py b/projects/BEVFusion/setup.py
index 38f588b20..52d397c12 100644
--- a/projects/BEVFusion/setup.py
+++ b/projects/BEVFusion/setup.py
@@ -54,6 +54,14 @@ def make_cuda_ext(name, module, sources, sources_cuda=[], extra_args=[], extra_i
                     "src/bev_pool_cuda.cu",
                 ],
             ),
+            make_cuda_ext(
+                name="bev_pool_v2_ext",
+                module="projects.BEVFusion.bevfusion.ops.bev_pool_v2",
+                sources=[
+                    "src/bev_pool.cpp",
+                    "src/bev_pool_cuda.cu",
+                ],
+            ),
             make_cuda_ext(
                 name="voxel_layer",
                 module="projects.BEVFusion.bevfusion.ops.voxel",