BiaPyX · ibai-0 · Mar 23, 2026 · Apr 7, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/biapy/config/config.py b/biapy/config/config.py
@@ -1412,6 +1412,178 @@ def __init__(self, job_dir: str, job_identifier: str):
         #
         _C.MODEL.TORCHVISION_MODEL_NAME = ""
 
+        #
+        # BIAPY BACKEND MODELS
+        #
+        # Architecture of the network. Possible values are:
+        #   * Semantic segmentation: 'unet', 'resunet', 'resunet++', 'attention_unet', 'multiresunet', 'seunet', 'resunet_se', 'unetr', 'unext_v1', 'unext_v2'
+        #   * Instance segmentation: 'unet', 'resunet', 'resunet++', 'attention_unet', 'multiresunet', 'seunet', 'resunet_se', 'unetr', 'unext_v1', 'unext_v2'
+        #   * Detection: 'unet', 'resunet', 'resunet++', 'attention_unet', 'multiresunet', 'seunet', 'resunet_se', 'unetr', 'unext_v1', 'unext_v2'
+        #   * Denoising: 'unet', 'resunet', 'resunet++', 'attention_unet', 'seunet', 'resunet_se', 'unext_v1', 'unext_v2', 'nafnet'
+        #   * Super-resolution: 'edsr', 'rcan', 'dfcan', 'wdsr', 'unet', 'resunet', 'resunet++', 'seunet', 'resunet_se', 'attention_unet', 'multiresunet', 'unext_v1', 'unext_v2'
+        #   * Self-supervision: 'unet', 'resunet', 'resunet++', 'attention_unet', 'multiresunet', 'seunet', 'resunet_se', 'unetr', 'edsr', 'rcan', 'dfcan', 'wdsr', 'vit', 'mae', 'unext_v1', 'unext_v2'
+        #   * Classification: 'simple_cnn', 'vit', 'efficientnet_b[0-7]' (only 2D)
+        #   * Image to image: 'edsr', 'rcan', 'dfcan', 'wdsr', 'unet', 'resunet', 'resunet++', 'seunet', 'resunet_se', 'attention_unet', 'unetr', 'multiresunet', 'unext_v1', 'unext_v2'
+        _C.MODEL.ARCHITECTURE = "unet"
+        # Number of feature maps on each level of the network.
+        _C.MODEL.FEATURE_MAPS = [16, 32, 64, 128, 256]
+        # Values to make the dropout with. Set to 0 to prevent dropout. When using it with 'ViT' or 'unetr'
+        # a list with just one number must be provided
+        _C.MODEL.DROPOUT_VALUES = [0.0, 0.0, 0.0, 0.0, 0.0]
+        # Normalization layer (one of 'bn', 'sync_bn' 'in', 'gn' or 'none').
+        _C.MODEL.NORMALIZATION = "bn"
+        # Kernel size
+        _C.MODEL.KERNEL_SIZE = 3
+        # Upsampling layer to use in the model. Options: ["upsampling", "convtranspose"]
+        _C.MODEL.UPSAMPLE_LAYER = "convtranspose"
+        # Activation function to use along the model
+        _C.MODEL.ACTIVATION = "ELU"
+        # Number of classes including the background class (that should be using 0 label)
+        _C.DATA.N_CLASSES = 2
+        # Downsampling to be made in Z. This value will be the third integer of the MaxPooling operation. When facing
+        # anysotropic datasets set it to get better performance
+        _C.MODEL.Z_DOWN = [0, 0, 0, 0]
+        # For each level of the model (U-Net levels), set to true or false if the dimensions of the feature maps are isotropic.
+        _C.MODEL.ISOTROPY = [True, True, True, True, True]
+        # Include extra convolutional layers with larger kernel at the beginning and end of the U-Net-like model.
+        _C.MODEL.LARGER_IO = False
+        # Checkpoint: set to True to load previous training weigths (needed for inference or to make fine-tunning)
+        _C.MODEL.LOAD_CHECKPOINT = False
+        # When loading checkpoints whether only model's weights are going to be loaded or optimizer, epochs and loss_scaler.
+        _C.MODEL.LOAD_CHECKPOINT_ONLY_WEIGHTS = True
+        # Decide which checkpoint to load from job's dir if PATHS.CHECKPOINT_FILE is ''.
+        # Options: 'best_on_val' or 'last_on_train'
+        _C.MODEL.LOAD_CHECKPOINT_EPOCH = "best_on_val"
+        # Whether to load the model from the checkpoint instead of builiding it following 'MODEL.ARCHITECTURE' when 'MODEL.SOURCE' is "biapy"
+        _C.MODEL.LOAD_MODEL_FROM_CHECKPOINT = True
+        # Format of the output checkpoint. Options are 'pth' (native PyTorch format) or 'safetensors' (https://github.com/huggingface/safetensors)
+        _C.MODEL.OUT_CHECKPOINT_FORMAT = "pth"
+        # To skip loading those layers that do not match in shape with the given checkpoint. If this is set to False a regular load function will be 
+        # done, which will fail if a layer mismatch is found. Only works when 'MODEL.LOAD_MODEL_FROM_CHECKPOINT' is True
+        _C.MODEL.SKIP_UNMATCHED_LAYERS = False
+        # Epochs to save a checkpoint of the model apart from the ones saved with LOAD_CHECKPOINT_ONLY_WEIGHTS. Set it to -1 to
+        # not do it.
+        _C.MODEL.SAVE_CKPT_FREQ = -1
+        # Number of ConvNeXtBlocks in each level.
+        _C.MODEL.CONVNEXT_LAYERS = [2, 2, 2, 2, 2]  # CONVNEXT_LAYERS
+        # Maximum Stochastic Depth probability for the U-NeXt model.
+        _C.MODEL.CONVNEXT_SD_PROB = 0.1
+        # Layer Scale parameter for the U-NeXt model.
+        _C.MODEL.CONVNEXT_LAYER_SCALE = 1e-6
+        # Size of the stem kernel in the U-NeXt model.
+        _C.MODEL.CONVNEXT_STEM_K_SIZE = 2
+
+        # TRANSFORMERS MODELS
+        # Type of model. Options are "custom", "vit_base_patch16", "vit_large_patch16" and "vit_huge_patch16". On custom setting
+        # the rest of the ViT parameters can be modified as other options will set them automatically.
+        _C.MODEL.VIT_MODEL = "custom"
+        # Size of the patches that are extracted from the input image.
+        _C.MODEL.VIT_TOKEN_SIZE = 16
+        # Dimension of the embedding space
+        _C.MODEL.VIT_EMBED_DIM = 768
+        # Number of transformer encoder layers
+        _C.MODEL.VIT_NUM_LAYERS = 12
+        # Number of heads in the multi-head attention layer.
+        _C.MODEL.VIT_NUM_HEADS = 12
+        # Size of the dense layers of the final classifier. This value will mutiply 'VIT_EMBED_DIM'
+        _C.MODEL.VIT_MLP_RATIO = 4.0
+        # Normalization layer epsion
+        _C.MODEL.VIT_NORM_EPS = 1e-6
+
+        # Dimension of the embedding space for the MAE decoder
+        _C.MODEL.MAE_DEC_HIDDEN_SIZE = 512
+        # Number of transformer decoder layers
+        _C.MODEL.MAE_DEC_NUM_LAYERS = 8
+        # Number of heads in the multi-head attention layer.
+        _C.MODEL.MAE_DEC_NUM_HEADS = 16
+        # Size of the dense layers of the final classifier
+        _C.MODEL.MAE_DEC_MLP_DIMS = 2048
+        # Type of the masking strategy. Options: ["grid", "random"]
+        _C.MODEL.MAE_MASK_TYPE = "grid"
+        # Percentage of the input image to mask (applied only when MODEL.MAE_MASK_TYPE == "random"). Value between 0 and 1.
+        _C.MODEL.MAE_MASK_RATIO = 0.5
+
+        # UNETR
+        # Multiple of the transformer encoder layers from of which the skip connection signal is going to be extracted
+        _C.MODEL.UNETR_VIT_HIDD_MULT = 3
+        # Number of filters in the first UNETR's layer of the decoder. In each layer the previous number of filters is doubled.
+        _C.MODEL.UNETR_VIT_NUM_FILTERS = 16
+        # Decoder activation
+        _C.MODEL.UNETR_DEC_ACTIVATION = "relu"
+        # Decoder convolutions' kernel size
+        _C.MODEL.UNETR_DEC_KERNEL_SIZE = 3
+
+        # Specific for SR models based on U-Net architectures. Options are ["pre", "post"]
+        _C.MODEL.UNET_SR_UPSAMPLE_POSITION = "pre"
+
+        # RCAN
+        # Number of RG modules
+        _C.MODEL.RCAN_RG_BLOCK_NUM = 10
+        # Number of RCAB modules in each RG block
+        _C.MODEL.RCAN_RCAB_BLOCK_NUM = 20
+        # Filters in the convolutions
+        _C.MODEL.RCAN_CONV_FILTERS = 16
+        # Channel reduction ratio for channel attention
+        _C.MODEL.RCAN_REDUCTION_RATIO = 16
+        # Whether to maintain or not the upscaling layer. 
+        _C.MODEL.RCAN_UPSCALING_LAYER = True
+
+        # These parameters can be used as a template for building custom HRNet versions
+        _C.MODEL.HRNET = CN()
+        # Whether to downsample the input in Z or not
+        _C.MODEL.HRNET.Z_DOWN = True
+        # Type of block to use in HRNet. Options: 'BASIC', 'BOTTLENECK', 'CONVNEXT_V1' and 'CONVNEXT_V2'
+        _C.MODEL.HRNET.BLOCK_TYPE = 'BASIC'
+        # Indicate whether to use a custom configuration for HRNet or use a predefined one. If set to True 
+        # MODEL.HRNET.STAGE2, MODEL.HRNET.STAGE3 and MODEL.HRNET.STAGE4 will be used. If False, the configuration
+        # will be set depending on the selected architecture (see PROBLEM.MODEL_ARCHITECTURE)
+        _C.MODEL.HRNET.HEAD_TYPE = "FCN" # Options: "OCR", "ASPP", "PSP", "FCN"
+        _C.MODEL.HRNET.CUSTOM = False
+
+        # These stages are used for HRNet18, HRNet32, HRNet48 and HRNet64
+        _C.MODEL.HRNET.STAGE2 = CN()
+        _C.MODEL.HRNET.STAGE2.NUM_MODULES = 1
+        _C.MODEL.HRNET.STAGE2.NUM_BRANCHES = 2
+        _C.MODEL.HRNET.STAGE2.NUM_BLOCKS = [4, 4]
+        _C.MODEL.HRNET.STAGE2.NUM_CHANNELS = [18, 36]
+        _C.MODEL.HRNET.STAGE3 = CN()
+        _C.MODEL.HRNET.STAGE3.NUM_MODULES = 4
+        _C.MODEL.HRNET.STAGE3.NUM_BRANCHES = 3
+        _C.MODEL.HRNET.STAGE3.NUM_BLOCKS = [4, 4, 4]
+        _C.MODEL.HRNET.STAGE3.NUM_CHANNELS = [18, 36, 72]
+        _C.MODEL.HRNET.STAGE4 = CN()
+        _C.MODEL.HRNET.STAGE4.NUM_MODULES = 3
+        _C.MODEL.HRNET.STAGE4.NUM_BRANCHES = 4
+        _C.MODEL.HRNET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4]
+        _C.MODEL.HRNET.STAGE4.NUM_CHANNELS = [18, 36, 72, 144]
+
+        _C.MODEL.STUNET = CN()
+        # Variant of the STUNet model. Options are: 'small', 'base', 'large'
+        _C.MODEL.STUNET.VARIANT = 'base'
+        # Whether to use a pretrained version of STUNet on ImageNet
+        _C.MODEL.STUNET.PRETRAINED = False
+
+        # NafNet 
+        _C.MODEL.NAFNET = CN()
+        # Base number of channels (width) used in the first layer and base levels.
+        _C.MODEL.NAFNET.WIDTH = 16
+        # Number of NAFBlocks stacked at the bottleneck (deepest level).
+        _C.MODEL.NAFNET.MIDDLE_BLK_NUM = 12
+        # Number of NAFBlocks assigned to each downsampling level of the encoder.
+        _C.MODEL.NAFNET.ENC_BLK_NUMS = [2, 2, 4, 8]
+        # Number of NAFBlocks assigned to each upsampling level of the decoder.
+        _C.MODEL.NAFNET.DEC_BLK_NUMS = [2, 2, 2, 2]
+        # Channel expansion factor for the depthwise convolution within the gating unit.
+        _C.MODEL.NAFNET.DW_EXPAND = 2
+        # Expansion factor for the hidden layer within the feed-forward network.
+        _C.MODEL.NAFNET.FFN_EXPAND = 2
+        # Discriminator architecture
+        _C.MODEL.NAFNET.ARCHITECTURE_D = "patchgan"
+        # Discriminator PATCHGAN
+        _C.MODEL.NAFNET.PATCHGAN = CN()
+        # Number of initial convolutional filters in the first layer of the discriminator.
+        _C.MODEL.NAFNET.PATCHGAN.BASE_FILTERS = 64
+
         # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         # 6. Loss definition options
         # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1474,26 +1646,37 @@ def __init__(self, job_dir: str, job_identifier: str):
         _C.LOSS.CONTRAST.PROJ_DIM = 256
         _C.LOSS.CONTRAST.PIXEL_UPD_FREQ = 10
 
+        # Fine-grained GAN composition. Set any weight to 0.0 to disable that term.
+        # Used when LOSS.TYPE == "CYCLEGAN".
+        _C.LOSS.CYCLEGAN = CN()
+        # Weight for adversarial BCE term.
+        _C.LOSS.CYCLEGAN.LAMBDA_GAN = 1.0
+        # Weight for L1 reconstruction term.
+        _C.LOSS.CYCLEGAN.LAMBDA_RECON = 10.0
+        # Weight for MSE reconstruction term.
+        _C.LOSS.CYCLEGAN.DELTA_MSE = 0.0
+        # Weight for VGG perceptual term.
+        _C.LOSS.CYCLEGAN.ALPHA_PERCEPTUAL = 0.0
+        # Weight for SSIM term.
+        _C.LOSS.CYCLEGAN.GAMMA_SSIM = 1.0
+
         # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         # 7. Training phase options
         # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         _C.TRAIN = CN()
         _C.TRAIN.ENABLE = False
         # Enable verbosity
         _C.TRAIN.VERBOSE = False
-        # Optimizer to use. Possible values: "SGD", "ADAM" or "ADAMW"
-        _C.TRAIN.OPTIMIZER = "SGD"
-        # Learning rate
-        _C.TRAIN.LR = 1.0e-4
+        # Optimizer(s) to use. Possible values: "SGD", "ADAM" or "ADAMW".
+        _C.TRAIN.OPTIMIZER = ["SGD"]
+        # Learning rate(s). 
+        _C.TRAIN.LR = [1.0e-4]
         # Weight decay
         _C.TRAIN.W_DECAY = 0.02
         # Coefficients used for computing running averages of gradient and its square. Used in ADAM and ADAMW optmizers
-        _C.TRAIN.OPT_BETAS = (0.9, 0.999)
+        _C.TRAIN.OPT_BETAS = [[0.9, 0.999]]
         # Batch size
         _C.TRAIN.BATCH_SIZE = 2
-        # If memory or # gpus is limited, use this variable to maintain the effective batch size, which is
-        # batch_size (per gpu) * nodes * (gpus per node) * accum_iter.
-        _C.TRAIN.ACCUM_ITER = 1
         # Number of epochs to train the model
         _C.TRAIN.EPOCHS = 360
         # Epochs to wait with no validation data improvement until the training is stopped
@@ -1509,6 +1692,9 @@ def __init__(self, job_dir: str, job_identifier: str):
         #   * Classification: 'accuracy', 'top-5-accuracy'
         #   * Image to image: "psnr", "mae", "mse", "ssim"
         _C.TRAIN.METRICS = []
+
+        # Gradient clipping max norm applied per optimizer. 0 = disabled.
+        _C.TRAIN.GRADIENT_CLIP_NORM = 0.0
 
         # Callbacks
         # To determine which value monitor to consider which epoch consider the best to save. Currently not used.
@@ -1526,7 +1712,7 @@ def __init__(self, job_dir: str, job_identifier: str):
         _C.TRAIN.LR_SCHEDULER = CN()
         _C.TRAIN.LR_SCHEDULER.NAME = ""  # Possible options: 'warmupcosine', 'reduceonplateau', 'onecycle'
         # Lower bound on the learning rate used in 'warmupcosine' and 'reduceonplateau'
-        _C.TRAIN.LR_SCHEDULER.MIN_LR = -1.0
+        _C.TRAIN.LR_SCHEDULER.MIN_LR = [-1.0]
 
         # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         # 7.1.1 Reduce on plateau options

diff --git a/biapy/data/generators/__init__.py b/biapy/data/generators/__init__.py
@@ -251,7 +251,7 @@ def create_train_val_augmentors(
             dic["zflip"] = cfg.AUGMENTOR.ZFLIP
         if cfg.PROBLEM.TYPE == "INSTANCE_SEG":
             dic["instance_problem"] = True
-        elif cfg.PROBLEM.TYPE == "DENOISING":
+        elif cfg.PROBLEM.TYPE == "DENOISING" and cfg.MODEL.ARCHITECTURE != 'nafnet':
             dic["n2v"] = True
             dic["n2v_perc_pix"] = cfg.PROBLEM.DENOISING.N2V_PERC_PIX
             dic["n2v_manipulator"] = cfg.PROBLEM.DENOISING.N2V_MANIPULATOR
@@ -297,7 +297,7 @@ def create_train_val_augmentors(
         )
         if cfg.PROBLEM.TYPE == "INSTANCE_SEG":
             dic["instance_problem"] = True
-        elif cfg.PROBLEM.TYPE == "DENOISING":
+        elif cfg.PROBLEM.TYPE == "DENOISING" and cfg.MODEL.ARCHITECTURE != 'nafnet':
             dic["n2v"] = True
             dic["n2v_perc_pix"] = cfg.PROBLEM.DENOISING.N2V_PERC_PIX
             dic["n2v_manipulator"] = cfg.PROBLEM.DENOISING.N2V_MANIPULATOR
@@ -317,7 +317,7 @@ def create_train_val_augmentors(
         )
 
     # Training dataset
-    total_batch_size = cfg.TRAIN.BATCH_SIZE * get_world_size() * cfg.TRAIN.ACCUM_ITER
+    total_batch_size = cfg.TRAIN.BATCH_SIZE * get_world_size()
     training_samples = len(train_generator)
 
     # ---- Choose num_workers for this DataLoader ----
@@ -352,7 +352,6 @@ def worker_init_fn(worker_id):
 
     num_training_steps_per_epoch = training_samples // total_batch_size
     print(f"Train/val generators with {num_workers} workers")
-    print("Accumulate grad iterations: %d" % cfg.TRAIN.ACCUM_ITER)
     print("Effective batch size: %d" % total_batch_size)
     print("Sampler_train = %s" % str(sampler_train))
     train_dataset = DataLoader(

diff --git a/biapy/engine/__init__.py b/biapy/engine/__init__.py
@@ -21,7 +21,7 @@ def prepare_optimizer(
     cfg: CN,
     model_without_ddp: nn.Module | nn.parallel.DistributedDataParallel,
     steps_per_epoch: int,
-) -> Tuple[Optimizer, Scheduler | None]:
+) -> Tuple[list[Optimizer], list[Scheduler | None]]:
     """
     Create and configure the optimizer and learning rate scheduler for the given model.
 
@@ -40,50 +40,63 @@ def prepare_optimizer(
 
     Returns
     -------
-    optimizer : Optimizer
-        Configured optimizer for the model.
-    lr_scheduler : Scheduler or None
-        Configured learning rate scheduler, or None if not specified.
+    optimizers : List[Optimizer]
+        Configured optimizers for the models.
+    lr_schedulers : List[Scheduler | None]
+        Configured learning rate schedulers, or None if not specified.
     """
-    lr = cfg.TRAIN.LR if cfg.TRAIN.LR_SCHEDULER.NAME != "warmupcosine" else cfg.TRAIN.LR_SCHEDULER.MIN_LR
-    opt_args = {}
-    if cfg.TRAIN.OPTIMIZER in ["ADAM", "ADAMW"]:
-        opt_args["betas"] = cfg.TRAIN.OPT_BETAS
-    optimizer = timm.optim.create_optimizer_v2(
-        model_without_ddp,
-        opt=cfg.TRAIN.OPTIMIZER,
-        lr=lr,
-        weight_decay=cfg.TRAIN.W_DECAY,
-        **opt_args,
-    )
-    print(optimizer)
-
-    # Learning rate schedulers
-    lr_scheduler = None
-    if cfg.TRAIN.LR_SCHEDULER.NAME != "":
-        if cfg.TRAIN.LR_SCHEDULER.NAME == "reduceonplateau":
-            lr_scheduler = ReduceLROnPlateau(
-                optimizer,
-                patience=cfg.TRAIN.LR_SCHEDULER.REDUCEONPLATEAU_PATIENCE,
-                factor=cfg.TRAIN.LR_SCHEDULER.REDUCEONPLATEAU_FACTOR,
-                min_lr=cfg.TRAIN.LR_SCHEDULER.MIN_LR,
-            )
-        elif cfg.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
-            lr_scheduler = WarmUpCosineDecayScheduler(
-                lr=cfg.TRAIN.LR,
-                min_lr=cfg.TRAIN.LR_SCHEDULER.MIN_LR,
-                warmup_epochs=cfg.TRAIN.LR_SCHEDULER.WARMUP_COSINE_DECAY_EPOCHS,
-                epochs=cfg.TRAIN.EPOCHS,
-            )
-        elif cfg.TRAIN.LR_SCHEDULER.NAME == "onecycle":
-            lr_scheduler = OneCycleLR(
-                optimizer,
-                cfg.TRAIN.LR,
-                epochs=cfg.TRAIN.EPOCHS,
-                steps_per_epoch=steps_per_epoch,
-            )
-
-    return optimizer, lr_scheduler
+
+    optimizers = []
+    lr_schedulers = []
+
+    if hasattr(model_without_ddp, 'param_groups'):
+        param_groups = model_without_ddp.param_groups
+    else:
+        param_groups = [[p for p in model_without_ddp.parameters()]]
+
+    for i in range(len(cfg.TRAIN.OPTIMIZER)):
+        lr = cfg.TRAIN.LR if cfg.TRAIN.LR_SCHEDULER.NAME != "warmupcosine" else cfg.TRAIN.LR_SCHEDULER.MIN_LR
+        opt_args = {}
+        if cfg.TRAIN.OPTIMIZER[i] in ["ADAM", "ADAMW"]:
+            opt_args["betas"] = cfg.TRAIN.OPT_BETAS[i]
+        optimizer = timm.optim.create_optimizer_v2(
+            param_groups[i],
+            opt=cfg.TRAIN.OPTIMIZER[i],
+            lr=lr[i],
+            weight_decay=cfg.TRAIN.W_DECAY,
+            **opt_args,
+        )
+        print(optimizer)
+        optimizers.append(optimizer)
+
+        # Learning rate schedulers
+        lr_scheduler = None
+        if cfg.TRAIN.LR_SCHEDULER.NAME != "":
+            if cfg.TRAIN.LR_SCHEDULER.NAME == "reduceonplateau":
+                lr_scheduler = ReduceLROnPlateau(
+                    optimizer,
+                    patience=cfg.TRAIN.LR_SCHEDULER.REDUCEONPLATEAU_PATIENCE,
+                    factor=cfg.TRAIN.LR_SCHEDULER.REDUCEONPLATEAU_FACTOR,
+                    min_lr=cfg.TRAIN.LR_SCHEDULER.MIN_LR[i],
+                )
+            elif cfg.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+                lr_scheduler = WarmUpCosineDecayScheduler(
+                    lr=cfg.TRAIN.LR[i],
+                    min_lr=cfg.TRAIN.LR_SCHEDULER.MIN_LR[i],
+                    warmup_epochs=cfg.TRAIN.LR_SCHEDULER.WARMUP_COSINE_DECAY_EPOCHS,
+                    epochs=cfg.TRAIN.EPOCHS,
+                )
+            elif cfg.TRAIN.LR_SCHEDULER.NAME == "onecycle":
+                lr_scheduler = OneCycleLR(
+                    optimizer,
+                    cfg.TRAIN.LR[i],
+                    epochs=cfg.TRAIN.EPOCHS,
+                    steps_per_epoch=steps_per_epoch,
+                )
+
+        lr_schedulers.append(lr_scheduler)
+
+    return optimizers, lr_schedulers
 
 
 def build_callbacks(cfg: CN) -> EarlyStopping | None: