configs/nuscenes/seg/camera-bev256d2.yaml

model:
  encoders:
    lidar: null
    camera:
      backbone:
        type: SwinTransformer
        embed_dims: 96
        depths: [2, 2, 6, 2]
        num_heads: [3, 6, 12, 24]
        window_size: 7
        mlp_ratio: 4
        qkv_bias: true
        qk_scale: null
        drop_rate: 0.
        attn_drop_rate: 0.
        drop_path_rate: 0.3
        patch_norm: true
        out_indices: [1, 2, 3]
        with_cp: false
        convert_weights: true
      neck:
        type: GeneralizedLSSFPN
        in_channels: [192, 384, 768]
        out_channels: 256
        start_level: 0
        num_outs: 3
        norm_cfg:
          type: BN2d
          requires_grad: true
        act_cfg:
          type: ReLU
          inplace: true
        upsample_cfg:
          mode: bilinear
          align_corners: false
      vtransform:
        type: LSSTransform
        in_channels: 256
        out_channels: 80
        image_size: ${image_size}
        feature_size: ${[image_size[0] // 8, image_size[1] // 8]}
        xbound: [-51.2, 51.2, 0.4]
        ybound: [-51.2, 51.2, 0.4]
        zbound: [-10.0, 10.0, 20.0]
        dbound: [1.0, 60.0, 0.5]
        downsample: 2
  fuser: null
  decoder:
    backbone:
      type: GeneralizedResNet
      in_channels: 80
      blocks:
        - [2, 160, 2]
        - [2, 320, 2]
        - [2, 640, 1]
    neck:
      type: LSSFPN
      in_indices: [-1, 0]
      in_channels: [640, 160]
      out_channels: 256
      scale_factor: 2

optimizer:
  type: AdamW
  lr: 1.0e-4