Skip to content

Models API

This page documents the model APIs in VisDet.

Detectors

Base Detector

BaseDetector

Bases: BaseModel

Base class for detectors.

Parameters:

Name Type Description Default
data_preprocessor dict or ConfigDict

The pre-process config of :class:BaseDataPreprocessor. it usually includes, pad_size_divisor, pad_value, mean and std.

None
init_cfg dict or ConfigDict

the config to control the initialization. Defaults to None.

None
Source code in visdet/models/detectors/base.py
class BaseDetector(BaseModel, metaclass=ABCMeta):
    """Base class for detectors.

    Args:
       data_preprocessor (dict or ConfigDict, optional): The pre-process
           config of :class:`BaseDataPreprocessor`.  it usually includes,
            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
       init_cfg (dict or ConfigDict, optional): the config to control the
           initialization. Defaults to None.
    """

    def __init__(self, data_preprocessor: OptConfigType = None, init_cfg: OptMultiConfig = None):
        super().__init__(data_preprocessor=data_preprocessor, init_cfg=init_cfg)

    @property
    def with_neck(self) -> bool:
        """bool: whether the detector has a neck"""
        return hasattr(self, "neck") and self.neck is not None

    @property
    def with_shared_head(self) -> bool:
        """bool: whether the detector has a shared head in the RoI Head"""
        return hasattr(self, "roi_head") and self.roi_head.with_shared_head

    @property
    def with_bbox(self) -> bool:
        """bool: whether the detector has a bbox head"""
        return (hasattr(self, "roi_head") and self.roi_head.with_bbox) or (
            hasattr(self, "bbox_head") and self.bbox_head is not None
        )

    @property
    def with_mask(self) -> bool:
        """bool: whether the detector has a mask head"""
        return (hasattr(self, "roi_head") and self.roi_head.with_mask) or (
            hasattr(self, "mask_head") and self.mask_head is not None
        )

    def forward(
        self,
        inputs: torch.Tensor,
        data_samples: OptSampleList = None,
        mode: str = "tensor",
    ) -> ForwardResults:
        """The unified entry for a forward process in both training and test.

        The method should accept three modes: "tensor", "predict" and "loss":

        - "tensor": Forward the whole network and return tensor or tuple of
        tensor without any post-processing, same as a common nn.Module.
        - "predict": Forward and return the predictions, which are fully
        processed to a list of :obj:`DetDataSample`.
        - "loss": Forward and return a dict of losses according to the given
        inputs and data samples.

        Note that this method doesn't handle either back propagation or
        parameter update, which are supposed to be done in :meth:`train_step`.

        Args:
            inputs (torch.Tensor): The input tensor with shape
                (N, C, ...) in general.
            data_samples (list[:obj:`DetDataSample`], optional): A batch of
                data samples that contain annotations and predictions.
                Defaults to None.
            mode (str): Return what kind of value. Defaults to 'tensor'.

        Returns:
            The return type depends on ``mode``.

            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
            - If ``mode="predict"``, return a list of :obj:`DetDataSample`.
            - If ``mode="loss"``, return a dict of tensor.
        """
        if mode == "loss":
            return self.loss(inputs, data_samples)
        elif mode == "predict":
            return self.predict(inputs, data_samples)
        elif mode == "tensor":
            return self._forward(inputs, data_samples)
        else:
            raise RuntimeError(f'Invalid mode "{mode}". Only supports loss, predict and tensor mode')

    @abstractmethod
    def loss(self, batch_inputs: Tensor, batch_data_samples: SampleList) -> dict | tuple:
        """Calculate losses from a batch of inputs and data samples."""
        pass

    @abstractmethod
    def predict(self, batch_inputs: Tensor, batch_data_samples: SampleList) -> SampleList:
        """Predict results from a batch of inputs and data samples with post-processing."""
        pass

    @abstractmethod
    def _forward(self, batch_inputs: Tensor, batch_data_samples: OptSampleList = None):
        """Network forward process.

        Usually includes backbone, neck and head forward without any post-
        processing.
        """
        pass

    @abstractmethod
    def extract_feat(self, batch_inputs: Tensor) -> tuple[Tensor]:
        """Extract features from images."""
        pass

    def add_pred_to_datasample(self, data_samples: SampleList, results_list: InstanceList) -> SampleList:
        """Add predictions to `DetDataSample`.

        Args:
            data_samples (list[:obj:`DetDataSample`], optional): A batch of
                data samples that contain annotations and predictions.
            results_list (list[:obj:`InstanceData`]): Detection results of
                each image.

        Returns:
            list[:obj:`DetDataSample`]: Detection results of the
            input images. Each DetDataSample usually contains
            'pred_instances'. And the ``pred_instances`` usually
            contains following keys.

                - scores (Tensor): Classification scores, has a shape
                    (num_instance, )
                - labels (Tensor): Labels of bboxes, has a shape
                    (num_instances, ).
                - bboxes (Tensor): Has a shape (num_instances, 4),
                    the last dimension 4 arrange as (x1, y1, x2, y2).
        """
        for data_sample, pred_instances in zip(data_samples, results_list):
            data_sample.pred_instances = pred_instances
        samplelist_boxtype2tensor(data_samples)
        return data_samples

Two-Stage Detectors

TwoStageDetector

Bases: BaseDetector

Base class for two-stage detectors.

Two-stage detectors typically consisting of a region proposal network and a task-specific regression head.

Source code in visdet/models/detectors/two_stage.py
@MODELS.register_module()
class TwoStageDetector(BaseDetector):
    """Base class for two-stage detectors.

    Two-stage detectors typically consisting of a region proposal network and a
    task-specific regression head.
    """

    def __init__(
        self,
        backbone: ConfigType,
        neck: OptConfigType = None,
        rpn_head: OptConfigType = None,
        roi_head: OptConfigType = None,
        train_cfg: OptConfigType = None,
        test_cfg: OptConfigType = None,
        data_preprocessor: OptConfigType = None,
        init_cfg: OptMultiConfig = None,
    ) -> None:
        super().__init__(data_preprocessor=data_preprocessor, init_cfg=init_cfg)

        # Handle backbone as either config dict or direct object
        if isinstance(backbone, dict):
            self.backbone = MODELS.build(backbone)
        else:
            self.backbone = backbone

        # Handle neck as either config dict or direct object
        if neck is not None:
            if isinstance(neck, list):
                self.neck = nn.ModuleList()
                for n in neck:
                    self.neck.append(MODELS.build(n))
            elif isinstance(neck, dict):
                self.neck = MODELS.build(neck)
            else:
                self.neck = neck

        # Handle rpn_head as either config dict or direct object
        if rpn_head is not None:
            if isinstance(rpn_head, dict):
                rpn_train_cfg = None
                if train_cfg is not None:
                    if isinstance(train_cfg, dict):
                        rpn_train_cfg = train_cfg.get("rpn", None)
                    else:
                        rpn_train_cfg = getattr(train_cfg, "rpn", None)

                rpn_test_cfg = None
                if test_cfg is not None:
                    if isinstance(test_cfg, dict):
                        rpn_test_cfg = test_cfg.get("rpn", None)
                    else:
                        rpn_test_cfg = getattr(test_cfg, "rpn", None)

                rpn_head_ = rpn_head.copy()
                rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=rpn_test_cfg)
                rpn_head_num_classes = rpn_head_.get("num_classes", None)
                if rpn_head_num_classes is None:
                    rpn_head_.update(num_classes=1)
                else:
                    if rpn_head_num_classes != 1:
                        warnings.warn(
                            "The `num_classes` should be 1 in RPN, but get "
                            f"{rpn_head_num_classes}, please set "
                            "rpn_head.num_classes = 1 in your config file.",
                            stacklevel=2,
                        )
                        rpn_head_.update(num_classes=1)
                self.rpn_head = MODELS.build(rpn_head_)
            else:
                self.rpn_head = rpn_head

        # Handle roi_head as either config dict or direct object
        if roi_head is not None:
            if isinstance(roi_head, dict):
                # update train and test cfg here for now
                # TODO: refactor assigner & sampler
                rcnn_train_cfg = None
                if train_cfg is not None:
                    if isinstance(train_cfg, dict):
                        rcnn_train_cfg = train_cfg.get("rcnn", train_cfg.get("roi", None))
                    else:
                        rcnn_train_cfg = getattr(train_cfg, "rcnn", getattr(train_cfg, "roi", None))

                rcnn_test_cfg = None
                if test_cfg is not None:
                    if isinstance(test_cfg, dict):
                        rcnn_test_cfg = test_cfg.get("rcnn", test_cfg.get("roi", None))
                    else:
                        rcnn_test_cfg = getattr(test_cfg, "rcnn", getattr(test_cfg, "roi", None))

                roi_head.update(train_cfg=rcnn_train_cfg)
                roi_head.update(test_cfg=rcnn_test_cfg)
                self.roi_head = MODELS.build(roi_head)
            else:
                self.roi_head = roi_head

        self.train_cfg = train_cfg
        self.test_cfg = test_cfg

    def _load_from_state_dict(
        self,
        state_dict: dict,
        prefix: str,
        local_metadata: dict,
        strict: bool,
        missing_keys: list[str] | str,
        unexpected_keys: list[str] | str,
        error_msgs: list[str] | str,
    ) -> None:
        """Exchange bbox_head key to rpn_head key when loading single-stage
        weights into two-stage model."""
        bbox_head_prefix = prefix + ".bbox_head" if prefix else "bbox_head"
        bbox_head_keys = [k for k in state_dict.keys() if k.startswith(bbox_head_prefix)]
        rpn_head_prefix = prefix + ".rpn_head" if prefix else "rpn_head"
        rpn_head_keys = [k for k in state_dict.keys() if k.startswith(rpn_head_prefix)]
        if len(bbox_head_keys) != 0 and len(rpn_head_keys) == 0:
            for bbox_head_key in bbox_head_keys:
                rpn_head_key = rpn_head_prefix + bbox_head_key[len(bbox_head_prefix) :]
                state_dict[rpn_head_key] = state_dict.pop(bbox_head_key)
        super()._load_from_state_dict(
            state_dict,
            prefix,
            local_metadata,
            strict,
            missing_keys,
            unexpected_keys,
            error_msgs,
        )

    @property
    def with_rpn(self) -> bool:
        """bool: whether the detector has RPN"""
        return hasattr(self, "rpn_head") and self.rpn_head is not None

    @property
    def with_roi_head(self) -> bool:
        """bool: whether the detector has a RoI head"""
        return hasattr(self, "roi_head") and self.roi_head is not None

    def extract_feat(self, batch_inputs: Tensor) -> tuple[Tensor]:
        """Extract features.

        Args:
            batch_inputs (Tensor): Image tensor with shape (N, C, H ,W).

        Returns:
            tuple[Tensor]: Multi-level features that may have
            different resolutions.
        """
        x = self.backbone(batch_inputs)
        if self.with_neck:
            if isinstance(self.neck, nn.ModuleList):
                for neck in self.neck:
                    x = neck(x)
            else:
                x = self.neck(x)
        return x

    def _forward(self, batch_inputs: Tensor, batch_data_samples: SampleList) -> tuple:
        """Network forward process. Usually includes backbone, neck and head
        forward without any post-processing.

        Args:
            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
                the meta information of each image and corresponding
                annotations.

        Returns:
            tuple: A tuple of features from ``rpn_head`` and ``roi_head``
            forward.
        """
        results = ()
        x = self.extract_feat(batch_inputs)

        if self.with_rpn:
            rpn_results_list = self.rpn_head.predict(x, batch_data_samples, rescale=False)
        else:
            assert getattr(batch_data_samples[0], "proposals", None) is not None
            rpn_results_list = [data_sample.proposals for data_sample in batch_data_samples]
        roi_outs = self.roi_head.forward(x, rpn_results_list, batch_data_samples)
        results = (*results, roi_outs)
        return results

    def loss(self, batch_inputs: Tensor, batch_data_samples: SampleList) -> dict:
        """Calculate losses from a batch of inputs and data samples.

        Args:
            batch_inputs (Tensor): Input images of shape (N, C, H, W).
                These should usually be mean centered and std scaled.
            batch_data_samples (List[:obj:`DetDataSample`]): The batch
                data samples. It usually includes information such
                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.

        Returns:
            dict: A dictionary of loss components
        """
        logger = MMLogger.get_current_instance()
        logger.debug(f"[TwoStageDetector] Starting loss calculation with batch size: {batch_inputs.shape[0]}")
        logger.debug(f"[TwoStageDetector] Input tensor shape: {batch_inputs.shape}, dtype: {batch_inputs.dtype}")

        # Check input for NaN/Inf
        if torch.isnan(batch_inputs).any():
            logger.error(f"[TwoStageDetector] NaN detected in input batch_inputs!")
        if torch.isinf(batch_inputs).any():
            logger.error(f"[TwoStageDetector] Inf detected in input batch_inputs!")

        x = self.extract_feat(batch_inputs)
        logger.debug(f"[TwoStageDetector] Extracted features: {len(x)} levels")
        for i, feat in enumerate(x):
            logger.debug(f"[TwoStageDetector] Feature level {i} shape: {feat.shape}")
            if torch.isnan(feat).any():
                logger.error(f"[TwoStageDetector] NaN detected in feature level {i}!")
            if torch.isinf(feat).any():
                logger.error(f"[TwoStageDetector] Inf detected in feature level {i}!")

        losses = {}

        # RPN forward and loss
        if self.with_rpn:
            logger.debug("[TwoStageDetector] Computing RPN losses...")
            proposal_cfg = getattr(self.train_cfg, "rpn_proposal", getattr(self.test_cfg, "rpn", {}))
            rpn_data_samples = copy.deepcopy(batch_data_samples)
            # set cat_id of gt_labels to 0 in RPN
            for data_sample in rpn_data_samples:
                data_sample.gt_instances.labels = torch.zeros_like(data_sample.gt_instances.labels)

            rpn_losses, rpn_results_list = self.rpn_head.loss_and_predict(
                x, rpn_data_samples, proposal_cfg=proposal_cfg
            )

            # Log RPN losses
            logger.debug(f"[TwoStageDetector] RPN losses: {list(rpn_losses.keys())}")
            for loss_name, loss_value in rpn_losses.items():
                if isinstance(loss_value, torch.Tensor):
                    logger.debug(f"[TwoStageDetector] RPN {loss_name}: {loss_value.item()}")
                    if torch.isnan(loss_value):
                        logger.error(f"[TwoStageDetector] NaN detected in RPN {loss_name}!")
                    if torch.isinf(loss_value):
                        logger.error(f"[TwoStageDetector] Inf detected in RPN {loss_name}!")

            # avoid get same name with roi_head loss
            keys = rpn_losses.keys()
            for key in list(keys):
                if "loss" in key and "rpn" not in key:
                    rpn_losses[f"rpn_{key}"] = rpn_losses.pop(key)
            losses.update(rpn_losses)
        else:
            assert getattr(batch_data_samples[0], "proposals", None) is not None
            # use pre-defined proposals in InstanceData for the second stage
            # to extract ROI features.
            rpn_results_list = [data_sample.proposals for data_sample in batch_data_samples]

        logger.debug("[TwoStageDetector] Computing ROI head losses...")
        roi_losses = self.roi_head.loss(x, rpn_results_list, batch_data_samples)

        # Log ROI losses
        logger.debug(f"[TwoStageDetector] ROI losses: {list(roi_losses.keys())}")
        for loss_name, loss_value in roi_losses.items():
            if isinstance(loss_value, torch.Tensor):
                logger.debug(f"[TwoStageDetector] ROI {loss_name}: {loss_value.item()}")
                if torch.isnan(loss_value):
                    logger.error(f"[TwoStageDetector] NaN detected in ROI {loss_name}!")
                if torch.isinf(loss_value):
                    logger.error(f"[TwoStageDetector] Inf detected in ROI {loss_name}!")

        losses.update(roi_losses)

        logger.debug(f"[TwoStageDetector] Total losses computed: {list(losses.keys())}")

        return losses

    def predict(self, batch_inputs: Tensor, batch_data_samples: SampleList, rescale: bool = True) -> SampleList:
        """Predict results from a batch of inputs and data samples with post-
        processing.

        Args:
            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
            batch_data_samples (List[:obj:`DetDataSample`]): The Data
                Samples. It usually includes information such as
                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
            rescale (bool): Whether to rescale the results.
                Defaults to True.

        Returns:
            list[:obj:`DetDataSample`]: Return the detection results of the
            input images. The returns value is DetDataSample,
            which usually contain 'pred_instances'. And the
            ``pred_instances`` usually contains following keys.

                - scores (Tensor): Classification scores, has a shape
                    (num_instance, )
                - labels (Tensor): Labels of bboxes, has a shape
                    (num_instances, ).
                - bboxes (Tensor): Has a shape (num_instances, 4),
                    the last dimension 4 arrange as (x1, y1, x2, y2).
                - masks (Tensor): Has a shape (num_instances, H, W).
        """

        assert self.with_bbox, "Bbox head must be implemented."
        x = self.extract_feat(batch_inputs)

        # If there are no pre-defined proposals, use RPN to get proposals
        if getattr(batch_data_samples[0], "proposals", None) is None:
            rpn_results_list = self.rpn_head.predict(x, batch_data_samples, rescale=False)
        else:
            rpn_results_list = [data_sample.proposals for data_sample in batch_data_samples]

        results_list = self.roi_head.predict(x, rpn_results_list, batch_data_samples, rescale=rescale)

        batch_data_samples = self.add_pred_to_datasample(batch_data_samples, results_list)
        return batch_data_samples

with_rpn property

bool: whether the detector has RPN

with_roi_head property

bool: whether the detector has a RoI head

extract_feat(batch_inputs)

Extract features.

Parameters:

Name Type Description Default
batch_inputs Tensor

Image tensor with shape (N, C, H ,W).

required

Returns:

Type Description
tuple[Tensor]

tuple[Tensor]: Multi-level features that may have

tuple[Tensor]

different resolutions.

Source code in visdet/models/detectors/two_stage.py
def extract_feat(self, batch_inputs: Tensor) -> tuple[Tensor]:
    """Extract features.

    Args:
        batch_inputs (Tensor): Image tensor with shape (N, C, H ,W).

    Returns:
        tuple[Tensor]: Multi-level features that may have
        different resolutions.
    """
    x = self.backbone(batch_inputs)
    if self.with_neck:
        if isinstance(self.neck, nn.ModuleList):
            for neck in self.neck:
                x = neck(x)
        else:
            x = self.neck(x)
    return x

loss(batch_inputs, batch_data_samples)

Calculate losses from a batch of inputs and data samples.

Parameters:

Name Type Description Default
batch_inputs Tensor

Input images of shape (N, C, H, W). These should usually be mean centered and std scaled.

required
batch_data_samples (List[

obj:DetDataSample]): The batch data samples. It usually includes information such as gt_instance or gt_panoptic_seg or gt_sem_seg.

required

Returns:

Name Type Description
dict dict

A dictionary of loss components

Source code in visdet/models/detectors/two_stage.py
def loss(self, batch_inputs: Tensor, batch_data_samples: SampleList) -> dict:
    """Calculate losses from a batch of inputs and data samples.

    Args:
        batch_inputs (Tensor): Input images of shape (N, C, H, W).
            These should usually be mean centered and std scaled.
        batch_data_samples (List[:obj:`DetDataSample`]): The batch
            data samples. It usually includes information such
            as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.

    Returns:
        dict: A dictionary of loss components
    """
    logger = MMLogger.get_current_instance()
    logger.debug(f"[TwoStageDetector] Starting loss calculation with batch size: {batch_inputs.shape[0]}")
    logger.debug(f"[TwoStageDetector] Input tensor shape: {batch_inputs.shape}, dtype: {batch_inputs.dtype}")

    # Check input for NaN/Inf
    if torch.isnan(batch_inputs).any():
        logger.error(f"[TwoStageDetector] NaN detected in input batch_inputs!")
    if torch.isinf(batch_inputs).any():
        logger.error(f"[TwoStageDetector] Inf detected in input batch_inputs!")

    x = self.extract_feat(batch_inputs)
    logger.debug(f"[TwoStageDetector] Extracted features: {len(x)} levels")
    for i, feat in enumerate(x):
        logger.debug(f"[TwoStageDetector] Feature level {i} shape: {feat.shape}")
        if torch.isnan(feat).any():
            logger.error(f"[TwoStageDetector] NaN detected in feature level {i}!")
        if torch.isinf(feat).any():
            logger.error(f"[TwoStageDetector] Inf detected in feature level {i}!")

    losses = {}

    # RPN forward and loss
    if self.with_rpn:
        logger.debug("[TwoStageDetector] Computing RPN losses...")
        proposal_cfg = getattr(self.train_cfg, "rpn_proposal", getattr(self.test_cfg, "rpn", {}))
        rpn_data_samples = copy.deepcopy(batch_data_samples)
        # set cat_id of gt_labels to 0 in RPN
        for data_sample in rpn_data_samples:
            data_sample.gt_instances.labels = torch.zeros_like(data_sample.gt_instances.labels)

        rpn_losses, rpn_results_list = self.rpn_head.loss_and_predict(
            x, rpn_data_samples, proposal_cfg=proposal_cfg
        )

        # Log RPN losses
        logger.debug(f"[TwoStageDetector] RPN losses: {list(rpn_losses.keys())}")
        for loss_name, loss_value in rpn_losses.items():
            if isinstance(loss_value, torch.Tensor):
                logger.debug(f"[TwoStageDetector] RPN {loss_name}: {loss_value.item()}")
                if torch.isnan(loss_value):
                    logger.error(f"[TwoStageDetector] NaN detected in RPN {loss_name}!")
                if torch.isinf(loss_value):
                    logger.error(f"[TwoStageDetector] Inf detected in RPN {loss_name}!")

        # avoid get same name with roi_head loss
        keys = rpn_losses.keys()
        for key in list(keys):
            if "loss" in key and "rpn" not in key:
                rpn_losses[f"rpn_{key}"] = rpn_losses.pop(key)
        losses.update(rpn_losses)
    else:
        assert getattr(batch_data_samples[0], "proposals", None) is not None
        # use pre-defined proposals in InstanceData for the second stage
        # to extract ROI features.
        rpn_results_list = [data_sample.proposals for data_sample in batch_data_samples]

    logger.debug("[TwoStageDetector] Computing ROI head losses...")
    roi_losses = self.roi_head.loss(x, rpn_results_list, batch_data_samples)

    # Log ROI losses
    logger.debug(f"[TwoStageDetector] ROI losses: {list(roi_losses.keys())}")
    for loss_name, loss_value in roi_losses.items():
        if isinstance(loss_value, torch.Tensor):
            logger.debug(f"[TwoStageDetector] ROI {loss_name}: {loss_value.item()}")
            if torch.isnan(loss_value):
                logger.error(f"[TwoStageDetector] NaN detected in ROI {loss_name}!")
            if torch.isinf(loss_value):
                logger.error(f"[TwoStageDetector] Inf detected in ROI {loss_name}!")

    losses.update(roi_losses)

    logger.debug(f"[TwoStageDetector] Total losses computed: {list(losses.keys())}")

    return losses

predict(batch_inputs, batch_data_samples, rescale=True)

Predict results from a batch of inputs and data samples with post- processing.

Parameters:

Name Type Description Default
batch_inputs Tensor

Inputs with shape (N, C, H, W).

required
batch_data_samples (List[

obj:DetDataSample]): The Data Samples. It usually includes information such as gt_instance, gt_panoptic_seg and gt_sem_seg.

required
rescale bool

Whether to rescale the results. Defaults to True.

True

Returns:

Type Description
SampleList

list[:obj:DetDataSample]: Return the detection results of the

SampleList

input images. The returns value is DetDataSample,

SampleList

which usually contain 'pred_instances'. And the

SampleList

pred_instances usually contains following keys.

  • scores (Tensor): Classification scores, has a shape (num_instance, )
  • labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
  • bboxes (Tensor): Has a shape (num_instances, 4), the last dimension 4 arrange as (x1, y1, x2, y2).
  • masks (Tensor): Has a shape (num_instances, H, W).
Source code in visdet/models/detectors/two_stage.py
def predict(self, batch_inputs: Tensor, batch_data_samples: SampleList, rescale: bool = True) -> SampleList:
    """Predict results from a batch of inputs and data samples with post-
    processing.

    Args:
        batch_inputs (Tensor): Inputs with shape (N, C, H, W).
        batch_data_samples (List[:obj:`DetDataSample`]): The Data
            Samples. It usually includes information such as
            `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
        rescale (bool): Whether to rescale the results.
            Defaults to True.

    Returns:
        list[:obj:`DetDataSample`]: Return the detection results of the
        input images. The returns value is DetDataSample,
        which usually contain 'pred_instances'. And the
        ``pred_instances`` usually contains following keys.

            - scores (Tensor): Classification scores, has a shape
                (num_instance, )
            - labels (Tensor): Labels of bboxes, has a shape
                (num_instances, ).
            - bboxes (Tensor): Has a shape (num_instances, 4),
                the last dimension 4 arrange as (x1, y1, x2, y2).
            - masks (Tensor): Has a shape (num_instances, H, W).
    """

    assert self.with_bbox, "Bbox head must be implemented."
    x = self.extract_feat(batch_inputs)

    # If there are no pre-defined proposals, use RPN to get proposals
    if getattr(batch_data_samples[0], "proposals", None) is None:
        rpn_results_list = self.rpn_head.predict(x, batch_data_samples, rescale=False)
    else:
        rpn_results_list = [data_sample.proposals for data_sample in batch_data_samples]

    results_list = self.roi_head.predict(x, rpn_results_list, batch_data_samples, rescale=rescale)

    batch_data_samples = self.add_pred_to_datasample(batch_data_samples, results_list)
    return batch_data_samples

Mask R-CNN

MaskRCNN

Bases: TwoStageDetector

Implementation of Mask R-CNN <https://arxiv.org/abs/1703.06870>_

Mask R-CNN extends Faster R-CNN by adding a branch for predicting segmentation masks on each Region of Interest (RoI), in parallel with the existing branch for classification and bounding box regression.

Parameters:

Name Type Description Default
backbone ConfigDict

Configuration for the backbone network.

required
rpn_head ConfigDict

Configuration for the RPN head.

required
roi_head ConfigDict

Configuration for the RoI head.

required
train_cfg ConfigDict

Training configuration.

required
test_cfg ConfigDict

Testing configuration.

required
neck OptConfigType

Configuration for the neck network. Default: None.

None
data_preprocessor OptConfigType

Configuration for the data preprocessor. Default: None.

None
init_cfg OptMultiConfig

Initialization configuration. Default: None.

None
Source code in visdet/models/detectors/mask_rcnn.py
@MODELS.register_module()
class MaskRCNN(TwoStageDetector):
    """Implementation of `Mask R-CNN <https://arxiv.org/abs/1703.06870>`_

    Mask R-CNN extends Faster R-CNN by adding a branch for predicting
    segmentation masks on each Region of Interest (RoI), in parallel
    with the existing branch for classification and bounding box regression.

    Args:
        backbone (ConfigDict): Configuration for the backbone network.
        rpn_head (ConfigDict): Configuration for the RPN head.
        roi_head (ConfigDict): Configuration for the RoI head.
        train_cfg (ConfigDict): Training configuration.
        test_cfg (ConfigDict): Testing configuration.
        neck (OptConfigType, optional): Configuration for the neck network.
            Default: None.
        data_preprocessor (OptConfigType, optional): Configuration for the
            data preprocessor. Default: None.
        init_cfg (OptMultiConfig, optional): Initialization configuration.
            Default: None.
    """

    def __init__(
        self,
        backbone,  # Can be ConfigDict or direct object
        rpn_head,  # Can be ConfigDict or direct object
        roi_head,  # Can be ConfigDict or direct object
        train_cfg,  # Can be ConfigDict or direct object
        test_cfg,  # Can be ConfigDict or direct object
        neck=None,  # Can be ConfigDict or direct object
        data_preprocessor=None,  # Can be ConfigDict or direct object
        init_cfg=None,
    ) -> None:
        super().__init__(
            backbone=backbone,
            neck=neck,
            rpn_head=rpn_head,
            roi_head=roi_head,
            train_cfg=train_cfg,
            test_cfg=test_cfg,
            init_cfg=init_cfg,
            data_preprocessor=data_preprocessor,
        )

Backbones

backbones

Necks

necks

Heads

dense_heads

See Also