
    .hd                        d dl mZ d dlZd dlmZ d dlmZmZ  G d dej                        Z G d dej                        Z	y)	    )annotationsN)nn)MLPLayerNorm2dc                       e Zd ZdZdej
                  ddf	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 	 	 d	dZ xZ	S )
MaskDecodera  
    Decoder module for generating masks and their associated quality scores using a transformer architecture.

    This class predicts masks given image and prompt embeddings, utilizing a transformer to process the inputs and
    generate mask predictions along with their quality scores.

    Attributes:
        transformer_dim (int): Channel dimension for the transformer module.
        transformer (nn.Module): Transformer module used for mask prediction.
        num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
        iou_token (nn.Embedding): Embedding for the IoU token.
        num_mask_tokens (int): Number of mask tokens.
        mask_tokens (nn.Embedding): Embedding for the mask tokens.
        output_upscaling (nn.Sequential): Neural network sequence for upscaling the output.
        output_hypernetworks_mlps (nn.ModuleList): Hypernetwork MLPs for generating masks.
        iou_prediction_head (nn.Module): MLP for predicting mask quality.

    Methods:
        forward: Predict masks given image and prompt embeddings.
        predict_masks: Internal method for mask prediction.

    Examples:
        >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer_module)
        >>> masks, iou_pred = decoder(
        ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, multimask_output=True
        ... )
        >>> print(f"Predicted masks shape: {masks.shape}, IoU predictions shape: {iou_pred.shape}")
          c                   t         |           || _        || _        || _        t        j                  d|      | _        |dz   | _        t        j                  | j                  |      | _	        t        j                  t        j                  ||dz  dd      t        |dz         |       t        j                  |dz  |dz  dd       |             | _        t        j                  t        | j                        D cg c]  }t!        |||dz  d       c}      | _        t!        ||| j                  |      | _        yc c}w )a  
        Initialize the MaskDecoder module for generating masks and their associated quality scores.

        Args:
            transformer_dim (int): Channel dimension for the transformer module.
            transformer (nn.Module): Transformer module used for mask prediction.
            num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
            activation (Type[nn.Module]): Type of activation to use when upscaling masks.
            iou_head_depth (int): Depth of the MLP used to predict mask quality.
            iou_head_hidden_dim (int): Hidden dimension of the MLP used to predict mask quality.

        Examples:
            >>> transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=256, nhead=8), num_layers=6)
            >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer)
            >>> print(decoder)
                 kernel_sizestride   r	   N)super__init__transformer_dimtransformernum_multimask_outputsr   	Embedding	iou_tokennum_mask_tokensmask_tokens
SequentialConvTranspose2dr   output_upscaling
ModuleListranger   output_hypernetworks_mlpsiou_prediction_head)	selfr   r   r   
activationiou_head_depthiou_head_hidden_dim_	__class__s	           e/var/www/html/ai-service/venv/lib/python3.12/site-packages/ultralytics/models/sam/modules/decoders.pyr   zMaskDecoder.__init__)   s$   2 	.&%:"a94q8<<(<(<oN "10DRS\]^1,-L!3_5IWXabcL!
 *,UZ[_[o[oUpqPQS/?a3GKq*
& $'8KTMaMacq#r  rs    Ec                    | j                  ||||      \  }}|rt        dd      nt        dd      }|dd|ddddf   }|dd|f   }||fS )a  
        Predict masks given image and prompt embeddings.

        Args:
            image_embeddings (torch.Tensor): Embeddings from the image encoder.
            image_pe (torch.Tensor): Positional encoding with the shape of image_embeddings.
            sparse_prompt_embeddings (torch.Tensor): Embeddings of the points and boxes.
            dense_prompt_embeddings (torch.Tensor): Embeddings of the mask inputs.
            multimask_output (bool): Whether to return multiple masks or a single mask.

        Returns:
            masks (torch.Tensor): Batched predicted masks.
            iou_pred (torch.Tensor): Batched predictions of mask quality.

        Examples:
            >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer_module)
            >>> image_emb = torch.rand(1, 256, 64, 64)
            >>> image_pe = torch.rand(1, 256, 64, 64)
            >>> sparse_emb = torch.rand(1, 2, 256)
            >>> dense_emb = torch.rand(1, 256, 64, 64)
            >>> masks, iou_pred = decoder(image_emb, image_pe, sparse_emb, dense_emb, multimask_output=True)
            >>> print(f"Masks shape: {masks.shape}, IoU predictions shape: {iou_pred.shape}")
        )image_embeddingsimage_pesparse_prompt_embeddingsdense_prompt_embeddingsr   Nr   )predict_masksslice)	r#   r+   r,   r-   r.   multimask_outputmasksiou_pred
mask_slices	            r)   forwardzMaskDecoder.forwardY   sk    > ,,-%=$;	 - 
x (8U1d^U1a[
aQ)*AzM*h    c           
        t        j                  | j                  j                  | j                  j                  gd      }|j                  d      j                  |j                  d   dd      }t        j                  ||fd      }t        j                  ||j                  d   d      }||z   }t        j                  ||j                  d   d      }|j                  \  }	}
}}| j                  |||      \  }}|dddddf   }|dddd| j                  z   ddf   }|j                  dd      j                  |	|
||      }| j                  |      }t        | j                        D cg c]!  } | j                  |   |dd|ddf         # }}t        j                   |d      }|j                  \  }	}
}}||j                  |	|
||z        z  j                  |	d||      }| j#                  |      }||fS c c}w )z`Predict masks and quality scores using image and prompt embeddings via transformer architecture.r   dimr   Nr   )torchcatr   weightr   	unsqueezeexpandshaperepeat_interleaver   r   	transposeviewr   r    r!   stackr"   )r#   r+   r,   r-   r.   output_tokenstokenssrcpos_srcbchwhsiou_token_outmask_tokens_outupscaled_embeddingihyper_in_listhyper_inr2   r3   s                         r)   r/   zMaskDecoder.predict_masks   s    		4>>#8#8$:J:J:Q:Q"RXYZ%//299:R:X:XYZ:[]_acdM+CD!L %%&6QQO++))(FLLOKYY
1a ""38C1a7QQ)=)=%= >AB mmAq!&&q!Q2!2237QVW[WkWkQl-
LM-D**1-oaAg.FG-
 -
 ;;}!4'--
1a.33Aq!a%@@FFq"aQRS ++M:h-
s   3&H)r   intr   	nn.Moduler   rT   r$   type[nn.Module]r%   rT   r&   rT   returnNone)r+   torch.Tensorr,   rY   r-   rY   r.   rY   r1   boolrW   !tuple[torch.Tensor, torch.Tensor])
r+   rY   r,   rY   r-   rY   r.   rY   rW   r[   )
__name__
__module____qualname____doc__r   GELUr   r5   r/   __classcell__r(   s   @r)   r   r      s    B &'&(gg#&.s.s .s  #	.s
 $.s .s !.s 
.s`+&+ + #/	+
 ".+ + 
++Z%&% % #/	%
 ".% 
+%r6   r   c                       e Zd ZdZdej
                  ddddddddddf	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ	 d	 	 	 	 	 	 	 	 	 	 	 	 	 dd	Zd
 Z	d Z
 xZS )SAM2MaskDecodera
  
    Transformer-based decoder for predicting instance segmentation masks from image and prompt embeddings.

    This class extends the functionality of the MaskDecoder, incorporating additional features such as
    high-resolution feature processing, dynamic multimask output, and object score prediction.

    Attributes:
        transformer_dim (int): Channel dimension of the transformer.
        transformer (nn.Module): Transformer used to predict masks.
        num_multimask_outputs (int): Number of masks to predict when disambiguating masks.
        iou_token (nn.Embedding): Embedding for IOU token.
        num_mask_tokens (int): Total number of mask tokens.
        mask_tokens (nn.Embedding): Embedding for mask tokens.
        pred_obj_scores (bool): Whether to predict object scores.
        obj_score_token (nn.Embedding): Embedding for object score token.
        use_multimask_token_for_obj_ptr (bool): Whether to use multimask token for object pointer.
        output_upscaling (nn.Sequential): Upscaling layers for output.
        use_high_res_features (bool): Whether to use high-resolution features.
        conv_s0 (nn.Conv2d): Convolutional layer for high-resolution features (s0).
        conv_s1 (nn.Conv2d): Convolutional layer for high-resolution features (s1).
        output_hypernetworks_mlps (nn.ModuleList): List of MLPs for output hypernetworks.
        iou_prediction_head (MLP): MLP for IOU prediction.
        pred_obj_score_head (nn.Linear | MLP): Linear layer or MLP for object score prediction.
        dynamic_multimask_via_stability (bool): Whether to use dynamic multimask via stability.
        dynamic_multimask_stability_delta (float): Delta value for dynamic multimask stability.
        dynamic_multimask_stability_thresh (float): Threshold for dynamic multimask stability.

    Methods:
        forward: Predict masks given image and prompt embeddings.
        predict_masks: Predict instance segmentation masks from image and prompt embeddings.
        _get_stability_scores: Compute mask stability scores based on IoU between thresholds.
        _dynamic_multimask_via_stability: Dynamically select the most stable mask output.

    Examples:
        >>> image_embeddings = torch.rand(1, 256, 64, 64)
        >>> image_pe = torch.rand(1, 256, 64, 64)
        >>> sparse_prompt_embeddings = torch.rand(1, 2, 256)
        >>> dense_prompt_embeddings = torch.rand(1, 256, 64, 64)
        >>> decoder = SAM2MaskDecoder(256, transformer)
        >>> masks, iou_pred, sam_tokens_out, obj_score_logits = decoder.forward(
        ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, True, False
        ... )
    r	   r
   Fg?g\(\?c                4   t         |           || _        || _        || _        t        j                  d|      | _        |dz   | _        t        j                  | j                  |      | _	        || _
        | j                  rt        j                  d|      | _        || _        t        j                  t        j                  ||dz  dd      t        |dz         |       t        j                  |dz  |dz  dd       |             | _        || _        |rBt        j$                  ||dz  dd      | _        t        j$                  ||dz  dd      | _        t        j*                  t-        | j                        D cg c]  }t/        |||dz  d       c}      | _        t/        ||| j                  ||      | _        | j                  r0t        j4                  |d      | _        |rt/        ||dd      | _        |	| _        |
| _        || _        yc c}w )	a  
        Initialize the SAM2MaskDecoder module for predicting instance segmentation masks.

        This decoder extends the functionality of MaskDecoder, incorporating additional features such as
        high-resolution feature processing, dynamic multimask output, and object score prediction.

        Args:
            transformer_dim (int): Channel dimension of the transformer.
            transformer (nn.Module): Transformer used to predict masks.
            num_multimask_outputs (int): Number of masks to predict when disambiguating masks.
            activation (Type[nn.Module]): Type of activation to use when upscaling masks.
            iou_head_depth (int): Depth of the MLP used to predict mask quality.
            iou_head_hidden_dim (int): Hidden dimension of the MLP used to predict mask quality.
            use_high_res_features (bool): Whether to use high-resolution features.
            iou_prediction_use_sigmoid (bool): Whether to use sigmoid for IOU prediction.
            dynamic_multimask_via_stability (bool): Whether to use dynamic multimask via stability.
            dynamic_multimask_stability_delta (float): Delta value for dynamic multimask stability.
            dynamic_multimask_stability_thresh (float): Threshold for dynamic multimask stability.
            pred_obj_scores (bool): Whether to predict object scores.
            pred_obj_scores_mlp (bool): Whether to use MLP for object score prediction.
            use_multimask_token_for_obj_ptr (bool): Whether to use multimask token for object pointer.

        Examples:
            >>> transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=256, nhead=8), num_layers=6)
            >>> decoder = SAM2MaskDecoder(transformer_dim=256, transformer=transformer)
            >>> print(decoder)
        r   r   r   r   r   r	   )sigmoidN)r   r   r   r   r   r   r   r   r   r   pred_obj_scoresobj_score_tokenuse_multimask_token_for_obj_ptrr   r   r   r   use_high_res_featuresConv2dconv_s0conv_s1r   r    r   r!   r"   Linearpred_obj_score_headdynamic_multimask_via_stability!dynamic_multimask_stability_delta"dynamic_multimask_stability_thresh)r#   r   r   r   r$   r%   r&   rj   iou_prediction_use_sigmoidrp   rq   rr   rg   pred_obj_scores_mlpri   r'   r(   s                   r)   r   zSAM2MaskDecoder.__init__   s   X 	.&%:"a94q8<<(<(<oN.#%<<?#CD /N, "10DRS\]^1,-L!3_5IWXabcL!
 &;" 99_o6JXYbcdDL99_o6JXYbcdDL)+UZ[_[o[oUpqPQS/?a3GKq*
& $'  .$
  ')yy!'DD$"+.QRTU+V( 0O,1R.2T/' rs    Hc                b   | j                  ||||||      \  }}	}
}|r|ddddddddf   }|	ddddf   }	nJ| j                  r"| j                  s| j                  ||	      \  }}	n|ddddddddf   }|	ddddf   }	|r| j                  r|
ddddf   }n|
ddddf   }||	||fS )a  
        Predict masks given image and prompt embeddings.

        Args:
            image_embeddings (torch.Tensor): Embeddings from the image encoder with shape (B, C, H, W).
            image_pe (torch.Tensor): Positional encoding with the shape of image_embeddings (B, C, H, W).
            sparse_prompt_embeddings (torch.Tensor): Embeddings of the points and boxes with shape (B, N, C).
            dense_prompt_embeddings (torch.Tensor): Embeddings of the mask inputs with shape (B, C, H, W).
            multimask_output (bool): Whether to return multiple masks or a single mask.
            repeat_image (bool): Flag to repeat the image embeddings.
            high_res_features (list[torch.Tensor] | None, optional): Optional high-resolution features.

        Returns:
            masks (torch.Tensor): Batched predicted masks with shape (B, N, H, W).
            iou_pred (torch.Tensor): Batched predictions of mask quality with shape (B, N).
            sam_tokens_out (torch.Tensor): Batched SAM token for mask output with shape (B, N, C).
            object_score_logits (torch.Tensor): Batched object score logits with shape (B, 1).

        Examples:
            >>> image_embeddings = torch.rand(1, 256, 64, 64)
            >>> image_pe = torch.rand(1, 256, 64, 64)
            >>> sparse_prompt_embeddings = torch.rand(1, 2, 256)
            >>> dense_prompt_embeddings = torch.rand(1, 256, 64, 64)
            >>> decoder = SAM2MaskDecoder(256, transformer)
            >>> masks, iou_pred, sam_tokens_out, obj_score_logits = decoder.forward(
            ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, True, False
            ... )
        )r+   r,   r-   r.   repeat_imagehigh_res_featuresNr   r   )r/   rp   training _dynamic_multimask_via_stabilityri   )r#   r+   r,   r-   r.   r1   rv   rw   r2   r3   rO   object_score_logitssam_tokens_outs                r)   r5   zSAM2MaskDecoder.forward8  s    L AE@R@R-%=$;%/ AS A
=x*= !QRA+&E12H11$--"CCE8TOE8!QqS!Q,'E1Q3'H D D,QU3N -Q!V4Nh0CCCr6   c           
        d}| j                   rYt        j                  | j                  j                  | j
                  j                  | j                  j                  gd      }d}nAt        j                  | j
                  j                  | j                  j                  gd      }|j                  d      j                  |j                  d   dd      }t        j                  ||fd      }	|r&t        j                  ||	j                  d   d      }
n#|j                  d   |	j                  d   k(  sJ |}
|
|z   }
|j                  d   dk(  sJ d       t        j                  ||	j                  d   d      }|
j                  \  }}}}| j                  |
||	      \  }}
|dd|ddf   }|dd|dz   |dz   | j                  z   ddf   }|
j                  dd      j                  ||||      }
| j                  r|| j!                  |
      }n?| j                   \  }}}}}|\  }} | | ||
      |z               } | ||      |z         }t#        | j                        D cg c]!  } | j$                  |   |dd|ddf         # }}t        j&                  |d      }|j                  \  }}}}||j                  ||||z        z  j                  |d||      }| j)                  |      }| j                   r#|dk(  sJ | j+                  |dddddf         } n"d|j-                  |j                  d   d      z  } |||| fS c c}w )	zYPredict instance segmentation masks from image and prompt embeddings using a transformer.r   r8   r   r:   z@image_pe should have size 1 in batch dim (from `get_dense_pe()`)Nr   g      $@)rg   r;   r<   rh   r=   r   r   r>   r?   r@   rA   r   r   rB   rC   rj   r   r    r!   rD   r"   ro   new_ones)!r#   r+   r,   r-   r.   rv   rw   srE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   dc1ln1act1dc2act2feat_s0feat_s1rQ   rR   rS   r2   r3   rz   s!                                    r)   r/   zSAM2MaskDecoder.predict_masks}  sd    !II((//NN))$$++
 M A!IIt~~'<'<d>N>N>U>U&V\]^M%//299:R:X:XYZ:[]_acdM+CD!L ))*:FLLOQRSC#))!,Q???"C++~~a A%i'ii%))(FLLOKYY
1a ""38C1a7QAQ1E1E)E FIJ mmAq!&&q!Q2))->-F!%!6!6s!;(,(=(=%CdC0GW!%c#c(W*<&=!>!%c*<&=&G!H RWW[WkWkQl-
LM-D**1-oaAg.FG-
 -
 ;;}!4'--
1a.33Aq!a%@@FFq"aQRS ++M:6M6"&":":2aAg;"G #'):):8>>!;La)P"Ph1DDD!-
s   -&Mc                   |j                  d      }| j                  }t        j                  ||kD  d      j	                         }t        j                  || kD  d      j	                         }t        j
                  |dkD  ||z  d      S )zNCompute mask stability scores based on IoU between upper and lower thresholds.r:   r8   r   g      ?)flattenrq   r;   sumfloatwhere)r#   mask_logitsstability_deltaarea_iarea_us        r)   _get_stability_scoresz%SAM2MaskDecoder._get_stability_scores  sw    !))"-@@;8bAGGI;/)99rBHHJ{{6A:v<<r6   c                B   |ddddddddf   }|ddddf   }t        j                  |d      }t        j                  |j                  d   |j                        }|||f   }|j                  d      }|||f   }|j                  d      }|ddddddddf   }	|ddddf   }
| j                  |	      }|| j                  k\  }t        j                  |d   j                  |	      |	|      }t        j                  |j                  |
      |
|      }||fS )a  
        Dynamically select the most stable mask output based on stability scores and IoU predictions.

        This method is used when outputting a single mask. If the stability score from the current single-mask
        output (based on output token 0) falls below a threshold, it instead selects from multi-mask outputs
        (based on output tokens 1-3) the mask with the highest predicted IoU score. This ensures a valid mask
        for both clicking and tracking scenarios.

        Args:
            all_mask_logits (torch.Tensor): Logits for all predicted masks, shape (B, N, H, W) where B is
                batch size, N is number of masks (typically 4), and H, W are mask dimensions.
            all_iou_scores (torch.Tensor): Predicted IoU scores for all masks, shape (B, N).

        Returns:
            mask_logits_out (torch.Tensor): Selected mask logits, shape (B, 1, H, W).
            iou_scores_out (torch.Tensor): Selected IoU scores, shape (B, 1).

        Examples:
            >>> decoder = SAM2MaskDecoder(...)
            >>> all_mask_logits = torch.rand(2, 4, 256, 256)  # 2 images, 4 masks each
            >>> all_iou_scores = torch.rand(2, 4)
            >>> mask_logits, iou_scores = decoder._dynamic_multimask_via_stability(all_mask_logits, all_iou_scores)
            >>> print(mask_logits.shape, iou_scores.shape)
            torch.Size([2, 1, 256, 256]) torch.Size([2, 1])
        Nr   r:   r8   r   )device).NN)
r;   argmaxaranger@   r   r>   r   rr   r   	expand_as)r#   all_mask_logitsall_iou_scoresmultimask_logitsmultimask_iou_scoresbest_scores_inds
batch_indsbest_multimask_logitsbest_multimask_iou_scoressinglemask_logitssinglemask_iou_scoresstability_scores	is_stablemask_logits_outiou_scores_outs                  r)   ry   z0SAM2MaskDecoder._dynamic_multimask_via_stability  sL   6 +1ab!Q;7-ae4 <<(<"E\\"6"<"<Q"?H]H]^
 0=M1M N 5 ? ? B$8EU9U$V!$=$G$G$J! ,AqsAqL9 .q!A#v 6556GH$(O(OO	  ++o&001BC!

  56!%

 ..r6   )r   rT   r   rU   r   rT   r$   rV   r%   rT   r&   rT   rj   rZ   rg   rZ   rt   rZ   ri   rZ   rW   rX   )N)r+   rY   r,   rY   r-   rY   r.   rY   r1   rZ   rv   rZ   rw   list[torch.Tensor] | NonerW   =tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])r+   rY   r,   rY   r-   rY   r.   rY   rv   rZ   rw   r   rW   r   )r\   r]   r^   r_   r   r`   r   r5   r/   r   ry   ra   rb   s   @r)   rd   rd      s   *` &'&(gg#&&+#((-*.+/ %$)05[U[U [U  #	[U
 $[U [U ![U  $[U [U "[U *.[U  
![UJ 8<CD&CD CD #/	CD
 ".CD CD CD 5CD 
GCDX 8<EE&EE EE #/	EE
 ".EE EE 5EE 
GEEN=5/r6   rd   )

__future__r   r;   r   ultralytics.nn.modulesr   r   Moduler   rd    r6   r)   <module>r      s8    #   3`")) `FS/bii S/r6   