
    .hY                       d dl mZ d dlZd dlmc mZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZmZ dZ G d dej2                        Z G d dej                  j2                        Zy)    )annotationsN)nn)trunc_normal_)MLP)LOGGER   )SAM2TwoWayTransformer)MaskDecoderSAM2MaskDecoder)ImageEncoderViTPromptEncoder)get_1d_sine_peselect_closest_cond_framesg      c                  V     e Zd ZU dZdZded<   	 	 d	 	 	 	 	 	 	 	 	 	 	 d fdZd Z xZS )	SAMModela  
    Segment Anything Model (SAM) for object segmentation tasks.

    This class combines image encoders, prompt encoders, and mask decoders to predict object masks from images
    and input prompts.

    Attributes:
        mask_threshold (float): Threshold value for mask prediction.
        image_encoder (ImageEncoderViT): Backbone for encoding images into embeddings.
        prompt_encoder (PromptEncoder): Encoder for various types of input prompts.
        mask_decoder (MaskDecoder): Predicts object masks from image and prompt embeddings.
        pixel_mean (torch.Tensor): Mean values for normalizing pixels in the input image.
        pixel_std (torch.Tensor): Standard deviation values for normalizing pixels in the input image.

    Methods:
        set_imgsz: Set image size to make model compatible with different image sizes.

    Examples:
        >>> image_encoder = ImageEncoderViT(...)
        >>> prompt_encoder = PromptEncoder(...)
        >>> mask_decoder = MaskDecoder(...)
        >>> sam_model = SAMModel(image_encoder, prompt_encoder, mask_decoder)
        >>> # Further usage depends on SAMPredictor class

    Notes:
        All forward() operations are implemented in the SAMPredictor class.
            floatmask_thresholdc                (   t         |           || _        || _        || _        | j                  dt        j                  |      j                  ddd      d       | j                  dt        j                  |      j                  ddd      d       y)a  
        Initialize the SAMModel class to predict object masks from an image and input prompts.

        Args:
            image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings.
            prompt_encoder (PromptEncoder): Encodes various types of input prompts.
            mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts.
            pixel_mean (list[float]): Mean values for normalizing pixels in the input image.
            pixel_std (list[float]): Standard deviation values for normalizing pixels in the input image.

        Examples:
            >>> image_encoder = ImageEncoderViT(...)
            >>> prompt_encoder = PromptEncoder(...)
            >>> mask_decoder = MaskDecoder(...)
            >>> sam_model = SAMModel(image_encoder, prompt_encoder, mask_decoder)
            >>> # Further usage depends on SAMPredictor class

        Notes:
            All forward() operations moved to SAMPredictor.
        
pixel_meanr   F	pixel_stdN)	super__init__image_encoderprompt_encodermask_decoderregister_buffertorchTensorview)selfr   r   r   r   r   	__class__s         `/var/www/html/ai-service/venv/lib/python3.12/site-packages/ultralytics/models/sam/modules/sam.pyr   zSAMModel.__init__8   s    8 	*,(\5<<
+C+H+HQPQ+RTYZ[%,,y*A*F*Fr1a*PRWX    c                    t        | j                  d      r| j                  j                  |       || j                  _        |D cg c]  }|dz  	 c}| j                  _        |d   | j                  _        yc c}w )CSet image size to make model compatible with different image sizes.	set_imgsz   r   N)hasattrr   r(   r   input_image_sizeimage_embedding_sizeimg_sizer"   imgszxs      r$   r(   zSAMModel.set_imgsz[   si    4%%{3((//4,EJ3KAG3K0&+Ah# 4Ls   A:))g33333^@gR]@gRY@)g(\2M@g(\L@g     L@)r   r   r   r   r   r
   r   list[float]r   r1   returnNone)	__name__
__module____qualname____doc__r   __annotations__r   r(   __classcell__r#   s   @r$   r   r      sh    8  NE #<!8!Y&!Y &!Y "	!Y
  !Y !Y 
!YF/r%   r   c                  $    e Zd ZU dZdZded<   	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZed        Zd Z	d Z
	 	 	 	 dd	Zdd
ZddZd Z	 ddZd Zd Zd Z	 	 	 ddZd Zed        ZddZd Z xZS )	SAM2Modela  
    SAM2Model class for Segment Anything Model 2 with memory-based video object segmentation capabilities.

    This class extends the functionality of SAM to handle video sequences, incorporating memory mechanisms
    for temporal consistency and efficient tracking of objects across frames.

    Attributes:
        mask_threshold (float): Threshold value for mask prediction.
        image_encoder (ImageEncoderViT): Visual encoder for extracting image features.
        memory_attention (nn.Module): Module for attending to memory features.
        memory_encoder (nn.Module): Encoder for generating memory representations.
        num_maskmem (int): Number of accessible memory frames.
        image_size (int): Size of input images.
        backbone_stride (int): Stride of the backbone network output.
        sam_prompt_embed_dim (int): Dimension of SAM prompt embeddings.
        sam_image_embedding_size (int): Size of SAM image embeddings.
        sam_prompt_encoder (PromptEncoder): Encoder for processing input prompts.
        sam_mask_decoder (SAM2MaskDecoder): Decoder for generating object masks.
        obj_ptr_proj (nn.Module): Projection layer for object pointers.
        obj_ptr_tpos_proj (nn.Module): Projection for temporal positional encoding in object pointers.
        hidden_dim (int): Hidden dimension of the model.
        mem_dim (int): Memory dimension for encoding features.
        use_high_res_features_in_sam (bool): Whether to use high-resolution feature maps in the SAM mask decoder.
        use_obj_ptrs_in_encoder (bool): Whether to cross-attend to object pointers from other frames in the encoder.
        max_obj_ptrs_in_encoder (int): Maximum number of object pointers from other frames in encoder cross-attention.
        add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers.
        proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
            encoding in object pointers.
        use_signed_tpos_enc_to_obj_ptrs (bool): Whether to use signed distance in temporal positional encoding.
        only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past during
            evaluation.
        pred_obj_scores (bool): Whether to predict if there is an object in the frame.
        pred_obj_scores_mlp (bool): Whether to use an MLP to predict object scores.
        fixed_no_obj_ptr (bool): Whether to have a fixed no-object pointer when there is no object present.
        soft_no_obj_ptr (bool): Whether to mix in no-object pointer softly for easier recovery and error mitigation.
        use_mlp_for_obj_ptr_proj (bool): Whether to use MLP for object pointer projection.
        no_obj_embed_spatial (torch.Tensor | None): No-object embedding for spatial frames.
        max_cond_frames_in_attn (int): Maximum number of conditioning frames to participate in memory attention.
        directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the
            first frame.
        multimask_output_in_sam (bool): Whether to output multiple masks for the first click on initial
            conditioning frames.
        multimask_min_pt_num (int): Minimum number of clicks to use multimask output in SAM.
        multimask_max_pt_num (int): Maximum number of clicks to use multimask output in SAM.
        multimask_output_for_tracking (bool): Whether to use multimask output for tracking.
        use_multimask_token_for_obj_ptr (bool): Whether to use multimask tokens for object pointers.
        iou_prediction_use_sigmoid (bool): Whether to use sigmoid to restrict IoU prediction to [0-1].
        memory_temporal_stride_for_eval (int): Memory bank's temporal stride during evaluation.
        non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in
            memory encoder during evaluation.
        sigmoid_scale_for_mem_enc (float): Scale factor for mask sigmoid probability.
        sigmoid_bias_for_mem_enc (float): Bias factor for mask sigmoid probability.
        binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames
            with clicks during evaluation.
        use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM
            prompt encoder and mask decoder on frames with mask input.

    Methods:
        forward_image: Process image batch through encoder to extract multi-level features.
        track_step: Perform a single tracking step, updating object masks and memory features.
        set_binarize: Set binarize for VideoPredictor.
        set_imgsz: Set image size to make model compatible with different image sizes.

    Examples:
        >>> model = SAM2Model(image_encoder, memory_attention, memory_encoder)
        >>> image_batch = torch.rand(1, 3, 512, 512)
        >>> features = model.forward_image(image_batch)
        >>> track_results = model.track_step(0, True, features, None, None, None, {})
    r   r   r   c$                V   t         $|           || _        || _        |rdnd| _        || _        || _        |r(t        j                  j                  dddd      | _
        || _        |r|sJ || _        || _        || _        || _        |j                   | _        || _        | j"                  | _        t)        | j$                  d      rRt)        | j$                  j*                  d      r2| j$                  j*                  j,                  j.                  d   | _        || _        t        j                  j3                  t        j4                  |dd| j&                              | _        t9        | j6                  d	       t        j                  j3                  t        j4                  dd| j"                              | _        t        j                  j3                  t        j4                  dd| j"                              | _        t9        | j:                  d	       t9        | j<                  d	       || _        || _         || _!        |	| _"        || _#        || _$        |
| _%        || _&        || _'        || _(        || _)        || _*        || _+        || _,        || _-        |"| _.        || _/        || _0        || _1        || _2        | jb                  r| j^                  sJ | j
                  sJ | j^                  re| j
                  rYt        j                  j3                  t        j4                  d| j"                              | _3        t9        | jf                  d	       | | _4        d
| _5        |!rYt        j                  j3                  t        j4                  d| j&                              | _5        t9        | jj                  d	       | jm                          || _7        |#rRtq        jr                  d       t        jt                  | j                  jv                  ddd      | j                  _;        y
y
)a  
        Initialize the SAM2Model for video object segmentation with memory-based tracking.

        Args:
            image_encoder (nn.Module): Visual encoder for extracting image features.
            memory_attention (nn.Module): Module for attending to memory features.
            memory_encoder (nn.Module): Encoder for generating memory representations.
            num_maskmem (int): Number of accessible memory frames.
            image_size (int): Size of input images.
            backbone_stride (int): Stride of the image backbone output.
            sigmoid_scale_for_mem_enc (float): Scale factor for mask sigmoid probability.
            sigmoid_bias_for_mem_enc (float): Bias factor for mask sigmoid probability.
            binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames
                with clicks during evaluation.
            use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM
                prompt encoder and mask decoder on frames with mask input.
            max_cond_frames_in_attn (int): Maximum number of conditioning frames to participate in memory attention.
            directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the
                first frame.
            use_high_res_features_in_sam (bool): Whether to use high-resolution feature maps in the SAM mask decoder.
            multimask_output_in_sam (bool): Whether to output multiple masks for the first click on initial
                conditioning frames.
            multimask_min_pt_num (int): Minimum number of clicks to use multimask output in SAM.
            multimask_max_pt_num (int): Maximum number of clicks to use multimask output in SAM.
            multimask_output_for_tracking (bool): Whether to use multimask output for tracking.
            use_multimask_token_for_obj_ptr (bool): Whether to use multimask tokens for object pointers.
            iou_prediction_use_sigmoid (bool): Whether to use sigmoid to restrict IoU prediction to [0-1].
            memory_temporal_stride_for_eval (int): Memory bank's temporal stride during evaluation.
            non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in
                memory encoder during evaluation.
            use_obj_ptrs_in_encoder (bool): Whether to cross-attend to object pointers from other frames in the encoder.
            max_obj_ptrs_in_encoder (int): Maximum number of object pointers from other frames in encoder
                cross-attention.
            add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers in
                the encoder.
            proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
                encoding in object pointers.
            use_signed_tpos_enc_to_obj_ptrs (bool): Whether to use signed distance in the temporal positional encoding
                in the object pointers.
            only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past
                during evaluation.
            pred_obj_scores (bool): Whether to predict if there is an object in the frame.
            pred_obj_scores_mlp (bool): Whether to use an MLP to predict object scores.
            fixed_no_obj_ptr (bool): Whether to have a fixed no-object pointer when there is no object present.
            soft_no_obj_ptr (bool): Whether to mix in no-object pointer softly for easier recovery and error mitigation.
            use_mlp_for_obj_ptr_proj (bool): Whether to use MLP for object pointer projection.
            no_obj_embed_spatial (bool): Whether add no obj embedding to spatial frames.
            sam_mask_decoder_extra_args (dict | None): Extra arguments for constructing the SAM mask decoder.
            compile_image_encoder (bool): Whether to compile the image encoder for faster inference.

        Examples:
            >>> image_encoder = ImageEncoderViT(...)
            >>> memory_attention = SAM2TwoWayTransformer(...)
            >>> memory_encoder = nn.Sequential(...)
            >>> model = SAM2Model(image_encoder, memory_attention, memory_encoder)
            >>> image_batch = torch.rand(1, 3, 512, 512)
            >>> features = model.forward_image(image_batch)
            >>> track_results = model.track_step(0, True, features, None, None, None, {})
           r      )kernel_sizestrideout_projweightr   g{Gz?)stdNzFImage encoder compilation is enabled. First forward pass will be slow.zmax-autotuneTF)mode	fullgraphdynamic)<r   r   r   use_high_res_features_in_samnum_feature_levelsuse_obj_ptrs_in_encodermax_obj_ptrs_in_encoderr   r   Conv2dmask_downsampleadd_tpos_enc_to_obj_ptrsproj_tpos_enc_in_obj_ptrsuse_signed_tpos_enc_to_obj_ptrs"only_obj_ptrs_in_the_past_for_evalmemory_attentiond_model
hidden_dimmemory_encodermem_dimr*   rB   rC   shapenum_maskmem	Parameterzerosmaskmem_tpos_encr   no_mem_embedno_mem_pos_encdirectly_add_no_mem_embedsigmoid_scale_for_mem_encsigmoid_bias_for_mem_enc"binarize_mask_from_pts_for_mem_encnon_overlap_masks_for_mem_encmemory_temporal_stride_for_eval$use_mask_input_as_output_without_sammultimask_output_in_sammultimask_min_pt_nummultimask_max_pt_nummultimask_output_for_trackinguse_multimask_token_for_obj_ptriou_prediction_use_sigmoid
image_sizebackbone_stridesam_mask_decoder_extra_argspred_obj_scorespred_obj_scores_mlpfixed_no_obj_ptrsoft_no_obj_ptr
no_obj_ptruse_mlp_for_obj_ptr_projno_obj_embed_spatial_build_sam_headsmax_cond_frames_in_attnr   infocompileforward)%r"   r   rR   rU   rX   rk   rl   r_   r`   ra   rd   rv   r^   rH   re   rf   rg   rh   ri   rj   rc   rb   rJ   rK   rN   rO   rP   rQ   rn   ro   rp   rq   rs   rt   rm   compile_image_encoderr#   s%                                       r$   r   zSAM2Model.__init__   sm   B 	 +,H)'C!'>$'>$" $)88??1aQq?#QD (@%$+++)B&/N,2T/ !1*22 -4&&
3@S@S@\@\^f8g..77>>DDQGDL& % 2 25;;{AqRVR^R^3_ `d++6!HH..u{{1a/QR#hh00Q4??1STd''T2d))t4)B& *C&(@%2T/-J*/N, 5Y1'>$$8!$8!-J*/N,*D' %.+F(.#6  0.  ''''////D$@$@#hh00Q1PQDO$//t4(@%$(!(-(:(:5;;q$,,;W(XD%$33>'>$ !KK`a).""**#	*D& !r%   c                H    t        | j                               j                  S )z=Return the device on which the model's parameters are stored.)next
parametersdevicer"   s    r$   r~   zSAM2Model.devicel  s     DOO%&---r%   c                    t        d      )zWProcess image and prompt inputs to generate object masks and scores in video sequences.zPlease use the corresponding methods in SAM2VideoPredictor for inference.See notebooks/video_predictor_example.ipynb for an example.)NotImplementedError)r"   argskwargss      r$   ry   zSAM2Model.forwardq  s    !J
 	
r%   c                   | j                   | _        | j                  | j                  z  | _        t        | j                  | j                  | j                  f| j                  | j                  fd      | _        t        ddt        d| j                  dd      | j                  dd| j                  | j                  | j                  | j                  | j                  d	
| j                  xs i | _        | j                   rwt"        j$                  j'                  | j                   | j                         | _        | j*                  rUt-        | j                   | j                   | j                   d      | _        n#t"        j$                  j/                         | _        | j0                  r:t"        j$                  j'                  | j                   | j2                        | _        y
t"        j$                  j/                         | _        y
)zMBuild SAM-style prompt encoder and mask decoder for image segmentation tasks.r)   )	embed_dimr,   r+   mask_in_chansr>      i      )depthembedding_dimmlp_dim	num_heads   )
num_multimask_outputstransformertransformer_dimiou_head_depthiou_head_hidden_dimuse_high_res_featuresrj   rn   ro   ri   N )rT   sam_prompt_embed_dimrk   rl   sam_image_embedding_sizer   sam_prompt_encoderr   r	   rH   rj   rn   ro   ri   rm   sam_mask_decoderrJ   r   r   Linearobj_ptr_projrs   r   IdentityrO   rV   obj_ptr_tpos_projr   s    r$   ru   zSAM2Model._build_sam_headsx  s   $(OO!(,4;O;O(O% #0//----" #oot?#
 !0 !
"#-"77	 !55 #"&"C"C'+'F'F 00 $ 8 8,0,P,P!
  //52!!
$ '' % QD,,$'$//[\$]! % 1 1 3D)) &+XX__T__dll%SD"%*XX%6%6%8D"r%   c           	        |j                   d   }|j                  }|j                  d      | j                  k(  sJ |j                  d      | j                  k(  sJ |j                  d      | j                  k(  sJ |0|d   }|d   }	|j                   d   |k(  r|	j                   d   |k(  sNJ t        j                  |dd||j                        }t        j                  |dt
        j                  |	       }	|t        |j                         d
k(  r|j                   dd |dfk(  sJ |j                   dd | j                  j                  k7  r=t        j                  |j                         | j                  j                  ddd      }
n|}
nd}
| j                  ||	fd|
      \  }}| j!                  || j                  j#                         |||d|      \  }}}}| j$                  r(|dkD  }t        j&                  |ddddf   |t(              }t        j                  || j*                  | j*                  fdd      }|dddf   }|rvt        j,                  |d      }t        j.                  ||      }|||f   j1                  d      }|||f   j1                  d      }|j                  d      dkD  r|||f   }n||}}| j3                  |      }| j$                  r^| j4                  r|j7                         }nj9                  |j                        }| j:                  r||z  }|d|z
  | j<                  z  z   }|||||||fS )a{
  
        Forward pass through SAM prompt encoders and mask heads.

        This method processes image features and optional point/mask inputs to generate object masks and scores.

        Args:
            backbone_features (torch.Tensor): Image features with shape (B, C, H, W).
            point_inputs (dict[str, torch.Tensor] | None): Dictionary containing point prompts.
                'point_coords': Tensor of shape (B, P, 2) with float32 dtype, containing absolute
                    pixel-unit coordinates in (x, y) format for P input points.
                'point_labels': Tensor of shape (B, P) with int32 dtype, where 1 means positive clicks,
                    0 means negative clicks, and -1 means padding.
            mask_inputs (torch.Tensor | None): Mask of shape (B, 1, H*16, W*16), float or bool, with the
                same spatial size as the image.
            high_res_features (list[torch.Tensor] | None): List of two feature maps with shapes
                (B, C, 4*H, 4*W) and (B, C, 2*H, 2*W) respectively, used as high-resolution feature maps
                for SAM decoder.
            multimask_output (bool): If True, output 3 candidate masks and their IoU estimates; if False,
                output only 1 mask and its IoU estimate.

        Returns:
            low_res_multimasks (torch.Tensor): Tensor of shape (B, M, H*4, W*4) with SAM output mask logits.
            high_res_multimasks (torch.Tensor): Tensor of shape (B, M, H*16, W*16) with upsampled mask logits.
            ious (torch.Tensor): Tensor of shape (B, M) with estimated IoU for each output mask.
            low_res_masks (torch.Tensor): Tensor of shape (B, 1, H*4, W*4) with the best low-resolution mask.
            high_res_masks (torch.Tensor): Tensor of shape (B, 1, H*16, W*16) with the best high-resolution mask.
            obj_ptr (torch.Tensor): Tensor of shape (B, C) with object pointer vector for the output mask.
            object_score_logits (torch.Tensor): Tensor of shape (B) with object score logits.

        Examples:
            >>> backbone_features = torch.rand(1, 256, 32, 32)
            >>> point_inputs = {"point_coords": torch.rand(1, 2, 2), "point_labels": torch.tensor([[1, 0]])}
            >>> mask_inputs = torch.rand(1, 1, 512, 512)
            >>> results = model._forward_sam_heads(backbone_features, point_inputs, mask_inputs)
            >>> (
            ...     low_res_multimasks,
            ...     high_res_multimasks,
            ...     ious,
            ...     low_res_masks,
            ...     high_res_masks,
            ...     obj_ptr,
            ...     object_score_logits,
            ... ) = results
        r   r   r   r>   Npoint_coordspoint_labelsr~   dtype)r   r~   r?   FbilinearTsizealign_cornersrE   	antialias)pointsboxesmasks)image_embeddingsimage_pesparse_prompt_embeddingsdense_prompt_embeddingsmultimask_outputrepeat_imagehigh_res_features)r   rE   r   r   dimr~   )rW   r~   r   r   r   r   rZ   r   onesint32lenr   mask_input_sizeFinterpolater   r   get_dense_pern   whereNO_OBJ_SCORErk   argmaxarange	unsqueezer   rq   sigmoidtorp   rr   )r"   backbone_featurespoint_inputsmask_inputsr   r   Br~   sam_point_coordssam_point_labelssam_mask_promptsparse_embeddingsdense_embeddingslow_res_multimasksioussam_output_tokensobject_score_logitsis_obj_appearinghigh_res_multimaskssam_output_tokenbest_iou_inds
batch_indslow_res_maskshigh_res_masksobj_ptrlambda_is_obj_appearings                             r$   _forward_sam_headszSAM2Model._forward_sam_heads  s   h ##A&")) %%a(D,E,EEEE %%a(D,I,IIII %%a(D,I,IIII #+N;+N;#))!,16F6L6LQ6OST6TTT  %{{1a6IZI`I`a %

1au{{6 RR " {(()Q.;3D3DRa3HQPQF3RRR  %)@)@)P)PP"#--%%'00@@"'#"# #. #O.2.E.E$&67! /F /
++
 LPK`K`.,,99;%6$4-/ La L
HD"35H 2Q6 "'-=atm-LN`bn!o  mm//4??3	
 -QT2!LL26Ma7J.z=/HISSTUVM0]1JKUUVWXN %%a(1,#4Z5N#O ,>@S>M ##$45##*=*E*E*G'*:*=*=gmm*L'$$1G;%<!< OOG
 	
r%   c                   d\  }}|j                         }||z  |z   }t        j                  ||j                  d      dz  |j                  d      dz  fddd      }|j	                  |j
                  d	   d
      j                         }	| j                  r||:t        j                  |j
                  d	   | j                  |j                        }
n+| j                  || j                  |      |      \  }}}}}}
}t        j                  |j                  d
      j                         dkD  d
      }|d   }|j                         }||z  |z   }| j                  r&| j                   r||
z  }
|
d
|z
  | j"                  z  z   }
|||	|||
|fS )zFProcess mask inputs directly as output, bypassing SAM encoder/decoder.)g      4@      $r   r?   r   Fr   Tr   r   r   r   )r   r   r   r   r   ).N)r   r   r   r   new_onesrW   rJ   r   rZ   rT   r~   r   rM   anyflattenrn   rp   rr   )r"   r   r   r   	out_scaleout_biasmask_inputs_floatr   r   r   r   _r   r   r   s                  r$   _use_mask_as_outputzSAM2Model._use_mask_as_output>  s    *	8'--/*Y6A %%b)Q.0C0CB0G10LM
 ##K$5$5a$8!<BBD++/@/HL]Lekk+"3"3A"6P[PbPbcG )-(?(?"3 001BC"3 )@ )%Aq!Q7A !99[%8%8%;%A%A%Cc%IqQ+I6"2"8"8":'*AAHL$$1G;%<!< OOG 
 	
r%   c                    | j                  |      }| j                  rN| j                  j                  |d   d         |d   d<   | j                  j	                  |d   d         |d   d<   |S )zRProcess image batch through encoder to extract multi-level features for SAM model.backbone_fpnr   r   )r   rH   r   conv_s0conv_s1)r"   	img_batchbackbone_outs      r$   forward_imagezSAM2Model.forward_imagem  s{    )))4,, /3.C.C.K.KLYgLhijLk.lL(+.2.C.C.K.KLYgLhijLk.lL(+r%   c                   t        |d         t        |d         k(  sJ t        |d         | j                  k\  sJ |d   | j                   d }|d   | j                   d }|D cg c]   }|j                  d   |j                  d   f" }}|D cg c]$  }|j                  d      j	                  ddd      & }}|D cg c]$  }|j                  d      j	                  ddd      & }}||||fS c c}w c c}w c c}w )	zZPrepare and flatten visual features from the image backbone output for further processing.r   vision_pos_encNr   r   r   r   r   )r   rI   rW   r   permute)r"   r   feature_mapsvision_pos_embedsr0   
feat_sizesvision_featss          r$   _prepare_backbone_featuresz$SAM2Model._prepare_backbone_featuresw  s   </0CEU8V4WWWW</0D4K4KKKK#N3T5L5L4L4NO()9:D<S<S;S;UV:KLQqwwr{AGGBK0L
L?KL!		!,,Q15LLDUVqQYYq\11!Q:VV\+<jHH MLVs   '%C2)C7)C<c	                V   |d   j                  d      }	| j                  }
|d   \  }}|d   j                  }| j                  dk(  r(|d   j	                  ddd      j                  |	|
||      S d}|rdnd}|sg g }}t        |d         dkD  sJ |d   }t        ||| j                        \  }}|j                         D cg c]  }d|f }}| j                  rdn| j                  }t        d| j                        D ]  }| j                  |z
  }|dk(  r|r||z   n||z
  }n1|s|dz
  |z  |z  }||dz
  |z  z
  }n|dz    |z   |z  }||dz
  |z  z   }|d   j                  |d      }||j                  |d      }|j                  ||f        |D ]  \  }}|	|d   j                  ||j                   d	k(  
      }|j                  |j#                  d      j	                  ddd             |d   d   j                  |      }|j#                  d      j	                  ddd      }|| j$                  | j                  |z
  dz
     z   }|j                  |        | j&                  rBt)        || j*                        }| j                  s=| j,                  r1|j/                         D ci c]  \  }}|r||k\  r	n||k  r|| } }}n|} | j/                         D cg c],  \  }}| j0                  r||z
  |z  nt3        ||z
        |d   f. }!}}t        d|      D ]Z  }"|r||"z   n||"z
  }|dk  s|||k\  r n@|d   j                  ||j                  |d            }|E|!j                  |"|d   f       \ |!rt5        |! \  }#}$t7        j8                  |$d      }%| j:                  r|dz
  }&| j<                  r|
n| j>                  }'t7        j@                  |#||d   jB                        }(tE        |(|&z  |'      }(| jG                  |(      }(|(jI                  d      jK                  d|	| j>                        }(n&|%jM                  t        |#      |	| j>                        }(| j>                  |
k  ro|%jO                  d|	|
| j>                  z  | j>                        }%|%j	                  dddd      j#                  dd      }%|(jQ                  |
| j>                  z  d      }(|j                  |%       |j                  |(       |%jR                  d   }nd}n| jT                  r9|d   | jV                  z   })|)j	                  ddd      j                  |	|
||      })|)S | jV                  jK                  d|	| j>                        g}| jX                  jK                  d|	| j>                        g}t7        jZ                  |d      }*t7        jZ                  |d      }+| j]                  |||*|+|      })|)j	                  ddd      j                  |	|
||      })|)S c c}w c c}}w c c}}w )zePrepare memory-conditioned features by fusing current frame's visual features with previous memories.r   r   r   r   cond_frame_outputsnon_cond_frame_outputsNmaskmem_featurescuda)r~   non_blockingmaskmem_pos_encr   r   r   r   r>   )currcurr_posmemory
memory_posnum_obj_ptr_tokens)/r   rT   r~   rX   r   r!   r   r   rv   valuestrainingrc   rangegetappendr   typer   r[   rJ   minrK   rQ   itemsrP   abszipr   stackrN   rO   rV   tensorr   r   r   r   expand	new_zerosreshaperepeat_interleaverW   r^   r\   r]   catrR   ),r"   	frame_idxis_init_cond_framecurrent_vision_featscurrent_vision_pos_embedsr   output_dict
num_framestrack_in_reverser   CHWr~   r   tpos_sign_multo_cat_memoryto_cat_memory_pos_embedcond_outputsselected_cond_outputsunselected_cond_outputsoutt_pos_and_prevsrt_post_relprev_frame_idxprevfeatsmaskmem_encrK   tptr_cond_outputspos_and_ptrst_diffpos_list	ptrs_listobj_ptrs
t_diff_maxtpos_dimobj_pospix_feat_with_memr   memory_pos_embeds,                                               r$   $_prepare_memory_conditioned_featuresz.SAM2Model._prepare_memory_conditioned_features  s    !$))!,OO"~1%b)00 q '+33Aq!<AA!Q1MM.A!572M {#789A===&';<L=W<)E)E>:!#: 4I3O3O3QRC3xROR
 ]](L(LAq$"2"23 5((50A::JY%6PY\aPaN) (11}&:a%?N%3uqyAo%EN *3Q'71'<%=%AN%3uqyAo%EN!":;??PTU; 255ndKC&&s|4-50  / <t< /0336PVP[P[_ePe3f$$U]]1%5%=%=aA%FG"#45b9<<F<K)11!4<<Q1E)D,A,A$BRBRUZBZ]^B^,__'..{;< ++*-j$:V:V*W' }})P)P '<&A&A&C("As.>ANAN 3($ ( (=$ #3"8"8":  3  $CC ']m;!$Y]!3I    $A'>? FF.>	F*IPVDVA1u!7AO%&>?CCAG^GbGbcdfjGklC$++VS^,DEF  *-|*<'Hi${{9!<H 44%<q%@
(,(F(F1DLL"',,xNbceNfNlNl"m"0:1E8"T"&"8"8"A")"3"3A"6"="=b!T\\"R"*"4"4S]At||"T||a'#+#3#3B1;Ldll#[#+#3#3Aq!Q#?#G#G1#M")";";A<MST";"U!((2+227;)1):&)*& --$8$<t?P?P$P!$5$=$=aA$F$K$KAqRSUV$W!(( "..55aDLLIJM'+':':'A'A!Q'U&V# =a0 99%<!D 11%.'1 2 
 .55aA>CCAq!QO  A Sd( s   X.X!1X%c                   |d   j                  d      }| j                  }|d   \  }}	|d   j                  ddd      j                  ||||	      }
| j                  r| j
                  s| j                  |      }| j                  xr |}|r+| j
                  s|dkD  j                  |
j                        }nt        j                  |      }| j                  dk7  r|| j                  z  }| j                  dk7  r|| j                  z   }| j                  |
|d      }|d	   }|d
   }| j                  E|dkD  j!                         }|d|d   z
   | j                  d   j"                  |j$                   z  z  }||fS )zXEncode frame features and masks into a new memory representation for video segmentation.r   r   r   r         ?r   T)skip_mask_sigmoidvision_featuresr   ).NN)r   rT   r   r!   rb   r   "_apply_non_overlapping_constraintsra   r   r   r   r   r_   r`   rU   rt   r   r  rW   )r"   r  r   pred_masks_high_resr   is_mask_from_ptsr   r  r  r  pix_featbinarizemask_for_memmaskmem_outr   r   r   s                    r$   _encode_new_memoryzSAM2Model._encode_new_memory*  s    !$))!,OO"~1'+33Aq!<AA!Q1M--dmm #'"I"IJ]"^::O?ODMM/!377GL !==)<=L))S0'$*H*HHL((C/'$*G*GGL))(LTX)Y&'89%&67 $$0 3a 7>>@%5o%F!F K$JcJcKf&,,K. !. .  00r%   c           
     ^   t        |      dkD  rft        |dd |dd       D cg c]H  \  }} |j                  ddd      j                  |j	                  d      |j	                  d      g| J }}}nd}|W| j
                  rK|d   j                  ddd      } |j                  d| j                  g|d    }| j                  |||      }nT| j                  |||dd |dd |dd ||	|
      }|||J |}| j                  ||      }| j                  |||||      }|||fS c c}}w )hPerform a single tracking step, updating object masks and memory features based on current frame inputs.r   Nr   r   r   )r  r  r  r  r   r  r  r  )r   r   r   r   r   )r   r  r   r!   r   rd   rT   r   r5  _use_multimaskr   )r"   r  r  r  r  r   r   r   r  r  r  prev_sam_mask_logitsr0   sr   r=  sam_outputsr   s                     r$   _track_stepzSAM2Model._track_stepV  s     #$q(   4Sb 9:cr?K!Aq (		!Q"''q	166!9AqA! !
 !%"t'P'P ,B/771a@H$x}}RJ:b>JH22;J[\K @@##5%9"#%>*CBC*H%bc?'%!1 A 	H $/#/K4GGG2#223E|T11"*)'"3!1 2 K -x77O!s   AD)c                    |r5| j                   dkD  r&| j                  |||||du      \  }}	||d<   |	|d<   yd|d<   d|d<   y)z^Run memory encoder on predicted mask to encode it into a new memory feature for future frames.r   N)r  r   r;  r   r<  r   r   )rX   rA  )
r"   r  r   r   run_mem_encoderr   r   current_outr   r   s
             r$   _encode_memory_in_outputz"SAM2Model._encode_memory_in_output  sr     t//!3040G0G%9%$2$7".d": 1H 1-o /?K*+-<K)*.2K*+-1K)*r%   c                    | j                  |||||||||	|
|      \  }}}|\  }}}}}}}|||d}| j                  s||d<   | j                  |||||||       |S )rC  )
pred_masksr;  r   r   )rH  r   rL  )r"   r  r  r  r  r   r   r   r  r  r  rJ  rE  rG  r   r   r   r   r   rK  s                       r$   
track_stepzSAM2Model.track_step  s    , !,, % 
Q P[L1a9L (#1

 }} 2EK-. 	%% 	
 r%   c                    |dn|d   j                  d      }| j                  xr6 |xs | j                  xr$ | j                  |cxk  xr | j                  k  S c S )zaDetermine whether to use multiple mask outputs in the SAM head based on configuration and inputs.r   r   r   )r   re   rh   rf   rg   )r"   r  r   num_ptss       r$   rD  zSAM2Model._use_multimask  sk    #+!n1M1R1RST1U(( T#It'I'IT**gR9R9RR	
 S	
r%   c                   | j                   d   }|dk(  r| S | j                  }t        j                  | dd      }t        j                  ||      dddddf   }||k(  }t        j
                  || t        j                  | d            } | S )	z\Apply non-overlapping constraints to masks, keeping the highest scoring object per location.r   r   T)r   keepdimr   Nr   )max)rW   r~   r   r   r   r   clamp)rN  
batch_sizer~   max_obj_indsbatch_obj_indskeeps         r$   r:  z,SAM2Model._apply_non_overlapping_constraints  s      %%a(
?""||JAtDj@D$PTATU~- [[z5;;zu3UV
r%   c                    || _         y)z Set binarize for VideoPredictor.N)ra   )r"   r>  s     r$   set_binarizezSAM2Model.set_binarize   s
    2:/r%   c                    |d   | _         || j                  _        |D cg c]  }|dz  	 c}| j                  _        | j                   | j                  z  | _        yc c}w )r'   r   r)   N)rk   r   r+   r,   rl   r   r.   s      r$   r(   zSAM2Model.set_imgsz  sU    (380IN7OAR7O4(,4;O;O(O% 8Ps   A)    i   r)   r7  r   FFr   FFFr   r   FFFr   FFr)   TFFFFFFFFFNF)ri   boolrn   r^  ro   r^  rp   r^  rq   r^  rs   r^  rt   r^  rz   r^  )NNNF)NN)r   ztorch.Tensor)F)FTN)r4   r5   r6   r7   r   r8   r   propertyr~   ry   ru   r   r   r   r   r5  rA  rH  rL  rO  rD  staticmethodr:  r[  r(   r9   r:   s   @r$   r<   r<   d   sz   DL  NE "%!$+0-2 ""'%* %&+05#(()&+ % "!%"'(-+0 %$)!& %).%*$(&+I}& *.'}: ;}< "=}> ?}@ A}B #'C}D #E}H  $I}~ . .
-9d U
n-
^I0 b!H*1X88t2H  !':x
  ";Pr%   r<   )
__future__r   r   torch.nn.functionalr   
functionalr   torch.nn.initr   ultralytics.nn.modulesr   ultralytics.utilsr   blocksr	   decodersr
   r   encodersr   r   utilsr   r   r   Moduler   r<   r   r%   r$   <module>rl     s]    #     ' & $ ) 2 4 = H/ryy H/VeP ePr%   