
    .hn:                        d dl mZ d dlZd dlZd dlmZmZ d dlmZ  G d dej                        Z	 G d dej                        Z
 G d	 d
ej                        Zy)    )annotationsN)Tensornn)MLPBlockc                  p     e Zd ZdZej
                  df	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 	 	 ddZ xZS )TwoWayTransformera  
    A Two-Way Transformer module for simultaneous attention to image and query points.

    This class implements a specialized transformer decoder that attends to an input image using queries with
    supplied positional embeddings. It's useful for tasks like object detection, image segmentation, and point
    cloud processing.

    Attributes:
        depth (int): Number of layers in the transformer.
        embedding_dim (int): Channel dimension for input embeddings.
        num_heads (int): Number of heads for multihead attention.
        mlp_dim (int): Internal channel dimension for the MLP block.
        layers (nn.ModuleList): List of TwoWayAttentionBlock layers composing the transformer.
        final_attn_token_to_image (Attention): Final attention layer from queries to image.
        norm_final_attn (nn.LayerNorm): Layer normalization applied to final queries.

    Methods:
        forward: Process image and point embeddings through the transformer.

    Examples:
        >>> transformer = TwoWayTransformer(depth=6, embedding_dim=256, num_heads=8, mlp_dim=2048)
        >>> image_embedding = torch.randn(1, 256, 32, 32)
        >>> image_pe = torch.randn(1, 256, 32, 32)
        >>> point_embedding = torch.randn(1, 100, 256)
        >>> output_queries, output_image = transformer(image_embedding, image_pe, point_embedding)
        >>> print(output_queries.shape, output_image.shape)
       c                `   t         |           || _        || _        || _        || _        t        j                         | _        t        |      D ]/  }| j                  j                  t        ||||||dk(               1 t        |||      | _        t        j                  |      | _        y)ak  
        Initialize a Two-Way Transformer for simultaneous attention to image and query points.

        Args:
            depth (int): Number of layers in the transformer.
            embedding_dim (int): Channel dimension for input embeddings.
            num_heads (int): Number of heads for multihead attention. Must divide embedding_dim.
            mlp_dim (int): Internal channel dimension for the MLP block.
            activation (Type[nn.Module], optional): Activation function to use in the MLP block.
            attention_downsample_rate (int, optional): Downsampling rate for attention mechanism.
        r   )embedding_dim	num_headsmlp_dim
activationattention_downsample_rateskip_first_layer_pedownsample_rateN)super__init__depthr   r   r   r   
ModuleListlayersrangeappendTwoWayAttentionBlock	Attentionfinal_attn_token_to_image	LayerNormnorm_final_attn)	selfr   r   r   r   r   r   i	__class__s	           h/var/www/html/ai-service/venv/lib/python3.12/site-packages/ultralytics/models/sam/modules/transformer.pyr   zTwoWayTransformer.__init__*   s    ( 	
*"mmou 
	AKK$"/'#).G)*a	
	 *3=)]v)w&!||M:    c                B   |j                  d      j                  ddd      }|j                  d      j                  ddd      }|}|}| j                  D ]  } |||||      \  }} ||z   }||z   }| j                  |||      }	||	z   }| j	                  |      }||fS )a  
        Process image and point embeddings through the Two-Way Transformer.

        Args:
            image_embedding (torch.Tensor): Image to attend to, with shape (B, embedding_dim, H, W).
            image_pe (torch.Tensor): Positional encoding to add to the image, with same shape as image_embedding.
            point_embedding (torch.Tensor): Embedding to add to query points, with shape (B, N_points, embedding_dim).

        Returns:
            queries (torch.Tensor): Processed point embeddings with shape (B, N_points, embedding_dim).
            keys (torch.Tensor): Processed image embeddings with shape (B, H*W, embedding_dim).
        r	   r      )querieskeysquery_pekey_peqkv)flattenpermuter   r   r   )
r   image_embeddingimage_pepoint_embeddingr&   r'   layerr+   r,   attn_outs
             r"   forwardzTwoWayTransformer.forwardT   s    & *11!4<<Q1E##A&..q!Q7 " [[ 	E!(	MGT	 o%8O11Ad1CH$&&w/}r#   )r   intr   r6   r   r6   r   r6   r   type[nn.Module]r   r6   returnNone)r0   torch.Tensorr1   r:   r2   r:   r8   !tuple[torch.Tensor, torch.Tensor]	__name__
__module____qualname____doc__r   ReLUr   r5   __classcell__r!   s   @r"   r   r      s    D ')gg)*(;(; (; 	(;
 (; $(; $'(; 
(;T*%* * &	*
 
+*r#   r   c                  x     e Zd ZdZdej
                  ddf	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 	 	 	 	 ddZ xZS )	r   aG  
    A two-way attention block for simultaneous attention to image and query points.

    This class implements a specialized transformer block with four main layers: self-attention on sparse inputs,
    cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention of dense
    inputs to sparse inputs.

    Attributes:
        self_attn (Attention): Self-attention layer for queries.
        norm1 (nn.LayerNorm): Layer normalization after self-attention.
        cross_attn_token_to_image (Attention): Cross-attention layer from queries to keys.
        norm2 (nn.LayerNorm): Layer normalization after token-to-image attention.
        mlp (MLPBlock): MLP block for transforming query embeddings.
        norm3 (nn.LayerNorm): Layer normalization after MLP block.
        norm4 (nn.LayerNorm): Layer normalization after image-to-token attention.
        cross_attn_image_to_token (Attention): Cross-attention layer from keys to queries.
        skip_first_layer_pe (bool): Whether to skip positional encoding in the first layer.

    Methods:
        forward: Apply self-attention and cross-attention to queries and keys.

    Examples:
        >>> embedding_dim, num_heads = 256, 8
        >>> block = TwoWayAttentionBlock(embedding_dim, num_heads)
        >>> queries = torch.randn(1, 100, embedding_dim)
        >>> keys = torch.randn(1, 1000, embedding_dim)
        >>> query_pe = torch.randn(1, 100, embedding_dim)
        >>> key_pe = torch.randn(1, 1000, embedding_dim)
        >>> processed_queries, processed_keys = block(queries, keys, query_pe, key_pe)
    i   r	   Fc                   t         |           t        ||      | _        t	        j
                  |      | _        t        |||      | _        t	        j
                  |      | _        t        |||      | _
        t	        j
                  |      | _        t	        j
                  |      | _        t        |||      | _        || _        y)a  
        Initialize a TwoWayAttentionBlock for simultaneous attention to image and query points.

        This block implements a specialized transformer layer with four main components: self-attention on sparse
        inputs, cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention
        of dense inputs to sparse inputs.

        Args:
            embedding_dim (int): Channel dimension of the embeddings.
            num_heads (int): Number of attention heads in the attention layers.
            mlp_dim (int, optional): Hidden dimension of the MLP block.
            activation (Type[nn.Module], optional): Activation function for the MLP block.
            attention_downsample_rate (int, optional): Downsampling rate for the attention mechanism.
            skip_first_layer_pe (bool, optional): Whether to skip positional encoding in the first layer.
        r   N)r   r   r   	self_attnr   r   norm1cross_attn_token_to_imagenorm2r   mlpnorm3norm4cross_attn_image_to_tokenr   )r   r   r   r   r   r   r   r!   s          r"   r   zTwoWayAttentionBlock.__init__   s    0 	"=)<\\-0
)2=)]v)w&\\-0
M7J?\\-0
\\-0
)2=)]v)w&#6 r#   c                   | j                   r| j                  |||      }n||z   }| j                  |||      }||z   }| j                  |      }||z   }||z   }| j                  |||      }||z   }| j	                  |      }| j                  |      }||z   }| j                  |      }||z   }||z   }| j                  |||      }||z   }| j                  |      }||fS )a  
        Apply two-way attention to process query and key embeddings in a transformer block.

        Args:
            queries (torch.Tensor): Query embeddings with shape (B, N_queries, embedding_dim).
            keys (torch.Tensor): Key embeddings with shape (B, N_keys, embedding_dim).
            query_pe (torch.Tensor): Positional encodings for queries with same shape as queries.
            key_pe (torch.Tensor): Positional encodings for keys with same shape as keys.

        Returns:
            queries (torch.Tensor): Processed query embeddings with shape (B, N_queries, embedding_dim).
            keys (torch.Tensor): Processed key embeddings with shape (B, N_keys, embedding_dim).
        r*   )	r   rF   rG   rH   rI   rJ   rK   rM   rL   )	r   r&   r'   r(   r)   r+   r4   r,   mlp_outs	            r"   r5   zTwoWayAttentionBlock.forward   s   " ##nnw'WnEG("A~~Q'~:H(G**W% h6M11Ad1CH$**W% ((7#G#**W% h6M11Ag1Fhzz$}r#   )r   r6   r   r6   r   r6   r   r7   r   r6   r   boolr8   r9   )
r&   r:   r'   r:   r(   r:   r)   r:   r8   r;   r<   rC   s   @r"   r   r      s    F &(gg)*$)%7%7 %7 	%7
 $%7 $'%7 "%7 
%7N,#,+7,CO,Ye,	*,r#   r   c                  h     e Zd ZdZ	 	 d	 	 	 	 	 	 	 	 	 d fdZedd       Zed	d       Zd
dZ xZ	S )r   a  
    An attention layer with downscaling capability for embedding size after projection.

    This class implements a multi-head attention mechanism with the option to downsample the internal
    dimension of queries, keys, and values.

    Attributes:
        embedding_dim (int): Dimensionality of input embeddings.
        kv_in_dim (int): Dimensionality of key and value inputs.
        internal_dim (int): Internal dimension after downsampling.
        num_heads (int): Number of attention heads.
        q_proj (nn.Linear): Linear projection for queries.
        k_proj (nn.Linear): Linear projection for keys.
        v_proj (nn.Linear): Linear projection for values.
        out_proj (nn.Linear): Linear projection for output.

    Methods:
        _separate_heads: Separate input tensor into attention heads.
        _recombine_heads: Recombine separated attention heads.
        forward: Compute attention output for given query, key, and value tensors.

    Examples:
        >>> attn = Attention(embedding_dim=256, num_heads=8, downsample_rate=2)
        >>> q = torch.randn(1, 100, 256)
        >>> k = v = torch.randn(1, 50, 256)
        >>> output = attn(q, k, v)
        >>> print(output.shape)
        torch.Size([1, 100, 256])
    c                   t         |           || _        ||n|| _        ||z  | _        || _        | j                  |z  dk(  sJ d       t        j                  || j                        | _        t        j                  | j                  | j                        | _	        t        j                  | j                  | j                        | _
        t        j                  | j                  |      | _        y)a?  
        Initialize the Attention module with specified dimensions and settings.

        Args:
            embedding_dim (int): Dimensionality of input embeddings.
            num_heads (int): Number of attention heads.
            downsample_rate (int, optional): Factor by which internal dimensions are downsampled.
            kv_in_dim (int | None, optional): Dimensionality of key and value inputs. If None, uses embedding_dim.

        Raises:
            AssertionError: If num_heads does not evenly divide the internal dim (embedding_dim / downsample_rate).
        Nr   z$num_heads must divide embedding_dim.)r   r   r   	kv_in_diminternal_dimr   r   Linearq_projk_projv_projout_proj)r   r   r   r   rS   r!   s        r"   r   zAttention.__init__  s    & 	*&/&;)_<"  9,1Y3YY1iit/@/@Aii0A0ABii0A0AB		$"3"3]Cr#   c                t    | j                   \  }}}| j                  |||||z        } | j                  dd      S )zGSeparate the input tensor into the specified number of attention heads.r%   r	   )shapereshape	transpose)xr   bncs        r"   _separate_headszAttention._separate_heads5  s<     ''1aIIaIqI~6{{1a  r#   c                t    | j                   \  }}}}| j                  dd      } | j                  ||||z        S )z9Recombine separated attention heads into a single tensor.r%   r	   )r[   r]   r\   )r^   r_   n_headsn_tokens
c_per_heads        r"   _recombine_headszAttention._recombine_heads<  s>     ,-77(7HjKK1yyHg
&:;;r#   c                   | j                  |      }| j                  |      }| j                  |      }| j                  || j                        }| j                  || j                        }| j                  || j                        }|j
                  \  }}}}||j                  dddd      z  }|t        j                  |      z  }t        j                  |d      }||z  }| j                  |      }| j                  |      S )a  
        Apply multi-head attention to query, key, and value tensors with optional downsampling.

        Args:
            q (torch.Tensor): Query tensor with shape (B, N_q, embedding_dim).
            k (torch.Tensor): Key tensor with shape (B, N_k, embedding_dim).
            v (torch.Tensor): Value tensor with shape (B, N_k, embedding_dim).

        Returns:
            (torch.Tensor): Output tensor after attention with shape (B, N_q, embedding_dim).
        r   r%      r	   )dim)rV   rW   rX   rb   r   r[   r/   mathsqrttorchsoftmaxrg   rY   )r   r+   r,   r-   _rf   attnouts           r"   r5   zAttention.forwardC  s     KKNKKNKKN   DNN3  DNN3  DNN3  gg1a199Q1a((dii
++}}Tr* Qh##C(}}S!!r#   )r%   N)
r   r6   r   r6   r   r6   rS   r6   r8   r9   )r^   r:   r   r6   r8   r:   )r^   r   r8   r   )r+   r:   r,   r:   r-   r:   r8   r:   )
r=   r>   r?   r@   r   staticmethodrb   rg   r5   rB   rC   s   @r"   r   r      sy    D  !DD D 	D
 D 
D> ! ! < <"r#   r   )
__future__r   rl   rn   r   r   ultralytics.nn.modulesr   Moduler   r   r    r#   r"   <module>rx      sN    #    +q		 qhs299 slk"		 k"r#   