
    .hz                       d Z ddlmZ ddlZddlZddlmZ ddlmc mZ	 ddl
mZmZ ddlmZ ddlmZ ddlmZmZmZ d	Z G d
 dej,                        Z G d de      Z G d dej,                        Z G d dej,                        Z G d dej,                        Z G d dej,                        Z G d dej,                        Z G d dej,                        Z G d dej,                        Z G d dej,                        Z y)zTransformer modules.    )annotationsN)	constant_xavier_uniform_)
TORCH_1_11   )Conv)_get_clonesinverse_sigmoid#multi_scale_deformable_attn_pytorch)
TransformerEncoderLayerTransformerLayerTransformerBlockMLPBlockLayerNorm2dAIFIDeformableTransformerDecoder!DeformableTransformerDecoderLayerMSDeformAttnMLPc                       e Zd ZdZddd ej
                         df	 	 	 	 	 	 	 	 	 	 	 d fdZeddd       Z	 	 	 d	 	 	 	 	 	 	 	 	 ddZ		 	 	 d	 	 	 	 	 	 	 	 	 dd	Z
	 	 	 d	 	 	 	 	 	 	 	 	 dd
Z xZS )r   a  
    A single layer of the transformer encoder.

    This class implements a standard transformer encoder layer with multi-head attention and feedforward network,
    supporting both pre-normalization and post-normalization configurations.

    Attributes:
        ma (nn.MultiheadAttention): Multi-head attention module.
        fc1 (nn.Linear): First linear layer in the feedforward network.
        fc2 (nn.Linear): Second linear layer in the feedforward network.
        norm1 (nn.LayerNorm): Layer normalization after attention.
        norm2 (nn.LayerNorm): Layer normalization after feedforward network.
        dropout (nn.Dropout): Dropout layer for the feedforward network.
        dropout1 (nn.Dropout): Dropout layer after attention.
        dropout2 (nn.Dropout): Dropout layer after feedforward network.
        act (nn.Module): Activation function.
        normalize_before (bool): Whether to apply normalization before attention and feedforward.
                  Fc                   t         |           ddlm} |st	        d      t        j                  |||d      | _        t        j                  ||      | _	        t        j                  ||      | _
        t        j                  |      | _        t        j                  |      | _        t        j                  |      | _        t        j                  |      | _        t        j                  |      | _        || _        || _        y)a  
        Initialize the TransformerEncoderLayer with specified parameters.

        Args:
            c1 (int): Input dimension.
            cm (int): Hidden dimension in the feedforward network.
            num_heads (int): Number of attention heads.
            dropout (float): Dropout probability.
            act (nn.Module): Activation function.
            normalize_before (bool): Whether to apply normalization before attention and feedforward.
           )	TORCH_1_9z]TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True).T)dropoutbatch_firstN)super__init__utils.torch_utilsr   ModuleNotFoundErrornnMultiheadAttentionmaLinearfc1fc2	LayerNormnorm1norm2Dropoutr   dropout1dropout2actnormalize_before)	selfc1cm	num_headsr   r/   r0   r   	__class__s	           `/var/www/html/ai-service/venv/lib/python3.12/site-packages/ultralytics/nn/modules/transformer.pyr    z TransformerEncoderLayer.__init__4   s    ( 	2%o  ''IwTXY99R$99R$\\"%
\\"%
zz'*

7+

7+ 0    c                    || S | |z   S )z2Add position embeddings to the tensor if provided. tensorposs     r6   with_pos_embedz&TransformerEncoderLayer.with_pos_embed]        v6&3,6r7   c           	     l   | j                  ||      x}}| j                  |||||      d   }|| j                  |      z   }| j                  |      }| j	                  | j                  | j                  | j                  |                        }|| j                  |      z   }| j                  |      S )a  
        Perform forward pass with post-normalization.

        Args:
            src (torch.Tensor): Input tensor.
            src_mask (torch.Tensor, optional): Mask for the src sequence.
            src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
            pos (torch.Tensor, optional): Positional encoding.

        Returns:
            (torch.Tensor): Output tensor after attention and feedforward.
        value	attn_maskkey_padding_maskr   )
r=   r%   r-   r*   r(   r   r/   r'   r.   r+   )r1   srcsrc_masksrc_key_padding_maskr<   qksrc2s           r6   forward_postz$TransformerEncoderLayer.forward_postb   s    & ##C--Awwq!3(MawbcdeDMM$''jjoxxTXXdhhsm%<=>DMM$''zz#r7   c           	     l   | j                  |      }| j                  ||      x}}| j                  |||||      d   }|| j                  |      z   }| j	                  |      }| j                  | j                  | j                  | j                  |                        }|| j                  |      z   S )a  
        Perform forward pass with pre-normalization.

        Args:
            src (torch.Tensor): Input tensor.
            src_mask (torch.Tensor, optional): Mask for the src sequence.
            src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
            pos (torch.Tensor, optional): Positional encoding.

        Returns:
            (torch.Tensor): Output tensor after attention and feedforward.
        r@   r   )
r*   r=   r%   r-   r+   r(   r   r/   r'   r.   )r1   rD   rE   rF   r<   rI   rG   rH   s           r6   forward_prez#TransformerEncoderLayer.forward_pre}   s    & zz###D#..Awwq!48NbwcdefDMM$''zz#xxTXXdhhtn%=>?T]]4(((r7   c                j    | j                   r| j                  ||||      S | j                  ||||      S )a  
        Forward propagate the input through the encoder module.

        Args:
            src (torch.Tensor): Input tensor.
            src_mask (torch.Tensor, optional): Mask for the src sequence.
            src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
            pos (torch.Tensor, optional): Positional encoding.

        Returns:
            (torch.Tensor): Output tensor after transformer encoder layer.
        )r0   rL   rJ   )r1   rD   rE   rF   r<   s        r6   forwardzTransformerEncoderLayer.forward   s=    &   ##C3GMM  h0DcJJr7   r2   intr3   rP   r4   rP   r   floatr/   	nn.Moduler0   boolNr;   torch.Tensorr<   torch.Tensor | NonereturnrV   NNN)
rD   rV   rE   rW   rF   rW   r<   rW   rX   rV   )__name__
__module____qualname____doc__r#   GELUr    staticmethodr=   rJ   rL   rN   __classcell__r5   s   @r6   r   r       s;   ,  !&'1'1 '1 	'1
 '1 '1 '1R 7 7 )-48#' & 2	
 ! 
< )-48#')) &) 2	)
 !) 
)< )-48#'KK &K 2	K
 !K 
Kr7   r   c                       e Zd ZdZddd ej
                         df	 	 	 	 	 	 	 	 	 	 	 d	 fdZd
 fdZe	 d	 	 	 	 	 	 	 	 	 dd       Z	 xZ
S )r   z
    AIFI transformer layer for 2D data with positional embeddings.

    This class extends TransformerEncoderLayer to work with 2D feature maps by adding 2D sine-cosine positional
    embeddings and handling the spatial dimensions appropriately.
    r   r   r   Fc                .    t         |   ||||||       y)a  
        Initialize the AIFI instance with specified parameters.

        Args:
            c1 (int): Input dimension.
            cm (int): Hidden dimension in the feedforward network.
            num_heads (int): Number of attention heads.
            dropout (float): Dropout probability.
            act (nn.Module): Activation function.
            normalize_before (bool): Whether to apply normalization before attention and feedforward.
        N)r   r    )r1   r2   r3   r4   r   r/   r0   r5   s          r6   r    zAIFI.__init__   s    ( 	RGS:JKr7   c                d   |j                   dd \  }}}| j                  |||      }t        |   |j	                  d      j                  ddd      |j                  |j                  |j                              }|j                  ddd      j                  d|||g      j                         S )z
        Forward pass for the AIFI transformer layer.

        Args:
            x (torch.Tensor): Input tensor with shape [B, C, H, W].

        Returns:
            (torch.Tensor): Output tensor with shape [B, C, H, W].
        r   N   r   )devicedtype)r<   )shape"build_2d_sincos_position_embeddingr   rN   flattenpermutetorf   rg   view
contiguous)r1   xchw	pos_embedr5   s         r6   rN   zAIFI.forward   s     ''!"+1a;;Aq!D	GOAIIaL00Aq9y||STS[S[cdcjcj|?kOlyyAq!&&Aq!}5@@BBr7   c                   |dz  dk(  sJ d       t        j                  | t         j                        }t        j                  |t         j                        }t        rt        j                  ||d      nt        j                  ||      \  }}|dz  }t        j                  |t         j                        |z  }d||z  z  }|j                         d   |d	   z  }|j                         d   |d	   z  }	t        j                  t        j                  |      t        j                  |      t        j                  |	      t        j                  |	      gd
      d	   S )a  
        Build 2D sine-cosine position embedding.

        Args:
            w (int): Width of the feature map.
            h (int): Height of the feature map.
            embed_dim (int): Embedding dimension.
            temperature (float): Temperature for the sine/cosine functions.

        Returns:
            (torch.Tensor): Position embedding with shape [1, embed_dim, h*w].
           r   zHEmbed dimension must be divisible by 4 for 2D sin-cos position embeddingrg   ij)indexingg      ?.NNr   )	torcharangefloat32r   meshgridrk   catsincos)
rs   rr   	embed_dimtemperaturegrid_wgrid_hpos_dimomegaout_wout_hs
             r6   rj   z'AIFI.build_2d_sincos_position_embedding   s     1}!m#mm!au}}5au}}5JTFZ_ZhZhioqwZxq.WEMM:WD{E)* +eDk9 +eDk9yy%))E*EIIe,<eii>NPUPYPYZ_P`acdefjkkr7   rO   rp   rV   rX   rV   )   g     @)
rs   rP   rr   rP   r   rP   r   rQ   rX   rV   )rZ   r[   r\   r]   r#   r^   r    rN   r_   rj   r`   ra   s   @r6   r   r      s      !&LL L 	L
 L L L,C  CJlll#&l;@l	l lr7   r   c                  ,     e Zd ZdZd fdZddZ xZS )r   zeTransformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance).c                |   t         |           t        j                  ||d      | _        t        j                  ||d      | _        t        j                  ||d      | _        t        j                  ||      | _        t        j                  ||d      | _	        t        j                  ||d      | _
        y)z
        Initialize a self-attention mechanism using linear transformations and multi-head attention.

        Args:
            c (int): Input and output channel dimension.
            num_heads (int): Number of attention heads.
        F)bias)r   r4   N)r   r    r#   r&   rG   rH   vr$   r%   r'   r(   )r1   rq   r4   r5   s      r6   r    zTransformerLayer.__init__   s     	1ae,1ae,1ae,''!yI99Q.99Q.r7   c                    | j                  | j                  |      | j                  |      | j                  |            d   |z   }| j	                  | j                  |            |z   S )z
        Apply a transformer block to the input x and return the output.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after transformer layer.
        r   )r%   rG   rH   r   r(   r'   r1   rp   s     r6   rN   zTransformerLayer.forward  sT     GGDFF1Itvvay$&&)4Q7!;xx$q((r7   )rq   rP   r4   rP   r   rZ   r[   r\   r]   r    rN   r`   ra   s   @r6   r   r      s    o/ )r7   r   c                  ,     e Zd ZdZd fdZddZ xZS )r   a  
    Vision Transformer block based on https://arxiv.org/abs/2010.11929.

    This class implements a complete transformer block with optional convolution layer for channel adjustment,
    learnable position embedding, and multiple transformer layers.

    Attributes:
        conv (Conv, optional): Convolution layer if input and output channels differ.
        linear (nn.Linear): Learnable position embedding.
        tr (nn.Sequential): Sequential container of transformer layers.
        c2 (int): Output channel dimension.
    c                    t         |           d| _        |k7  rt        |      | _        t	        j
                        | _        t	        j                  fdt        |      D         | _	        | _
        y)aU  
        Initialize a Transformer module with position embedding and specified number of heads and layers.

        Args:
            c1 (int): Input channel dimension.
            c2 (int): Output channel dimension.
            num_heads (int): Number of attention heads.
            num_layers (int): Number of transformer layers.
        Nc              3  6   K   | ]  }t                y wrT   )r   ).0_c2r4   s     r6   	<genexpr>z,TransformerBlock.__init__.<locals>.<genexpr>:  s     !]a"22y"A!]s   )r   r    convr   r#   r&   linear
Sequentialrangetrr   )r1   r2   r   r4   
num_layersr5   s     `` r6   r    zTransformerBlock.__init__+  s`     		8RDIiiB'--!]5Q[K\!]^r7   c                B   | j                   | j                  |      }|j                  \  }}}}|j                  d      j                  ddd      }| j	                  || j                  |      z         j                  ddd      j                  || j                  ||      S )z
        Forward propagate the input through the transformer block.

        Args:
            x (torch.Tensor): Input tensor with shape [b, c1, w, h].

        Returns:
            (torch.Tensor): Output tensor with shape [b, c2, w, h].
        re   r   r   )r   ri   rk   rl   r   r   reshaper   )r1   rp   br   rs   rr   ps          r6   rN   zTransformerBlock.forward=  s     99 		!AWW
1aIIaL  Aq)wwq4;;q>)*221a;CCAtwwPQSTUUr7   )r2   rP   r   rP   r4   rP   r   rP   r   r   ra   s   @r6   r   r     s    $Vr7   r   c                  D     e Zd ZdZej
                  fd fdZddZ xZS )r   z+A single block of a multi-layer perceptron.c                    t         |           t        j                  ||      | _        t        j                  ||      | _         |       | _        y)a   
        Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.

        Args:
            embedding_dim (int): Input and output dimension.
            mlp_dim (int): Hidden dimension.
            act (nn.Module): Activation function.
        N)r   r    r#   r&   lin1lin2r/   )r1   embedding_dimmlp_dimr/   r5   s       r6   r    zMLPBlock.__init__Q  s=     	IImW5	IIg}5	5r7   c                `    | j                  | j                  | j                  |                  S )z
        Forward pass for the MLPBlock.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after MLP block.
        )r   r/   r   r   s     r6   rN   zMLPBlock.forward_  s$     yy$))A,/00r7   )r   rP   r   rP   r   )	rZ   r[   r\   r]   r#   r^   r    rN   r`   ra   s   @r6   r   r   N  s    5=?WW 
1r7   r   c                  X     e Zd ZdZej
                  df	 	 	 	 	 	 	 	 	 d fdZddZ xZS )r   a  
    A simple multi-layer perceptron (also called FFN).

    This class implements a configurable MLP with multiple linear layers, activation functions, and optional
    sigmoid output activation.

    Attributes:
        num_layers (int): Number of layers in the MLP.
        layers (nn.ModuleList): List of linear layers.
        sigmoid (bool): Whether to apply sigmoid to the output.
        act (nn.Module): Activation function.
    Fc                    t         |           || _        |g|dz
  z  }t        j                  d t        |g|z   ||gz         D              | _        || _         |       | _        y)a  
        Initialize the MLP with specified input, hidden, output dimensions and number of layers.

        Args:
            input_dim (int): Input dimension.
            hidden_dim (int): Hidden dimension.
            output_dim (int): Output dimension.
            num_layers (int): Number of layers.
            act (nn.Module): Activation function.
            sigmoid (bool): Whether to apply sigmoid to the output.
        r   c              3  N   K   | ]  \  }}t        j                  ||        y wrT   )r#   r&   )r   nrH   s      r6   r   zMLP.__init__.<locals>.<genexpr>  s     #g1BIIaO#gs   #%N)	r   r    r   r#   
ModuleListziplayerssigmoidr/   )	r1   	input_dim
hidden_dim
output_dimr   r/   r   rr   r5   s	           r6   r    zMLP.__init__z  se     	$LJN+mm#gYKRSOUVZdYeUe@f#gg5r7   c                   t        | j                        D ]J  \  }}|| j                  dz
  k  r+ t        | dt	        j
                                ||            n ||      }L t        | dd      r|j                         S |S )z
        Forward pass for the entire MLP.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after MLP.
        r   r/   r   F)	enumerater   r   getattrr#   ReLUr   )r1   rp   ilayers       r6   rN   zMLP.forward  sx     "$++. 	cHAu=>STAT=T/eRWWY/a9Z_`aZbA	c%dIu=qyy{D1Dr7   )
r   rP   r   rP   r   rP   r   rP   r   rS   r   )	rZ   r[   r\   r]   r#   r   r    rN   r`   ra   s   @r6   r   r   l  sK     VXU\U\ns*-;>LOgk*Er7   r   c                  .     e Zd ZdZdd fdZddZ xZS )r   au  
    2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.

    This class implements layer normalization for 2D feature maps, normalizing across the channel dimension
    while preserving spatial dimensions.

    Attributes:
        weight (nn.Parameter): Learnable scale parameter.
        bias (nn.Parameter): Learnable bias parameter.
        eps (float): Small constant for numerical stability.

    References:
        https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
        https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py
    c                    t         |           t        j                  t	        j
                  |            | _        t        j                  t	        j                  |            | _        || _	        y)z
        Initialize LayerNorm2d with the given parameters.

        Args:
            num_channels (int): Number of channels in the input.
            eps (float): Small constant for numerical stability.
        N)
r   r    r#   	Parameterr{   onesweightzerosr   eps)r1   num_channelsr   r5   s      r6   r    zLayerNorm2d.__init__  sI     	ll5::l#;<LL\!:;	r7   c                   |j                  dd      }||z
  j                  d      j                  dd      }||z
  t        j                  || j                  z         z  }| j
                  ddddf   |z  | j                  ddddf   z   S )z
        Perform forward pass for 2D layer normalization.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Normalized output tensor.
        r   Tkeepdimre   N)meanpowr{   sqrtr   r   r   )r1   rp   uss       r6   rN   zLayerNorm2d.forward  s     FF1dF#UKKN40UejjTXX..{{1dD=)A-		!T4-0HHHr7   )gư>)r   rP   r   rQ   r   r   ra   s   @r6   r   r     s     Ir7   r   c                  N     e Zd ZdZdd fdZd Z	 d	 	 	 	 	 	 	 	 	 	 	 ddZ xZS )	r   a  
    Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.

    This module implements multiscale deformable attention that can attend to features at multiple scales
    with learnable sampling locations and attention weights.

    Attributes:
        im2col_step (int): Step size for im2col operations.
        d_model (int): Model dimension.
        n_levels (int): Number of feature levels.
        n_heads (int): Number of attention heads.
        n_points (int): Number of sampling points per attention head per feature level.
        sampling_offsets (nn.Linear): Linear layer for generating sampling offsets.
        attention_weights (nn.Linear): Linear layer for generating attention weights.
        value_proj (nn.Linear): Linear layer for projecting values.
        output_proj (nn.Linear): Linear layer for projecting output.

    References:
        https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
    c                   t         |           ||z  dk7  rt        d| d|       ||z  }||z  |k(  sJ d       d| _        || _        || _        || _        || _        t        j                  |||z  |z  dz        | _
        t        j                  |||z  |z        | _        t        j                  ||      | _        t        j                  ||      | _        | j                          y)aG  
        Initialize MSDeformAttn with the given parameters.

        Args:
            d_model (int): Model dimension.
            n_levels (int): Number of feature levels.
            n_heads (int): Number of attention heads.
            n_points (int): Number of sampling points per attention head per feature level.
        r   z.d_model must be divisible by n_heads, but got z and z(`d_model` must be divisible by `n_heads`@   re   N)r   r    
ValueErrorim2col_stepd_modeln_levelsn_headsn_pointsr#   r&   sampling_offsetsattention_weights
value_projoutput_proj_reset_parameters)r1   r   r   r   r   _d_per_headr5   s         r6   r    zMSDeformAttn.__init__  s     	W!MgYV[\c[deff(W$/[1[[/   "		'7X3E3PST3T U!#7Gh4F4Q!R))GW599Wg6 r7   c                H   t        | j                  j                  j                  d       t	        j
                  | j                  t        j                        dt        j                  z  | j                  z  z  }t	        j                  |j                         |j                         gd      }||j                         j                  dd      d   z  j                  | j                  ddd	      j!                  d| j"                  | j$                  d      }t'        | j$                        D ]  }|d
d
d
d
|d
d
fxx   |dz   z  cc<    t	        j(                         5  t+        j,                  |j                  d            | j                  _        d
d
d
       t        | j0                  j                  j                  d       t        | j0                  j.                  j                  d       t3        | j4                  j                  j                         t        | j4                  j.                  j                  d       t3        | j6                  j                  j                         t        | j6                  j.                  j                  d       y
# 1 sw Y   xY w)zReset module parameters.r   rw   g       @rh   Tr   r   r   re   N)r   r   r   datar{   r|   r   r}   mathpistackr   r   absmaxrn   repeatr   r   r   no_gradr#   r   r   r   r   r   r   )r1   thetas	grid_initr   s       r6   r   zMSDeformAttn._reset_parameters  s   $''..33S9dll%--@C$''MTXT`T`D`aKKvzz| <bA	,,R,>qAAT$,,1a(VAt}}dmmQ7 	
 t}}% 	+AaAqj!QU*!	+]]_ 	J)+innR6H)ID!!&	J$((//44c:$((--22C8..334$//&&++S1((//445$""'',,c2	J 	Js    4JJ!c           	        |j                   dd \  }}|j                   d   }t        d |D              |k(  sJ | j                  |      }||j                  |d   t	        d            }|j                  ||| j                  | j                  | j                  z        }| j                  |      j                  ||| j                  | j                  | j                  d      }	| j                  |      j                  ||| j                  | j                  | j                  z        }
t        j                  |
d      j                  ||| j                  | j                  | j                        }
|j                   d   }|dk(  rdt        j                  ||j                   |j"                        j%                  d      }|	|ddddddddf   z  }|ddddddddddf   |z   }nQ|d	k(  r=|	| j                  z  |ddddddddddf   z  d
z  }|ddddddddddf   |z   }nt'        d| d      t)        ||||
      }| j+                  |      S )a  
        Perform forward pass for multiscale deformable attention.

        Args:
            query (torch.Tensor): Query tensor with shape [bs, query_length, C].
            refer_bbox (torch.Tensor): Reference bounding boxes with shape [bs, query_length, n_levels, 2],
                range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area.
            value (torch.Tensor): Value tensor with shape [bs, value_length, C].
            value_shapes (list): List with shape [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})].
            value_mask (torch.Tensor, optional): Mask tensor with shape [bs, value_length], True for non-padding
                elements, False for padding elements.

        Returns:
            (torch.Tensor): Output tensor with shape [bs, Length_{query}, C].

        References:
            https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
        Nre   r   c              3  2   K   | ]  }|d    |d   z    yw)r   r   Nr9   )r   r   s     r6   r   z'MSDeformAttn.forward.<locals>.<genexpr>2  s     511Q4!A$;5s   rz   r   rh   )rg   rf   rv   g      ?z5Last dim of reference_points must be 2 or 4, but got .)ri   sumr   masked_fillrQ   rn   r   r   r   r   r   r   Fsoftmaxr{   	as_tensorrg   rf   flipr   r   r   )r1   query
refer_bboxrA   value_shapes
value_maskbslen_qlen_vr   r   
num_pointsoffset_normalizeraddsampling_locationsoutputs                   r6   rN   zMSDeformAttn.forward  sH   4 KKO	EA555>>>&!%%j&;U1XFE

2udllDLLDLL4PQ007<<RVZVcVceierertuv 2259>>r5$,,X\XeXehlhuhuXuvII&7<AA"eT\\[_[h[hjnjwjwx%%b)
? %EKKX]XdXd e j jkm n"%6tT4DRS7S%TTC!+Aq$4,B!Cc!I1_"T]]2Z1dAtUVUW@W5XX[^^C!+Aq$4!,C!Ds!JTU_T``abcc4ULJ\^op''r7   )r   rv   r   rv   )r   rP   r   rP   r   rP   r   rP   rT   )r   rV   r   rV   rA   rV   r   listr   rW   rX   rV   )rZ   r[   r\   r]   r    r   rN   r`   ra   s   @r6   r   r     sV    *!>36 +/1(1( !1( 	1(
 1( (1( 
1(r7   r   c                       e Zd ZdZdddd ej
                         ddf	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZedd       Zdd	Z		 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
Z
 xZS )r   a;  
    Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.

    This class implements a single decoder layer with self-attention, cross-attention using multiscale deformable
    attention, and a feedforward network.

    Attributes:
        self_attn (nn.MultiheadAttention): Self-attention module.
        dropout1 (nn.Dropout): Dropout after self-attention.
        norm1 (nn.LayerNorm): Layer normalization after self-attention.
        cross_attn (MSDeformAttn): Cross-attention module.
        dropout2 (nn.Dropout): Dropout after cross-attention.
        norm2 (nn.LayerNorm): Layer normalization after cross-attention.
        linear1 (nn.Linear): First linear layer in the feedforward network.
        act (nn.Module): Activation function.
        dropout3 (nn.Dropout): Dropout in the feedforward network.
        linear2 (nn.Linear): Second linear layer in the feedforward network.
        dropout4 (nn.Dropout): Dropout after the feedforward network.
        norm3 (nn.LayerNorm): Layer normalization after the feedforward network.

    References:
        https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
        https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
    r   r   i   r   rv   c                h   t         |           t        j                  |||      | _        t        j
                  |      | _        t        j                  |      | _        t        ||||      | _
        t        j
                  |      | _        t        j                  |      | _        t        j                  ||      | _        || _        t        j
                  |      | _        t        j                  ||      | _        t        j
                  |      | _        t        j                  |      | _        y)a  
        Initialize the DeformableTransformerDecoderLayer with the given parameters.

        Args:
            d_model (int): Model dimension.
            n_heads (int): Number of attention heads.
            d_ffn (int): Dimension of the feedforward network.
            dropout (float): Dropout probability.
            act (nn.Module): Activation function.
            n_levels (int): Number of feature levels.
            n_points (int): Number of sampling points.
        )r   N)r   r    r#   r$   	self_attnr,   r-   r)   r*   r   
cross_attnr.   r+   r&   linear1r/   dropout3linear2dropout4norm3)	r1   r   r   d_ffnr   r/   r   r   r5   s	           r6   r    z*DeformableTransformerDecoderLayer.__init__d  s    , 	 ..wQ

7+\\'*
 'w'8L

7+\\'*
 yy%0

7+yy0

7+\\'*
r7   c                    || S | |z   S )z;Add positional embeddings to the input tensor, if provided.r9   r:   s     r6   r=   z0DeformableTransformerDecoderLayer.with_pos_embed  r>   r7   c           	         | j                  | j                  | j                  | j                  |                        }|| j	                  |      z   }| j                  |      S )z
        Perform forward pass through the Feed-Forward Network part of the layer.

        Args:
            tgt (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after FFN.
        )r  r  r/   r   r  r  )r1   tgttgt2s      r6   forward_ffnz-DeformableTransformerDecoderLayer.forward_ffn  sN     ||DMM$((4<<3D*EFGDMM$''zz#r7   c                   | j                  ||      x}}	| j                  |j                  dd      |	j                  dd      |j                  dd      |      d   j                  dd      }
|| j                  |
      z   }| j	                  |      }| j                  | j                  ||      |j                  d      |||      }
|| j                  |
      z   }| j                  |      }| j                  |      S )aH  
        Perform the forward pass through the entire decoder layer.

        Args:
            embed (torch.Tensor): Input embeddings.
            refer_bbox (torch.Tensor): Reference bounding boxes.
            feats (torch.Tensor): Feature maps.
            shapes (list): Feature shapes.
            padding_mask (torch.Tensor, optional): Padding mask.
            attn_mask (torch.Tensor, optional): Attention mask.
            query_pos (torch.Tensor, optional): Query position embeddings.

        Returns:
            (torch.Tensor): Output tensor after decoder layer.
        r   r   )rB   re   )
r=   r   	transposer-   r*   r   	unsqueezer.   r+   r
  )r1   embedr   featsshapespadding_maskrB   	query_posrG   rH   r  s              r6   rN   z)DeformableTransformerDecoderLayer.forward  s    4 ##E955AnnQ[[A.Aq0A5??STVWCXdmnn

)Aq/ 	 c**

5! ooy1:3G3G3JESY[g
 c**

5! &&r7   )r   rP   r   rP   r  rP   r   rQ   r/   rR   r   rP   r   rP   rU   )r  rV   rX   rV   rY   )r  rV   r   rV   r  rV   r  r   r  rW   rB   rW   r  rW   rX   rV   )rZ   r[   r\   r]   r#   r   r    r_   r=   r
  rN   r`   ra   s   @r6   r   r   J  s    6  (+(+ (+ 	(+
 (+ (+ (+ (+T 7 7( -1)-)-)')' !)' 	)'
 )' *)' ')' ')' 
)'r7   r   c                  V     e Zd ZdZdd fdZ	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ xZS )r   av  
    Deformable Transformer Decoder based on PaddleDetection implementation.

    This class implements a complete deformable transformer decoder with multiple decoder layers and prediction
    heads for bounding box regression and classification.

    Attributes:
        layers (nn.ModuleList): List of decoder layers.
        num_layers (int): Number of decoder layers.
        hidden_dim (int): Hidden dimension.
        eval_idx (int): Index of the layer to use during evaluation.

    References:
        https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
    c                    t         |           t        ||      | _        || _        || _        |dk\  r|| _        y||z   | _        y)aU  
        Initialize the DeformableTransformerDecoder with the given parameters.

        Args:
            hidden_dim (int): Hidden dimension.
            decoder_layer (nn.Module): Decoder layer module.
            num_layers (int): Number of decoder layers.
            eval_idx (int): Index of the layer to use during evaluation.
        r   N)r   r    r	   r   r   r   eval_idx)r1   r   decoder_layerr   r  r5   s        r6   r    z%DeformableTransformerDecoder.__init__  sD     	!-<$$$,MzH7Lr7   c
                   |}
g }g }d}|j                         }t        | j                        D ]  \  }} ||
||||	| ||            }
 ||   |
      }t        j                   |t	        |      z         }| j
                  rb|j                   ||   |
             |dk(  r|j                  |       nm|j                  t        j                   |t	        |      z                n<|| j                  k(  r-|j                   ||   |
             |j                  |        n#|}| j
                  r|j                         n|} t        j                  |      t        j                  |      fS )a  
        Perform the forward pass through the entire decoder.

        Args:
            embed (torch.Tensor): Decoder embeddings.
            refer_bbox (torch.Tensor): Reference bounding boxes.
            feats (torch.Tensor): Image features.
            shapes (list): Feature shapes.
            bbox_head (nn.Module): Bounding box prediction head.
            score_head (nn.Module): Score prediction head.
            pos_mlp (nn.Module): Position MLP.
            attn_mask (torch.Tensor, optional): Attention mask.
            padding_mask (torch.Tensor, optional): Padding mask.

        Returns:
            dec_bboxes (torch.Tensor): Decoded bounding boxes.
            dec_cls (torch.Tensor): Decoded classification scores.
        Nr   )
r   r   r   r{   r
   trainingappendr  detachr   )r1   r  r   r  r  	bbox_head
score_headpos_mlprB   r  r   
dec_bboxesdec_clslast_refined_bboxr   r   bboxrefined_bboxs                     r6   rN   z$DeformableTransformerDecoder.forward  sN   < 
 '')
!$++. 	RHAu6:uflIW^_iWjkF9Q<'D ==
0K)KLL}}}z!}V456%%l3%%emmD?K\;]4]&^_dmm#}z!}V45!!,/ ,26--,,.\J%	R( {{:&G(<<<r7   )rh   )r   rP   r  rR   r   rP   r  rP   )NN)r  rV   r   rV   r  rV   r  r   r  rR   r  rR   r  rR   rB   rW   r  rW   r   ra   s   @r6   r   r     ss     M2 *.,07=7= !7= 	7=
 7= 7= 7= 7= '7= *7=r7   r   )!r]   
__future__r   r   r{   torch.nnr#   torch.nn.functional
functionalr   torch.nn.initr   r   ultralytics.utils.torch_utilsr   r   r   utilsr	   r
   r   __all__Moduler   r   r   r   r   r   r   r   r   r   r9   r7   r6   <module>r,     s     "      4 4  T TMKbii MK`Il" IlX)ryy )B.Vryy .Vb1ryy 1</E")) /Ed+I")) +I\{(299 {(|@'		 @'FX=299 X=r7   