
    .h                   J   d Z ddlmZ ddlZddlmZ ddlmc mZ ddl	m
Z
 ddlmZmZmZmZmZmZ ddlmZ dZ G d	 d
ej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d de      Z  G d d ej*                        Z! G d! d"e      Z" G d# d$e      Z# G d% d&ej*                        Z$ G d' d(ej*                        Z% G d) d*ej*                        Z& G d+ d,ej*                        Z' G d- d.ej*                        Z( G d/ d0ej*                        Z) G d1 d2ej*                        Z* G d3 d4ej*                        Z+ G d5 d6ej*                        Z, G d7 d8ej*                        Z- G d9 d:e%      Z. G d; d<e      Z/ G d= d>ej*                        Z0 G d? d@e0      Z1 G dA dBej*                        Z2 G dC dDej*                        Z3 G dE dFej*                        Z4 G dG dHej*                        Z5 G dI dJej*                        Z6 G dK dLej*                        Z7 G dM dNe      Z8 G dO dPe      Z9 G dQ dRej
                  j*                        Z: G dS dTej*                        Z; G dU dVe      Z< G dW dXej*                        Z= G dY dZej*                        Z> G d[ d\ej*                        Z? G d] d^ej*                        Z@ G d_ d`e      ZA G da dbej*                        ZB G dc ddej*                        ZC G de dfej*                        ZD G dg dhej*                        ZE G di djej*                        ZF G dk dlej*                        ZG G dm dnej*                        ZH G do dpej*                        ZIy)qzBlock modules.    )annotationsN)fuse_conv_and_bn   )ConvDWConv	GhostConv	LightConvRepConvautopad)TransformerBlock)'DFLHGBlockHGStemSPPSPPFC1C2C3C2fC2fAttnImagePoolingAttnContrastiveHeadBNContrastiveHeadC3xC3TRC3GhostGhostBottleneck
BottleneckBottleneckCSPProtoRepC3ResNetLayerRepNCSPELAN4ELAN1ADownAConvSPPELANCBFuseCBLinearC3k2C2fPSAC2PSARepVGGDWCIBC2fCIB	AttentionPSASCDownTorchVisionc                  .     e Zd ZdZdd fdZddZ xZS )r   z
    Integral module of Distribution Focal Loss (DFL).

    Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
    c                d   t         |           t        j                  |ddd      j	                  d      | _        t        j                  |t        j                        }t        j                  |j                  d|dd            | j
                  j                  j                  dd || _        y)z
        Initialize a convolutional layer with a given number of input channels.

        Args:
            c1 (int): Number of input channels.
        r   Fbias)dtypeN)super__init__nnConv2drequires_grad_convtorcharangefloat	Parameterviewweightdatac1)selfrF   x	__class__s      Z/var/www/html/ai-service/venv/lib/python3.12/site-packages/ultralytics/nn/modules/block.pyr:   zDFL.__init__A   s~     	IIb!QU3BB5I	LL5;;/#%<<q"a0C#D		a     c                    |j                   \  }}}| j                  |j                  |d| j                  |      j	                  dd      j                  d            j                  |d|      S )zCApply the DFL module to input tensor and return transformed output.      r   )shaper>   rC   rF   	transposesoftmax)rG   rH   b_as        rJ   forwardzDFL.forwardN   s]    ''1ayy1dggq1;;AqAII!LMRRSTVWYZ[[rK   )   )rF   intrH   torch.TensorreturnrY   __name__
__module____qualname____doc__r:   rU   __classcell__rI   s   @rJ   r   r   :   s    \rK   r   c                  .     e Zd ZdZdd fdZddZ xZS )r    zBUltralytics YOLO models mask Proto module for segmentation models.c                    t         |           t        ||d      | _        t	        j
                  ||dddd      | _        t        ||d      | _        t        ||      | _        y)a  
        Initialize the Ultralytics YOLO models mask Proto module with specified number of protos and masks.

        Args:
            c1 (int): Input channels.
            c_ (int): Intermediate channels.
            c2 (int): Output channels (number of protos).
           )krN   r   Tr6   N)	r9   r:   r   cv1r;   ConvTranspose2dupsamplecv2cv3)rG   rF   c_c2rI   s       rJ   r:   zProto.__init__X   sY     	B!$**2r1aFB!$B<rK   c           	     ~    | j                  | j                  | j                  | j                  |                        S )zEPerform a forward pass through layers using an upsampled input image.)rj   ri   rh   rf   rG   rH   s     rJ   rU   zProto.forwardg   s+    xxtxx{!;<==rK   )       )rF   rW   rk   rW   rl   rW   rX   r[   ra   s   @rJ   r    r    U   s    L >rK   r    c                  ,     e Zd ZdZd fdZddZ xZS )r   z
    StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
    c           	        t         |           t        ||ddt        j                               | _        t        ||dz  dddt        j                               | _        t        |dz  |dddt        j                               | _        t        |dz  |ddt        j                               | _        t        ||ddt        j                               | _	        t        j                  dddd      | _        y)	z
        Initialize the StemBlock of PPHGNetV2.

        Args:
            c1 (int): Input channels.
            cm (int): Middle channels.
            c2 (int): Output channels.
        rd   rN   actr   r   T)kernel_sizestridepadding	ceil_modeN)r9   r:   r   r;   ReLUstem1stem2astem2bstem3stem4	MaxPool2dpool)rG   rF   cmrl   rI   s       rJ   r:   zHGStem.__init__s   s     	"b!QBGGI6
2rQw1aRWWY?27B1aRWWY?"q&"a	:
"b!QBGGI6
LLQq!tT	rK   c                d   | j                  |      }t        j                  |g d      }| j                  |      }t        j                  |g d      }| j	                  |      }| j                  |      }t        j                  ||gd      }| j                  |      }| j                  |      }|S )+Forward pass of a PPHGNetV2 backbone layer.)r   r   r   r   r   dim)
rz   Fpadr{   r|   r   r?   catr}   r~   )rG   rH   x2x1s       rJ   rU   zHGStem.forward   s    JJqMEE!\"[[^UU2|$[[_YYq\IIr2hA&JJqMJJqMrK   )rF   rW   r   rW   rl   rW   rX   r[   ra   s   @rJ   r   r   l   s    U"rK   r   c                  t     e Zd ZdZdddd ej
                         f	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZddZ xZS )	r   z
    HG_Block of PPHGNetV2 with 2 convolutions and LightConv.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
    rd      Fc	                0  	 t         
|           |rt        nt        	t	        j
                  	fdt        |      D              | _        t        |z  z   |dz  dd      | _        t        |dz  |dd      | _	        |xr |k(  | _
        y)a  
        Initialize HGBlock with specified parameters.

        Args:
            c1 (int): Input channels.
            cm (int): Middle channels.
            c2 (int): Output channels.
            k (int): Kernel size.
            n (int): Number of LightConv or Conv blocks.
            lightconv (bool): Whether to use LightConv.
            shortcut (bool): Whether to use shortcut connection.
            act (nn.Module): Activation function.
        c              3  D   K   | ]  } |d k(  rn        yw)r   re   rt   N ).0irt   blockrF   r   re   s     rJ   	<genexpr>z#HGBlock.__init__.<locals>.<genexpr>   s(     _QRu16Rr2LL_    rN   r   rs   N)r9   r:   r	   r   r;   
ModuleListrangemscecadd)rG   rF   r   rl   re   n	lightconvshortcutrt   r   rI   s    `` `   `@rJ   r:   zHGBlock.__init__   s~    0 	&	D_V[\]V^__rAF{B!GQs;rQwAqc2(brK   c                    |gj                  fd| j                  D               | j                  | j                  t	        j
                  d                  | j                  r|z   S S )r   c              3  4   K   | ]  } |d            ywNr   r   r   ys     rJ   r   z"HGBlock.forward.<locals>.<genexpr>        *a1R5*   r   )extendr   r   r   r?   r   r   rG   rH   r   s     @rJ   rU   zHGBlock.forward   sV    C	*466**GGDGGEIIaO,-q1u'a'rK   )rF   rW   r   rW   rl   rW   re   rW   r   rW   r   boolr   r   rt   	nn.ModulerX   )	r\   r]   r^   r_   r;   ry   r:   rU   r`   ra   s   @rJ   r   r      sy      )) ) 	)
 ) ) ) ) )>(rK   r   c                  .     e Zd ZdZdd fdZddZ xZS )r   zDSpatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729.c                "   t         |           |dz  }t        ||dd      | _        t        |t	        |      dz   z  |dd      | _        t        j                  |D cg c]  }t        j                  |d|dz         c}      | _	        yc c}w )z
        Initialize the SPP layer with input/output channels and pooling kernel sizes.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            k (tuple): Kernel sizes for max pooling.
        rN   r   ru   rv   rw   N)
r9   r:   r   rf   lenri   r;   r   r   r   )rG   rF   rl   re   rk   rH   rI   s         rJ   r:   zSPP.__init__   s|     	1WB1%c!fqj)2q!4_`aZ[1aSTf Uabas   "Bc                    | j                  |      }| j                  t        j                  |g| j                  D cg c]
  } ||       c}z   d            S c c}w )zBForward pass of the SPP layer, performing spatial pyramid pooling.r   )rf   ri   r?   r   r   )rG   rH   r   s      rJ   rU   zSPP.forward   sF    HHQKxx		1#tvv(>!1(>">BCC(>s   A))   	      )rF   rW   rl   rW   re   tuple[int, ...]rX   r[   ra   s   @rJ   r   r      s    NcDrK   r   c                  .     e Zd ZdZdd fdZddZ xZS )r   zGSpatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher.c                    t         |           |dz  }t        ||dd      | _        t        |dz  |dd      | _        t        j                  |d|dz        | _        y)a'  
        Initialize the SPPF layer with given input/output channels and kernel size.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            k (int): Kernel size.

        Notes:
            This module is equivalent to SPP(k=(5, 9, 13)).
        rN   r   rM   r   N)r9   r:   r   rf   ri   r;   r   r   )rG   rF   rl   re   rk   rI   s        rJ   r:   zSPPF.__init__   sY     	1WB1%QAq)!AqAvFrK   c                      j                  |      gj                   fdt        d      D                j                  t	        j
                  d            S )zRApply sequential pooling operations to input and return concatenated feature maps.c              3  F   K   | ]  }j                  d            ywr   r   )r   rS   rG   r   s     rJ   r   zSPPF.forward.<locals>.<genexpr>   s     11"1s   !rd   r   )rf   r   r   ri   r?   r   r   s   ` @rJ   rU   zSPPF.forward   sA    XXa[M	1a11xx		!Q((rK   r   )rF   rW   rl   rW   re   rW   rX   r[   ra   s   @rJ   r   r      s    QG$)rK   r   c                  .     e Zd ZdZdd fdZddZ xZS )r   z"CSP Bottleneck with 1 convolution.c                    t         |           t        |dd      | _        t	        j
                  fdt        |      D         | _        y)z
        Initialize the CSP Bottleneck with 1 convolution.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of convolutions.
        r   c              3  8   K   | ]  }t        d         yw)rd   N)r   )r   rS   rl   s     rJ   r   zC1.__init__.<locals>.<genexpr>  s      CQb"a Cs   N)r9   r:   r   rf   r;   
Sequentialr   r   )rG   rF   rl   r   rI   s     ` rJ   r:   zC1.__init__   s<     	B1% C%( CDrK   c                L    | j                  |      }| j                  |      |z   S )z:Apply convolution and residual connection to input tensor.)rf   r   r   s      rJ   rU   z
C1.forward  s!    HHQKvvay1}rK   r   )rF   rW   rl   rW   r   rW   rX   r[   ra   s   @rJ   r   r      s    ,ErK   r   c                  .     e Zd ZdZdd fdZddZ xZS )r   z#CSP Bottleneck with 2 convolutions.c                "    t                    t        ||z         _        t	        |d j                  z  dd       _        t	        d j                  z  |d       _        t        j                   fdt        |      D          _
        y)ah  
        Initialize a CSP Bottleneck with 2 convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        rN   r   c           	   3  h   K   | ])  }t        j                  j                  d d       + yw)rd   rd   r         ?re   eNr   cr   rS   grG   r   s     rJ   r   zC2.__init__.<locals>.<genexpr>  s.      vhiDFFDFFHaK[_b!c!c v   /2Nr9   r:   rW   r   r   rf   ri   r;   r   r   r   rG   rF   rl   r   r   r   r   rI   s   `   `` rJ   r:   zC2.__init__  sn     	R!VAJ1-DFF
B* vmrstmu vwrK   c                    | j                  |      j                  dd      \  }}| j                  t        j                  | j                  |      |fd            S )z<Forward pass through the CSP bottleneck with 2 convolutions.rN   r   )rf   chunkri   r?   r   r   rG   rH   rT   rR   s       rJ   rU   z
C2.forward   sF    xx{  A&1xx		466!9a.!455rK   r   Tr         ?rF   rW   rl   rW   r   rW   r   r   r   rW   r   rA   rX   r[   ra   s   @rJ   r   r   
  s    -x&6rK   r   c                  6     e Zd ZdZdd fdZddZddZ xZS )r   <Faster Implementation of CSP Bottleneck with 2 convolutions.c                .    t                    t        ||z         _        t	        |d j                  z  dd       _        t	        d|z    j                  z  |d       _        t        j                   fdt        |      D               _
        y)ah  
        Initialize a CSP bottleneck with 2 convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        rN   r   c           	   3  h   K   | ])  }t        j                  j                  d d       + ywr   r   r   s     rJ   r   zC2f.__init__.<locals>.<genexpr>9  .     tfgz$&&$&&(AIY]`aatr   N)r9   r:   rW   r   r   rf   ri   r;   r   r   r   r   s   `   `` rJ   r:   zC2f.__init__)  ss     	R!VAJ1-Q$&&("a0tkpqrksttrK   c                    t        | j                  |      j                  dd            j                  fd| j                  D               | j                  t        j                  d            S )zForward pass through C2f layer.rN   r   c              3  4   K   | ]  } |d            ywr   r   r   s     rJ   r   zC2f.forward.<locals>.<genexpr>>  r   r   )listrf   r   r   r   ri   r?   r   r   s     @rJ   rU   zC2f.forward;  sQ    !""1a()	*466**xx		!Q((rK   c                   | j                  |      j                  | j                  | j                  fd      d   d   gj                  fd| j                  D               | j                  t        j                  d            S ).Forward pass using split() instead of chunk().r   r   c              3  4   K   | ]  } |d            ywr   r   r   s     rJ   r   z$C2f.forward_split.<locals>.<genexpr>E  r   r   )rf   splitr   r   r   ri   r?   r   r   s     @rJ   forward_splitzC2f.forward_splitA  sj    HHQKtvvtvv.2qT1Q4L	*466**xx		!Q((rK   r   Fr   r   r   rX   r\   r]   r^   r_   r:   rU   r   r`   ra   s   @rJ   r   r   &  s    Fu$))rK   r   c                  .     e Zd ZdZdd fdZddZ xZS )r   z#CSP Bottleneck with 3 convolutions.c                   t         |           t        ||z        t        |dd      | _        t        |dd      | _        t        dz  |d      | _        t        j                  fdt        |      D         | _
        y)aj  
        Initialize the CSP Bottleneck with 3 convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        r   rN   c           	   3  @   K   | ]  }t        d d        yw)))r   r   r   r   r   Nr   r   rS   rk   r   r   s     rJ   r   zC3.__init__.<locals>.<genexpr>]  s&      n`aBHaCSWZ![![ n   N)r9   r:   rW   r   rf   ri   rj   r;   r   r   r   	rG   rF   rl   r   r   r   r   rk   rI   s	       `` @rJ   r:   zC3.__init__L  sr     	a[B1%B1%BA& nejklem norK   c           	         | j                  t        j                  | j                  | j	                  |            | j                  |      fd            S )z<Forward pass through the CSP bottleneck with 3 convolutions.r   )rj   r?   r   r   rf   ri   rn   s     rJ   rU   z
C3.forward_  s:    xx		466$((1+#6"DaHIIrK   r   r   rX   r[   ra   s   @rJ   r   r   I  s    -p&JrK   r   c                  &     e Zd ZdZdd fdZ xZS )r   z"C3 module with cross-convolutions.c                     t            ||||       t        ||z         _        t	        j
                   fdt        |      D          _        y)ae  
        Initialize C3 module with cross-convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        c           	   3  h   K   | ])  }t        j                  j                  d d       + yw)))r   rd   rd   r   r   r   N)r   rk   r   s     rJ   r   zC3x.__init__.<locals>.<genexpr>u  s.      vhiDGGTWWhM]ab!c!c vr   N)r9   r:   rW   rk   r;   r   r   r   r   s   `   `` rJ   r:   zC3x.__init__g  sH     	RHa3b1f+ vmrstmu vwrK   r   r   r\   r]   r^   r_   r:   r`   ra   s   @rJ   r   r   d  s    ,x xrK   r   c                  .     e Zd ZdZdd fdZddZ xZS )r!   zRep C3.c           	     h   t         |           t        ||z        }t        ||dd      | _        t        ||dd      | _        t        j                  t        |      D cg c]  }t        ||       c} | _
        ||k7  rt        ||dd      | _        yt        j                         | _        yc c}w )z
        Initialize CSP Bottleneck with a single convolution.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of RepConv blocks.
            e (float): Expansion ratio.
        r   N)r9   r:   rW   r   rf   ri   r;   r   r   r
   r   Identityrj   )rG   rF   rl   r   r   rk   rS   rI   s          rJ   r:   zRepC3.__init__{  s     	a[B1%B1%%( CQR CD)+r4B1%r{{} !Ds    B/c                    | j                  | j                  | j                  |            | j                  |      z         S )zForward pass of RepC3 module.)rj   r   rf   ri   rn   s     rJ   rU   zRepC3.forward  s/    xxtxx{+dhhqk9::rK   )rd   r   rF   rW   rl   rW   r   rW   r   rA   rX   r[   ra   s   @rJ   r!   r!   x  s    E";rK   r!   c                  &     e Zd ZdZdd fdZ xZS )r   z"C3 module with TransformerBlock().c                p    t         |   ||||||       t        ||z        }t        ||d|      | _        y)ad  
        Initialize C3 module with TransformerBlock.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Transformer blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        rM   N)r9   r:   rW   r   r   r   s	           rJ   r:   zC3TR.__init__  s;     	RHa3a[!"b!Q/rK   r   r   r   ra   s   @rJ   r   r     s    ,0 0rK   r   c                  &     e Zd ZdZdd fdZ xZS )r   z!C3 module with GhostBottleneck().c                    t         |   ||||||       t        ||z        t        j                  fdt        |      D         | _        y)ah  
        Initialize C3 module with GhostBottleneck.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Ghost bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        c              3  6   K   | ]  }t                y w)N)r   )r   rS   rk   s     rJ   r   z#C3Ghost.__init__.<locals>.<genexpr>  s      KQR!8 Ks   Nr9   r:   rW   r;   r   r   r   r   s	          @rJ   r:   zC3Ghost.__init__  sC     	RHa3a[ K%( KLrK   r   r   r   ra   s   @rJ   r   r     s    +M MrK   r   c                  .     e Zd ZdZdd fdZddZ xZS )r   zGGhost Bottleneck https://github.com/huawei-noah/Efficient-AI-Backbones.c                   t         |           |dz  }t        j                  t	        ||dd      |dk(  rt        ||||d      nt        j                         t	        ||ddd            | _        |dk(  r8t        j                  t        ||||d      t        ||ddd            | _	        yt        j                         | _	        y)z
        Initialize Ghost Bottleneck module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            k (int): Kernel size.
            s (int): Stride.
        rN   r   Frs   N)
r9   r:   r;   r   r   r   r   r>   r   r   )rG   rF   rl   re   srk   rI   s         rJ   r:   zGhostBottleneck.__init__  s     	1WMMb"a#/0AvF2r1aU+2;;=b"a.
	 ^_bc]cBMM&RA594B1RW;XY 	ikititiv 	rK   c                H    | j                  |      | j                  |      z   S )z8Apply skip connection and concatenation to input tensor.)r>   r   rn   s     rJ   rU   zGhostBottleneck.forward  s    yy|dmmA...rK   r   rF   rW   rl   rW   re   rW   r   rW   rX   r[   ra   s   @rJ   r   r     s    Q
*/rK   r   c                  F     e Zd ZdZ	 d	 	 	 	 	 	 	 	 	 	 	 d fdZddZ xZS )r   zStandard bottleneck.c                    t         |           t        ||z        }t        |||d   d      | _        t        |||d   d|      | _        |xr ||k(  | _        y)ac  
        Initialize a standard bottleneck module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            shortcut (bool): Whether to use shortcut connection.
            g (int): Groups for convolutions.
            k (tuple): Kernel sizes for convolutions.
            e (float): Expansion ratio.
        r   r   r   N)r9   r:   rW   r   rf   ri   r   	rG   rF   rl   r   r   re   r   rk   rI   s	           rJ   r:   zBottleneck.__init__  s[     	a[B!a(B!a1-(brK   c                    | j                   r#|| j                  | j                  |            z   S | j                  | j                  |            S )z3Apply bottleneck with optional shortcut connection.)r   ri   rf   rn   s     rJ   rU   zBottleneck.forward  s:    ,0HHq488DHHQK((O$((488A;:OOrK   Tr   r   r   rF   rW   rl   rW   r   r   r   rW   re   ztuple[int, int]r   rA   rX   r[   ra   s   @rJ   r   r     sH     lo)))*.):=)FU)ch)(PrK   r   c                  .     e Zd ZdZdd fdZddZ xZS )r   zGCSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks.c                   t         |           t        ||z        t        |dd      | _        t        j                  |ddd      | _        t        j                  ddd      | _        t        dz  |dd      | _	        t        j                  dz        | _        t        j                         | _        t        j                  fdt        |      D         | _        y)aR  
        Initialize CSP Bottleneck.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        r   Fr6   rN   c              3  >   K   | ]  }t        d         ywr   r   Nr   r   s     rJ   r   z)BottleneckCSP.__init__.<locals>.<genexpr>	  s!      ZABHa3!G!G Z   N)r9   r:   rW   r   rf   r;   r<   ri   rj   cv4BatchNorm2dbnSiLUrt   r   r   r   r   s	       `` @rJ   r:   zBottleneckCSP.__init__  s     	a[B1%99RQ699RQ6BAq)..R(779 ZQVWXQY Z[rK   c           
        | j                  | j                  | j                  |                  }| j                  |      }| j	                  | j                  | j                  t        j                  ||fd                        S )z)Apply CSP bottleneck with 3 convolutions.r   )	rj   r   rf   ri   r  rt   r  r?   r   )rG   rH   y1y2s       rJ   rU   zBottleneckCSP.forward  s^    XXdffTXXa[)*XXa[xxB8Q)?!@ABBrK   r   r   rX   r[   ra   s   @rJ   r   r     s    Q\,CrK   r   c                  .     e Zd ZdZdd fdZddZ xZS )ResNetBlockz.ResNet block with standard convolution layers.c           	     B   t         |           ||z  }t        ||ddd      | _        t        ||d|dd      | _        t        ||dd      | _        |dk7  s||k7  r)t        j                  t        ||d|d            | _	        yt        j                         | _	        y)	z
        Initialize ResNet block.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            s (int): Stride.
            e (int): Expansion ratio.
        r   Tre   r   rt   rd   re   r   prt   Fr   N)
r9   r:   r   rf   ri   rj   r;   r   r   r   )rG   rF   rl   r   r   c3rI   s         rJ   r:   zResNetBlock.__init__  s     	VB!qd3B!qA48B!/LMQRFVX\^V^d2rQ!&GHdfdododqrK   c           	         t        j                  | j                  | j                  | j	                  |                  | j                  |      z         S )z&Forward pass through the ResNet block.)r   relurj   ri   rf   r   rn   s     rJ   rU   zResNetBlock.forward&  s9    vvdhhtxx45a8HHIIrK   )r   rM   )rF   rW   rl   rW   r   rW   r   rW   rX   r[   ra   s   @rJ   r  r    s    8r"JrK   r  c                  .     e Zd ZdZdd fdZddZ xZS )r"   z)ResNet layer with multiple ResNet blocks.c                   t         	|           || _        | j                  rAt        j                  t        ||dddd      t        j                  ddd            | _        y	t        ||||      g}|j                  t        |dz
        D cg c]  }t        ||z  |d|       c}       t        j                  | | _        y	c c}w )
a5  
        Initialize ResNet layer.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            s (int): Stride.
            is_first (bool): Whether this is the first layer.
            n (int): Number of ResNet blocks.
            e (int): Expansion ratio.
           rN   rd   Tr  r   r   r  N)r9   r:   is_firstr;   r   r   r   layerr  r   r   )
rG   rF   rl   r   r!  r   r   blocksrS   rI   s
            rJ   r:   zResNetLayer.__init__.  s     	 ==RqA5r||PQZ[ef7gDJ ""b!q12FMME!a%LQq;q2vr1:QR/DJ Rs   Cc                $    | j                  |      S )z&Forward pass through the ResNet layer.)r"  rn   s     rJ   rU   zResNetLayer.forwardF  s    zz!}rK   )r   Fr   rM   )rF   rW   rl   rW   r   rW   r!  r   r   rW   r   rW   rX   r[   ra   s   @rJ   r"   r"   +  s    300rK   r"   c                  .     e Zd ZdZdd fdZddZ xZS )MaxSigmoidAttnBlockzMax Sigmoid attention block.c                   t         |           || _        ||z  | _        ||k7  rt	        ||dd      nd| _        t        j                  ||      | _        t        j                  t        j                  |            | _        t	        ||ddd      | _        |r1t        j                  t        j                  d|dd            | _        yd| _        y)aH  
        Initialize MaxSigmoidAttnBlock.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            nh (int): Number of heads.
            ec (int): Embedding channels.
            gc (int): Guide channels.
            scale (bool): Whether to use learnable scale parameter.
        r   Fr   Nrd   r  r   )r9   r:   nhhcr   r   r;   LinearglrB   r?   zerosr7   	proj_convonesscale)rG   rF   rl   r(  r   gcr/  rI   s          rJ   r:   zMaxSigmoidAttnBlock.__init__N  s     	(24($r2.))B#LLR1	b"QE:>CR\\%**QAq"9:

rK   c                   |j                   \  }}}}| j                  |      }|j                  ||j                   d   | j                  | j                        }| j
                  | j                  |      n|}|j                  || j                  | j                  ||      }t        j                  d||      }|j                  d      d   }|| j                  dz  z  }|| j                  dddddf   z   }|j                         | j                  z  }| j                  |      }|j                  || j                  d||      }||j                  d      z  }|j                  |d||      S )	z
        Forward pass of MaxSigmoidAttnBlock.

        Args:
            x (torch.Tensor): Input tensor.
            guide (torch.Tensor): Guide tensor.

        Returns:
            (torch.Tensor): Output tensor after attention.
        r   Nzbmchw,bnmc->bmhwnr   r   r   r   rN   )rO   r+  rC   r(  r)  r   r?   einsummaxr7   sigmoidr/  r-  	unsqueeze)	rG   rH   guidebsrS   hwembedaws	            rJ   rU   zMaxSigmoidAttnBlock.forwardc  s4    ggAq!

2u{{1~tww@"gg1
q

2twwA6\\-ue<VVV^A477C< $))D!T4/00ZZ\DJJ&NN1FF2twwAq)Qvvb"a##rK   )r         F)rF   rW   rl   rW   r(  rW   r   rW   r0  rW   r/  r   rH   rY   r6  rY   rZ   rY   r[   ra   s   @rJ   r&  r&  K  s    &M*$rK   r&  c                  f     e Zd ZdZ	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZddZddZ xZS )r   z*C2f module with an additional attn module.c
                    t         
           t        ||	z         _        t	        |d j                  z  dd       _        t	        d|z    j                  z  |d       _        t        j                   fdt        |      D               _
        t         j                   j                  |||       _        y)a  
        Initialize C2f module with attention mechanism.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            ec (int): Embedding channels for attention.
            nh (int): Number of heads for attention.
            gc (int): Guide channels for attention.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        rN   r   rd   c           	   3  h   K   | ])  }t        j                  j                  d d       + ywr   r   r   s     rJ   r   z#C2fAttn.__init__.<locals>.<genexpr>  r   r   )r0  r   r(  N)r9   r:   rW   r   r   rf   ri   r;   r   r   r   r&  attn)rG   rF   rl   r   r   r(  r0  r   r   r   rI   s   `      `` rJ   r:   zC2fAttn.__init__  s    4 	R!VAJ1-Q$&&("a0tkpqrkstt'2"L	rK   c                2   t        | j                  |      j                  dd            j                  fd| j                  D               j                  | j                  d   |             | j                  t        j                  d            S )a  
        Forward pass through C2f layer with attention.

        Args:
            x (torch.Tensor): Input tensor.
            guide (torch.Tensor): Guide tensor for attention.

        Returns:
            (torch.Tensor): Output tensor after processing.
        rN   r   c              3  4   K   | ]  } |d            ywr   r   r   s     rJ   r   z"C2fAttn.forward.<locals>.<genexpr>  r   r   r   )
r   rf   r   r   r   appendrB  ri   r?   r   rG   rH   r6  r   s      @rJ   rU   zC2fAttn.forward  sn     !""1a()	*466**	1R5%()xx		!Q((rK   c                ^   t        | j                  |      j                  | j                  | j                  fd            j	                  fd| j
                  D               j                  | j                  d   |             | j                  t        j                  d            S )a  
        Forward pass using split() instead of chunk().

        Args:
            x (torch.Tensor): Input tensor.
            guide (torch.Tensor): Guide tensor for attention.

        Returns:
            (torch.Tensor): Output tensor after processing.
        r   c              3  4   K   | ]  } |d            ywr   r   r   s     rJ   r   z(C2fAttn.forward_split.<locals>.<genexpr>  r   r   r   )r   rf   r   r   r   r   rE  rB  ri   r?   r   rF  s      @rJ   r   zC2fAttn.forward_split  s{     !""DFFDFF#3Q78	*466**	1R5%()xx		!Q((rK   )r   r<  r   r=  Fr   r   )rF   rW   rl   rW   r   rW   r   rW   r(  rW   r0  rW   r   r   r   rW   r   rA   r>  r   ra   s   @rJ   r   r     s    4 MM M 	M
 M M M M M MB) )rK   r   c                  F     e Zd ZdZ	 d	 	 	 	 	 	 	 	 	 	 	 d fdZddZ xZS )r   zKImagePoolingAttn: Enhance the text embeddings with image-aware information.c           
        t         
|           t        |      }t        j                  t        j
                  |      t        j                  ||            | _        t        j                  t        j
                  |      t        j                  ||            | _        t        j                  t        j
                  |      t        j                  ||            | _	        t        j                  ||      | _
        |r+t        j                  t        j                  dg      d      nd| _        t        j                  |D cg c]  }t        j                   ||d       c}      | _        t        j                  t%        |      D 	cg c]  }	t        j&                  ||f       c}	      | _        || _        || _        || _        ||z  | _        || _        yc c}w c c}	w )a  
        Initialize ImagePoolingAttn module.

        Args:
            ec (int): Embedding channels.
            ch (tuple): Channel dimensions for feature maps.
            ct (int): Channel dimension for text embeddings.
            nh (int): Number of attention heads.
            k (int): Kernel size for pooling.
            scale (bool): Whether to use learnable scale parameter.
        g        Trequires_gradr   r   )ru   N)r9   r:   r   r;   r   	LayerNormr*  querykeyvalueprojrB   r?   tensorr/  r   r<   projectionsr   AdaptiveMaxPool2dim_poolsr   r(  nfr)  re   )rG   r   chctr(  re   r/  rV  in_channelsrS   rI   s             rJ   r:   zImagePoolingAttn.__init__  sJ    	W]]2<<#3RYYr25FG
==b!1299R3DE]]2<<#3RYYr25FG
IIb"%	NSR\\%,,u"5TJY\
==gi)jXc"))KQR*S)jkUSUY&Wr';';QF'C&WX( *k&Ws   G
G c           
        |d   j                   d   }t        |      | j                  k(  sJ | j                  dz  }t	        || j
                  | j                        D cg c]%  \  }}} | ||            j                  |d|      ' c}}}}t        j                  |d      j                  dd      }| j                  |      }| j                  |      }| j                  |      }	|j                  |d| j                  | j                         }|j                  |d| j                  | j                         }|	j                  |d| j                  | j                         }	t        j"                  d||      }
|
| j                   dz  z  }
t%        j&                  |
d      }
t        j"                  d|
|	      }| j)                  |j                  |d| j*                              }|| j,                  z  |z   S c c}}}w )	z
        Forward pass of ImagePoolingAttn.

        Args:
            x (list[torch.Tensor]): List of input feature maps.
            text (torch.Tensor): Text embeddings.

        Returns:
            (torch.Tensor): Enhanced text embeddings.
        r   rN   r   r   r   zbnmc,bkmc->bmnkr   zbmnk,bkmc->bnmc)rO   r   rV  re   ziprS  rU  rC   r?   r   rP   rN  rO  rP  reshaper(  r)  r2  r   rQ   rQ  r   r/  )rG   rH   textr7  num_patchesrQ  r   qre   vr;  s              rJ   rU   zImagePoolingAttn.forward  s    qTZZ]1v   ffaiLOPQSWScSceiererLstt!T4T$q']B4tIIaR **1a0JJtHHQKJJqM IIb"dggtww/IIb"dggtww/IIb"dggtww/\\+Q2477C< YYrr"LL*B2IIaiiB014::~$$# us   !*G7)ro   r   r=     rd   F)r   rW   rW  r   rX  rW   r(  rW   re   rW   r/  r   )rH   list[torch.Tensor]r]  rY   rZ   rY   r[   ra   s   @rJ   r   r     sG    U ns!0;>JMVYfj<%rK   r   c                  *     e Zd ZdZ fdZddZ xZS )r   zZImplements contrastive learning head for region-text similarity in vision-language models.c                    t         |           t        j                  t	        j
                  dg            | _        t        j                  t	        j                  g       t	        j
                  d      j                         z        | _	        y)zBInitialize ContrastiveHead with region-text similarity parameters.      $g$I$I,@N)
r9   r:   r;   rB   r?   rR  r7   r.  loglogit_scale)rG   rI   s    rJ   r:   zContrastiveHead.__init__  sY    LLug!67	<<

2h9O9S9S9U(UVrK   c                    t        j                  |dd      }t        j                  |dd      }t        j                  d||      }|| j                  j                         z  | j                  z   S )z
        Forward function of contrastive learning.

        Args:
            x (torch.Tensor): Image features.
            w (torch.Tensor): Text features.

        Returns:
            (torch.Tensor): Similarity scores.
        r   rN   r   r  r   bchw,bkc->bkhw)r   	normalizer?   r2  rg  expr7   rG   rH   r9  s      rJ   rU   zContrastiveHead.forward  s^     KKqA&KKrQ'LL)1a04##''))DII55rK   rH   rY   r9  rY   rZ   rY   r[   ra   s   @rJ   r   r   	  s    dW6rK   r   c                  :     e Zd ZdZd fdZd ZddZddZ xZS )r   z
    Batch Norm Contrastive Head using batch norm instead of l2-normalization.

    Args:
        embed_dims (int): Embed dimensions of text and image features.
    c                   t         |           t        j                  |      | _        t        j
                  t        j                  dg            | _        t        j
                  dt        j                  g       z        | _
        y)z
        Initialize BNContrastiveHead.

        Args:
            embed_dims (int): Embedding dimensions for features.
        re  g      N)r9   r:   r;   r  normrB   r?   rR  r7   r.  rg  )rG   
embed_dimsrI   s     rJ   r:   zBNContrastiveHead.__init__,  sY     	NN:.	LLug!67	<<uzz"~(=>rK   c                2    | ` | `| `| j                  | _        y)zCFuse the batch normalization layer in the BNContrastiveHead module.N)rq  r7   rg  forward_fuserU   )rG   s    rJ   fusezBNContrastiveHead.fuse:  s    II((rK   c                    |S )zPasses input out unchanged.r   rm  s      rJ   rt  zBNContrastiveHead.forward_fuseA  s    rK   c                    | j                  |      }t        j                  |dd      }t        j                  d||      }|| j
                  j                         z  | j                  z   S )z
        Forward function of contrastive learning with batch normalization.

        Args:
            x (torch.Tensor): Image features.
            w (torch.Tensor): Text features.

        Returns:
            (torch.Tensor): Similarity scores.
        r   rN   ri  rj  )rq  r   rk  r?   r2  rg  rl  r7   rm  s      rJ   rU   zBNContrastiveHead.forwardE  sY     IIaLKKrQ'LL)1a04##''))DII55rK   )rr  rW   rn  )	r\   r]   r^   r_   r:   ru  rt  rU   r`   ra   s   @rJ   r   r   $  s    ?)6rK   r   c                  >     e Zd ZdZ	 d	 	 	 	 	 	 	 	 	 	 	 d fdZ xZS )RepBottleneckzRep bottleneck.c                v    t         |   ||||||       t        ||z        }t        |||d   d      | _        y)aT  
        Initialize RepBottleneck.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            shortcut (bool): Whether to use shortcut connection.
            g (int): Groups for convolutions.
            k (tuple): Kernel sizes for convolutions.
            e (float): Expansion ratio.
        r   r   N)r9   r:   rW   r
   rf   r  s	           rJ   r:   zRepBottleneck.__init__Z  s?     	R1a3a[2r1Q4+rK   r  r  r   ra   s   @rJ   ry  ry  W  sG     lo,,,*.,:=,FU,ch, ,rK   ry  c                  &     e Zd ZdZdd fdZ xZS )RepCSPzXRepeatable Cross Stage Partial Network (RepCSP) module for efficient feature extraction.c                    t         |   ||||       t        ||z        t        j                  fdt        |      D         | _        y)aS  
        Initialize RepCSP layer.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of RepBottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        c              3  >   K   | ]  }t        d         ywr  )ry  r   s     rJ   r   z"RepCSP.__init__.<locals>.<genexpr>~  s!      ]qr2xc!J!J ]r  Nr   r   s	       `` @rJ   r:   zRepCSP.__init__p  sF     	RHa3a[ ]TYZ[T\ ]^rK   r   r   r   ra   s   @rJ   r|  r|  m  s    b_ _rK   r|  c                  6     e Zd ZdZdd fdZddZddZ xZS )r#   z	CSP-ELAN.c           	     \   t         |           |dz  | _        t        ||dd      | _        t        j                  t        |dz  ||      t        ||dd            | _        t        j                  t        |||      t        ||dd            | _	        t        |d|z  z   |dd      | _
        y)a  
        Initialize CSP-ELAN layer.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            c3 (int): Intermediate channels.
            c4 (int): Intermediate channels for RepCSP.
            n (int): Number of RepCSP blocks.
        rN   r   rd   N)r9   r:   r   r   rf   r;   r   r|  ri   rj   r  )rG   rF   rl   r  c4r   rI   s         rJ   r:   zRepNCSPELAN4.__init__  s     	qB1%==aQ!7b"a9KL==B!2DRA4FGa"fr1a0rK   c                   t        | j                  |      j                  dd            j                  fd| j                  | j
                  fD               | j                  t        j                  d            S )z(Forward pass through RepNCSPELAN4 layer.rN   r   c              3  4   K   | ]  } |d            ywr   r   r   s     rJ   r   z'RepNCSPELAN4.forward.<locals>.<genexpr>  s     :!AbE(:r   )	r   rf   r   r   ri   rj   r  r?   r   r   s     @rJ   rU   zRepNCSPELAN4.forward  sZ    !""1a()	:dhh%9::xx		!Q((rK   c                .   t        | j                  |      j                  | j                  | j                  fd            j	                  fd| j
                  | j                  fD               | j                  t        j                  d            S )r   r   c              3  4   K   | ]  } |d            ywr   r   r   s     rJ   r   z-RepNCSPELAN4.forward_split.<locals>.<genexpr>  s     8a1R58r   )
r   rf   r   r   r   ri   rj   r  r?   r   r   s     @rJ   r   zRepNCSPELAN4.forward_split  sg    !""DFFDFF#3Q78	8DHHdhh#788xx		!Q((rK   r   )
rF   rW   rl   rW   r  rW   r  rW   r   rW   rX   r   ra   s   @rJ   r#   r#     s    1$))rK   r#   c                  $     e Zd ZdZd fdZ xZS )r$   z!ELAN1 module with 4 convolutions.c                    t         |   ||||       |dz  | _        t        ||dd      | _        t        |dz  |dd      | _        t        ||dd      | _        t        |d|z  z   |dd      | _        y)z
        Initialize ELAN1 layer.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            c3 (int): Intermediate channels.
            c4 (int): Intermediate channels for convolutions.
        rN   r   rd   N)r9   r:   r   r   rf   ri   rj   r  )rG   rF   rl   r  r  rI   s        rJ   r:   zELAN1.__init__  sw     	RR(qB1%aQ*B1%a"fr1a0rK   )rF   rW   rl   rW   r  rW   r  rW   r   ra   s   @rJ   r$   r$     s    +1 1rK   r$   c                  ,     e Zd ZdZd fdZddZ xZS )r&   zAConv.c                J    t         |           t        ||ddd      | _        y)z
        Initialize AConv module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
        rd   rN   r   N)r9   r:   r   rf   rG   rF   rl   rI   s      rJ   r:   zAConv.__init__  s$     	B1a(rK   c                    t         j                  j                  j                  |ddddd      }| j	                  |      S )z!Forward pass through AConv layer.rN   r   r   FT)r?   r;   
functional
avg_pool2drf   rn   s     rJ   rU   zAConv.forward  s4    HH**1aAudCxx{rK   rF   rW   rl   rW   rX   r[   ra   s   @rJ   r&   r&     s    	)rK   r&   c                  ,     e Zd ZdZd fdZddZ xZS )r%   zADown.c                    t         |           |dz  | _        t        |dz  | j                  ddd      | _        t        |dz  | j                  ddd      | _        y)z
        Initialize ADown module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
        rN   rd   r   r   N)r9   r:   r   r   rf   ri   r  s      rJ   r:   zADown.__init__  sS     	qaAq1aAq1rK   c                T   t         j                  j                  j                  |ddddd      }|j	                  dd      \  }}| j                  |      }t         j                  j                  j                  |ddd      }| j                  |      }t        j                  ||fd      S )z!Forward pass through ADown layer.rN   r   r   FTrd   )	r?   r;   r  r  r   rf   
max_pool2dri   r   )rG   rH   r   r   s       rJ   rU   zADown.forward  s    HH**1aAudCABXXb\XX  ++B1a8XXb\yy"b1%%rK   r  rX   r[   ra   s   @rJ   r%   r%     s    2&rK   r%   c                  .     e Zd ZdZdd fdZddZ xZS )r'   z	SPP-ELAN.c                B   t         |           || _        t        ||dd      | _        t        j                  |d|dz        | _        t        j                  |d|dz        | _        t        j                  |d|dz        | _	        t        d|z  |dd      | _
        y)z
        Initialize SPP-ELAN block.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            c3 (int): Intermediate channels.
            k (int): Kernel size for max pooling.
        r   rN   r   rM   N)r9   r:   r   r   rf   r;   r   ri   rj   r  cv5)rG   rF   rl   r  re   rI   s        rJ   r:   zSPPELAN.__init__  s     	B1%<<AaaH<<AaaH<<AaaHBAq)rK   c                    | j                  |      gj                  fd| j                  | j                  | j                  fD               | j                  t        j                  d            S )z#Forward pass through SPPELAN layer.c              3  4   K   | ]  } |d            ywr   r   r   s     rJ   r   z"SPPELAN.forward.<locals>.<genexpr>  s     Ba1R5Br   r   )rf   r   ri   rj   r  r  r?   r   r   s     @rJ   rU   zSPPELAN.forward  sP    XXa[M	BDHHdhh#ABBxx		!Q((rK   r   )rF   rW   rl   rW   r  rW   re   rW   rX   r[   ra   s   @rJ   r'   r'     s    *$)rK   r'   c                  .     e Zd ZdZdd fdZddZ xZS )r)   z	CBLinear.c           
         t         |           || _        t        j                  |t        |      ||t        ||      |d      | _        y)a  
        Initialize CBLinear module.

        Args:
            c1 (int): Input channels.
            c2s (list[int]): List of output channel sizes.
            k (int): Kernel size.
            s (int): Stride.
            p (int | None): Padding.
            g (int): Groups.
        T)groupsr7   N)r9   r:   c2sr;   r<   sumr   r>   )rG   rF   r  re   r   r  r   rI   s          rJ   r:   zCBLinear.__init__  s>     	IIb#c(Aq'!Q-PTU	rK   c                Z    | j                  |      j                  | j                  d      S )z$Forward pass through CBLinear layer.r   r   )r>   r   r  rn   s     rJ   rU   zCBLinear.forward  s$    yy|!!$((!22rK   )r   r   Nr   )rF   rW   r  	list[int]re   rW   r   rW   r  z
int | Noner   rW   )rH   rY   rZ   rb  r[   ra   s   @rJ   r)   r)     s    V 3rK   r)   c                  ,     e Zd ZdZd fdZddZ xZS )r(   zCBFuse.c                0    t         |           || _        y)zv
        Initialize CBFuse module.

        Args:
            idx (list[int]): Indices for feature selection.
        N)r9   r:   idx)rG   r  rI   s     rJ   r:   zCBFuse.__init__  s     	rK   c           	        |d   j                   dd }t        |dd       D cg c]-  \  }}t        j                  || j                  |      |d      / }}}t        j                  t        j                  ||dd z         d      S c c}}w )z
        Forward pass through CBFuse layer.

        Args:
            xs (list[torch.Tensor]): List of input tensors.

        Returns:
            (torch.Tensor): Fused output tensor.
        r   rN   Nnearest)sizemoder   r   )rO   	enumerater   interpolater  r?   r  stack)rG   xstarget_sizer   rH   ress         rJ   rU   zCBFuse.forward'  s     fll12&[deghkikel[mnSWSTVWq}}Qtxx{^+INnnyyS2bc7]3;; os   2B	)r  r  )r  rb  rZ   rY   r[   ra   s   @rJ   r(   r(     s    <rK   r(   c                  .     e Zd ZdZdd fdZddZ xZS )C3fr   c                   t         |           t        ||z        t        |dd      | _        t        |dd      | _        t        d|z   z  |d      | _        t        j                  fdt        |      D              | _
        y)an  
        Initialize CSP bottleneck layer with two convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        r   rN   c           	   3  @   K   | ]  }t        d d        ywr   r   r   s     rJ   r   zC3f.__init__.<locals>.<genexpr>J  s&     l^_z"b(AAQUXYYlr   N)r9   r:   rW   r   rf   ri   rj   r;   r   r   r   r   s	       `` @rJ   r:   zC3f.__init__9  sv     	a[B1%B1%Q"b!,lchijckllrK   c                    | j                  |      | j                  |      gj                  fd| j                  D               | j	                  t        j                  d            S )zForward pass through C3f layer.c              3  4   K   | ]  } |d            ywr   r   r   s     rJ   r   zC3f.forward.<locals>.<genexpr>O  r   r   r   )ri   rf   r   r   rj   r?   r   r   s     @rJ   rU   zC3f.forwardL  sL    XXa[$((1+&	*466**xx		!Q((rK   r   r   rX   r[   ra   s   @rJ   r  r  6  s    Fm&)rK   r  c                  B     e Zd ZdZ	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ xZS )r*   r   c                     t            ||||       t        j                   fdt	        |      D               _        y)aw  
        Initialize C3k2 module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of blocks.
            c3k (bool): Whether to use C3k blocks.
            e (float): Expansion ratio.
            g (int): Groups for convolutions.
            shortcut (bool): Whether to use shortcut connections.
        c              3     K   | ]K  }r#t        j                  j                  d       n!t        j                  j                         M yw)rN   N)C3kr   r   )r   rS   c3kr   rG   r   s     rJ   r   z C3k2.__init__.<locals>.<genexpr>f  sG      
hi3C8Q/JtvvtvvW_ab<cc
s   AANr9   r:   r;   r   r   r   )	rG   rF   rl   r   r  r   r   r   rI   s	   `   ` ``rJ   r:   zC3k2.__init__V  s?     	RHa3 
mrstmu
 
rK   )r   Fr   r   T)rF   rW   rl   rW   r   rW   r  r   r   rA   r   rW   r   r   r   ra   s   @rJ   r*   r*   S  sO    F mq


#&
15
BG
RU
ei
 
rK   r*   c                  &     e Zd ZdZdd fdZ xZS )r  zhC3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks.c                    t         	|   ||||       t        ||z        t        j                  fdt        |      D         | _        y)ap  
        Initialize C3k module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
            k (int): Kernel size.
        c           	   3  D   K   | ]  }t        fd         yw)r   r   Nr   )r   rS   rk   r   re   r   s     rJ   r   zC3k.__init__.<locals>.<genexpr>~  s(      dVWBHaAq6S!Q!Q dr   Nr   )
rG   rF   rl   r   r   r   r   re   rk   rI   s
       `` `@rJ   r:   zC3k.__init__n  sF     	RHa3a[ d[`ab[c derK   )r   Tr   r   rd   )rF   rW   rl   rW   r   rW   r   r   r   rW   r   rA   re   rW   r   ra   s   @rJ   r  r  k  s    rf frK   r  c                  b     e Zd ZdZd fdZddZddZ ej                         d        Z	 xZ
S )r-   zfRepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture.c           	         t         |           t        ||ddd|d      | _        t        ||ddd|d      | _        || _        t        j                         | _        y)zm
        Initialize RepVGGDW module.

        Args:
            ed (int): Input and output channels.
        r   r   rd   Fr   rt   N)	r9   r:   r   r>   conv1r   r;   r  rt   )rG   edrI   s     rJ   r:   zRepVGGDW.__init__  sT     	RAqBE:	"b!QRU;
779rK   c                f    | j                  | j                  |      | j                  |      z         S )z
        Perform a forward pass of the RepVGGDW block.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after applying the depth wise separable convolution.
        )rt   r>   r  rn   s     rJ   rU   zRepVGGDW.forward  s(     xx		!tzz!}455rK   c                B    | j                  | j                  |            S )a  
        Perform a forward pass of the RepVGGDW block without fusing the convolutions.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after applying the depth wise separable convolution.
        )rt   r>   rn   s     rJ   rt  zRepVGGDW.forward_fuse  s     xx		!%%rK   c                F   t        | j                  j                  | j                  j                        }t        | j                  j                  | j                  j                        }|j                  }|j
                  }|j                  }|j
                  }t        j                  j                  j                  |g d      }||z   }||z   }|j                  j                  j                  |       |j
                  j                  j                  |       || _        | `y)z
        Fuse the convolutional layers in the RepVGGDW block.

        This method fuses the convolutional layers and updates the weights and biases accordingly.
        )rN   rN   rN   rN   N)r   r>   r  r  rD   r7   r?   r;   r  r   rE   copy_)	rG   r>   r  conv_wconv_bconv1_wconv1_bfinal_conv_wfinal_conv_bs	            rJ   ru  zRepVGGDW.fuse  s      				= $**--@,,**((%%))'<@''|,		\*	JrK   )r  rW   rZ   NonerX   )r\   r]   r^   r_   r:   rU   rt  r?   no_gradru  r`   ra   s   @rJ   r-   r-     s1    p
6
& U]]_ rK   r-   c                  .     e Zd ZdZdd fdZddZ xZS )r.   a  
    Conditional Identity Block (CIB) module.

    Args:
        c1 (int): Number of input channels.
        c2 (int): Number of output channels.
        shortcut (bool, optional): Whether to add a shortcut connection. Defaults to True.
        e (float, optional): Scaling factor for the hidden channels. Defaults to 0.5.
        lk (bool, optional): Whether to use RepVGGDW for the third convolutional layer. Defaults to False.
    c                N   t         |           t        ||z        }t        j                  t        ||d|      t        |d|z  d      |rt        d|z        nt        d|z  d|z  dd|z        t        d|z  |d      t        ||d|            | _        |xr ||k(  | _        y)a!  
        Initialize the CIB module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            shortcut (bool): Whether to use shortcut connection.
            e (float): Expansion ratio.
            lk (bool): Whether to use RepVGGDW.
        rd   r  rN   r   N)	r9   r:   rW   r;   r   r   r-   rf   r   )rG   rF   rl   r   r   lkrk   rI   s          rJ   r:   zCIB.__init__  s     	a[==Rb!QVQ "HQVQVQVQ!b&(IRQRb!
 (brK   c                d    | j                   r|| j                  |      z   S | j                  |      S )z
        Forward pass of the CIB module.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor.
        )r   rf   rn   s     rJ   rU   zCIB.forward  s)     #'((q488A;;;rK   )Tr   F)
rF   rW   rl   rW   r   r   r   rA   r  r   rX   r[   ra   s   @rJ   r.   r.     s    	).
<rK   r.   c                  B     e Zd ZdZ	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ xZS )r/   aQ  
    C2fCIB class represents a convolutional block with C2f and CIB modules.

    Args:
        c1 (int): Number of input channels.
        c2 (int): Number of output channels.
        n (int, optional): Number of CIB modules to stack. Defaults to 1.
        shortcut (bool, optional): Whether to use shortcut connection. Defaults to False.
        lk (bool, optional): Whether to use local key connection. Defaults to False.
        g (int, optional): Number of groups for grouped convolution. Defaults to 1.
        e (float, optional): Expansion ratio for CIB modules. Defaults to 0.5.
    c                     t            |||||       t        j                   fdt	        |      D               _        y)a  
        Initialize C2fCIB module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of CIB modules.
            shortcut (bool): Whether to use shortcut connection.
            lk (bool): Whether to use local key connection.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        c              3  f   K   | ](  }t        j                  j                  d        * yw)r   )r   r  N)r.   r   )r   rS   r  rG   r   s     rJ   r   z"C2fCIB.__init__.<locals>.<genexpr>  s)     ]qs4664668srJJ]s   .1Nr  )	rG   rF   rl   r   r   r  r   r   rI   s	   `   ``  rJ   r:   zC2fCIB.__init__  s9     	RHa3]TYZ[T\]]rK   )r   FFr   r   )rF   rW   rl   rW   r   rW   r   r   r  r   r   rW   r   rA   r   ra   s   @rJ   r/   r/     sZ     nq^^^#&^6:^HL^Y\^ej^ ^rK   r/   c                  .     e Zd ZdZdd fdZddZ xZS )r0   a  
    Attention module that performs self-attention on the input tensor.

    Args:
        dim (int): The input tensor dimension.
        num_heads (int): The number of attention heads.
        attn_ratio (float): The ratio of the attention key dimension to the head dimension.

    Attributes:
        num_heads (int): The number of attention heads.
        head_dim (int): The dimension of each attention head.
        key_dim (int): The dimension of the attention key.
        scale (float): The scaling factor for the attention scores.
        qkv (Conv): Convolutional layer for computing the query, key, and value.
        proj (Conv): Convolutional layer for projecting the attended values.
        pe (Conv): Convolutional layer for positional encoding.
    c                P   t         |           || _        ||z  | _        t	        | j                  |z        | _        | j
                  dz  | _        | j
                  |z  }||dz  z   }t        ||dd      | _        t        ||dd      | _	        t        ||dd|d      | _
        y)	z
        Initialize multi-head attention module.

        Args:
            dim (int): Input dimension.
            num_heads (int): Number of attention heads.
            attn_ratio (float): Attention ratio for key dimension.
              rN   r   Frs   rd   r  N)r9   r:   	num_headshead_dimrW   key_dimr/  r   qkvrQ  pe)rG   r   r  
attn_rationh_kdr8  rI   s         rJ   r:   zAttention.__init__(  s     	"y(4==:56\\4'
y(%!)OQu-c1%0	sCA%8rK   c           	     P   |j                   \  }}}}||z  }| j                  |      }|j                  || j                  | j                  dz  | j
                  z   |      j                  | j                  | j                  | j
                  gd      \  }}	}
|j                  dd      |	z  | j                  z  }|j                  d      }|
|j                  dd      z  j                  ||||      | j                  |
j                  ||||            z   }| j                  |      }|S )z
        Forward pass of the Attention module.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            (torch.Tensor): The output tensor after self-attention.
        rN   r   r   )rO   r  rC   r  r  r  r   rP   r/  rQ   r  r\  rQ  )rG   rH   BCHWNr  r_  re   r`  rB  s               rJ   rU   zAttention.forward<  s    WW
1aEhhqk((1dnndllQ.>.NPQRXX\\4<<7Q Y 
1a B#a'4::5|||#B''--aAq9DGGAIIaQRTUWXDY<ZZIIaLrK   )ra  r   )r   rW   r  rW   r  rA   rX   r[   ra   s   @rJ   r0   r0     s    $9(rK   r0   c                  .     e Zd ZdZdd fdZddZ xZS )PSABlockaK  
    PSABlock class implementing a Position-Sensitive Attention block for neural networks.

    This class encapsulates the functionality for applying multi-head attention and feed-forward neural network layers
    with optional shortcut connections.

    Attributes:
        attn (Attention): Multi-head attention module.
        ffn (nn.Sequential): Feed-forward neural network module.
        add (bool): Flag indicating whether to add shortcut connections.

    Methods:
        forward: Performs a forward pass through the PSABlock, applying attention and feed-forward layers.

    Examples:
        Create a PSABlock and perform a forward pass
        >>> psablock = PSABlock(c=128, attn_ratio=0.5, num_heads=4, shortcut=True)
        >>> input_tensor = torch.randn(1, 128, 32, 32)
        >>> output_tensor = psablock(input_tensor)
    c           	         t         |           t        |||      | _        t	        j
                  t        ||dz  d      t        |dz  |dd            | _        || _        y)a&  
        Initialize the PSABlock.

        Args:
            c (int): Input and output channels.
            attn_ratio (float): Attention ratio for key dimension.
            num_heads (int): Number of attention heads.
            shortcut (bool): Whether to use shortcut connections.
        r  r  rN   r   Frs   N)	r9   r:   r0   rB  r;   r   r   ffnr   )rG   r   r  r  r   rI   s        rJ   r:   zPSABlock.__init__j  sU     	aJ)L	==aQ!2DQ1%4PQrK   c                    | j                   r|| j                  |      z   n| j                  |      }| j                   r|| j                  |      z   }|S | j                  |      }|S )z
        Execute a forward pass through PSABlock.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after attention and feed-forward processing.
        )r   rB  r  rn   s     rJ   rU   zPSABlock.forwardz  sV     !%A		!diil#xxAO .2XXa[rK   )r   rM   T)
r   rW   r  rA   r  rW   r   r   rZ   r  rX   r[   ra   s   @rJ   r  r  T  s    * rK   r  c                  .     e Zd ZdZdd fdZddZ xZS )r1   a  
    PSA class for implementing Position-Sensitive Attention in neural networks.

    This class encapsulates the functionality for applying position-sensitive attention and feed-forward networks to
    input tensors, enhancing feature extraction and processing capabilities.

    Attributes:
        c (int): Number of hidden channels after applying the initial convolution.
        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
        attn (Attention): Attention module for position-sensitive attention.
        ffn (nn.Sequential): Feed-forward network for further processing.

    Methods:
        forward: Applies position-sensitive attention and feed-forward network to the input tensor.

    Examples:
        Create a PSA module and apply it to an input tensor
        >>> psa = PSA(c1=128, c2=128, e=0.5)
        >>> input_tensor = torch.randn(1, 128, 64, 64)
        >>> output_tensor = psa.forward(input_tensor)
    c           	        t         |           ||k(  sJ t        ||z        | _        t	        |d| j                  z  dd      | _        t	        d| j                  z  |d      | _        t        | j                  d| j                  dz        | _        t        j                  t	        | j                  | j                  dz  d      t	        | j                  dz  | j                  dd            | _        y)	z
        Initialize PSA module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            e (float): Expansion ratio.
        rN   r   r   @   r  Frs   N)r9   r:   rW   r   r   rf   ri   r0   rB  r;   r   r  )rG   rF   rl   r   rI   s       rJ   r:   zPSA.__init__  s     	RxxR!VAJ1-DFF
B*dff"M	==dffdffqj!!<d466A:tvvWX^c>derK   c                   | j                  |      j                  | j                  | j                  fd      \  }}|| j                  |      z   }|| j	                  |      z   }| j                  t        j                  ||fd            S )z
        Execute forward pass in PSA module.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after attention and feed-forward processing.
        r   r   )rf   r   r   rB  r  ri   r?   r   r   s       rJ   rU   zPSA.forward  ss     xx{  $&&$&&!1q 91		!Oxx		1a&!,--rK   )r   )rF   rW   rl   rW   r   rA   rX   r[   ra   s   @rJ   r1   r1     s    .f$.rK   r1   c                  .     e Zd ZdZdd fdZddZ xZS )r,   aL  
    C2PSA module with attention mechanism for enhanced feature extraction and processing.

    This module implements a convolutional block with attention mechanisms to enhance feature extraction and processing
    capabilities. It includes a series of PSABlock modules for self-attention and feed-forward operations.

    Attributes:
        c (int): Number of hidden channels.
        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
        m (nn.Sequential): Sequential container of PSABlock modules for attention and feed-forward operations.

    Methods:
        forward: Performs a forward pass through the C2PSA module, applying attention and feed-forward operations.

    Notes:
        This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules.

    Examples:
        >>> c2psa = C2PSA(c1=256, c2=256, n=3, e=0.5)
        >>> input_tensor = torch.randn(1, 256, 64, 64)
        >>> output_tensor = c2psa(input_tensor)
    c                (    t                    ||k(  sJ t        ||z         _        t	        |d j                  z  dd       _        t	        d j                  z  |d       _        t        j                   fdt        |      D          _
        y)z
        Initialize C2PSA module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of PSABlock modules.
            e (float): Expansion ratio.
        rN   r   c              3  h   K   | ])  }t        j                  d j                  dz         + ywr   r  r  Nr  r   r   rS   rG   s     rJ   r   z!C2PSA.__init__.<locals>.<genexpr>  s+      l^_$&&SDFFVXL!Y!Y lr   Nr   rG   rF   rl   r   r   rI   s   `    rJ   r:   zC2PSA.__init__  sy     	RxxR!VAJ1-DFF
B* lchijck lmrK   c                    | j                  |      j                  | j                  | j                  fd      \  }}| j                  |      }| j	                  t        j                  ||fd            S )z
        Process the input tensor through a series of PSA blocks.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after processing.
        r   r   )rf   r   r   r   ri   r?   r   r   s       rJ   rU   zC2PSA.forward  s]     xx{  $&&$&&!1q 91FF1Ixx		1a&!,--rK   r   r   r   rX   r[   ra   s   @rJ   r,   r,     s    0n$.rK   r,   c                  &     e Zd ZdZdd fdZ xZS )r+   a  
    C2fPSA module with enhanced feature extraction using PSA blocks.

    This class extends the C2f module by incorporating PSA blocks for improved attention mechanisms and feature extraction.

    Attributes:
        c (int): Number of hidden channels.
        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
        m (nn.ModuleList): List of PSA blocks for feature extraction.

    Methods:
        forward: Performs a forward pass through the C2fPSA module.
        forward_split: Performs a forward pass using split() instead of chunk().

    Examples:
        >>> import torch
        >>> from ultralytics.models.common import C2fPSA
        >>> model = C2fPSA(c1=64, c2=64, n=3, e=0.5)
        >>> x = torch.randn(1, 64, 128, 128)
        >>> output = model(x)
        >>> print(output.shape)
    c                     ||k(  sJ t            ||||       t        j                   fdt	        |      D               _        y)z
        Initialize C2fPSA module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of PSABlock modules.
            e (float): Expansion ratio.
        )r   r   c              3  h   K   | ])  }t        j                  d j                  dz         + ywr  r  r  s     rJ   r   z"C2fPSA.__init__.<locals>.<genexpr>"  s+     j\]x3$&&TV,WWjr   Nr  r  s   `    rJ   r:   zC2fPSA.__init__  sC     RxxR1*jafghaijjrK   r  r   r   ra   s   @rJ   r+   r+     s    0k krK   r+   c                  ,     e Zd ZdZd fdZddZ xZS )r2   a<  
    SCDown module for downsampling with separable convolutions.

    This module performs downsampling using a combination of pointwise and depthwise convolutions, which helps in
    efficiently reducing the spatial dimensions of the input tensor while maintaining the channel information.

    Attributes:
        cv1 (Conv): Pointwise convolution layer that reduces the number of channels.
        cv2 (Conv): Depthwise convolution layer that performs spatial downsampling.

    Methods:
        forward: Applies the SCDown module to the input tensor.

    Examples:
        >>> import torch
        >>> from ultralytics import SCDown
        >>> model = SCDown(c1=64, c2=128, k=3, s=2)
        >>> x = torch.randn(1, 64, 128, 128)
        >>> y = model(x)
        >>> print(y.shape)
        torch.Size([1, 128, 64, 64])
    c                t    t         |           t        ||dd      | _        t        |||||d      | _        y)z
        Initialize SCDown module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            k (int): Kernel size.
            s (int): Stride.
        r   F)re   r   r   rt   N)r9   r:   r   rf   ri   )rG   rF   rl   re   r   rI   s        rJ   r:   zSCDown.__init__=  s8     	B1%B!qBE:rK   c                B    | j                  | j                  |            S )z
        Apply convolution and downsampling to the input tensor.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Downsampled output tensor.
        )ri   rf   rn   s     rJ   rU   zSCDown.forwardK  s     xx$$rK   r   rX   r[   ra   s   @rJ   r2   r2   %  s    .;
%rK   r2   c                  B     e Zd ZdZ	 d	 	 	 	 	 	 	 	 	 d fdZddZ xZS )r3   aZ  
    TorchVision module to allow loading any torchvision model.

    This class provides a way to load a model from the torchvision library, optionally load pre-trained weights, and customize the model by truncating or unwrapping layers.

    Attributes:
        m (nn.Module): The loaded torchvision model, possibly truncated and unwrapped.

    Args:
        model (str): Name of the torchvision model to load.
        weights (str, optional): Pre-trained weights to load. Default is "DEFAULT".
        unwrap (bool, optional): If True, unwraps the model to a sequential containing all but the last `truncate` layers. Default is True.
        truncate (int, optional): Number of layers to truncate from the end if `unwrap` is True. Default is 2.
        split (bool, optional): Returns output from intermediate child modules as list. Default is False.
    c                   ddl }t        | 	          t        |j                  d      r#|j                  j                  ||      | _        n. |j                  j                  |   t        |            | _        |rt        | j                  j                               }t        |d   t        j                        r#g t        |d   j                               |dd }t        j                  |r|d|  n| | _        || _        yd| _        t        j                         x| j                  _        | j                  _        y)an  
        Load the model and weights from torchvision.

        Args:
            model (str): Name of the torchvision model to load.
            weights (str): Pre-trained weights to load.
            unwrap (bool): Whether to unwrap the model.
            truncate (int): Number of layers to truncate.
            split (bool): Whether to split the output.
        r   N	get_model)weights)
pretrainedr   F)torchvisionr9   r:   hasattrmodelsr  r   __dict__r   r   children
isinstancer;   r   r   r   headheads)	rG   modelr   unwraptruncater   r  layersrI   s	           rJ   r:   zTorchVision.__init__i  s     	;%%{3 ''11%1IDF7[''0074=QDF$&&//+,F&)R]]3C4q	 2 2 45Cqr
C]]8VJhY%7QDFDJDJ)+6DFFK$&&,rK   c                    | j                   r)|gj                  fd| j                  D               S | j                  |      S )z
        Forward pass through the model.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor | list[torch.Tensor]): Output tensor or list of tensors.
        c              3  4   K   | ]  } |d            ywr   r   r   s     rJ   r   z&TorchVision.forward.<locals>.<genexpr>  s     .!QquX.r   )r   r   r   r   s     @rJ   rU   zTorchVision.forward  sD     ::AHH.tvv..  q	ArK   )DEFAULTTrN   F)
r
  strr   r  r  r   r  rW   r   r   rX   r[   ra   s   @rJ   r3   r3   X  sA    " kp77#&7<@7SV7cg7<rK   r3   c                  .     e Zd ZdZdd fdZddZ xZS )AAttna  
    Area-attention module for YOLO models, providing efficient attention mechanisms.

    This module implements an area-based attention mechanism that processes input features in a spatially-aware manner,
    making it particularly effective for object detection tasks.

    Attributes:
        area (int): Number of areas the feature map is divided.
        num_heads (int): Number of heads into which the attention mechanism is divided.
        head_dim (int): Dimension of each attention head.
        qkv (Conv): Convolution layer for computing query, key and value tensors.
        proj (Conv): Projection convolution layer.
        pe (Conv): Position encoding convolution layer.

    Methods:
        forward: Applies area-attention to input tensor.

    Examples:
        >>> attn = AAttn(dim=256, num_heads=8, area=4)
        >>> x = torch.randn(1, 256, 32, 32)
        >>> output = attn(x)
        >>> print(output.shape)
        torch.Size([1, 256, 32, 32])
    c           	         t         |           || _        || _        ||z  x| _        }|| j                  z  }t        ||dz  dd      | _        t        ||dd      | _        t        ||ddd|d      | _        y)a'  
        Initialize an Area-attention module for YOLO models.

        Args:
            dim (int): Number of hidden channels.
            num_heads (int): Number of heads into which the attention mechanism is divided.
            area (int): Number of areas the feature map is divided.
        rd   r   Frs   r   r  N)	r9   r:   arear  r  r   r  rQ  r  )rG   r   r  r  r  all_head_dimrI   s         rJ   r:   zAAttn.__init__  s~     		"#&)#33$..0\A-qe<sA59	|S!QSeDrK   c                   |j                   \  }}}}||z  }| j                  |      j                  d      j                  dd      }| j                  dkD  r@|j                  || j                  z  || j                  z  |dz        }|j                   \  }}}|j                  ||| j                  | j                  dz        j                  dddd      j                  | j                  | j                  | j                  gd      \  }	}
}|	j                  dd      |
z  | j                  dz  z  }|j                  d      }||j                  dd      z  }|j                  dddd      }|j                  dddd      }| j                  dkD  rj|j                  || j                  z  || j                  z  |      }|j                  || j                  z  || j                  z  |      }|j                   \  }}}|j                  ||||      j                  dddd      j                         }|j                  ||||      j                  dddd      j                         }|| j                  |      z   }| j                  |      S )	z
        Process the input tensor through the area-attention.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after area-attention.
        rN   r   rd   r   r   r  r   r  )rO   r  flattenrP   r  r\  rC   r  r  permuter   rQ   
contiguousr  rQ  )rG   rH   r  r  r  r  r  r  rS   r_  re   r`  rB  s                rJ   rU   zAAttn.forward  s)    WW
1aEhhqk!!!$..q!499q=++a$))mQ$))^QUCCiiGAq!HHQ4>>4==1+<=WQ1a UDMM4==$--@aUH 	1a
 B#a'DMM4,?@|||#r2&&IIaAq!IIaAq!99q=		!tyy.!dii-;A		!tyy.!dii-;AggGAq!IIaAq!))!Q15@@BIIaAq!))!Q15@@B
Nyy|rK   r   )r   rW   r  rW   r  rW   rX   r[   ra   s   @rJ   r  r    s    2E(%rK   r  c                  6     e Zd ZdZdd fdZddZddZ xZS )	ABlocka  
    Area-attention block module for efficient feature extraction in YOLO models.

    This module implements an area-attention mechanism combined with a feed-forward network for processing feature maps.
    It uses a novel area-based attention approach that is more efficient than traditional self-attention while
    maintaining effectiveness.

    Attributes:
        attn (AAttn): Area-attention module for processing spatial features.
        mlp (nn.Sequential): Multi-layer perceptron for feature transformation.

    Methods:
        _init_weights: Initializes module weights using truncated normal distribution.
        forward: Applies area-attention and feed-forward processing to input tensor.

    Examples:
        >>> block = ABlock(dim=256, num_heads=8, mlp_ratio=1.2, area=1)
        >>> x = torch.randn(1, 256, 32, 32)
        >>> output = block(x)
        >>> print(output.shape)
        torch.Size([1, 256, 32, 32])
    c           	         t         |           t        |||      | _        t	        ||z        }t        j                  t        ||d      t        ||dd            | _        | j                  | j                         y)ae  
        Initialize an Area-attention block module.

        Args:
            dim (int): Number of input channels.
            num_heads (int): Number of heads into which the attention mechanism is divided.
            mlp_ratio (float): Expansion ratio for MLP hidden dimension.
            area (int): Number of areas the feature map is divided.
        )r  r  r   Frs   N)r9   r:   r  rB  rW   r;   r   r   mlpapply_init_weights)rG   r   r  	mlp_ratior  mlp_hidden_dimrI   s         rJ   r:   zABlock.__init__  si     	#>	S9_-==c>1!=tNTWYZ`e?fg

4%%&rK   c                    t        |t        j                        rct        j                  j	                  |j
                  d       |j                  +t        j                  j                  |j                  d       yyy)z
        Initialize weights using a truncated normal distribution.

        Args:
            m (nn.Module): Module to initialize.
        g{Gz?)stdNr   )r  r;   r<   inittrunc_normal_rD   r7   	constant_)rG   r   s     rJ   r   zABlock._init_weights  sY     a#GG!!!((!5vv!!!!&&!, " $rK   c                R    || j                  |      z   }|| j                  |      z   S )z
        Forward pass through ABlock.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after area-attention and feed-forward processing.
        )rB  r  rn   s     rJ   rU   zABlock.forward%  s(     		!488A;rK   )g333333?r   )r   rW   r  rW   r!  rA   r  rW   )r   r   rX   )r\   r]   r^   r_   r:   r   rU   r`   ra   s   @rJ   r  r    s    .'$
-rK   r  c                  d     e Zd ZdZ	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZddZ xZS )A2C2fa  
    Area-Attention C2f module for enhanced feature extraction with area-based attention mechanisms.

    This module extends the C2f architecture by incorporating area-attention and ABlock layers for improved feature
    processing. It supports both area-attention and standard convolution modes.

    Attributes:
        cv1 (Conv): Initial 1x1 convolution layer that reduces input channels to hidden channels.
        cv2 (Conv): Final 1x1 convolution layer that processes concatenated features.
        gamma (nn.Parameter | None): Learnable parameter for residual scaling when using area attention.
        m (nn.ModuleList): List of either ABlock or C3k modules for feature processing.

    Methods:
        forward: Processes input through area-attention or standard convolution pathway.

    Examples:
        >>> m = A2C2f(512, 512, n=1, a2=True, area=1)
        >>> x = torch.randn(1, 512, 32, 32)
        >>> output = m(x)
        >>> print(output.shape)
        torch.Size([1, 512, 32, 32])
    c                  	
 t         |           t        ||z        dz  dk(  sJ d       t        |dd      | _        t        d|z   z  |d      | _        r/|r-t        j                  dt        j                  |      z  d      nd| _
        t        j                  	
fd	t        |      D              | _        y)
a  
        Initialize Area-Attention C2f module.

        Args:
            c1 (int): Number of input channels.
            c2 (int): Number of output channels.
            n (int): Number of ABlock or C3k modules to stack.
            a2 (bool): Whether to use area attention blocks. If False, uses C3k blocks instead.
            area (int): Number of areas the feature map is divided.
            residual (bool): Whether to use residual connections with learnable gamma parameter.
            mlp_ratio (float): Expansion ratio for MLP hidden dimension.
            e (float): Channel expansion ratio for hidden channels.
            g (int): Number of groups for grouped convolutions.
            shortcut (bool): Whether to use shortcut connections in C3k blocks.
        rp   r   z(Dimension of ABlock be a multiple of 32.r   g{Gz?TrK  Nc              3     K   | ];  }r&t        j                  fd t        d      D         nt        d       = yw)c              3  @   K   | ]  }t        d z          yw)rp   N)r  )r   rS   r  rk   r!  s     rJ   r   z+A2C2f.__init__.<locals>.<genexpr>.<genexpr>p  s      TaF2rRxDATr   rN   N)r;   r   r   r  )r   rS   a2r  rk   r   r!  r   s     rJ   r   z!A2C2f.__init__.<locals>.<genexpr>o  sI      
   MMT5QR8TURQ!,-
s   AA)r9   r:   rW   r   rf   ri   r;   rB   r?   r.  gammar   r   r   )rG   rF   rl   r   r.  r  residualr!  r   r   r   rk   rI   s       `` ` ``@rJ   r:   zA2C2f.__init__K  s    8 	a[Bw!|GGG|B1%Q"b!,PRW_R\\$B"7tLei
 
 
 1X	
 
rK   c                L   | j                  |      gj                  fd| j                  D               | j                  t	        j
                  d            | j                  ;|| j                  j                  d| j                  j                  d   dd      z  z   S S )z
        Forward pass through A2C2f layer.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after processing.
        c              3  4   K   | ]  } |d            ywr   r   r   s     rJ   r   z A2C2f.forward.<locals>.<genexpr>  r   r   r   r   r   )	rf   r   r   ri   r?   r   r/  rC   rO   r   s     @rJ   rU   zA2C2f.forwardv  s     XXa[M	*466**HHUYYq!_%::!tzzr4::+;+;A+>1EIIIrK   )r   Tr   Fg       @r   r   T)rF   rW   rl   rW   r   rW   r.  r   r  rW   r0  r   r!  rA   r   rA   r   rW   r   r   rX   r[   ra   s   @rJ   r*  r*  3  s    6 )
)
 )
 	)

 )
 )
 )
 )
 )
 )
 )
VrK   r*  c                  .     e Zd ZdZdd fdZddZ xZS )	SwiGLUFFNz@SwiGLU Feed-Forward Network for transformer-based architectures.c                    t         |           t        j                  |||z        | _        t        j                  ||z  dz  |      | _        y)z
        Initialize SwiGLU FFN with input dimension, output dimension, and expansion factor.

        Args:
            gc (int): Guide channels.
            ec (int): Embedding channels.
            e (int): Expansion factor.
        rN   N)r9   r:   r;   r*  w12w3)rG   r0  r   r   rI   s       rJ   r:   zSwiGLUFFN.__init__  s@     	99RR())AFaK,rK   c                    | j                  |      }|j                  dd      \  }}t        j                  |      |z  }| j	                  |      S )z.Apply SwiGLU transformation to input features.rN   r   r   )r6  r   r   silur7  )rG   rH   x12r   r   hiddens         rJ   rU   zSwiGLUFFN.forward  sD    hhqk1"%BbwwvrK   )rM   )r0  rW   r   rW   r   rW   rZ   r  rX   r[   ra   s   @rJ   r4  r4    s    J-rK   r4  c                  ,     e Zd ZdZd fdZddZ xZS )Residualz7Residual connection wrapper for neural network modules.c                $   t         |           || _        t        j                  j                  | j                  j                  j                         t        j                  j                  | j                  j                  j                         y)z
        Initialize residual module with the wrapped module.

        Args:
            m (nn.Module): Module to wrap with residual connection.
        N)	r9   r:   r   r;   r%  zeros_r7  r7   rD   )rG   r   rI   s     rJ   r:   zResidual.__init__  sS     	
tvvyy~~& 	tvvyy''(rK   c                *    || j                  |      z   S )z,Apply residual connection to input features.r   rn   s     rJ   rU   zResidual.forward  s    466!9}rK   )r   r   rZ   r  rX   r[   ra   s   @rJ   r=  r=    s    A)rK   r=  c                  ,     e Zd ZdZd fdZddZ xZS )SAVPEzESpatial-Aware Visual Prompt Embedding module for feature enhancement.c           	        t         |           t        j                  fdt	        |      D              | _        t        j                  fdt	        |      D              | _        d| _        t        j                  dz  |d      | _	        t        j                  dz  | j                  dd      | _
        t        j                  d| j                  dd      | _        t        j                  t        d| j                  z  | j                  d      t        j                  | j                  | j                  dd            | _        y)	a  
        Initialize SAVPE module with channels, intermediate channels, and embedding dimension.

        Args:
            ch (list[int]): List of input channel dimensions.
            c3 (int): Intermediate channels.
            embed (int): Embedding dimension.
        c           	   3     K   | ]c  \  }}t        j                  t        |d       t        d       |dv rt        j                  |dz        nt        j                                e yw)rd      r   rN   rN   scale_factorNr;   r   r   Upsampler   r   r   rH   r  s      rJ   r   z!SAVPE.__init__.<locals>.<genexpr>  sa      !
 1 MMQARQTUY_T_!a%1Pegepeper!
s   A)A,c              3     K   | ]W  \  }}t        j                  t        |d       |dv rt        j                  |dz        nt        j                                Y yw)r   rE  rN   rF  NrH  rJ  s      rJ   r   z!SAVPE.__init__.<locals>.<genexpr>  sP      !
1 MM$q"a.QRX["++1q5*I^`^i^i^kl!
s   AA rV   rd   r   )rw   rN   N)r9   r:   r;   r   r  rf   ri   r   r<   rj   r  r  r   r   cv6)rG   rW  r  r:  rI   s     ` rJ   r:   zSAVPE.__init__  s     	== !
 ""	!
 
 == !
!"!
 

 99QVUA.99QVTVVQ:99Q15==a$&&j$&&!!<biiPTPVPVXYcd>efrK   c                   t        |      D cg c]  \  }} | j                  |   |       }}}| j                  t        j                  |d            }t        |      D cg c]  \  }} | j
                  |   |       }}}| j                  t        j                  |d            }|j                  \  }}}}	|j                  d   }
|j                  ||d      }|j                  |d| j                  ||	      j                  d|
ddd      j                  ||
z  | j                  ||	      }|j                  ||
d||	      j                  ||
z  d||	      }| j                  t        j                  || j                  |      fd            }|j                  ||
| j                  d      }|j                  ||
dd      }||z  t        j                  |      t        j                  |j                         j"                  z  z   }t%        j&                  |d      j)                  |j                         }|j+                  dd      |j                  || j                  || j                  z  d      j+                  dd      z  }t%        j,                  |j+                  dd      j                  ||
d      dd      S c c}}w c c}}w )zJProcess input features and visual prompts to generate enhanced embeddings.r   r   r   r  rN   ri  )r  ri   r  r?   r   rf   rj   rO   rC   r\  r   expandrL  r  logical_notfinfor8   minr   rQ   torP   rk  )rG   rH   vpr   xir   r  r  r  r  Qscore
aggregateds                rJ   rU   zSAVPE.forward  sI   *3A,7B[TXXa[_77HHUYYqa()*3A,7B[TXXa[_77HHUYYqa()WW
1aHHQKFF1aIIaDFFAq)00QBCKKAPQESWSYSY[\^_`ZZ1aA&..q1uaA>HHUYY488B<0a89IIaDFFB'ZZ1a$B**2.QWW1E1I1III		%R(++AGG4__R,qyyDFFAKQS/T/^/^_ace/ff
{{://B7??1bIrUVWW1 8 8s   K %K)rW  r  r  rW   r:  rW   )rH   rb  rT  rY   rZ   rY   r[   ra   s   @rJ   rB  rB    s    Og8XrK   rB  )Jr_   
__future__r   r?   torch.nnr;   torch.nn.functionalr  r   ultralytics.utils.torch_utilsr   r>   r   r   r   r	   r
   r   transformerr   __all__Moduler   r    r   r   r   r   r   r   r   r   r   r!   r   r   r   r   r   r  r"   r&  r   r   r   r   ry  r|  r#   r$   r&   r%   r'   r)   r(   r  r*   r  r-   r.   r/   r0   r  r1   r,   r+   r2   r3   r  r  r*  r4  r=  rB  r   rK   rJ   <module>r`     s    "     : F F )(V\")) \6>BII >.#RYY #L+(bii +(\D")) D0)299 )8 ,6 68 )"))  )FJ J6x" x(;BII ;202 0(Mb M(/bii /:P P8CBII C@J")) J2")) @3$")) 3$lB)bii B)J@%ryy @%F6bii 6606		 06f,J ,,_R _()299 )D1L 1*BII (&BII &4)bii )83ryy 30<RYY <8)")) ):
3 
0f" f,@uxx @F-<")) -<`^S ^B<		 <~2ryy 2j7.")) 7.t7.BII 7.t%kS %kP0%RYY 0%f>")) >BSBII SlARYY AHRBII Rj		 0ryy ,9XBII 9XrK   