
    rh              	          d Z ddlZddlZddlZddlmZ ddlmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZmZmZ ddlmZ ddl m!Z!  ejD                  e#      Z$e ed       G d de                    Z%e ed       G d de                    Z&e ed       G d de                    Z'e ed       G d de                    Z(d Z)d Z* G d d ejV                        Z, G d! d"ejV                        Z- G d# d$ejV                        Z.dJd%e
j^                  d&e0d'e1d(e
j^                  fd)Z2 G d* d+ejV                        Z3 G d, d-ejV                        Z4 G d. d/ejV                        Z5 G d0 d1ejV                        Z6 G d2 d3ejV                        Z7 G d4 d5ejV                        Z8 G d6 d7ejV                        Z9 G d8 d9e      Z: G d: d;ejV                        Z;e G d< d=e             Z<e G d> d?e<             Z= ed@       G dA dBe<             Z> edC       G dD dEe<             Z? edF       G dG dHe<e             Z@g dIZAy)KzPyTorch Swin Transformer model.    N)	dataclass)OptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BackboneOutput)PreTrainedModel) find_pruneable_heads_and_indicesmeshgridprune_linear_layer)ModelOutputauto_docstringlogging	torch_int)BackboneMixin   )
SwinConfigzN
    Swin encoder's outputs, with potential hidden states and attentions.
    )custom_introc                       e Zd ZU dZdZeej                     ed<   dZ	ee
ej                  df      ed<   dZee
ej                  df      ed<   dZee
ej                  df      ed<   y)SwinEncoderOutputa  
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   tupler   r        y/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/swin/modeling_swin.pyr   r   +   s}     6:x 1 129=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr&   r   zV
    Swin model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   dZeeej                  df      ed<   y)	SwinModelOutputa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
        Average pooling of the last layer hidden-state.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr   pooler_output.r   r   r   )r   r   r   r    r   r   r!   r"   r#   r*   r   r$   r   r   r%   r&   r'   r)   r)   A   s    	 6:x 1 12915M8E--.5=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr&   r)   z*
    Swin masked image model outputs.
    c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   dZeeej                  df      ed<   ed	        Zy)
SwinMaskedImageModelingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
        Masked image modeling (MLM) loss.
    reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Reconstructed pixel values.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlossreconstruction.r   r   r   c                 N    t        j                  dt               | j                  S )Nzlogits attribute is deprecated and will be removed in version 5 of Transformers. Please use the reconstruction attribute to retrieve the final output instead.)warningswarnFutureWarningr.   selfs    r'   logitsz$SwinMaskedImageModelingOutput.logitst   s%    ]	

 """r&   )r   r   r   r    r-   r   r!   r"   r#   r.   r   r$   r   r   propertyr5   r%   r&   r'   r,   r,   Z   s     )-D(5$$
%,26NHU../6=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJ# #r&   r,   z0
    Swin outputs for image classification.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   dZeeej                  df      ed<   y)	SwinImageClassifierOutputa7  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr-   r5   .r   r   r   )r   r   r   r    r-   r   r!   r"   r#   r5   r   r$   r   r   r%   r&   r'   r8   r8   ~   s     )-D(5$$
%,*.FHU&&'.=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr&   r8   c                     | j                   \  }}}}| j                  |||z  |||z  ||      } | j                  dddddd      j                         j                  d|||      }|S )z2
    Partitions the given input into windows.
    r   r   r            shapeviewpermute
contiguous)input_featurewindow_size
batch_sizeheightwidthnum_channelswindowss          r'   window_partitionrJ      s}     /<.A.A+J|!&&Fk);8Lk[gM ##Aq!Q15@@BGGKYdfrsGNr&   c                     | j                   d   }| j                  d||z  ||z  |||      } | j                  dddddd      j                         j                  d|||      } | S )z?
    Merges windows to produce higher resolution features.
    r=   r   r   r   r:   r;   r<   r>   )rI   rD   rF   rG   rH   s        r'   window_reverserL      sn     ==$Lll2v4e{6JKYdfrsGooaAq!Q/::<AA"feUabGNr&   c            
            e Zd ZdZd fd	Zdej                  dededej                  fdZ	 	 dde	ej                     d	e	ej                     d
edeej                     fdZ xZS )SwinEmbeddingszW
    Construct the patch and position embeddings. Optionally, also the mask token.
    c                 ~   t         |           t        |      | _        | j                  j                  }| j                  j
                  | _        |r4t        j                  t        j                  dd|j                              nd | _        |j                  r=t        j                  t        j                  d|dz   |j                              | _        nd | _        t        j                  |j                        | _        t        j"                  |j$                        | _        |j(                  | _        || _        y )Nr   )super__init__SwinPatchEmbeddingspatch_embeddingsnum_patches	grid_size
patch_gridr   	Parameterr!   zeros	embed_dim
mask_tokenuse_absolute_embeddingsposition_embeddings	LayerNormnormDropouthidden_dropout_probdropout
patch_sizeconfig)r4   rc   use_mask_tokenrT   	__class__s       r'   rQ   zSwinEmbeddings.__init__   s     3F ;++77//99O]",,u{{1a9I9I'JKcg))')||EKK;QR?TZTdTd4e'fD$'+D$LL!1!12	zz&"<"<= ++r&   
embeddingsrF   rG   returnc                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  z  }	|| j
                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr=         ?r   r   r:   bicubicF)sizemodealign_cornersdim)r?   r\   r!   jit
is_tracingrb   r   reshaperA   r   
functionalinterpolater@   cat)r4   rf   rF   rG   rT   num_positionsclass_pos_embedpatch_pos_embedro   
new_height	new_widthsqrt_num_positionss               r'   interpolate_pos_encodingz'SwinEmbeddings.interpolate_pos_encoding   s`    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr&   pixel_valuesbool_masked_posr|   c                    |j                   \  }}}}| j                  |      \  }}	| j                  |      }|j                         \  }
}}|K| j                  j                  |
|d      }|j                  d      j                  |      }|d|z
  z  ||z  z   }| j                  (|r|| j                  |||      z   }n|| j                  z   }| j                  |      }||	fS )Nr=         ?)r?   rS   r^   rk   rZ   expand	unsqueezetype_asr\   r|   ra   )r4   r}   r~   r|   _rH   rF   rG   rf   output_dimensionsrE   seq_lenmask_tokensmasks                 r'   forwardzSwinEmbeddings.forward   s     *6););&<(,(=(=l(K%
%YYz*
!+!2
GQ&//00WbIK",,R088ED#sTz2[45GGJ##/''$*G*G
TZ\a*bb
'$*B*BB
\\*-
,,,r&   )F)NF)r   r   r   r    rQ   r!   Tensorintr|   r   r"   
BoolTensorboolr$   r   __classcell__re   s   @r'   rN   rN      s    &&D5<< &D &DUX &D]b]i]i &DV 7;).	-u001- "%"2"23- #'	-
 
u||	-r&   rN   c                   v     e Zd ZdZ fdZd Zdeej                     de	ej                  e	e   f   fdZ xZS )rR   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        |d   |d   z  |d   |d   z  f| _        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)rP   rQ   
image_sizerb   rH   rY   
isinstancecollectionsabcIterablerT   rU   r   Conv2d
projection)r4   rc   r   rb   rH   hidden_sizerT   re   s          r'   rQ   zSwinPatchEmbeddings.__init__  s    !'!2!2F4E4EJ
$*$7$79I9Ik#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&$Q-:a=8*Q-:VW=:XY))L+:^hir&   c                 n   || j                   d   z  dk7  rDd| j                   d   || j                   d   z  z
  f}t        j                  j                  ||      }|| j                   d   z  dk7  rFddd| j                   d   || j                   d   z  z
  f}t        j                  j                  ||      }|S )Nr   r   )rb   r   rs   pad)r4   r}   rF   rG   
pad_valuess        r'   	maybe_padzSwinPatchEmbeddings.maybe_pad!  s    4??1%%*T__Q/%$//!:L2LLMJ==,,\:FLDOOA&&!+Q4??1#5QRAS8S#STJ==,,\:FLr&   r}   rg   c                     |j                   \  }}}}| j                  |||      }| j                  |      }|j                   \  }}}}||f}|j                  d      j	                  dd      }||fS )Nr:   r   )r?   r   r   flatten	transpose)r4   r}   r   rH   rF   rG   rf   r   s           r'   r   zSwinPatchEmbeddings.forward*  s}    )5););&<~~lFEB__\2
(..1fe#UO''*44Q:
,,,r&   )r   r   r   r    rQ   r   r   r!   r"   r$   r   r   r   r   r   s   @r'   rR   rR     sF    j	-HU->->$? 	-E%,,X]^aXbJbDc 	-r&   rR   c            	            e Zd ZdZej
                  fdee   dedej                  ddf fdZ	d Z
d	ej                  d
eeef   dej                  fdZ xZS )SwinPatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    input_resolutionro   
norm_layerrg   Nc                     t         |           || _        || _        t	        j
                  d|z  d|z  d      | _         |d|z        | _        y )Nr;   r:   Fbias)rP   rQ   r   ro   r   Linear	reductionr^   )r4   r   ro   r   re   s       r'   rQ   zSwinPatchMerging.__init__C  sI     01s7AG%@q3w'	r&   c                     |dz  dk(  xs |dz  dk(  }|r.ddd|dz  d|dz  f}t         j                  j                  ||      }|S )Nr:   r   r   )r   rs   r   )r4   rC   rF   rG   
should_padr   s         r'   r   zSwinPatchMerging.maybe_padJ  sU    qjAo:519>
Q519a!<JMM--mZHMr&   rC   input_dimensionsc                    |\  }}|j                   \  }}}|j                  ||||      }| j                  |||      }|d d dd ddd dd d f   }|d d dd ddd dd d f   }	|d d dd ddd dd d f   }
|d d dd ddd dd d f   }t        j                  ||	|
|gd      }|j                  |dd|z        }| j                  |      }| j                  |      }|S )Nr   r:   r   r=   r;   )r?   r@   r   r!   ru   r^   r   )r4   rC   r   rF   rG   rE   ro   rH   input_feature_0input_feature_1input_feature_2input_feature_3s               r'   r   zSwinPatchMerging.forwardR  s   ((5(;(;%
C%**:vulS}feD'14a4Aq(89'14a4Aq(89'14a4Aq(89'14a4Aq(89		?O_Ve"fhjk%**:r1|;KL		-0}5r&   )r   r   r   r    r   r]   r$   r   ModulerQ   r   r!   r   r   r   r   s   @r'   r   r   6  sr    
 XZWcWc (s (# (299 (hl (U\\ U3PS8_ Y^YeYe r&   r   input	drop_probtrainingrg   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
            r   r   )r   dtypedevice)r?   ndimr!   randr   r   floor_div)r   r   r   	keep_probr?   random_tensoroutputs          r'   	drop_pathr   m  s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr&   c                   x     e Zd ZdZd	dee   ddf fdZdej                  dej                  fdZ	de
fdZ xZS )
SwinDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rg   c                 0    t         |           || _        y N)rP   rQ   r   )r4   r   re   s     r'   rQ   zSwinDropPath.__init__  s    "r&   r   c                 D    t        || j                  | j                        S r   )r   r   r   r4   r   s     r'   r   zSwinDropPath.forward  s    FFr&   c                      d| j                    S )Nzp=)r   r3   s    r'   
extra_reprzSwinDropPath.extra_repr  s    DNN#$$r&   r   )r   r   r   r    r   floatrQ   r!   r   r   strr   r   r   s   @r'   r   r     sG    b#(5/ #T #GU\\ Gell G%C %r&   r   c                        e Zd Z fdZ	 	 	 ddej
                  deej                     deej                     dee   de	ej
                     f
dZ
 xZS )	SwinSelfAttentionc                    t         |           ||z  dk7  rt        d| d| d      || _        t	        ||z        | _        | j                  | j
                  z  | _        t        |t        j                  j                        r|n||f| _        t        j                  t        j                  d| j                  d   z  dz
  d| j                  d   z  dz
  z  |            | _        t        j"                  | j                  d         }t        j"                  | j                  d         }t        j$                  t'        ||gd            }t        j(                  |d      }|d d d d d f   |d d d d d f   z
  }	|	j+                  ddd      j-                         }	|	d d d d dfxx   | j                  d   dz
  z  cc<   |	d d d d dfxx   | j                  d   dz
  z  cc<   |	d d d d dfxx   d| j                  d   z  dz
  z  cc<   |	j/                  d	      }
| j1                  d
|
       t        j2                  | j                  | j                  |j4                        | _        t        j2                  | j                  | j                  |j4                        | _        t        j2                  | j                  | j                  |j4                        | _        t        j<                  |j>                        | _         y )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r:   r   ij)indexingr=   relative_position_indexr   )!rP   rQ   
ValueErrornum_attention_headsr   attention_head_sizeall_head_sizer   r   r   r   rD   r   rW   r!   rX   relative_position_bias_tablearangestackr   r   rA   rB   sumregister_bufferr   qkv_biasquerykeyvaluer_   attention_probs_dropout_probra   )r4   rc   ro   	num_headsrD   coords_hcoords_wcoordscoords_flattenrelative_coordsr   re   s              r'   rQ   zSwinSelfAttention.__init__  s   ?a#C5(^_h^iijk  $- #&sY#7 !558P8PP%k;??3K3KLKS^`kRl 	 -/LLKKT--a0014T=M=Ma=P9PST9TUW`a-
)
 << 0 0 34<< 0 0 34Xx&:TJKvq1(At4~aqj7QQ)11!Q:EEG1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"968OPYYt1143E3EFOO\
99T//1C1C&//ZYYt1143E3EFOO\
zz&"E"EFr&   r   attention_mask	head_maskoutput_attentionsrg   c                    |j                   \  }}}||d| j                  f}| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }t        j                  |	|
j	                  dd            }|t        j                  | j                        z  }| j                  | j                  j                  d         }|j                  | j                  d   | j                  d   z  | j                  d   | j                  d   z  d      }|j                  ddd      j                         }||j!                  d      z   }|r|j                   d   }|j                  ||z  || j"                  ||      }||j!                  d      j!                  d      z   }|j                  d| j"                  ||      }t$        j&                  j)                  |d      }| j+                  |      }|||z  }t        j                  ||      }|j                  dddd      j                         }|j-                         d d | j.                  fz   }|j                  |      }|r||f}|S |f}|S )Nr=   r   r:   r   rn   r   )r?   r   r   r@   r   r   r   r!   matmulmathsqrtr   r   rD   rA   rB   r   r   r   rs   softmaxra   rk   r   )r4   r   r   r   r   rE   ro   rH   hidden_shapequery_layer	key_layervalue_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                      r'   r   zSwinSelfAttention.forward  s    )6(;(;%
C"CT-E-EFjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<Y5H5HR5PQ+dii8P8P.QQ!%!B!B4C_C_CdCdegCh!i!7!<!<Q$"2"21"55t7G7G7JTM]M]^_M`7`bd"
 "8!?!?1a!H!S!S!U+.D.N.Nq.QQ%'--a0J/44j(*d6N6NPSUX   0.2J2J12M2W2WXY2ZZ/44R9Q9QSVX[\ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r&   NNF)r   r   r   rQ   r!   r   r   r"   r   r$   r   r   r   s   @r'   r   r     sq    #GP 7;15,16||6 !!2!236 E--.	6
 $D>6 
u||	6r&   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )SwinSelfOutputc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y r   )rP   rQ   r   r   denser_   r   ra   r4   rc   ro   re   s      r'   rQ   zSwinSelfOutput.__init__  s6    YYsC(
zz&"E"EFr&   r   input_tensorrg   c                 J    | j                  |      }| j                  |      }|S r   r  ra   )r4   r   r  s      r'   r   zSwinSelfOutput.forward  s$    

=1]3r&   r   r   r   rQ   r!   r   r   r   r   s   @r'   r   r     s2    G
U\\  RWR^R^ r&   r   c                        e Zd Z fdZd Z	 	 	 d	dej                  deej                     deej                     dee	   de
ej                     f
dZ xZS )
SwinAttentionc                     t         |           t        ||||      | _        t	        ||      | _        t               | _        y r   )rP   rQ   r   r4   r   r   setpruned_heads)r4   rc   ro   r   rD   re   s        r'   rQ   zSwinAttention.__init__  s8    %fc9kJ	$VS1Er&   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   rn   )lenr   r4   r   r   r  r   r   r   r   r   r  r   union)r4   headsindexs      r'   prune_headszSwinAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r&   r   r   r   r   rg   c                 j    | j                  ||||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )r4   r   )r4   r   r   r   r   self_outputsattention_outputr   s           r'   r   zSwinAttention.forward  sG     yy	K\];;|AF#%QR(88r&   r   )r   r   r   rQ   r  r!   r   r   r"   r   r$   r   r   r   s   @r'   r	  r	    st    ";* 7;15,1
||
 !!2!23
 E--.	

 $D>
 
u||	
r&   r	  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )SwinIntermediatec                    t         |           t        j                  |t	        |j
                  |z              | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )rP   rQ   r   r   r   	mlp_ratior  r   
hidden_actr   r   intermediate_act_fnr  s      r'   rQ   zSwinIntermediate.__init__#  sa    YYsC(8(83(>$?@
f''-'-f.?.?'@D$'-'8'8D$r&   r   rg   c                 J    | j                  |      }| j                  |      }|S r   )r  r  r   s     r'   r   zSwinIntermediate.forward+  s&    

=100?r&   r  r   s   @r'   r  r  "  s#    9U\\ ell r&   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
SwinOutputc                     t         |           t        j                  t	        |j
                  |z        |      | _        t        j                  |j                        | _	        y r   )
rP   rQ   r   r   r   r  r  r_   r`   ra   r  s      r'   rQ   zSwinOutput.__init__2  sF    YYs6#3#3c#9:C@
zz&"<"<=r&   r   rg   c                 J    | j                  |      }| j                  |      }|S r   r  r   s     r'   r   zSwinOutput.forward7  s$    

=1]3r&   r  r   s   @r'   r  r  1  s#    >
U\\ ell r&   r  c                        e Zd Zd fd	Zd Zd Zd Z	 	 	 ddej                  de	e
e
f   deej                     dee   d	ee   d
e	ej                  ej                  f   fdZ xZS )	SwinLayerc                    t         |           |j                  | _        || _        |j                  | _        || _        t        j                  ||j                        | _	        t        |||| j                        | _        |dkD  rt        |      nt        j                         | _        t        j                  ||j                        | _        t!        ||      | _        t%        ||      | _        y )Neps)rD   r   )rP   rQ   chunk_size_feed_forward
shift_sizerD   r   r   r]   layer_norm_epslayernorm_beforer	  	attentionr   Identityr   layernorm_afterr  intermediater  r   )r4   rc   ro   r   r   drop_path_rater'  re   s          r'   rQ   zSwinLayer.__init__>  s    '-'E'E$$!-- 0 "Sf6K6K L&vsI4K[K[\9G#9Mn5SUS^S^S`!||CV5J5JK,VS9 -r&   c                    t        |      | j                  k  rgt        d      | _        t        j
                  j                         r(t	        j                   t	        j                  |            n
t        |      | _        y y Nr   )minrD   r   r'  r!   rp   rq   tensor)r4   r   s     r'   set_shift_and_window_sizez#SwinLayer.set_shift_and_window_sizeK  s\     D$4$44'lDO=BYY=Q=Q=S		%,,'789Y\]mYn  5r&   c           	         | j                   dkD  rht        j                  d||df||      }t        d| j                         t        | j                   | j                          t        | j                    d       f}t        d| j                         t        | j                   | j                          t        | j                    d       f}d}|D ]  }	|D ]  }
||d d |	|
d d f<   |dz  }  t        || j                        }|j                  d| j                  | j                  z        }|j                  d      |j                  d      z
  }|j                  |dk7  d      j                  |dk(  d      }|S d }|S )Nr   r   r   r=   r:   g      Yr   )	r'  r!   rX   slicerD   rJ   r@   r   masked_fill)r4   rF   rG   r   r   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks                r'   get_attn_maskzSwinLayer.get_attn_maskS  s   ??Q{{Avua#8fUHa$***+t'''$//)9:t&-M a$***+t'''$//)9:t&-L
 E - #/ K@EHQk1<=QJE
 ,Hd6F6FGL',,R1A1ADDTDT1TUL$..q1L4J4J14MMI!--i1nfEQQR[_`R`befI  Ir&   c                     | j                   || j                   z  z
  | j                   z  }| j                   || j                   z  z
  | j                   z  }ddd|d|f}t        j                  j                  ||      }||fS r0  )rD   r   rs   r   )r4   r   rF   rG   	pad_right
pad_bottomr   s          r'   r   zSwinLayer.maybe_pado  s    %%0@0@(@@DDTDTT	&&$2B2B)BBdFVFVV
Ay!Z8
))-Dj((r&   r   r   r   r   always_partitionrg   c                    |s| j                  |       n	 |\  }}|j                         \  }}	}
|}| j                  |      }|j                  ||||
      }| j	                  |||      \  }}|j
                  \  }	}}}	| j                  dkD  r1t        j                  || j                   | j                   fd      }n|}t        || j                        }|j                  d| j                  | j                  z  |
      }| j                  |||j                  |j                        }| j                  ||||      }|d   }|j                  d| j                  | j                  |
      }t        || j                  ||      }| j                  dkD  r/t        j                  || j                  | j                  fd      }n|}|d   dkD  xs |d   dkD  }|r|d d d |d |d d f   j!                         }|j                  |||z  |
      }|| j#                  |      z   }| j%                  |      }| j'                  |      }|| j)                  |      z   }|r	||d	   f}|S |f}|S )
Nr   )r   r:   )shiftsdimsr=   r   )r   r   r<   r   )r3  rk   r)  r@   r   r?   r'  r!   rollrJ   rD   r?  r   r   r*  rL   rB   r   r,  r-  r   )r4   r   r   r   r   rC  rF   rG   rE   r   channelsshortcutr   
height_pad	width_padshifted_hidden_stateshidden_states_windowsr>  attention_outputsr  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                            r'   r   zSwinLayer.forwardv  s     **+;<("/"4"4"6
Ax --m<%**:vuhO %)NN=&%$P!z&3&9&9#:y!??Q$)JJ}tFVY]YhYhXhEipv$w!$1! !11FHXHX Y 5 : :2t?O?ORVRbRb?bdl m&&	)<)<EZEaEa ' 
	 !NN!9iK\ + 
 -Q/,11"d6F6FHXHXZbc():D<L<LjZcd ??Q %

?DOOUYUdUdCelr s /]Q&;*Q-!*;
 1!WfWfufa2G H S S U-22:v~xX 4>>2C#DD++M:((6$t{{<'@@@Q'8';< YeWfr&   )r   r   NFF)r   r   r   rQ   r3  r?  r   r!   r   r$   r   r   r"   r   r   r   r   s   @r'   r"  r"  =  s    .8) 26,1+0A||A  S/A E--.	A
 $D>A #4.A 
u||U\\)	*Ar&   r"  c                        e Zd Z fdZ	 	 	 d	dej
                  deeef   deej                     dee
   dee
   deej
                     fdZ xZS )
	SwinStagec                 h   t         	|           || _        || _        t	        j
                  t        |      D cg c]-  }t        ||||||   |dz  dk(  rdn|j                  dz        / c}      | _	        |& |||t        j                        | _        d| _        y d | _        d| _        y c c}w )Nr:   r   )rc   ro   r   r   r.  r'  )ro   r   F)rP   rQ   rc   ro   r   
ModuleListranger"  rD   blocksr]   
downsamplepointing)
r4   rc   ro   r   depthr   r   r[  ire   s
            r'   rQ   zSwinStage.__init__  s    mm u
  !%5'#,Q<%&UaZqf6H6HA6M

 !()9sr||\DO  #DO'
s   2B/r   r   r   r   rC  rg   c                    |\  }}t        | j                        D ]  \  }}	|||   nd }
 |	|||
||      }|d   }! |}| j                  )|dz   dz  |dz   dz  }}||||f}| j                  ||      }n||||f}|||f}|r|dd  z  }|S )Nr   r   r:   )	enumeraterZ  r[  )r4   r   r   r   r   rC  rF   rG   r^  layer_modulelayer_head_maskrS  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledr   stage_outputss                    r'   r   zSwinStage.forward  s     )(5 	-OA|.7.CilO(/BSUeM *!,M	- -:)??&5;aZA4EPQ	VWGW 1!'0BDU V OO,MO_`M!' >&(IK\]]12..Mr&   rT  )r   r   r   rQ   r!   r   r$   r   r   r"   r   r   r   r   s   @r'   rV  rV    sz    < 26,1+0||  S/ E--.	
 $D> #4. 
u||	r&   rV  c                        e Zd Z fdZ	 	 	 	 	 	 ddej
                  deeef   deej                     dee
   dee
   dee
   dee
   d	ee
   d
eeef   fdZ xZS )SwinEncoderc                    t         |           t        |j                        | _        || _        t        j                  d|j                  t        |j                        d      D cg c]  }|j                          }}t        j                  t        | j                        D cg c]  }t        |t        |j                   d|z  z        |d   d|z  z  |d   d|z  z  f|j                  |   |j"                  |   |t        |j                  d |       t        |j                  d |dz           || j                  dz
  k  rt$        nd        c}      | _        d| _        y c c}w c c}w )Nr   cpu)r   r:   r   )rc   ro   r   r]  r   r   r[  F)rP   rQ   r  depths
num_layersrc   r!   linspacer.  r   itemr   rX  rY  rV  r   rY   r   r   layersgradient_checkpointing)r4   rc   rU   xdpri_layerre   s         r'   rQ   zSwinEncoder.__init__  sM   fmm,!&63H3H#fmmJ\ej!klAqvvxllmm  %T__5  !F,,q'z9:&/lq'z&BIaLUVX_U_D`%a --0$..w7!#fmmHW&=">V]]S`U\_`U`EaAbc4;dooPQ>Q4Q/X\
 ',#! ms   )E&(B*E+r   r   r   r   output_hidden_states(output_hidden_states_before_downsamplingrC  return_dictrg   c	                    |rdnd }	|rdnd }
|rdnd }|rE|j                   \  }}} |j                  |g|| }|j                  dddd      }|	|fz  }	|
|fz  }
t        | j                        D ]  \  }}|||   nd } ||||||      }|d   }|d   }|d   }|d   |d   f}|rP|rN|j                   \  }}} |j                  |g|d   |d   f| }|j                  dddd      }|	|fz  }	|
|fz  }
nI|rG|sE|j                   \  }}} |j                  |g|| }|j                  dddd      }|	|fz  }	|
|fz  }
|s||dd  z  } |st        d ||	|fD              S t        ||	||
	      S )
Nr%   r   r   r   r:   r   r=   c              3   &   K   | ]	  }||  y wr   r%   ).0vs     r'   	<genexpr>z&SwinEncoder.forward.<locals>.<genexpr>G  s     mq_`_lms   )r   r   r   r   )r?   r@   rA   r`  ro  r$   r   )r4   r   r   r   r   rt  ru  rC  rv  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsrE   r   r   reshaped_hidden_stater^  ra  rb  rS  rc  r   s                         r'   r   zSwinEncoder.forward  sI    #7BD+?RT"$5b4)6)<)<&J;$6M$6$6z$bDT$bVa$b!$9$A$A!Q1$M!-!11&+@*BB&(5  	9OA|.7.CilO(/BSUeM *!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
A{ )O(I(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*/D.FF*%.V-:-@-@*
A{(:(:(::(fHX(fZe(f%(=(E(EaAq(Q%!m%55!*/D.FF* #}QR'88#A 	9D m]4EGZ$[mmm ++*#=	
 	
r&   )NFFFFT)r   r   r   rQ   r!   r   r$   r   r   r"   r   r   r   r   r   r   s   @r'   rh  rh    s    ,4 26,1/4CH+0&*A
||A
  S/A
 E--.	A

 $D>A
 'tnA
 3;4.A
 #4.A
 d^A
 
u''	(A
r&   rh  c                   0    e Zd ZU eed<   dZdZdZdgZd Z	y)SwinPreTrainedModelrc   swinr}   TrV  c                 H   t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yt        |t              rb|j                  $|j                  j
                  j                          |j                  %|j                  j
                  j                          yyt        |t               r%|j"                  j
                  j                          yy)zInitialize the weightsr   )meanstdNr   )r   r   r   r   weightdatanormal_rc   initializer_ranger   zero_r]   fill_rN   rZ   r\   r   r   )r4   modules     r'   _init_weightsz!SwinPreTrainedModel._init_weightsY  s"   fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-KK""$MM$$S)/  ,!!&&,,.))5**//557 6 12//44::< 3r&   N)
r   r   r   r   r#   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr  r%   r&   r'   r  r  Q  s&    $O&*#$=r&   r  c                        e Zd Zd fd	Zd Zd Ze	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee   dee   d	ed
ee   deeef   fd       Z xZS )	SwinModelc                    t         |   |       || _        t        |j                        | _        t        |j                  d| j
                  dz
  z  z        | _        t        ||      | _
        t        || j                  j                        | _        t        j                  | j                  |j                         | _        |rt        j$                  d      nd| _        | j)                          y)a  
        add_pooling_layer (`bool`, *optional*, defaults to `True`):
            Whether or not to apply pooling layer.
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether or not to create and apply mask tokens in the embedding layer.
        r:   r   )rd   r$  N)rP   rQ   rc   r  rk  rl  r   rY   num_featuresrN   rf   rh  rV   encoderr   r]   r(  	layernormAdaptiveAvgPool1dpooler	post_init)r4   rc   add_pooling_layerrd   re   s       r'   rQ   zSwinModel.__init__o  s     	 fmm, 0 0119L3M MN(O"64??+E+EFd&7&7V=R=RS1Bb**1- 	r&   c                 .    | j                   j                  S r   rf   rS   r3   s    r'   get_input_embeddingszSwinModel.get_input_embeddings      ///r&   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  layerr*  r  )r4   heads_to_pruner  r  s       r'   _prune_headszSwinModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr&   r}   r~   r   r   rt  r|   rv  rg   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  |t        | j                   j                              }| j                  |||      \  }}	| j                  ||	||||      }
|
d   }| j                  |      }d}| j                  7| j                  |j                  dd            }t        j                  |d      }|s||f|
dd z   }|S t        |||
j                   |
j"                  |
j$                        S )	z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)r~   r|   )r   r   rt  rv  r   r   r:   )r   r*   r   r   r   )rc   r   rt  use_return_dictr   get_head_maskr  rk  rf   r  r  r  r   r!   r   r)   r   r   r   )r4   r}   r~   r   r   rt  r|   rv  embedding_outputr   encoder_outputssequence_outputpooled_outputr   s                 r'   r   zSwinModel.forward  sp    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@ &&y#dkk6H6H2IJ	-1__/Tl .= .
** ,,/!5# ' 
 *!,..9;;" KK(A(A!Q(GHM!MM-;M%}58KKFM-')77&11#2#I#I
 	
r&   )TFNNNNNFN)r   r   r   rQ   r  r  r   r   r!   r"   r   r   r   r$   r)   r   r   r   s   @r'   r  r  m  s    *0C  596:15,0/3).&*>
u001>
 "%"2"23>
 E--.	>

 $D>>
 'tn>
 #'>
 d^>
 
uo%	&>
 >
r&   r  ad  
    Swin Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    c                        e Zd Z fdZe	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee	   dee	   de	dee	   d	e
eef   fd
       Z xZS )SwinForMaskedImageModelingc                    t         |   |       t        |dd      | _        t	        |j
                  d|j                  dz
  z  z        }t        j                  t        j                  ||j                  dz  |j                  z  d      t        j                  |j                              | _        | j                          y )NFT)r  rd   r:   r   )in_channelsout_channelsr   )rP   rQ   r  r  r   rY   rl  r   
Sequentialr   encoder_striderH   PixelShuffledecoderr  )r4   rc   r  re   s      r'   rQ   z#SwinForMaskedImageModeling.__init__  s     fdS	6++aF4E4E4I.JJK}}II(v7L7La7ORXReRe7est OOF112	
 	r&   r}   r~   r   r   rt  r|   rv  rg   c           	         ||n| j                   j                  }| j                  |||||||      }|d   }	|	j                  dd      }	|	j                  \  }
}}t        j                  |dz        x}}|	j                  |
|||      }	| j                  |	      }d}|| j                   j                  | j                   j                  z  }|j                  d||      }|j                  | j                   j                  d      j                  | j                   j                  d      j                  d      j                         }t        j                  j!                  ||d	      }||z  j#                         |j#                         d
z   z  | j                   j$                  z  }|s|f|dd z   }||f|z   S |S t'        |||j(                  |j*                  |j,                        S )a7  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, SwinForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swin-base-simmim-window6-192")
        >>> model = SwinForMaskedImageModeling.from_pretrained("microsoft/swin-base-simmim-window6-192")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 192, 192]
        ```N)r~   r   r   rt  r|   rv  r   r   r:   ri   r=   none)r   gh㈵>)r-   r.   r   r   r   )rc   r  r  r   r?   r   floorrr   r  r   rb   repeat_interleaver   rB   r   rs   l1_lossr   rH   r,   r   r   r   )r4   r}   r~   r   r   rt  r|   rv  r   r  rE   rH   sequence_lengthrF   rG   reconstructed_pixel_valuesmasked_im_lossrk   r   reconstruction_lossr   s                        r'   r   z"SwinForMaskedImageModeling.forward  s   L &1%<k$++B]B]))+/!5%=#  
 "!*)33Aq94C4I4I1
L/OS$899)11*lFTYZ &*\\/%B"&;;))T[[-C-CCD-55b$EO11$++2H2H!L""4;;#9#91=1	  #%--"7"7F`lr"7"s1D8==?488:PTCTUX\XcXcXpXppN02WQR[@F3A3M^%.YSYY,5!//))#*#A#A
 	
r&   r  )r   r   r   rQ   r   r   r!   r"   r   r   r   r$   r,   r   r   r   s   @r'   r  r    s       596:15,0/3).&*R
u001R
 "%"2"23R
 E--.	R

 $D>R
 'tnR
 #'R
 d^R
 
u33	4R
 R
r&   r  a  
    Swin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune Swin on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                        e Zd Z fdZe	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee	   dee	   de	dee	   d	e
eef   fd
       Z xZS )SwinForImageClassificationc                 >   t         |   |       |j                  | _        t        |      | _        |j                  dkD  r4t        j                  | j                  j                  |j                        nt        j                         | _	        | j                          y r0  )rP   rQ   
num_labelsr  r  r   r   r  r+  
classifierr  )r4   rc   re   s     r'   rQ   z#SwinForImageClassification.__init__S  sx      ++f%	 EKDUDUXYDYBIIdii,,f.?.?@_a_j_j_l 	
 	r&   r}   r   labelsr   rt  r|   rv  rg   c                 \   ||n| j                   j                  }| j                  ||||||      }|d   }	| j                  |	      }
d}|| j	                  |
||
| j                         }|s|
f|dd z   }||f|z   S |S t        ||
|j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   rt  r|   rv  r   )r5   r  pooled_logitsrc   r:   )r-   r5   r   r   r   )	rc   r  r  r  loss_functionr8   r   r   r   )r4   r}   r   r  r   rt  r|   rv  r   r  r5   r-   r   s                r'   r   z"SwinForImageClassification.forwarda  s    " &1%<k$++B]B]))/!5%=#  
  
/%%VFRXaealal%mDY,F)-)9TGf$EvE(!//))#*#A#A
 	
r&   r  )r   r   r   rQ   r   r   r!   r"   
LongTensorr   r   r$   r8   r   r   r   s   @r'   r  r  D  s      5915-1,0/3).&*-
u001-
 E--.-
 ))*	-

 $D>-
 'tn-
 #'-
 d^-
 
u//	0-
 -
r&   r  zM
    Swin backbone, to be used with frameworks like DETR and MaskFormer.
    c                   t     e Zd Zdef fdZd Z	 	 	 d
dej                  dee	   dee	   dee	   de
f
d	Z xZS )SwinBackbonerc   c           	      >   t         |   |       t         | 	  |       |j                  gt	        t        |j                              D cg c]  }t        |j                  d|z  z         c}z   | _        t        |      | _
        t        || j                  j                        | _        i }t        | j                  | j                         D ]  \  }}t#        j$                  |      ||<    t#        j&                  |      | _        | j+                          y c c}w )Nr:   )rP   rQ   _init_backbonerY   rY  r  rk  r   r  rN   rf   rh  rV   r  zip_out_featuresrH  r   r]   
ModuleDicthidden_states_normsr  )r4   rc   r^  r  stagerH   re   s         r'   rQ   zSwinBackbone.__init__  s     v&#--.X]^abhbobo^pXq1rST#f6F6FA6M2N1rr(0"64??+E+EF !#&t'9'94==#I 	DE<)+l)C&	D#%==1D#E  	 2ss   "Dc                 .    | j                   j                  S r   r  r3   s    r'   r  z!SwinBackbone.get_input_embeddings  r  r&   r}   rt  r   rv  rg   c           
          ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      \  }}| j                  ||d|dddd      }|j                  }d}	t        | j                  |      D ]  \  }
}|
| j                  v s|j                  \  }}}}|j                  dddd      j                         }|j                  |||z  |      } | j                  |
   |      }|j                  ||||      }|j                  dddd      j                         }|	|fz  }	 |s|	f}|r||j                  fz  }|S t!        |	|r|j                  nd|j"                  	      S )
aK  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
        >>> model = AutoBackbone.from_pretrained(
        ...     "microsoft/swin-tiny-patch4-window7-224", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 7, 7]
        ```NT)r   r   rt  ru  rC  rv  r%   r   r:   r   r   )feature_mapsr   r   )rc   r  rt  r   rf   r  r   r  stage_namesout_featuresr?   rA   rB   r@   r  r   r
   r   )r4   r}   rt  r   rv  r  r   r   r   r  r  hidden_staterE   rH   rF   rG   r   s                    r'   r   zSwinBackbone.forward  s   @ &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq-1__\-J**,,/!%59!  	
  66#&t'7'7#G 	0E<))):F:L:L7
L&%+33Aq!Q?JJL+00Ve^\Z>t77>|L+00VULY+33Aq!Q?JJL/	0 "_F#70022M%3G'//T))
 	
r&   )NNN)r   r   r   r   rQ   r  r!   r   r   r   r
   r   r   r   s   @r'   r  r    sj    z "0 04,0&*J
llJ
 'tnJ
 $D>	J

 d^J
 
J
r&   r  )r  r  r  r  r  )r   F)Br    collections.abcr   r   r0   dataclassesr   typingr   r   r!   torch.utils.checkpointr   activationsr   modeling_layersr	   modeling_outputsr
   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   utils.backbone_utilsr   configuration_swinr   
get_loggerr   loggerr   r)   r,   r8   rJ   rL   r   rN   rR   r   r   r   r   r   r   r   r   r	  r  r  r"  rV  rh  r  r  r  r  r  __all__r%   r&   r'   <module>r     s   &    ! "    ! 9 . - [ [ D D 1 * 
		H	% 
K K K  
Kk K K& 
#K # #< 
K K K*	Y-RYY Y-x(-")) (-V3ryy 3nU\\ e T V[VbVb *%299 %\		 \~
RYY 
#BII #Lryy 	 	z		 zz9* 9xX
")) X
v =/ = =6 `
# `
 `
F 	d
!4 d
d
N =
!4 =
=
@ 
_
& _

_
Dr&   