
    rh              	          d Z ddlZddlZddlmZ ddlmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ  ej@                  e!      Z"e ed       G d de                    Z#e ed       G d de                    Z$e ed       G d de                    Z%e ed       G d de                    Z& G d dejN                        Z( G d d ejN                        Z)d@d!e	jT                  d"e+d#e,d$e	jT                  fd%Z- G d& d'ejN                        Z. G d( d)ejN                        Z/ G d* d+ejN                        Z0 G d, d-ejN                        Z1 G d. d/e      Z2 G d0 d1ejN                        Z3e G d2 d3e             Z4e G d4 d5e4             Z5 ed6       G d7 d8e4             Z6 ed9       G d: d;e4             Z7 ed<       G d= d>e4e             Z8g d?Z9y)AzPyTorch FocalNet model.    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BackboneOutput)PreTrainedModel)ModelOutputauto_docstringlogging)BackboneMixin   )FocalNetConfigzC
    FocalNet encoder's outputs, with potential hidden states.
    )custom_introc                       e Zd ZU dZdZeej                     ed<   dZ	ee
ej                        ed<   dZee
ej                        ed<   y)FocalNetEncoderOutputa  
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlast_hidden_statehidden_statesreshaped_hidden_states)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   tupler        /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/focalnet/modeling_focalnet.pyr   r   '   sT     6:x 1 1298<M8E%"3"345<AEHU5+<+<%=>Er$   r   zZ
    FocalNet model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)FocalNetModelOutputa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
        Average pooling of the last layer hidden-state.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr   pooler_outputr   r   )r   r   r   r   r   r   r   r    r!   r(   r   r"   r   r#   r$   r%   r'   r'   <   si    	 6:x 1 12915M8E--.58<M8E%"3"345<AEHU5+<+<%=>Er$   r'   z.
    FocalNet masked image model outputs.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)!FocalNetMaskedImageModelingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
        Masked image modeling (MLM) loss.
    reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Reconstructed pixel values.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlossreconstructionr   r   )r   r   r   r   r+   r   r   r    r!   r,   r   r"   r   r#   r$   r%   r*   r*   T   sh     )-D(5$$
%,26NHU../68<M8E%"3"345<AEHU5+<+<%=>Er$   r*   z4
    FocalNet outputs for image classification.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)FocalNetImageClassifierOutputa7  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr+   logitsr   r   )r   r   r   r   r+   r   r   r    r!   r/   r   r"   r   r#   r$   r%   r.   r.   n   sh     )-D(5$$
%,*.FHU&&'.8<M8E%"3"345<AEHU5+<+<%=>Er$   r.   c                        e Zd ZdZd fd	Z	 ddeej                     deej                     de	ej                     fdZ xZS )	FocalNetEmbeddingszX
    Construct the patch embeddings and layernorm. Optionally, also the mask token.
    c           	         t         |           t        ||j                  |j                  |j
                  |j                  |j                  d      | _        | j                  j                  | _
        |r4t        j                  t        j                  dd|j                              nd | _        t        j                   |j                  |j"                        | _        t        j&                  |j(                        | _        y )NT)config
image_size
patch_sizenum_channels	embed_dimuse_conv_embedis_stemr   eps)super__init__FocalNetPatchEmbeddingsr4   r5   r6   r7   r8   patch_embeddings	grid_size
patch_gridr   	Parameterr   zeros
mask_token	LayerNormlayer_norm_epsnormDropouthidden_dropout_probdropout)selfr3   use_mask_token	__class__s      r%   r=   zFocalNetEmbeddings.__init__   s     7((((,,&&!00!
 //99O]",,u{{1a9I9I'JKcgLL!1!1v7L7LM	zz&"<"<=r$   pixel_valuesbool_masked_posreturnc                 8   | j                  |      \  }}| j                  |      }|j                         \  }}}|K| j                  j	                  ||d      }|j                  d      j                  |      }	|d|	z
  z  ||	z  z   }| j                  |      }||fS )N      ?)r?   rG   sizerD   expand	unsqueezetype_asrJ   )
rK   rN   rO   
embeddingsoutput_dimensions
batch_sizeseq_len_mask_tokensmasks
             r%   forwardzFocalNetEmbeddings.forward   s     )-(=(=l(K%
%YYz*
!+!2
GQ&//00WbIK",,R088ED#sTz2[45GGJ\\*-
,,,r$   )FN)r   r   r   r   r=   r   r   r    
BoolTensorr"   Tensorr_   __classcell__rM   s   @r%   r1   r1      sQ    >& hl-$U%6%67-JRSXScScJd-	u||	-r$   r1   c                   z     e Zd Z	 	 	 d fd	Zd Zdeej                     deej                  ee
   f   fdZ xZS )r>   c	                 d   t         |           t        |t        j                  j
                        r|n||f}t        |t        j                  j
                        r|n||f}|d   |d   z  |d   |d   z  z  }	|| _        || _        || _        |	| _	        |d   |d   z  |d   |d   z  f| _
        |r/|rd}
d}d}nd}
d}d}t        j                  |||
||      | _        nt        j                  ||||      | _        |r't        j                  ||j                  	      | _        y d | _        y )
Nr   r            r
   )kernel_sizestridepadding)rj   rk   r:   )r<   r=   
isinstancecollectionsabcIterabler4   r5   r6   num_patchesr@   r   Conv2d
projectionrE   rF   rG   )rK   r3   r4   r5   r6   r7   add_normr8   r9   rq   rj   rl   rk   rM   s                r%   r=   z FocalNetPatchEmbeddings.__init__   s7    	#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&$Q-:a=8*Q-:VW=:XY iii[Y`DO !iiiZ`jkDOYF4I4IJDIDIr$   c                 n   || j                   d   z  dk7  rDd| j                   d   || j                   d   z  z
  f}t        j                  j                  ||      }|| j                   d   z  dk7  rFddd| j                   d   || j                   d   z  z
  f}t        j                  j                  ||      }|S )Nr   r   )r5   r   
functionalpad)rK   rN   heightwidth
pad_valuess        r%   	maybe_padz!FocalNetPatchEmbeddings.maybe_pad   s    4??1%%*T__Q/%$//!:L2LLMJ==,,\:FLDOOA&&!+Q4??1#5QRAS8S#STJ==,,\:FLr$   rN   rP   c                 N   |j                   \  }}}}|| j                  k7  rt        d      | j                  |||      }| j	                  |      }|j                   \  }}}}||f}|j                  d      j                  dd      }| j                  | j                  |      }||fS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.rh   r   )shaper6   
ValueErrorr{   rs   flatten	transposerG   )rK   rN   r\   r6   rx   ry   rX   rY   s           r%   r_   zFocalNetPatchEmbeddings.forward   s    )5););&<4,,,w  ~~lFEB__\2
(..1fe#UO''*44Q:
99 :.J,,,r$   )FFF)r   r   r   r=   r{   r   r   r    r"   rb   intr_   rc   rd   s   @r%   r>   r>      sL     (T-HU->->$? -E%,,X]^aXbJbDc -r$   r>   input	drop_probtrainingrP   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
            r   r   )r   )dtypedevice)r}   ndimr   randr   r   floor_div)r   r   r   	keep_probr}   random_tensoroutputs          r%   	drop_pathr      s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr$   c                   x     e Zd ZdZd	dee   ddf fdZdej                  dej                  fdZ	de
fdZ xZS )
FocalNetDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rP   c                 0    t         |           || _        y r`   )r<   r=   r   )rK   r   rM   s     r%   r=   zFocalNetDropPath.__init__  s    "r$   r   c                 D    t        || j                  | j                        S r`   )r   r   r   )rK   r   s     r%   r_   zFocalNetDropPath.forward  s    FFr$   c                      d| j                    S )Nzp=)r   rK   s    r%   
extra_reprzFocalNetDropPath.extra_repr  s    DNN#$$r$   r`   )r   r   r   r   r   floatr=   r   rb   r_   strr   rc   rd   s   @r%   r   r     sG    b#(5/ #T #GU\\ Gell G%C %r$   r   c                   &     e Zd Zd fd	Zd Z xZS )FocalNetModulationc                    t         	|           || _        |j                  |   | _        |j
                  |   | _        || _        |j                  | _        |j                  | _	        t        j                  |d|z  | j                  dz   z   |      | _        t        j                  ||dd|      | _        t        j                         | _        t        j                  ||      | _        t        j$                  |      | _        t        j(                         | _        g | _        t/        | j                        D ]  }| j                  |z  | j                  z   }| j*                  j1                  t        j2                  t        j                  |||d||dz  d      t        j                                      | j,                  j1                  |        | j                  r't        j4                  ||j6                        | _        y y )Nrh   r   )bias)rj   rk   r   F)rj   rk   groupsrl   r   r:   )r<   r=   dimfocal_windowsfocal_windowfocal_levelsfocal_levelfocal_factor use_post_layernorm_in_modulationnormalize_modulatorr   Linearprojection_inrr   projection_contextGELU
activationprojection_outrH   projection_dropout
ModuleListfocal_layerskernel_sizesrangeappend
SequentialrE   rF   	layernorm)
rK   r3   indexr   r   r   r   krj   rM   s
            r%   r=   zFocalNetModulation.__init__  s   "007!..u5(060W0W-#)#=#= YYsAGt7G7G!7K,LSWX"$))C!ATX"Y'') iiS1"$**-?"@MMOt''( 
	2A++a/$2C2CCK$$IISk!CYdhiYipu GGI	 $$[1
	2 00\\#63H3HIDN 1r$   c                 ,   |j                   d   }| j                  |      j                  dddd      j                         }t	        j
                  |||| j                  dz   fd      \  }}}d}t        | j                        D ]+  } | j                  |   |      }|||dd||dz   f   z  z   }- | j                  |j                  dd      j                  dd            }	||	|dd| j                  df   z  z   }| j                  r|| j                  dz   z  }| j                  |      }
||
z  }|j                  dddd      j                         }| j                  r| j                  |      }| j                  |      }| j!                  |      }|S )	z
        Args:
            hidden_state:
                Input features with shape of (batch_size, height, width, num_channels)
        rR   r   r
   r   rh   NT)keepdim)r}   r   permute
contiguousr   splitr   r   r   r   meanr   r   r   r   r   r   )rK   hidden_stater6   xqctxgatesctx_alllevel
ctx_global	modulatorx_outs               r%   r_   zFocalNetModulation.forward=  s    $))"- |,44Q1a@KKMAlDDTDTWXDX'Y[\]3 4++, 	BE*$##E*3/CeAuuqy/@,@&A AAG	B __SXXaX%>%C%CAt%C%TU
Jq$2B2B2D/D)EEE ##!1!1A!56G ++G4	IaAq)44600NN5)E ##E*''.r$   )rh   Tr   r   r   r   r=   r_   rc   rd   s   @r%   r   r     s    JB"r$   r   c                   &     e Zd Zd fd	Zd Z xZS )FocalNetMlpc                 
   t         |           |xs |}|xs |}t        j                  ||      | _        t
        |j                     | _        t        j                  ||      | _        t        j                  |      | _
        y r`   )r<   r=   r   r   fc1r   
hidden_actr   fc2rH   drop)rK   r3   in_featureshidden_featuresout_featuresr   rM   s         r%   r=   zFocalNetMlp.__init__c  sh    #2{)8[99[/: !2!2399_l;JJt$	r$   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r`   )r   r   r   r   )rK   r   s     r%   r_   zFocalNetMlp.forwardl  sN    xx-|4yy.xx-yy.r$   )NNr   r   rd   s   @r%   r   r   b  s    %r$   r   c                   *     e Zd ZdZd fd	Zd Z xZS )FocalNetLayera  Focal Modulation Network layer (block).

    Args:
        config (`FocalNetConfig`):
            Model config.
        index (`int`):
            Layer index.
        dim (`int`):
            Number of input channels.
        input_resolution (`tuple[int]`):
            Input resolution.
        drop_path (`float`, *optional*, defaults to 0.0):
            Stochastic depth rate.
    c                 H   t         |           || _        || _        || _        |j
                  | _        |j                  | _        t        j                  ||j                        | _        t        |||| j                        | _        |dkD  rt        |      nt        j                         | _        t        j                  ||j                        | _        t%        ||j&                  z        }t)        |||| j                        | _        d| _        d| _        |j0                  ryt        j2                  |j4                  t7        j8                  |      z  d      | _        t        j2                  |j4                  t7        j8                  |      z  d      | _        y y )Nr:   )r3   r   r   r   r   )r3   r   r   r   rS   T)requires_grad)r<   r=   r3   r   input_resolutionrI   r   use_post_layernormr   rE   rF   norm1r   
modulationr   Identityr   norm2r   	mlp_ratior   mlpgamma_1gamma_2use_layerscalerB   layerscale_valuer   ones)rK   r3   r   r   r   r   mlp_hidden_dimrM   s          r%   r=   zFocalNetLayer.__init__  sC     0 ..	"(";";\\#6+@+@A
,#yy	
 9BC))4R[[]\\#6+@+@A
S6#3#334f#~dhdmdmn  <<(?(?%**S/(QaefDL<<(?(?%**S/(QaefDL !r$   c           	      :   |\  }}|j                   \  }}}|}| j                  r|n| j                  |      }|j                  ||||      }| j	                  |      j                  |||z  |      }| j                  s|n| j                  |      }|| j                  | j                  |z        z   }|| j                  | j                  | j                  r | j                  | j                  |            n| j                  | j                  |            z        z   }|S r`   )
r}   r   r   viewr   r   r   r   r   r   )	rK   r   input_dimensionsrx   ry   rZ   r\   r6   shortcuts	            r%   r_   zFocalNetLayer.forward  s   (&2&8&8#
A| (,'>'>|DJJ|D\#((VULQ|499*funVbc+/+B+B|

S_H`  $..1L"MM#dnnLL595L5Ltzz$((<01RVRZRZ[_[e[efr[sRtv'
 

 r$   )r   )r   r   r   r   r=   r_   rc   rd   s   @r%   r   r   u  s    g@r$   r   c                   j     e Zd Z fdZdej
                  deeef   deej
                     fdZ xZ	S )FocalNetStagec                    t         |           || _        t        |j                        | _        t        | j
                        D cg c]  }|j                  d|z  z   }}||   }|| j
                  dz
  k  r||dz      nd }|| j
                  dz
  k  rt        nd }t        j                  d|j                  t        |j                        d      D 	cg c]  }	|	j                          }
}	|
t        |j                  d |       t        |j                  d |dz           }t        j                  t        |j                  |         D cg c]'  }t!        ||||t#        |t$              r||   n|      ) c}      | _        |' |||d||d|j(                  d	      | _        d| _        y d | _        d| _        y c c}w c c}	w c c}w )
Nrh   r   r   cpu)r   )r3   r   r   r   r   TF)r3   r4   r5   r6   r7   rt   r8   r9   )r<   r=   r3   lendepths
num_stagesr   r7   r>   r   linspacedrop_path_ratesumitemr   r   r   rm   listlayersr8   
downsamplepointing)rK   r3   r   r   ir7   r   out_dimr   r   dprr   rM   s               r%   r=   zFocalNetStage.__init__  s   fmm,8=doo8NO1V%%A.O	O+04??Q3F+F)EAI&T1619L1L,SW
 "'63H3H#fmmJ\ej!klAqvvxllFMM&512S{QR9S5TU	mm v}}U34	  !%5.8D.Iily	
 !(+ !%44	DO  #DOI P m	s   F<G,Gr   r   rP   c                    |\  }}| j                   D ]  } |||      } |}| j                  K|\  }}|j                  dd      j                  |j                  d   d||      }| j                  |      \  }}n||||f}|||f}|S )Nr   rh   r   rR   )r   r   r   reshaper}   )	rK   r   r   rx   ry   layer_module!hidden_states_before_downsamplingrY   stage_outputss	            r%   r_   zFocalNetStage.forward  s    ( KK 	JL(8HIM	J -:)??&,MFE)33Aq9AA177:BM 04}/M,M, "( >&(IK\]r$   )
r   r   r   r=   r   rb   r"   r   r_   rc   rd   s   @r%   r   r     s=    *XU\\ U3PS8_ Y^_d_k_kYl r$   r   c                        e Zd Z fdZ	 	 	 d	dej
                  deeef   dee	   dee	   dee	   de
eef   fdZ xZS )
FocalNetEncoderc                 2   t         |           t        |j                        | _        || _        t        j                  t        | j                        D cg c]$  }t        |||d   d|z  z  |d   d|z  z  f      & c}      | _
        d| _        y c c}w )Nr   rh   r   )r3   r   r   F)r<   r=   r   r   r   r3   r   r   r   r   stagesgradient_checkpointing)rK   r3   r@   i_layerrM   s       r%   r=   zFocalNetEncoder.__init__  s    fmm,mm  %T__5  !!&/lq'z&BIaLUVX_U_D`%a	
 ',#s   )Br   r   output_hidden_states(output_hidden_states_before_downsamplingreturn_dictrP   c                    |rdnd }|rdnd }|rE|j                   \  }}	}
 |j                  |g||
 }|j                  dddd      }||fz  }||fz  }t        | j                        D ]  \  }} |||      }|d   }|d   }|d   }|d   |d   f}|rP|rN|j                   \  }}	}
 |j                  |g|d   |d   f|
 }|j                  dddd      }||fz  }||fz  }z|s}|r|j                   \  }}	}
 |j                  |g||
 }|j                  dddd      }||fz  }||fz  } |st        d ||fD              S t        |||	      S )
Nr#   r   r
   r   rh   rR   c              3   &   K   | ]	  }||  y wr`   r#   ).0vs     r%   	<genexpr>z*FocalNetEncoder.forward.<locals>.<genexpr>>  s     Xq!-Xs   )r   r   r   )r}   r   r   	enumerater  r"   r   )rK   r   r   r  r  r  all_hidden_statesall_reshaped_hidden_statesrZ   r\   hidden_sizereshaped_hidden_stater   stage_moduler   r   rY   s                    r%   r_   zFocalNetEncoder.forward  s    #7BD+?RT")6)<)<&J;$6M$6$6z$bDT$bVa$b!$9$A$A!Q1$M!-!11&+@*BB&(5 	GOA|(8HIM)!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
A{ )O(I(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*/D.FF*%.V-:-@-@*
A{(:(:(::(fHX(fZe(f%(=(E(EaAq(Q%!m%55!*/D.FF*3	G6 X]4E$FXXX$++#=
 	
r$   )FFT)r   r   r   r=   r   rb   r"   r   r   boolr   r   r_   rc   rd   s   @r%   r  r    su    ,, 05CH&*5
||5
  S/5
 'tn	5

 3;4.5
 d^5
 
u++	,5
r$   r  c                   0    e Zd ZU eed<   dZdZdZdgZd Z	y)FocalNetPreTrainedModelr3   focalnetrN   Tr   c                    t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yt        |t              r2|j                  %|j                  j
                  j                          yyt        |t              r| j                  j                   rs|j"                  j
                  j                  | j                  j$                         |j&                  j
                  j                  | j                  j$                         yyy)zInitialize the weightsr   )r   stdNrS   )rm   r   r   rr   weightdatanormal_r3   initializer_ranger   zero_rE   fill_r1   rD   r   r   r   r   r   )rK   modules     r%   _init_weightsz%FocalNetPreTrainedModel._init_weightsO  s<   fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-KK""$MM$$S) 23  ,!!&&,,. -.{{))##))$++*F*FG##))$++*F*FG * /r$   N)
r   r   r   r   r!   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr"  r#   r$   r%   r  r  G  s(    "$O&*#()Hr$   r  c                        e Zd Zd	 fd	Zd Ze	 	 	 	 d
deej                     deej                     dee
   dee
   deeef   f
d       Z xZS )FocalNetModelc                    t         |   |       || _        t        |j                        | _        t        |j                  d| j
                  dz
  z  z        | _        t        ||      | _
        t        || j                  j                        | _        t        j                  | j                  |j                         | _        |rt        j$                  d      nd| _        | j)                          y)z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        rh   r   )rL   r:   N)r<   r=   r3   r   r   r   r   r7   num_featuresr1   rX   r  rA   encoderr   rE   rF   r   AdaptiveAvgPool1dpooler	post_init)rK   r3   add_pooling_layerrL   rM   s       r%   r=   zFocalNetModel.__init__e  s     	 fmm, 0 0119L3M MN,VNS&vt/I/IJd&7&7V=R=RS1Bb**1- 	r$   c                 .    | j                   j                  S r`   )rX   r?   r   s    r%   get_input_embeddingsz"FocalNetModel.get_input_embeddingsz  s    ///r$   rN   rO   r  r  rP   c                    ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  ||      \  }}| j                  ||||      }|d   }| j                  |      }d}	| j                  7| j                  |j                  dd            }	t        j                  |	d      }	|s||	f|dd z   }
|
S t        ||	|j                  |j                        S )	z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rO   r  r  r   r   rh   )r   r(   r   r   )r3   r  use_return_dictr~   rX   r+  r   r-  r   r   r   r'   r   r   )rK   rN   rO   r  r  embedding_outputr   encoder_outputssequence_outputpooled_outputr   s              r%   r_   zFocalNetModel.forward}  s    %9$D $++JjJj 	 &1%<k$++B]B]?@@-1__\[j_-k**,,!5#	 ' 
 *!,..9;;" KK(A(A!Q(GHM!MM-;M%}58KKFM"-')77#2#I#I	
 	
r$   )TFNNNN)r   r   r   r=   r1  r   r   r   r    ra   r  r   r"   r'   r_   rc   rd   s   @r%   r(  r(  c  s    *0  596:/3&*.
u001.
 "%"2"23.
 'tn	.

 d^.
 
u))	*.
 .
r$   r(  a  
    FocalNet Model with a decoder on top for masked image modeling.

    This follows the same implementation as in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    c                        e Zd Z fdZe	 	 	 	 ddeej                     deej                     dee	   dee	   de
eef   f
d       Z xZS )	FocalNetForMaskedImageModelingc                    t         |   |       t        |dd      | _        t	        |j
                        | _        t        |j                  d| j                  dz
  z  z        }t        j                  t        j                  ||j                  dz  |j                  z  d      t        j                  |j                              | _        | j!                          y )NFT)r/  rL   rh   r   )in_channelsout_channelsrj   )r<   r=   r(  r  r   r   r   r   r7   r   r   rr   encoder_strider6   PixelShuffledecoderr.  )rK   r3   r*  rM   s      r%   r=   z'FocalNetForMaskedImageModeling.__init__  s     %fVZ[fmm,6++aDOOa4G.HHI}}II(v7L7La7ORXReRe7est OOF112	
 	r$   rN   rO   r  r  rP   c                    ||n| j                   j                  }| j                  ||||      }|d   }|j                  dd      }|j                  \  }}}	t        j                  |	dz        x}
}|j                  |||
|      }| j                  |      }d}|| j                   j                  | j                   j                  z  }|j                  d||      }|j                  | j                   j                  d      j                  | j                   j                  d      j                  d      j                         }t        j                  j!                  ||d	      }||z  j#                         |j#                         d
z   z  | j                   j$                  z  }|s|f|dd z   }||f|z   S |S t'        |||j(                  |j*                        S )a?  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, FocalNetConfig, FocalNetForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-base-simmim-window6-192")
        >>> config = FocalNetConfig()
        >>> model = FocalNetForMaskedImageModeling(config)

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 192, 192]
        ```N)rO   r  r  r   r   rh   g      ?rR   none)	reductiongh㈵>)r+   r,   r   r   )r3   r4  r  r   r}   mathfloorr   rA  r4   r5   repeat_interleaverV   r   r   rv   l1_lossr   r6   r*   r   r   )rK   rN   rO   r  r  outputsr7  rZ   r6   sequence_lengthrx   ry   reconstructed_pixel_valuesmasked_im_lossrT   r^   reconstruction_lossr   s                     r%   r_   z&FocalNetForMaskedImageModeling.forward  s   H &1%<k$++B]B]--+!5#	   
 "!*)33Aq94C4I4I1
L/OS$899)11*lFTYZ &*\\/%B"&;;))T[[-C-CCD-55b$EO11$++2H2H!L""4;;#9#91=1	  #%--"7"7F`lr"7"s1D8==?488:PTCTUX\XcXcXpXppN02WQR[@F3A3M^%.YSYY05!//#*#A#A	
 	
r$   r9  )r   r   r   r=   r   r   r   r    ra   r  r   r"   r*   r_   rc   rd   s   @r%   r;  r;    s    "  596:/3&*L
u001L
 "%"2"23L
 'tn	L

 d^L
 
u77	8L
 L
r$   r;  z
    FocalNet Model with an image classification head on top (a linear layer on top of the pooled output) e.g. for
    ImageNet.
    c                        e Zd Z fdZe	 	 	 	 ddeej                     deej                     dee	   dee	   de
eef   f
d       Z xZS )	FocalNetForImageClassificationc                 >   t         |   |       |j                  | _        t        |      | _        |j                  dkD  r4t        j                  | j                  j                  |j                        nt        j                         | _	        | j                          y )Nr   )r<   r=   
num_labelsr(  r  r   r   r*  r   
classifierr.  rK   r3   rM   s     r%   r=   z'FocalNetForImageClassification.__init__'  sx      ++%f- IOHYHY\]H]BIIdmm00&2C2CDcecncncp 	
 	r$   rN   labelsr  r  rP   c                    ||n| j                   j                  }| j                  |||      }|d   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }	| j
                  dk(  r& |	|j                         |j                               }n |	||      }n| j                   j                  dk(  r=t               }	 |	|j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               }	 |	||      }|s|f|dd z   }
||f|
z   S |
S t        |||j                   |j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr3  r   
regressionsingle_label_classificationmulti_label_classificationrR   rh   )r+   r/   r   r   )r3   r4  r  rR  problem_typerQ  r   r   longr   r	   squeezer   r   r   r.   r   r   )rK   rN   rT  r  r  rI  r8  r/   r+   loss_fctr   s              r%   r_   z&FocalNetForImageClassification.forward5  s    &1%<k$++B]B]--!5#   
  
/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE,!//#*#A#A	
 	
r$   r9  )r   r   r   r=   r   r   r   r    
LongTensorr  r   r"   r.   r_   rc   rd   s   @r%   rO  rO    s      59-1/3&*9
u0019
 ))*9
 'tn	9

 d^9
 
u33	49
 9
r$   rO  zG
    FocalNet backbone, to be used with frameworks like X-Decoder.
    c            
       l     e Zd Zdef fdZe	 	 ddej                  dee	   dee	   de
fd       Z xZS )	FocalNetBackboner3   c                     t         |   |       t         | 	  |       |j                  g|j                  z   | _        t        |      | _        | j                          y r`   )	r<   r=   _init_backboner7   hidden_sizesr*  r(  r  r.  rS  s     r%   r=   zFocalNetBackbone.__init__x  sQ     v&#--.1D1DD%f- 	r$   rN   r  r  rP   c                    ||n| j                   j                  }||n| j                   j                  }| j                  |dd      }|j                  }d}t        | j                        D ]  \  }}|| j                  v s|||   fz  } |s|f}	|r|	|j                  fz  }	|	S t        ||r|j                  d      S dd      S )aj  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-tiny-lrf")
        >>> model = AutoBackbone.from_pretrained("microsoft/focalnet-tiny-lrf")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr3  r#   )feature_mapsr   
attentions)
r3   r4  r  r  r   r  stage_namesr   r   r   )
rK   rN   r  r  rI  r   rd  idxstager   s
             r%   r_   zFocalNetBackbone.forward  s    2 &1%<k$++B]B]$8$D $++JjJj 	 --4UY-Z66#D$4$45 	6JC)))s!3 55	6 "_F#70022M%3G'//
 	
MQ
 	
r$   )NN)r   r   r   r   r=   r   r   rb   r   r  r   r_   rc   rd   s   @r%   r_  r_  r  s]    ~   04&*	0
ll0
 'tn0
 d^	0

 
0
 0
r$   r_  )rO  r;  r_  r(  r  )r   F):r   collections.abcrn   rE  dataclassesr   typingr   r   r   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   modeling_layersr   modeling_outputsr   modeling_utilsr   utilsr   r   r   utils.backbone_utilsr   configuration_focalnetr   
get_loggerr   loggerr   r'   r*   r.   Moduler1   r>   rb   r   r  r   r   r   r   r   r   r  r  r(  r;  rO  r_  __all__r#   r$   r%   <module>ry     s      ! "    A A ! 9 . - 9 9 1 2 
		H	% 
FK F F 
F+ F F$ 
F F F( 
FK F F(%- %-PD-bii D-PU\\ e T V[VbVb *%ryy %D DN")) &BBII BJ?. ?DH
bii H
V Ho H H6 H
+ H
 H
V _
%< _
_
D J
%< J
J
Z 
<
. <

<
~r$   