
    rh              	          d Z ddlZddlmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZ ddlmZmZmZ ddlmZ  ej@                  e!      Z"d>de#de#dee#   de#fdZ$ G d dejJ                        Z& G d dejJ                        Z' G d dejJ                        Z( G d dejJ                        Z) G d dejJ                        Z* G d dejJ                        Z+ G d  d!ejJ                        Z, G d" d#ejJ                        Z- G d$ d%ejJ                        Z. G d& d'ejJ                        Z/ G d( d)e      Z0 G d* d+ejJ                        Z1e G d, d-e             Z2e G d. d/e2             Z3 ed01       G d2 d3e2             Z4 G d4 d5ejJ                        Z5 G d6 d7ejJ                        Z6 G d8 d9ejJ                        Z7 ed:1       G d; d<e2             Z8g d=Z9y)?zPyTorch MobileViT model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )MobileViTConfigvaluedivisor	min_valuereturnc                 |    ||}t        |t        | |dz  z         |z  |z        }|d| z  k  r||z  }t        |      S )a  
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
       g?)maxint)r   r   r   	new_values       /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/mobilevit/modeling_mobilevit.pymake_divisibler"   ,   sS     	Is57Q;#677BWLMI3;W	y>    c                        e Zd Z	 	 	 	 	 	 ddedededededededed	ed
eeef   ddf fdZde	j                  de	j                  fdZ xZS )MobileViTConvLayerconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                 $   t         |           t        |dz
  dz        |z  }||z  dk7  rt        d| d| d      ||z  dk7  rt        d| d| d      t	        j
                  ||||||||d		      | _        |	r t	        j                  |d
ddd      | _        nd | _        |
rdt        |
t              rt        |
   | _        y t        |j                  t              rt        |j                     | _        y |j                  | _        y d | _        y )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r'   r(   r)   r*   paddingr-   r+   r,   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr
   
activation
hidden_act)selfr&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r2   	__class__s               r!   r:   zMobileViTConvLayer.__init__<   s*    	{Q!+,x71$/}<STZS[[cdee& A%0>UV\U]]efgg99#%# 

 !#)$("D "&D.#."("8F--s3"():):";"("3"3"DOr#   featuresc                     | j                  |      }| j                  | j                  |      }| j                  | j                  |      }|S N)r=   r?   rB   )rD   rF   s     r!   forwardzMobileViTConvLayer.forwardr   sK    ##H-)))(3H??&x0Hr#   )r   r   Fr   TT)__name__
__module____qualname__r   r   boolr   rA   r:   torchTensorrI   __classcell__rE   s   @r!   r%   r%   ;   s     "&+/4#4# 4# 	4#
 4# 4# 4# 4# 4#  4# dCi(4# 
4#l  r#   r%   c                   x     e Zd ZdZ	 ddedededededdf fd	Zd
ej                  dej                  fdZ	 xZ
S )MobileViTInvertedResidualzY
    Inverted residual block (MobileNetv2): https://huggingface.co/papers/1801.04381
    r&   r'   r(   r*   r-   r   Nc           	      @   t         |           t        t        t	        ||j
                  z              d      }|dvrt        d| d      |dk(  xr ||k(  | _        t        |||d      | _	        t        |||d|||      | _
        t        |||dd	
      | _        y )N   )r   r   zInvalid stride .r   r'   r(   r)   r	   )r'   r(   r)   r*   r+   r-   Fr'   r(   r)   r/   )r9   r:   r"   r   roundexpand_ratior;   use_residualr%   
expand_1x1conv_3x3
reduce_1x1)rD   r&   r'   r(   r*   r-   expanded_channelsrE   s          r!   r:   z"MobileViTInvertedResidual.__init__   s     	*3u[6CVCV5V/W+XZ[\vha899#q[K{l/J,:KYZ
 +)*$
 -)% 
r#   rF   c                     |}| j                  |      }| j                  |      }| j                  |      }| j                  r||z   S |S rH   )r\   r]   r^   r[   )rD   rF   residuals      r!   rI   z!MobileViTInvertedResidual.forward   sI    ??8,==*??8,&*&7&7x("EXEr#   r   )rJ   rK   rL   __doc__r   r   r:   rN   rO   rI   rP   rQ   s   @r!   rS   rS   {   sc    
 jk
%
47
GJ
TW
cf
	
BF F Fr#   rS   c                   t     e Zd Z	 ddedededededdf fdZd	ej                  dej                  fd
Z xZ	S )MobileViTMobileNetLayerr&   r'   r(   r*   
num_stagesr   Nc                     t         |           t        j                         | _        t        |      D ]5  }t        ||||dk(  r|nd      }| j                  j                  |       |}7 y )Nr   r   )r'   r(   r*   )r9   r:   r   
ModuleListlayerrangerS   append)	rD   r&   r'   r(   r*   rf   iri   rE   s	           r!   r:   z MobileViTMobileNetLayer.__init__   sh     	]]_
z" 	'A-')!"avQ	E JJe$&K	'r#   rF   c                 8    | j                   D ]
  } ||      } |S rH   ri   )rD   rF   layer_modules      r!   rI   zMobileViTMobileNetLayer.forward   s$     JJ 	.L#H-H	.r#   )r   r   
rJ   rK   rL   r   r   r:   rN   rO   rI   rP   rQ   s   @r!   re   re      sV    op'%'47'GJ'TW'il'	'   r#   re   c                   d     e Zd Zdededdf fdZdej                  dej                  fdZ xZ	S )MobileViTSelfAttentionr&   hidden_sizer   Nc                    t         |           ||j                  z  dk7  rt        d| d|j                   d      |j                  | _        t	        ||j                  z        | _        | j                  | j
                  z  | _        t        j                  || j                  |j                        | _
        t        j                  || j                  |j                        | _        t        j                  || j                  |j                        | _        t        j                  |j                        | _        y )Nr   zThe hidden size z4 is not a multiple of the number of attention heads rV   )r,   )r9   r:   num_attention_headsr;   r   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyr   Dropoutattention_probs_dropout_probdropoutrD   r&   rs   rE   s      r!   r:   zMobileViTSelfAttention.__init__   s    333q8";- 0334A7 
 $*#=#= #&{V5O5O'O#P !558P8PPYY{D,>,>V__U
99[$*<*<6??SYY{D,>,>V__U
zz&"E"EFr#   hidden_statesc                    |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }t        j                  ||j                  dd            }|t        j                  | j                        z  }t        j                  j                  |d      }	| j                  |	      }	t        j                  |	|      }
|
j!                  dddd      j#                         }
|
j%                         d d | j&                  fz   } |
j                  | }
|
S )Nr   r   dimr   r	   )shaperz   viewru   rv   	transposer{   r   rN   matmulmathsqrtr   
functionalsoftmaxr~   permute
contiguoussizerw   )rD   r   
batch_size
seq_length_query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapes               r!   rI   zMobileViTSelfAttention.forward   s   $1$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 !<<Y5H5HR5PQ+dii8P8P.QQ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CDr#   rp   rQ   s   @r!   rr   rr      s<    G GS GT G&"U\\ "ell "r#   rr   c                   d     e Zd Zdededdf fdZdej                  dej                  fdZ xZ	S )MobileViTSelfOutputr&   rs   r   Nc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y rH   r9   r:   r   rx   denser|   hidden_dropout_probr~   r   s      r!   r:   zMobileViTSelfOutput.__init__   s6    YY{K8
zz&"<"<=r#   r   c                 J    | j                  |      }| j                  |      }|S rH   r   r~   rD   r   s     r!   rI   zMobileViTSelfOutput.forward  s$    

=1]3r#   rp   rQ   s   @r!   r   r      s8    > >S >T >
U\\ ell r#   r   c                   z     e Zd Zdededdf fdZdee   ddfdZdej                  dej                  fd	Z
 xZS )
MobileViTAttentionr&   rs   r   Nc                     t         |           t        ||      | _        t	        ||      | _        t               | _        y rH   )r9   r:   rr   	attentionr   outputsetpruned_headsr   s      r!   r:   zMobileViTAttention.__init__  s4    /D)&+>Er#   headsc                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )lenr   r   ru   rv   r   r   rz   r{   r   r   r   rw   union)rD   r   indexs      r!   prune_headszMobileViTAttention.prune_heads  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r#   r   c                 J    | j                  |      }| j                  |      }|S rH   )r   r   )rD   r   self_outputsattention_outputs       r!   rI   zMobileViTAttention.forward   s%    ~~m4;;|4r#   )rJ   rK   rL   r   r   r:   r   r   rN   rO   rI   rP   rQ   s   @r!   r   r     sO    " "S "T ";S ;d ;$ U\\  ell  r#   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTIntermediater&   rs   intermediate_sizer   Nc                     t         |           t        j                  ||      | _        t        |j                  t              rt        |j                     | _	        y |j                  | _	        y rH   )
r9   r:   r   rx   r   r@   rC   rA   r
   intermediate_act_fnrD   r&   rs   r   rE   s       r!   r:   zMobileViTIntermediate.__init__'  sR    YY{,=>
f''-'-f.?.?'@D$'-'8'8D$r#   r   c                 J    | j                  |      }| j                  |      }|S rH   )r   r   r   s     r!   rI   zMobileViTIntermediate.forward/  s&    

=100?r#   rp   rQ   s   @r!   r   r   &  sA    9 9S 9UX 9]a 9U\\ ell r#   r   c                        e Zd Zdedededdf fdZdej                  dej                  dej                  fd	Z xZ	S )
MobileViTOutputr&   rs   r   r   Nc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y rH   r   r   s       r!   r:   zMobileViTOutput.__init__6  s7    YY0+>
zz&"<"<=r#   r   input_tensorc                 T    | j                  |      }| j                  |      }||z   }|S rH   r   )rD   r   r   s      r!   rI   zMobileViTOutput.forward;  s.    

=1]3%4r#   rp   rQ   s   @r!   r   r   5  sO    > >S >UX >]a >
U\\  RWR^R^ r#   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTTransformerLayerr&   rs   r   r   Nc                 $   t         |           t        ||      | _        t	        |||      | _        t        |||      | _        t        j                  ||j                        | _        t        j                  ||j                        | _        y )Nr5   )r9   r:   r   r   r   intermediater   r   r   	LayerNormlayer_norm_epslayernorm_beforelayernorm_afterr   s       r!   r:   z"MobileViTTransformerLayer.__init__C  sq    +FK@1&+GXY%fk;LM "[f>S>S T!||KV=R=RSr#   r   c                     | j                  | j                  |            }||z   }| j                  |      }| j                  |      }| j	                  ||      }|S rH   )r   r   r   r   r   )rD   r   r   layer_outputs       r!   rI   z!MobileViTTransformerLayer.forwardK  s\    >>$*?*?*NO(=8++M:((6{{<?r#   rp   rQ   s   @r!   r   r   B  sF    T TS TUX T]a TU\\ ell r#   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTTransformerr&   rs   rf   r   Nc           	          t         |           t        j                         | _        t        |      D ]A  }t        ||t        ||j                  z              }| j                  j                  |       C y )N)rs   r   )
r9   r:   r   rh   ri   rj   r   r   	mlp_ratiork   )rD   r&   rs   rf   r   transformer_layerrE   s         r!   r:   zMobileViTTransformer.__init__V  sh    ]]_
z" 	1A 9'"%kF4D4D&D"E!
 JJ/0	1r#   r   c                 8    | j                   D ]
  } ||      } |S rH   rn   )rD   r   ro   s      r!   rI   zMobileViTTransformer.forwardb  s%     JJ 	8L(7M	8r#   rp   rQ   s   @r!   r   r   U  s@    
1 
1S 
1c 
1VZ 
1U\\ ell r#   r   c                        e Zd ZdZ	 ddededededededed	d
f fdZdej                  d	e	ej                  e
f   fdZdej                  de
d	ej                  fdZdej                  d	ej                  fdZ xZS )MobileViTLayerzC
    MobileViT block: https://huggingface.co/papers/2110.02178
    r&   r'   r(   r*   rs   rf   r-   r   Nc                    t         |           |j                  | _        |j                  | _        |dk(  r)t        ||||dk(  r|nd|dkD  r|dz  nd      | _        |}nd | _        t        ||||j                        | _	        t        |||ddd      | _
        t        |||      | _        t        j                  ||j                        | _        t        |||d      | _        t        |d|z  ||j                        | _        y )	Nr   r   )r'   r(   r*   r-   rW   F)r'   r(   r)   r.   r/   )rs   rf   r   )r9   r:   
patch_sizepatch_widthpatch_heightrS   downsampling_layerr%   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projectionfusion)	rD   r&   r'   r(   r*   rs   rf   r-   rE   s	           r!   r:   zMobileViTLayer.__init__m  s    	!,,"--Q;&?')!)QvA*2Q,QA'D# 'K&*D#*#$//	
 +#$# 
 0#!
 kv7L7LM1+ST 
 )KkW]WnWn
r#   rF   c                 |   | j                   | j                  }}t        ||z        }|j                  \  }}}}t        j
                  j                         r$t        t	        j                  ||z        |z        n#t        t        j                  ||z        |z        }	t        j
                  j                         r$t        t	        j                  ||z        |z        n#t        t        j                  ||z        |z        }
d}|
|k7  s|	|k7  r't        j                  j                  ||	|
fdd      }d}|
|z  }|	|z  }||z  }|j                  ||z  |z  |||      }|j                  dd      }|j                  ||||      }|j                  dd      }|j                  ||z  |d      }||f||||||d	}||fS )
NFbilinearr   modealign_cornersTr   r   r	   r   )	orig_sizer   channelsinterpolatenum_patchesnum_patches_widthnum_patches_height)r   r   r   r   rN   jit
is_tracingr   ceilr   r   r   r   reshaper   )rD   rF   r   r   
patch_arear   r   orig_height
orig_width
new_height	new_widthr   num_patch_widthnum_patch_heightr   patches	info_dicts                    r!   	unfoldingzMobileViTLayer.unfolding  s   $($4$4d6G6G\|34
8@5
Hk: yy##% ejj|!;<|KLTYY{\9:\IJ 	 yy##% ejjk!9:[HITYYzK78;FG 	 
"jK&?}}00
I6ZW\ 1 H K ${2%5&8 ""!$44lOU`
 ##Aq)//*hZP##Aq)//*z"9;K &z2$ &&!0"2
	 	!!r#   r   r   c                    | j                   | j                  }}t        ||z        }|d   }|d   }|d   }|d   }	|d   }
|j                         j	                  |||d      }|j                  dd      }|j                  ||z  |	z  |
||      }|j                  dd	      }|j                  |||	|z  |
|z        }|d
   r&t        j                  j                  ||d   dd      }|S )Nr   r   r   r   r   r   r   r	   r   r   r   r   Fr   )
r   r   r   r   r   r   r   r   r   r   )rD   r   r   r   r   r   r   r   r   r   r   rF   s               r!   foldingzMobileViTLayer.folding  s&   $($4$4d6G6G\|34
|,
Z(.$%9:#$78 %%',,Z[RTU%%a+##!$44o|U`
 %%a+##"2\"A?U`C`
 ]#}}00y5JV[ 1 H r#   c                    | j                   r| j                  |      }|}| j                  |      }| j                  |      }| j                  |      \  }}| j	                  |      }| j                  |      }| j                  ||      }| j                  |      }| j                  t        j                  ||fd            }|S Nr   r   )r   r   r   r   r   r   r   r   r   rN   cat)rD   rF   ra   r   r   s        r!   rI   zMobileViTLayer.forward  s    ""..x8H ==*==* "^^H5 ""7+..) <<3''1;;uyy(H)=1EFr#   rb   )rJ   rK   rL   rc   r   r   r:   rN   rO   tupledictr   r   rI   rP   rQ   s   @r!   r   r   h  s     8
8
 8
 	8

 8
 8
 8
 8
 
8
t1"%,, 1"5t9K3L 1"fu||   :  r#   r   c                   d     e Zd Zdeddf fdZ	 	 d	dej                  dededee	e
f   fdZ xZS )
MobileViTEncoderr&   r   Nc           	         t         
|           || _        t        j                         | _        d| _        dx}}|j                  dk(  rd}d}n|j                  dk(  rd}d}t        ||j                  d   |j                  d   dd      }| j
                  j                  |       t        ||j                  d   |j                  d   dd	      }| j
                  j                  |       t        ||j                  d   |j                  d	   d|j                  d   d
      }| j
                  j                  |       |r|dz  }t        ||j                  d	   |j                  d   d|j                  d   d|      }| j
                  j                  |       |r|dz  }t        ||j                  d   |j                  d   d|j                  d   d	|      }	| j
                  j                  |	       y )NFrU   T   r   r   )r'   r(   r*   rf   r   r	   )r'   r(   r*   rs   rf      )r'   r(   r*   rs   rf   r-      )r9   r:   r&   r   rh   ri   gradient_checkpointingoutput_stridere   neck_hidden_sizesrk   r   hidden_sizes)rD   r&   dilate_layer_4dilate_layer_5r-   layer_1layer_2layer_3layer_4layer_5rE   s             r!   r:   zMobileViTEncoder.__init__  s   ]]_
&+# +0/1$!N!N!!R'!N)00311!4
 	

'")00311!4
 	

'" 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"r#   r   output_hidden_statesreturn_dictc                     |rdnd }t        | j                        D ]  \  }} ||      }|s||fz   } |st        d ||fD              S t        ||      S )N c              3   &   K   | ]	  }||  y wrH   r  ).0vs     r!   	<genexpr>z+MobileViTEncoder.forward.<locals>.<genexpr>k  s     Xq!-Xs   )last_hidden_stater   )	enumerateri   r   r   )rD   r   r  r  all_hidden_statesrl   ro   s          r!   rI   zMobileViTEncoder.forward\  sq     #7BD(4 	IOA|(7M#$58H$H!		I X]4E$FXXX-]noor#   )FT)rJ   rK   rL   r   r:   rN   rO   rM   r   r   r   rI   rP   rQ   s   @r!   r   r     sa    H# H#4 H#Z &+ 	p||p #p 	p
 
u44	5pr#   r   c                   N    e Zd ZU eed<   dZdZdZdgZde	j                  ddfd	Zy)
MobileViTPreTrainedModelr&   	mobilevitpixel_valuesTr   moduler   Nc                    t        |t        j                  t        j                  t        j                  f      rm|j
                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rJ|j                  j                  j                          |j
                  j                  j                  d       yy)zInitialize the weightsg        )meanstdNg      ?)r@   r   rx   r<   r>   weightdatanormal_r&   initializer_ranger,   zero_r   fill_)rD   r  s     r!   _init_weightsz&MobileViTPreTrainedModel._init_weightsx  s    fryy"))R^^DE MM&&CT[[5R5R&S{{&  &&( '-KK""$MM$$S) .r#   )rJ   rK   rL   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   Moduler)  r  r#   r!   r  r  p  s9    #$O&*#)*
*BII 
*$ 
*r#   r  c                        e Zd Zd
dedef fdZd Ze	 	 	 ddee	j                     dee   dee   deeef   fd	       Z xZS )MobileViTModelr&   expand_outputc                 L   t         |   |       || _        || _        t	        ||j
                  |j                  d   dd      | _        t        |      | _	        | j                  r.t	        ||j                  d   |j                  d   d      | _
        | j                          y	)
aE  
        expand_output (`bool`, *optional*, defaults to `True`):
            Whether to expand the output of the model using a 1x1 convolution. If `True`, the model will apply an additional
            1x1 convolution to expand the output channels from `config.neck_hidden_sizes[5]` to `config.neck_hidden_sizes[6]`.
        r   r	   r   )r'   r(   r)   r*   r     r   rW   N)r9   r:   r&   r2  r%   num_channelsr  	conv_stemr   encoderconv_1x1_exp	post_init)rD   r&   r2  rE   s      r!   r:   zMobileViTModel.__init__  s     	 *+++11!4
 (/ 2"44Q7#55a8	!D 	r#   c                     |j                         D ]e  \  }}| j                  j                  |   }t        |t              s0|j
                  j                  D ]  }|j                  j                  |        g y)zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr7  ri   r@   r   r   r   r   )rD   heads_to_prunelayer_indexr   mobilevit_layerr   s         r!   _prune_headszMobileViTModel._prune_heads  ss     #1"6"6"8 	CK"ll00=O/>:)8)D)D)J)J C%%//;;EBC	Cr#   r  r  r  r   c                    ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  |      }| j                  |||      }| j                  r/| j                  |d         }t        j                  |ddgd      }n|d   }d }|s|||fn|f}||dd  z   S t        |||j                  	      S )
Nz You have to specify pixel_valuesr  r  r   r   r   F)r   keepdimr   )r  pooler_outputr   )r&   r  use_return_dictr;   r6  r7  r2  r8  rN   r!  r   r   )	rD   r  r  r  embedding_outputencoder_outputsr  pooled_outputr   s	            r!   rI   zMobileViTModel.forward  s    %9$D $++JjJj 	 &1%<k$++B]B]?@@>>,7,,!5# ' 
  $ 1 1/!2D E "JJ'8r2hPUVM / 2 M;H;T'7[lZnFOAB///7/')77
 	
r#   )T)NNN)rJ   rK   rL   r   rM   r:   r?  r   r   rN   rO   r   r   r   rI   rP   rQ   s   @r!   r1  r1    s}     t >C  04/3&*	'
u||,'
 'tn'
 d^	'

 
u>>	?'
 '
r#   r1  z
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                        e Zd Zdeddf fdZe	 	 	 	 d
deej                     dee	   deej                     dee	   de
eef   f
d	       Z xZS )MobileViTForImageClassificationr&   r   Nc                 |   t         |   |       |j                  | _        t        |      | _        t        j                  |j                  d      | _        |j                  dkD  r-t        j                  |j                  d   |j                        nt        j                         | _        | j                          y )NT)inplacer   r   )r9   r:   
num_labelsr1  r  r   r|   classifier_dropout_probr~   rx   r  Identity
classifierr9  rD   r&   rE   s     r!   r:   z(MobileViTForImageClassification.__init__  s      ++'/ zz&"@"@$OJPJ[J[^_J_BIIf..r2F4E4EFegepeper 	
 	r#   r  r  labelsr  c                 6   ||n| j                   j                  }| j                  |||      }|r|j                  n|d   }| j	                  | j                  |            }d}|| j                   j                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }	| j                  dk(  r& |	|j                         |j                               }n |	||      }n| j                   j                  dk(  r=t               }	 |	|j                  d| j                        |j                  d            }n,| j                   j                  dk(  rt!               }	 |	||      }|s|f|dd z   }
||f|
z   S |
S t#        |||j$                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NrA  r   
regressionsingle_label_classificationmulti_label_classificationr   r   )losslogitsr   )r&   rD  r  rC  rP  r~   problem_typerM  dtyperN   longr   r   squeezer   r   r   r   r   )rD   r  r  rR  r  outputsrG  rX  rW  loss_fctr   s              r!   rI   z'MobileViTForImageClassification.forward  s    &1%<k$++B]B]..DXfq.r1<--'!*m!<={{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE3!//
 	
r#   NNNN)rJ   rK   rL   r   r:   r   r   rN   rO   rM   r   r   r   rI   rP   rQ   s   @r!   rJ  rJ    s     4   04/3)-&*4
u||,4
 'tn4
 &	4

 d^4
 
u::	;4
 4
r#   rJ  c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTASPPPoolingr&   r'   r(   r   Nc           	          t         |           t        j                  d      | _        t        |||dddd      | _        y )Nr   )output_sizeTrelu)r'   r(   r)   r*   r.   r/   )r9   r:   r   AdaptiveAvgPool2dglobal_poolr%   r   )rD   r&   r'   r(   rE   s       r!   r:   zMobileViTASPPPooling.__init__*  sB    //A>*#%"!
r#   rF   c                     |j                   dd  }| j                  |      }| j                  |      }t        j                  j                  ||dd      }|S )Nr   r   Fr   )r   rf  r   r   r   r   )rD   rF   spatial_sizes      r!   rI   zMobileViTASPPPooling.forward9  sS    ~~bc*##H-==*==,,XLzin,or#   rp   rQ   s   @r!   ra  ra  )  sA    
 
S 
PS 
X\ 
  r#   ra  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTASPPz
    ASPP module defined in DeepLab papers: https://huggingface.co/papers/1606.00915, https://huggingface.co/papers/1706.05587
    r&   r   Nc                 ~   t         |           |j                  d   }|j                  }t	        |j
                        dk7  rt        d      t        j                         | _	        t        |||dd      }| j                  j                  |       | j                  j                  |j
                  D cg c]  }t        |||d|d       c}       t        |||      }| j                  j                  |       t        |d|z  |dd      | _        t        j                  |j                   	      | _        y c c}w )
Nr   r	   z"Expected 3 values for atrous_ratesr   rd  rX   )r'   r(   r)   r-   r/   r  )p)r9   r:   r  aspp_out_channelsr   atrous_ratesr;   r   rh   convsr%   rk   extendra  projectr|   aspp_dropout_probr~   )rD   r&   r'   r(   in_projectionrate
pool_layerrE   s          r!   r:   zMobileViTASPP.__init__F  s(   ..r2//v""#q(ABB]]_
*#%!
 	

-(

 #//
  # +!- !!#)
	
 *&+|L


*%)L 0|YZkq
 zzF$<$<=)
s   5D:rF   c                     g }| j                   D ]  }|j                   ||              t        j                  |d      }| j	                  |      }| j                  |      }|S r   )ro  rk   rN   r   rq  r~   )rD   rF   pyramidconvpooled_featuress        r!   rI   zMobileViTASPP.forwardq  s\    JJ 	+DNN4>*	+))G+,,w/,,7r#   
rJ   rK   rL   rc   r   r:   rN   rO   rI   rP   rQ   s   @r!   rj  rj  A  s7    )> )>4 )>V  r#   rj  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTDeepLabV3zJ
    DeepLabv3 architecture: https://huggingface.co/papers/1706.05587
    r&   r   Nc           	          t         |           t        |      | _        t	        j
                  |j                        | _        t        ||j                  |j                  dddd      | _        y )Nr   FT)r'   r(   r)   r.   r/   r,   )r9   r:   rj  asppr   	Dropout2drN  r~   r%   rm  rM  rP  rQ  s     r!   r:   zMobileViTDeepLabV3.__init__  s]    !&)	||F$B$BC,00**# 
r#   r   c                 r    | j                  |d         }| j                  |      }| j                  |      }|S )Nr   )r~  r~   rP  )rD   r   rF   s      r!   rI   zMobileViTDeepLabV3.forward  s6    99]2./<<)??8,r#   rz  rQ   s   @r!   r|  r|  |  s6    
 
4 
 U\\ ell r#   r|  zX
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                        e Zd Zdeddf fdZe	 	 	 	 d
deej                     deej                     dee	   dee	   de
eef   f
d	       Z xZS ) MobileViTForSemanticSegmentationr&   r   Nc                     t         |   |       |j                  | _        t        |d      | _        t        |      | _        | j                          y )NF)r2  )r9   r:   rM  r1  r  r|  segmentation_headr9  rQ  s     r!   r:   z)MobileViTForSemanticSegmentation.__init__  sD      ++'eD!3F!; 	r#   r  rR  r  r  c                 h   ||n| j                   j                  }||n| j                   j                  }|$| j                   j                  dk(  rt	        d      | j                  |d|      }|r|j                  n|d   }| j                  |      }d}|Yt        j                  j                  ||j                  dd dd	      }	t        | j                   j                  
      }
 |
|	|      }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t        |||r|j                  d      S dd      S )a{  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
        >>> model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTrA  r   r   Fr   )ignore_indexr   )rW  rX  r   
attentions)r&   r  rD  rM  r;   r  r   r  r   r   r   r   r   semantic_loss_ignore_indexr   )rD   r  rR  r  r  r]  encoder_hidden_statesrX  rW  upsampled_logitsr^  r   s               r!   rI   z(MobileViTForSemanticSegmentation.forward  sq   H %9$D $++JjJj 	 &1%<k$++B]B]$++"8"8A"=NOO..!%# ! 
 :E 5 5'RS*''(=>!}}88V\\"#.Zu  9   (T[[5[5[\H,f5D# WQR[0 WQR[0)-)9TGf$EvE&3G'//	
 	
 NR	
 	
r#   r_  )rJ   rK   rL   r   r:   r   r   rN   rO   rM   r   r   r   rI   rP   rQ   s   @r!   r  r    s     4   04)-/3&*I
u||,I
 &I
 'tn	I

 d^I
 
u--	.I
 I
r#   r  )rJ  r  r1  r  )rU   N):rc   r   typingr   r   rN   torch.utils.checkpointr   torch.nnr   r   r   activationsr
   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   configuration_mobilevitr   
get_loggerrJ   loggerr   r"   r/  r%   rS   re   rr   r   r   r   r   r   r   r   r   r  r1  rJ  ra  rj  r|  r  __all__r  r#   r!   <module>r     s  "   "    A A ! 9  . Q 7 7 4 
		H	%#  HSM UX = =@-F		 -F`bii .6RYY 6r	")) 	   >BII 
bii 
		 &299 &f/ fR\pryy \p~ * * *( R
- R
 R
j E
&> E
E
P299 08BII 8v 8 
U
'? U

U
pr#   