
    rh              	          d Z ddlmZmZ ddlZddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZ ddlmZ  ej6                  e      Zd<dededee   defdZ e d       e d      fde de de de fdZ! G d dejD                        Z# G d dejD                        Z$ G d dejD                        Z% G d dejD                        Z& G d  d!ejD                        Z' G d" d#ejD                        Z( G d$ d%ejD                        Z) G d& d'e      Z* G d( d)ejD                        Z+e G d* d+e             Z,e G d, d-e,             Z- ed./       G d0 d1e,             Z. G d2 d3ejD                        Z/ G d4 d5ejD                        Z0 G d6 d7ejD                        Z1 ed8/       G d9 d:e,             Z2g d;Z3y)=zPyTorch MobileViTV2 model.    )OptionalUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel)auto_docstringlogging   )MobileViTV2Configvaluedivisor	min_valuereturnc                 |    ||}t        |t        | |dz  z         |z  |z        }|d| z  k  r||z  }t        |      S )a  
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
       g?)maxint)r   r   r   	new_values       /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/modeling_mobilevitv2.pymake_divisibler   +   sS     	Is57Q;#677BWLMI3;W	y>    z-infinfmin_valmax_valc                 .    t        |t        ||             S N)r   minr   r"   r#   s      r   clipr(   :   s    wGU+,,r    c                        e Zd Z	 	 	 	 	 	 ddedededededededed	ed
eeef   ddf fdZde	j                  de	j                  fdZ xZS )MobileViTV2ConvLayerconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                 $   t         |           t        |dz
  dz        |z  }||z  dk7  rt        d| d| d      ||z  dk7  rt        d| d| d      t	        j
                  ||||||||d		      | _        |	r t	        j                  |d
ddd      | _        nd | _        |
rdt        |
t              rt        |
   | _        y t        |j                  t              rt        |j                     | _        y |j                  | _        y d | _        y )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r,   r-   r.   r/   paddingr2   r0   r1   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr
   
activation
hidden_act)selfr+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r7   	__class__s               r   r?   zMobileViTV2ConvLayer.__init__@   s*    	{Q!+,x71$/}<STZS[[cdee& A%0>UV\U]]efgg99#%# 

 !#)$("D "&D.#."("8F--s3"():):";"("3"3"DOr    featuresc                     | j                  |      }| j                  | j                  |      }| j                  | j                  |      }|S r%   )rB   rD   rG   )rI   rK   s     r   forwardzMobileViTV2ConvLayer.forwardv   sK    ##H-)))(3H??&x0Hr    )r   r   Fr   TT)__name__
__module____qualname__r   r   boolr   rF   r?   torchTensorrM   __classcell__rJ   s   @r   r*   r*   ?   s     "&+/4#!4# 4# 	4#
 4# 4# 4# 4# 4#  4# dCi(4# 
4#l  r    r*   c                   x     e Zd ZdZ	 ddedededededdf fd	Zd
ej                  dej                  fdZ	 xZ
S )MobileViTV2InvertedResidualzY
    Inverted residual block (MobileNetv2): https://huggingface.co/papers/1801.04381
    r+   r,   r-   r/   r2   r   Nc           	      @   t         |           t        t        t	        ||j
                  z              d      }|dvrt        d| d      |dk(  xr ||k(  | _        t        |||d      | _	        t        |||d|||      | _
        t        |||dd	
      | _        y )N   )r   r   zInvalid stride .r   )r,   r-   r.   r	   )r,   r-   r.   r/   r0   r2   Fr,   r-   r.   r4   )r>   r?   r   r   roundexpand_ratior@   use_residualr*   
expand_1x1conv_3x3
reduce_1x1)rI   r+   r,   r-   r/   r2   expanded_channelsrJ   s          r   r?   z$MobileViTV2InvertedResidual.__init__   s     	*3u[6CVCV5V/W+XZ[\vha899#q[K{l/J.:KYZ
 -)*$
 /)% 
r    rK   c                     |}| j                  |      }| j                  |      }| j                  |      }| j                  r||z   S |S r%   )r_   r`   ra   r^   )rI   rK   residuals      r   rM   z#MobileViTV2InvertedResidual.forward   sI    ??8,==*??8,&*&7&7x("EXEr    )r   rN   rO   rP   __doc__r   r   r?   rR   rS   rM   rT   rU   s   @r   rW   rW      sc    
 lm
'
69
IL
VY
eh
	
BF F Fr    rW   c                   t     e Zd Z	 ddedededededdf fdZd	ej                  dej                  fd
Z xZ	S )MobileViTV2MobileNetLayerr+   r,   r-   r/   
num_stagesr   Nc                     t         |           t        j                         | _        t        |      D ]5  }t        ||||dk(  r|nd      }| j                  j                  |       |}7 y )Nr   r   )r,   r-   r/   )r>   r?   r   
ModuleListlayerrangerW   append)	rI   r+   r,   r-   r/   ri   irl   rJ   s	           r   r?   z"MobileViTV2MobileNetLayer.__init__   sh     	]]_
z" 	'A/')!"avQ	E JJe$&K	'r    rK   c                 8    | j                   D ]
  } ||      } |S r%   rl   )rI   rK   layer_modules      r   rM   z!MobileViTV2MobileNetLayer.forward   s$     JJ 	.L#H-H	.r    )r   r   
rN   rO   rP   r   r   r?   rR   rS   rM   rT   rU   s   @r   rh   rh      sV    qr'''69'IL'VY'kn'	'   r    rh   c                   h     e Zd ZdZdededdf fdZdej                  dej                  fdZ	 xZ
S )	MobileViTV2LinearSelfAttentionay  
    This layer applies a self-attention with linear complexity, as described in MobileViTV2 paper:
    https://huggingface.co/papers/2206.02680

    Args:
        config (`MobileVitv2Config`):
             Model configuration object
        embed_dim (`int`):
            `input_channels` from an expected input of size :math:`(batch_size, input_channels, height, width)`
    r+   	embed_dimr   Nc           	          t         |           t        ||dd|z  z   dddd      | _        t	        j
                  |j                        | _        t        |||dddd      | _        || _        y )Nr   r   TF)r+   r,   r-   r1   r.   r3   r4   p)	r>   r?   r*   qkv_projr   Dropoutattn_dropoutout_projrv   )rI   r+   rv   rJ   s      r   r?   z'MobileViTV2LinearSelfAttention.__init__   s{    ,!a)m,# 
 JJ)<)<=,!"# 
 #r    hidden_statesc                    | j                  |      }t        j                  |d| j                  | j                  gd      \  }}}t        j                  j
                  j                  |d      }| j                  |      }||z  }t        j                  |dd      }t        j                  j
                  j                  |      |j                  |      z  }| j                  |      }|S )Nr   )split_size_or_sectionsdimr   Tr   keepdim)rz   rR   splitrv   r   
functionalsoftmaxr|   sumrelu	expand_asr}   )	rI   r~   qkvquerykeyr   context_scorescontext_vectorouts	            r   rM   z&MobileViTV2LinearSelfAttention.forward   s    mmM*
 "KKQX\XfXfDgmnosE ,,44U4C**>: ~->r4H hh!!&&u-0H0H0OOmmC 
r    re   rU   s   @r   ru   ru      s>    	#0 #S #T #2U\\ ell r    ru   c                   p     e Zd Z	 d
dededededdf
 fdZdej                  dej                  fd	Z	 xZ
S )MobileViTV2FFNr+   rv   ffn_latent_dimffn_dropoutr   Nc           
          t         |           t        |||ddddd      | _        t	        j
                  |      | _        t        |||ddddd      | _        t	        j
                  |      | _        y )Nr   TF)r+   r,   r-   r.   r/   r1   r3   r4   )	r>   r?   r*   conv1r   r{   dropout1conv2dropout2)rI   r+   rv   r   r   rJ   s        r   r?   zMobileViTV2FFN.__init__  s|     	)!'#	

 

;/)&"# 	

 

;/r    r~   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r%   )r   r   r   r   )rI   r~   s     r   rM   zMobileViTV2FFN.forward(  s@    

=1m4

=1m4r            rN   rO   rP   r   r   floatr?   rR   rS   rM   rT   rU   s   @r   r   r     sY     !0!0 0 	0
 0 
0@U\\ ell r    r   c                   p     e Zd Z	 d
dededededdf
 fdZdej                  dej                  fd	Z	 xZ
S )MobileViTV2TransformerLayerr+   rv   r   dropoutr   Nc                 P   t         |           t        j                  d||j                        | _        t        ||      | _        t        j                  |      | _	        t        j                  d||j                        | _
        t        ||||j                        | _        y )Nr   
num_groupsnum_channelsr:   rx   )r>   r?   r   	GroupNormlayer_norm_epslayernorm_beforeru   	attentionr{   r   layernorm_afterr   r   ffn)rI   r+   rv   r   r   rJ   s        r   r?   z$MobileViTV2TransformerLayer.__init__1  s~     	 "	W]WlWl m7	J

W-!||qyV\VkVkl!&)^VEWEWXr    r~   c                     | j                  |      }| j                  |      }||z   }| j                  |      }| j                  |      }||z   }|S r%   )r   r   r   r   )rI   r~   layernorm_1_outattention_outputlayer_outputs        r   rM   z#MobileViTV2TransformerLayer.forward?  sY    //>>>/:(=8++M:xx-#m3r    r   r   rU   s   @r   r   r   0  s^     Y!Y Y 	Y
 Y 
Y	U\\ 	ell 	r    r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTV2Transformerr+   n_layersd_modelr   Nc                 <   t         	|           |j                  }||z  g|z  }|D cg c]  }t        |dz  dz         }}t	        j
                         | _        t        |      D ].  }t        ||||         }| j                  j                  |       0 y c c}w )N   )rv   r   )
r>   r?   ffn_multiplierr   r   rk   rl   rm   r   rn   )
rI   r+   r   r   r   ffn_dimsd	block_idxtransformer_layerrJ   s
            r   r?   zMobileViTV2Transformer.__init__L  s    .."W,-8 2::ACbB'::]]_
x 	1I ;'(9:M! JJ/0		1 ;s   Br~   c                 8    | j                   D ]
  } ||      } |S r%   rq   )rI   r~   rr   s      r   rM   zMobileViTV2Transformer.forward]  s%     JJ 	8L(7M	8r    rs   rU   s   @r   r   r   K  sA    10 1C 1# 1RV 1"U\\ ell r    r   c                       e Zd ZdZ	 	 	 ddededededededed	d
f fdZdej                  d	e	ej                  e	eef   f   fdZ
dej                  de	eef   d	ej                  fdZdej                  d	ej                  fdZ xZS )MobileViTV2LayerzE
    MobileViTV2 layer: https://huggingface.co/papers/2206.02680
    r+   r,   r-   attn_unit_dimn_attn_blocksr2   r/   r   Nc                    t         	|           |j                  | _        |j                  | _        |}|dk(  r)t        ||||dk(  r|nd|dkD  r|dz  nd      | _        |}nd | _        t        ||||j                  |      | _	        t        |||ddd      | _
        t        |||      | _        t        j                  d||j                        | _        t        |||dd	d      | _        y )
Nr   r   )r,   r-   r/   r2   )r,   r-   r.   r0   F)r,   r-   r.   r3   r4   )r   r   r   T)r>   r?   
patch_sizepatch_widthpatch_heightrW   downsampling_layerr*   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projection)
rI   r+   r,   r-   r   r   r2   r/   cnn_out_dimrJ   s
            r   r?   zMobileViTV2Layer.__init__h  s    	!,,"--#Q;&A')!)QvA*2Q,QA'D# 'K&*D# -#$//
 -#$# 
 2&-Zgh TZTiTij  4#$"  
r    feature_mapc                 "   |j                   \  }}}}t        j                  j                  || j                  | j
                  f| j                  | j
                  f      }|j                  ||| j                  | j
                  z  d      }|||ffS )N)r.   r/   r   )shaper   r   unfoldr   r   reshape)rI   r   
batch_sizer,   
img_height	img_widthpatchess          r   	unfoldingzMobileViTV2Layer.unfolding  s    9D9J9J6
KY--&&**D,<,<=%%t'7'78 ' 

 //*k4;L;LtO_O_;_acdY///r    r   output_sizec                     |j                   \  }}}}|j                  |||z  |      }t        j                  j	                  ||| j
                  | j                  f| j
                  | j                  f      }|S )N)r   r.   r/   )r   r   r   r   foldr   r   )rI   r   r   r   in_dimr   	n_patchesr   s           r   foldingzMobileViTV2Layer.folding  sz    4;MM1
FJ	//*fz.A9Mmm((#**D,<,<=%%t'7'78	 ) 
 r    rK   c                 6   | j                   r| j                  |      }| j                  |      }| j                  |      }| j                  |      \  }}| j	                  |      }| j                  |      }| j                  ||      }| j                  |      }|S r%   )r   r   r   r   r   r   r   r   )rI   rK   r   r   s       r   rM   zMobileViTV2Layer.forward  s    ""..x8H ==*==*  $~~h7 ""7+..) <<5''1r    )r   r   r   )rN   rO   rP   rf   r   r   r?   rR   rS   tupler   r   rM   rT   rU   s   @r   r   r   c  s     ;
!;
 ;
 	;

 ;
 ;
 ;
 ;
 
;
z	0U\\ 	0eELL%PSUXPX/<Y6Z 	0u|| %S/ ell   r    r   c                   d     e Zd Zdeddf fdZ	 	 d	dej                  dededee	e
f   fdZ xZS )
MobileViTV2Encoderr+   r   Nc           	         t         |           || _        t        j                         | _        d| _        dx}}|j                  dk(  rd}d}n|j                  dk(  rd}d}t        t        d|j                  z  dd      dd	      }t        d|j                  z  d
      }t        d|j                  z  d
      }t        d|j                  z  d
      }t        d|j                  z  d
      }	t        d|j                  z  d
      }
t        |||dd      }| j
                  j                  |       t        |||dd      }| j
                  j                  |       t        |||t        |j                  d   |j                  z  d
      |j                  d         }| j
                  j                  |       |r|dz  }t        |||	t        |j                  d   |j                  z  d
      |j                  d   |      }| j
                  j                  |       |r|dz  }t        ||	|
t        |j                  d   |j                  z  d
      |j                  d   |      }| j
                  j                  |       y )NFrY   Tr   r       @   r'   r   r   r         i     )r,   r-   r/   ri   r   r   )r,   r-   r   r   )r,   r-   r   r   r2   )r>   r?   r+   r   rk   rl   gradient_checkpointingoutput_strider   r(   width_multiplierrh   rn   r   base_attn_unit_dimsr   )rI   r+   dilate_layer_4dilate_layer_5r2   layer_0_dimlayer_1_dimlayer_2_dimlayer_3_dimlayer_4_dimlayer_5_dimlayer_1layer_2layer_3layer_4layer_5rJ   s                   r   r?   zMobileViTV2Encoder.__init__  s|   ]]_
&+# +0/1$!N!N!!R'!N$rF333RLVWce
 %R&*A*A%A2N$S6+B+B%BAN$S6+B+B%BAN$S6+B+B%BAN$S6+B+B%BAN+#$
 	

'"+#$
 	

'""#$()C)CA)FI`I`)`jkl ..q1
 	

'"MH"#$()C)CA)FI`I`)`jkl ..q1
 	

'"MH"#$()C)CA)FI`I`)`jkl ..q1
 	

'"r    r~   output_hidden_statesreturn_dictc                     |rdnd }t        | j                        D ]  \  }} ||      }|s||fz   } |st        d ||fD              S t        ||      S )N c              3   &   K   | ]	  }||  y wr%   r   ).0vs     r   	<genexpr>z-MobileViTV2Encoder.forward.<locals>.<genexpr>6  s     Xq!-Xs   )last_hidden_stater~   )	enumeraterl   r   r   )rI   r~   r   r   all_hidden_statesro   rr   s          r   rM   zMobileViTV2Encoder.forward'  sq     #7BD(4 	IOA|(7M#$58H$H!		I X]4E$FXXX-]noor    )FT)rN   rO   rP   r   r?   rR   rS   rQ   r   r   r   rM   rT   rU   s   @r   r   r     sb    O#0 O#T O#h &+ 	p||p #p 	p
 
u44	5pr    r   c                   N    e Zd ZU eed<   dZdZdZdgZde	j                  ddfd	Zy)
MobileViTV2PreTrainedModelr+   mobilevitv2pixel_valuesTr   moduler   Nc                    t        |t        j                  t        j                  t        j                  f      rm|j
                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rJ|j                  j                  j                          |j
                  j                  j                  d       yy)zInitialize the weightsr   )meanstdNg      ?)rE   r   LinearrA   rC   weightdatanormal_r+   initializer_ranger1   zero_r   fill_)rI   r	  s     r   _init_weightsz(MobileViTV2PreTrainedModel._init_weightsC  s    fryy"))R^^DE MM&&CT[[5R5R&S{{&  &&( '-KK""$MM$$S) .r    )rN   rO   rP   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   Moduler  r   r    r   r  r  ;  s9    %$O&*#+,
*BII 
*$ 
*r    r  c                        e Zd Zd
dedef fdZd Ze	 	 	 ddee	j                     dee   dee   deeef   fd	       Z xZS )MobileViTV2Modelr+   expand_outputc           	         t         |   |       || _        || _        t	        t        d|j                  z  dd      dd      }t        ||j                  |ddd	d	
      | _	        t        |      | _        | j                          y)a  
        expand_output (`bool`, *optional*, defaults to `True`):
            Whether to expand the output of the model. If `True`, the model will output pooled features in addition to
            hidden states. If `False`, only the hidden states will be returned.
        r   r   r   r'   rY   r   r	   r   Tr,   r-   r.   r/   r3   r4   N)r>   r?   r+   r  r   r(   r   r*   r   	conv_stemr   encoder	post_init)rI   r+   r  r   rJ   s       r   r?   zMobileViTV2Model.__init__R  s     	 *$rF333RLVWce
 .++$"
 *&1 	r    c                     |j                         D ]e  \  }}| j                  j                  |   }t        |t              s0|j
                  j                  D ]  }|j                  j                  |        g y)zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr!  rl   rE   r   r   r   prune_heads)rI   heads_to_prunelayer_indexheadsmobilevitv2_layerr   s         r   _prune_headszMobileViTV2Model._prune_headsn  sv     #1"6"6"8 	CK $ 2 2; ?+-=>):)F)F)L)L C%%//;;EBC	Cr    r  r   r   r   c                    ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  |      }| j                  |||      }| j                  r |d   }t        j                  |ddgd      }n|d   }d }|s|||fn|f}||dd  z   S t        |||j                  	      S )
Nz You have to specify pixel_valuesr   r   r   r   Fr   r   )r  pooler_outputr~   )r+   r   use_return_dictr@   r   r!  r  rR   r  r   r~   )	rI   r  r   r   embedding_outputencoder_outputsr  pooled_outputoutputs	            r   rM   zMobileViTV2Model.forwardx  s     %9$D $++JjJj 	 &1%<k$++B]B]?@@>>,7,,!5# ' 
  / 2 "JJ'8r2hPUVM / 2 M;H;T'7[lZnFOAB///7/')77
 	
r    )T)NNN)rN   rO   rP   r   rQ   r?   r*  r   r   rR   rS   r   r   r   rM   rT   rU   s   @r   r  r  P  s~    0  8C  04/3&*	'
u||,'
 'tn'
 d^	'

 
u>>	?'
 '
r    r  z
    MobileViTV2 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                        e Zd Zdeddf fdZe	 	 	 	 d
deej                     dee	   deej                     dee	   de
eef   f
d	       Z xZS )!MobileViTV2ForImageClassificationr+   r   Nc                 L   t         |   |       |j                  | _        t        |      | _        t        d|j                  z  d      }|j                  dkD  r!t        j                  ||j                        nt        j                         | _
        | j                          y )Nr   rY   r   r   )in_featuresout_features)r>   r?   
num_labelsr  r  r   r   r   r  Identity
classifierr"  )rI   r+   r-   rJ   s      r   r?   z*MobileViTV2ForImageClassification.__init__  s      +++F3%cF,C,C&CQO   1$ II,V=N=NO 	 	r    r  r   labelsr   c                    ||n| j                   j                  }| j                  |||      }|r|j                  n|d   }| j	                  |      }d}|| j                   j
                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j
                  dk(  rIt               }	| j                  dk(  r& |	|j                         |j                               }n |	||      }n| j                   j
                  dk(  r=t               }	 |	|j                  d| j                        |j                  d            }n,| j                   j
                  dk(  rt               }	 |	||      }|s|f|dd z   }
||f|
z   S |
S t!        |||j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr,  r   
regressionsingle_label_classificationmulti_label_classificationr   r   )losslogitsr~   )r+   r/  r  r.  r<  problem_typer:  dtyperR   longr   r   squeezer   viewr   r   r~   )rI   r  r   r=  r   outputsr2  rC  rB  loss_fctr3  s              r   rM   z)MobileViTV2ForImageClassification.forward  s    &1%<k$++B]B]""<FZhs"t1<--'!*/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE3!//
 	
r    NNNN)rN   rO   rP   r   r?   r   r   rR   rS   rQ   r   r   r   rM   rT   rU   s   @r   r6  r6    s    0 T "  04/3)-&*4
u||,4
 'tn4
 &	4

 d^4
 
u::	;4
 4
r    r6  c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTV2ASPPPoolingr+   r,   r-   r   Nc           	          t         |           t        j                  d      | _        t        |||dddd      | _        y )Nr   )r   Tr   r  )r>   r?   r   AdaptiveAvgPool2dglobal_poolr*   r   )rI   r+   r,   r-   rJ   s       r   r?   zMobileViTV2ASPPPooling.__init__  sB    //A>,#%"!
r    rK   c                     |j                   dd  }| j                  |      }| j                  |      }t        j                  j                  ||dd      }|S )Nr-  bilinearFsizemodealign_corners)r   rP  r   r   r   interpolate)rI   rK   spatial_sizes      r   rM   zMobileViTV2ASPPPooling.forward  sS    ~~bc*##H-==*==,,XLzin,or    rs   rU   s   @r   rM  rM    sB    
0 
s 
RU 
Z^ 
  r    rM  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTV2ASPPz
    ASPP module defined in DeepLab papers: https://huggingface.co/papers/1606.00915, https://huggingface.co/papers/1706.05587
    r+   r   Nc                    t         |           t        d|j                  z  d      }|}|j                  }t        |j                        dk7  rt        d      t        j                         | _
        t        |||dd      }| j                  j                  |       | j                  j                  |j                  D cg c]  }t        |||d|d	       c}       t        |||      }| j                  j                  |       t        |d
|z  |dd      | _        t        j                   |j"                        | _        y c c}w )Nr   rY   r   r	   z"Expected 3 values for atrous_ratesr   r   r[   )r,   r-   r.   r2   r4      rx   )r>   r?   r   r   aspp_out_channelslenatrous_ratesr@   r   rk   convsr*   rn   extendrM  projectr{   aspp_dropout_probr   )	rI   r+   encoder_out_channelsr,   r-   in_projectionrate
pool_layerrJ   s	           r   r?   zMobileViTV2ASPP.__init__  s6   -cF4K4K.KUVW*//v""#q(ABB]]_
,#%!
 	

-(

 #//
  % +!- !!#)
	
 ,FKN


*%+L 0|YZkq
 zzF$<$<=)
s   ErK   c                     g }| j                   D ]  }|j                   ||              t        j                  |d      }| j	                  |      }| j                  |      }|S )Nr   r   )r`  rn   rR   catrb  r   )rI   rK   pyramidconvpooled_featuress        r   rM   zMobileViTV2ASPP.forward=  s\    JJ 	+DNN4>*	+))G+,,w/,,7r    
rN   rO   rP   rf   r   r?   rR   rS   rM   rT   rU   s   @r   rZ  rZ    s8    *>0 *>T *>X  r    rZ  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTV2DeepLabV3zJ
    DeepLabv3 architecture: https://huggingface.co/papers/1706.05587
    r+   r   Nc           	          t         |           t        |      | _        t	        j
                  |j                        | _        t        ||j                  |j                  dddd      | _        y )Nr   FT)r,   r-   r.   r3   r4   r1   )r>   r?   rZ  asppr   	Dropout2dclassifier_dropout_probr   r*   r]  r:  r<  rI   r+   rJ   s     r   r?   zMobileViTV2DeepLabV3.__init__N  s]    #F+	||F$B$BC.00**# 
r    r~   c                 r    | j                  |d         }| j                  |      }| j                  |      }|S )Nr   )rq  r   r<  )rI   r~   rK   s      r   rM   zMobileViTV2DeepLabV3.forward^  s6    99]2./<<)??8,r    rm  rU   s   @r   ro  ro  I  s7    
0 
T 
 U\\ ell r    ro  zZ
    MobileViTV2 model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                        e Zd Zdeddf fdZe	 	 	 	 d
deej                     deej                     dee	   dee	   de
eef   f
d	       Z xZS )"MobileViTV2ForSemanticSegmentationr+   r   Nc                     t         |   |       |j                  | _        t        |d      | _        t        |      | _        | j                          y )NF)r  )r>   r?   r:  r  r  ro  segmentation_headr"  rt  s     r   r?   z+MobileViTV2ForSemanticSegmentation.__init__k  sE      +++F%H!5f!= 	r    r  r=  r   r   c                 h   ||n| j                   j                  }||n| j                   j                  }|$| j                   j                  dk(  rt	        d      | j                  |d|      }|r|j                  n|d   }| j                  |      }d}|Yt        j                  j                  ||j                  dd dd	      }	t        | j                   j                  
      }
 |
|	|      }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t        |||r|j                  d      S dd      S )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTV2ForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
        >>> model = MobileViTV2ForSemanticSegmentation.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr,  r-  rR  FrS  )ignore_indexr   )rB  rC  r~   
attentions)r+   r   r/  r:  r@   r  r~   ry  r   r   rW  r   r   semantic_loss_ignore_indexr   )rI   r  r=  r   r   rI  encoder_hidden_statesrC  rB  upsampled_logitsrJ  r3  s               r   rM   z*MobileViTV2ForSemanticSegmentation.forwardu  ss   H %9$D $++JjJj 	 &1%<k$++B]B]$++"8"8A"=NOO""!%# # 
 :E 5 5'RS*''(=>!}}88V\\"#.Zu  9   (T[[5[5[\H,f5D# WQR[0 WQR[0)-)9TGf$EvE&3G'//	
 	
 NR	
 	
r    rK  )rN   rO   rP   r   r?   r   r   rR   rS   rQ   r   r   r   rM   rT   rU   s   @r   rw  rw  e  s    0 T   04)-/3&*I
u||,I
 &I
 'tn	I

 d^I
 
u--	.I
 I
r    rw  )r6  rw  r  r  )rY   N)4rf   typingr   r   rR   torch.utils.checkpointr   torch.nnr   r   r   activationsr
   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   configuration_mobilevitv2r   
get_loggerrN   loggerr   r   r   r(   r  r*   rW   rh   ru   r   r   r   r   r   r  r  r6  rM  rZ  ro  rw  __all__r   r    r   <module>r     s  " ! "    A A ! 9  . , 8 
		H	%#  HSM UX  ).fe - - - -Y^ -
=299 =B-F")) -Fb		 .<RYY <~&RYY &R")) 6RYY 0o1 odcp cpL * * *( O
1 O
 O
d G
(B G
G
VRYY 09bii 9z299 8 
U
)C U

U
pr    