
    rh                       d Z ddlmZ ddlZddlmZ ddlZddl	m
Z
 ddlmZmZmZmZmZmZmZ ddlmZmZ dd	lmZmZmZmZmZ d
dlmZ  ej<                  e      Z dZ!e G d de             Z" G d dejF                  jH                        Z% G d dejF                  jH                        Z& G d dejF                  jH                        Z' G d dejF                  jH                        Z( G d dejF                  jH                        Z) G d dejF                  jH                        Z* G d dejF                  jH                        Z+ G d dejF                  jH                        Z, G d d ejF                  jH                        Z- G d! d"ejF                  jH                        Z. G d# d$ejF                  jH                        Z/ G d% d&ejF                  jH                        Z0 G d' d(ejF                  jH                        Z1 G d) d*ejF                  jH                        Z2e G d+ d,ejF                  jH                               Z3 G d- d.e      Z4d/Z5d0Z6 ed1e5       G d2 d3e4             Z7 ed4e5       G d5 d6e4e             Z8g d7Z9y)8zTF 2.0 Cvt model.    )annotationsN)	dataclass   )&TFImageClassifierOutputWithNoAttention)TFModelInputTypeTFPreTrainedModelTFSequenceClassificationLossget_initializerkeraskeras_serializableunpack_inputs)
shape_liststable_softmax)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )	CvtConfigr   c                  <    e Zd ZU dZdZded<   dZded<   dZded<   y)TFBaseModelOutputWithCLSTokena2  
    Base class for model's outputs.

    Args:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        cls_token_value (`tf.Tensor` of shape `(batch_size, 1, hidden_size)`):
            Classification token at the output of the last layer of the model.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
            the initial embedding outputs.
    Ntf.Tensor | Nonelast_hidden_statecls_token_valueztuple[tf.Tensor, ...] | Nonehidden_states)__name__
__module____qualname____doc__r   __annotations__r   r        z/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/cvt/modeling_tf_cvt.pyr   r   3   s+     +/'.(,O%,26M/6r#   r   c                  .     e Zd ZdZd fdZdddZ xZS )TFCvtDropPathzDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    References:
        (1) github.com:rwightman/pytorch-image-models
    c                2    t        |   di | || _        y )Nr"   )super__init__	drop_prob)selfr*   kwargs	__class__s      r$   r)   zTFCvtDropPath.__init__N   s    "6""r#   c                \   | j                   dk(  s|s|S d| j                   z
  }t        j                  |      d   fdt        t        j                  |            dz
  z  z   }|t        j                  j                  |dd| j                        z   }t        j                  |      }||z  |z  S )N        r   r   )r   )dtype)r*   tfshapelenrandomuniformcompute_dtypefloor)r+   xtraining	keep_probr2   random_tensors         r$   callzTFCvtDropPath.callR   s    >>S H&	!Q!DC,<q,@$AA!BII$5$5eQI[I[$5$\\/I..r#   )r*   floatN)r8   	tf.Tensor)r   r   r   r    r)   r<   __classcell__r-   s   @r$   r&   r&   H   s    
#/ /r#   r&   c                  R     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZdddZddZ xZS )	TFCvtEmbeddingsz-Construct the Convolutional Token Embeddings.c           	         t        	|   di | t        ||||||d      | _        t        j
                  j                  |      | _        y )Nconvolution_embeddings)
patch_sizenum_channels	embed_dimstridepaddingnamer"   )r(   r)   TFCvtConvEmbeddingsrE   r   layersDropoutdropout)
r+   configrF   rG   rH   rI   rJ   dropout_rater,   r-   s
            r$   r)   zTFCvtEmbeddings.__init___   sO     	"6"&9!%)'
# ||++L9r#   c                N    | j                  |      }| j                  ||      }|S Nr9   )rE   rO   )r+   pixel_valuesr9   hidden_states       r$   r<   zTFCvtEmbeddings.callv   s*    22<@||L8|Dr#   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTrE   )builtgetattrr1   
name_scoperE   rK   buildr+   input_shapes     r$   r[   zTFCvtEmbeddings.build{   o    ::
4148Dt::??@ 8++11$78 8 E8 8   A11A:)rP   r   rF   intrG   r`   rH   r`   rI   r`   rJ   r`   rQ   r=   F)rU   r?   r9   boolreturnr?   r>   r   r   r   r    r)   r<   r[   r@   rA   s   @r$   rC   rC   \   sY    7:: : 	:
 : : : :.
8r#   rC   c                  L     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 d fdZddZddZ xZS )rL   zcImage to Convolution Embeddings. This convolutional operation aims to model local spatial contexts.c           
        t        |   d	i | t        j                  j	                  |      | _        t        |t        j                  j                        r|n||f| _
        t        j                  j                  |||ddt        |j                        d      | _        t        j                  j                  dd      | _        || _        || _        y )
NrJ   validchannels_last
projection)filterskernel_sizestridesrJ   data_formatkernel_initializerrK   h㈵>normalizationepsilonrK   r"   )r(   r)   r   rM   ZeroPadding2DrJ   
isinstancecollectionsabcIterablerF   Conv2Dr
   initializer_rangerj   LayerNormalizationrq   rG   rH   )	r+   rP   rF   rG   rH   rI   rJ   r,   r-   s	           r$   r)   zTFCvtConvEmbeddings.__init__   s     	"6"||11'1B(2:{?W?W(X*_iku^v,,--"'.v/G/GH . 
 #\\<<TP_<`("r#   c                &   t        |t              r|d   }| j                  | j                  |            }t	        |      \  }}}}||z  }t        j                  ||||f      }| j                  |      }t        j                  |||||f      }|S )NrU   r2   )ru   dictrj   rJ   r   r1   reshaperq   )r+   rU   
batch_sizeheightwidthrG   hidden_sizes          r$   r<   zTFCvtConvEmbeddings.call   s    lD)'7Lt||L'AB 3=\2J/
FE<unzz,z;P\6]^)),7 zz,z65R^6_`r#   c                   | j                   ry d| _         t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d d | j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   rxY w# 1 sw Y   y xY w)NTrj   rq   )
rX   rY   r1   rZ   rj   rK   r[   rG   rq   rH   r\   s     r$   r[   zTFCvtConvEmbeddings.build   s    ::
4t,8t334 M%%tT49J9J&KLM4$/;t11667 G""(($dnn)EFG G <M MG Gs   *C&3)C2&C/2C;)rP   r   rF   r`   rG   r`   rH   r`   rI   r`   rJ   r`   )rU   r?   rc   r?   r>   rd   rA   s   @r$   rL   rL      sP    m## # 	#
 # # #6 	Gr#   rL   c                  6     e Zd ZdZd fdZdddZddZ xZS )	 TFCvtSelfAttentionConvProjectionzConvolutional projection layer.c           
     H   t        |   d
i | t        j                  j	                  |      | _        t        j                  j                  ||t        |j                        d|dd|      | _	        t        j                  j                  ddd	      | _        || _        y )Nrg   rh   Fconvolution)rk   rl   ro   rJ   rm   use_biasrK   groupsrp   g?rq   )rs   momentumrK   r"   )r(   r)   r   rM   rt   rJ   ry   r
   rz   r   BatchNormalizationrq   rH   )r+   rP   rH   rl   rI   rJ   r,   r-   s          r$   r)   z)TFCvtSelfAttentionConvProjection.__init__   s    "6"||11'1B <<..#.v/G/GH / 	
 #\\<<TTW^m<n"r#   c                l    | j                  | j                  |            }| j                  ||      }|S rS   )r   rJ   rq   r+   rV   r9   s      r$   r<   z%TFCvtSelfAttentionConvProjection.call   s6    ''\(BC)),)Jr#   c                   | j                   ry d| _         t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d d | j                  g       d d d        t        | dd       \t        j                  | j                  j
                        5  | j                  j                  d d d | j                  g       d d d        y y # 1 sw Y   sxY w# 1 sw Y   y xY w)NTr   rq   )	rX   rY   r1   rZ   r   rK   r[   rH   rq   r\   s     r$   r[   z&TFCvtSelfAttentionConvProjection.build   s    ::
4-9t//445 K  &&dD$..'IJK4$/;t11667 M""(($dDNN)KLM M <K KM Ms   *C'3*C3'C03C<)
rP   r   rH   r`   rl   r`   rI   r`   rJ   r`   ra   rV   r?   r9   rb   rc   r?   r>   rd   rA   s   @r$   r   r      s    )#"
	Mr#   r   c                      e Zd ZdZddZy)"TFCvtSelfAttentionLinearProjectionz7Linear projection layer used to flatten tokens into 1D.c                d    t        |      \  }}}}||z  }t        j                  ||||f      }|S )Nr}   )r   r1   r   )r+   rV   r   r   r   rG   r   s          r$   r<   z'TFCvtSelfAttentionLinearProjection.call   s<    2<\2J/
FE<unzz,z;P\6]^r#   NrV   r?   rc   r?   )r   r   r   r    r<   r"   r#   r$   r   r      s
    Ar#   r   c                  P     e Zd ZdZ	 d	 	 	 	 	 	 	 	 	 	 	 d fdZdddZd	dZ xZS )
TFCvtSelfAttentionProjectionz'Convolutional Projection for Attention.c                x    t        |   di | |dk(  rt        |||||d      | _        t	               | _        y )Ndw_bnconvolution_projectionrK   r"   )r(   r)   r   r   r   linear_projection)	r+   rP   rH   rl   rI   rJ   projection_methodr,   r-   s	           r$   r)   z%TFCvtSelfAttentionProjection.__init__   sF     	"6"'*J	;F^+D' "D!Er#   c                N    | j                  ||      }| j                  |      }|S rS   )r   r   r   s      r$   r<   z!TFCvtSelfAttentionProjection.call  s-    22<(2S--l;r#   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTr   )rX   rY   r1   rZ   r   rK   r[   r\   s     r$   r[   z"TFCvtSelfAttentionProjection.build  r^   r_   )r   )rP   r   rH   r`   rl   r`   rI   r`   rJ   r`   r   strra   r   r>   rd   rA   s   @r$   r   r      s[    1 ")FF F 	F
 F F F"
8r#   r   c                  p     e Zd ZdZ	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZddZd	d
dZddZ xZS )TFCvtSelfAttentionz
    Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for
    query, key, and value embeddings.
    c           	        t        |   di | |dz  | _        || _        || _        || _        t        ||||||	dk(  rdn|	d      | _        t        ||||||	d      | _        t        ||||||	d      | _	        t        j                  j                  |t        |j                        |
dd	
      | _        t        j                  j                  |t        |j                        |
dd
      | _        t        j                  j                  |t        |j                        |
dd
      | _        t        j                  j%                  |      | _        y )Ng      avglinearconvolution_projection_query)r   rK   convolution_projection_keyconvolution_projection_valuezerosprojection_queryunitsro   r   bias_initializerrK   projection_keyprojection_valuer"   )r(   r)   scalewith_cls_tokenrH   	num_headsr   r   r   r   r   rM   Denser
   rz   r   r   r   rN   rO   )r+   rP   r   rH   rl   stride_q	stride_kv	padding_q
padding_kvqkv_projection_methodqkv_biasattention_drop_rater   r,   r-   s                 r$   r)   zTFCvtSelfAttention.__init__  sm     	"6"_
,"",H*?5*HhNc/-
) +G3-+
' -I3/-
) !& 2 2.v/G/GH$# !3 !
 $ll00.v/G/GH$! 1 
 !& 2 2.v/G/GH$# !3 !
 ||++,?@r#   c                    t        |      \  }}}| j                  | j                  z  }t        j                  |||| j                  |f      }t        j
                  |d      }|S )Nr}   r      r   r   perm)r   rH   r   r1   r   	transpose)r+   rV   r   r   _head_dims         r$   "rearrange_for_multi_head_attentionz5TFCvtSelfAttention.rearrange_for_multi_head_attention_  s\    %/%="
K>>T^^3zz,z;PTP^P^`h6ij||L|Dr#   c                   | j                   rt        j                  |d||z  gd      \  }}t        |      \  }}}t        j                  |||||f      }| j                  ||      }	| j                  ||      }
| j                  ||      }| j                   rKt        j                  |
fd      }
t        j                  ||	fd      }	t        j                  ||fd      }| j                  | j                  z  }| j                  | j                  |
            }
| j                  | j                  |	            }	| j                  | j                  |            }t        j                  |
|	d      | j                   z  }t#        |d      }| j%                  ||      }t        j                  ||      }t        |      \  }}}}t        j&                  |d	
      }t        j                  |||| j                  |z  f      }|S )Nr   r}   rT   axisT)transpose_b)logitsr   r   r   )r   r1   splitr   r   r   r   r   concatrH   r   r   r   r   r   matmulr   r   rO   r   )r+   rV   r   r   r9   	cls_tokenr   r   rG   keyqueryvaluer   attention_scoreattention_probscontextr   s                    r$   r<   zTFCvtSelfAttention.callf  s   &(hh|a%=PRS&T#I| 1;<0H-
Kzz,z65R^6_`--lX-N11,1R11,1RIIy%0q9E))Y,15CIIy%0q9E>>T^^3778M8Me8TU55d6I6I#6NO778M8Me8TU))E3DADJJN(bI,,,J))OU3)'21k1,,w\:**Wz;QY@Y&Z[r#   c                L   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   [xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTr   r   r   r   r   r   )rX   rY   r1   rZ   r   rK   r[   r   r   r   rH   r   r   r\   s     r$   r[   zTFCvtSelfAttention.build  s1   ::
47>Jt@@EEF >1177=>45t<Ht>>CCD <//55d;<47>Jt@@EEF >1177=>4+T2>t4499: J%%++T4,HIJ4)40<t22778 H##))4t~~*FGH4+T2>t4499: J%%++T4,HIJ J ?> >< <> >J JH HJ JsH   I%I'?I4)J )J')JI$'I14I>JJJ#T)rP   r   r   r`   rH   r`   rl   r`   r   r`   r   r`   r   r`   r   r`   r   r   r   rb   r   r=   r   rb   r   ra   
rV   r?   r   r`   r   r`   r9   rb   rc   r?   r>   )	r   r   r   r    r)   r   r<   r[   r@   rA   s   @r$   r   r     s    $  $GAGA GA 	GA
 GA GA GA GA GA  #GA GA #GA GAR DJr#   r   c                  6     e Zd ZdZd fdZdddZddZ xZS )	TFCvtSelfOutputzOutput of the Attention layer .c                    t        |   di | t        j                  j	                  |t        |j                        d      | _        t        j                  j                  |      | _	        || _
        y Ndense)r   ro   rK   r"   )r(   r)   r   rM   r   r
   rz   r   rN   rO   rH   )r+   rP   rH   	drop_rater,   r-   s        r$   r)   zTFCvtSelfOutput.__init__  s`    "6"\\''@X@X0Y`g ( 

 ||++I6"r#   c                P    | j                  |      }| j                  ||      }|S N)inputs)r   r9   r   rO   r   s      r$   r<   zTFCvtSelfOutput.call  s*    zzz6||<(|Kr#   c                   | j                   ry d| _         t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   y xY wNTr   rX   rY   r1   rZ   r   rK   r[   rH   r\   s     r$   r[   zTFCvtSelfOutput.build  r    ::
4$'3tzz/ ?

  $dnn!=>? ? 4? ?   )A>>B)rP   r   rH   r`   r   r=   ra   r   r>   rd   rA   s   @r$   r   r     s    )#
?r#   r   c                  r     e Zd ZdZ	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZd Zdd	dZd
dZ xZS )TFCvtAttentionzDAttention layer. First chunk of the convolutional transformer block.c                    t        |   di | t        |||||||||	|
||d      | _        t	        |||d      | _        y )N	attentionr   outputr"   )r(   r)   r   r   r   dense_output)r+   rP   r   rH   rl   r   r   r   r   r   r   r   r   r   r,   r-   s                  r$   r)   zTFCvtAttention.__init__  s]    " 	"6"+!
 ,FIyxXr#   c                    t         r>   )NotImplementedError)r+   headss     r$   prune_headszTFCvtAttention.prune_heads  s    !!r#   c                V    | j                  ||||      }| j                  ||      }|S rS   )r   r   )r+   rV   r   r   r9   self_outputattention_outputs          r$   r<   zTFCvtAttention.call  s4    nn\658nT,,[8,Lr#   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   exY w# 1 sw Y   y xY w)NTr   r   )rX   rY   r1   rZ   r   rK   r[   r   r\   s     r$   r[   zTFCvtAttention.build  s    ::
4d+7t~~223 +$$T*+4.:t00556 .!!''-. . ;+ +. .s   C%CCC r   )rP   r   r   r`   rH   r`   rl   r`   r   r`   r   r`   r   r`   r   r`   r   r   r   rb   r   r=   r   r=   r   rb   ra   )rV   r?   r   r`   r   r`   r9   rb   r>   )	r   r   r   r    r)   r   r<   r[   r@   rA   s   @r$   r   r     s    N   $!Y!Y !Y 	!Y
 !Y !Y !Y !Y !Y  #!Y !Y #!Y !Y !YF" 
	.r#   r   c                  4     e Zd ZdZd fdZddZddZ xZS )TFCvtIntermediatezNIntermediate dense layer. Second chunk of the convolutional transformer block.c                    t        |   di | t        j                  j	                  t        ||z        t        |j                        dd      | _        || _	        y )Ngelur   )r   ro   
activationrK   r"   )
r(   r)   r   rM   r   r`   r
   rz   r   rH   )r+   rP   rH   	mlp_ratior,   r-   s        r$   r)   zTFCvtIntermediate.__init__  sX    "6"\\''i)+,.v/G/GH	 ( 

 #r#   c                (    | j                  |      }|S r>   )r   )r+   rV   s     r$   r<   zTFCvtIntermediate.call   s    zz,/r#   c                   | j                   ry d| _         t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   y xY wr   r   r\   s     r$   r[   zTFCvtIntermediate.build  r   r   )rP   r   rH   r`   r   r`   r   r>   rd   rA   s   @r$   r   r     s    X#?r#   r   c                  6     e Zd ZdZd fdZdddZddZ xZS )	TFCvtOutputzu
    Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection.
    c                    t        |   di | t        j                  j	                  |t        |j                        d      | _        t        j                  j                  |      | _	        || _
        || _        y r   )r(   r)   r   rM   r   r
   rz   r   rN   rO   rH   r   )r+   rP   rH   r   r   r,   r-   s         r$   r)   zTFCvtOutput.__init__  sg    "6"\\''@X@X0Y`g ( 

 ||++I6""r#   c                Z    | j                  |      }| j                  ||      }||z   }|S r   r   )r+   rV   input_tensorr9   s       r$   r<   zTFCvtOutput.call  s4    zzz6||<(|K#l2r#   c           	     @   | j                   ry d| _         t        | dd       qt        j                  | j                  j
                        5  | j                  j                  d d t        | j                  | j                  z        g       d d d        y y # 1 sw Y   y xY wr   )
rX   rY   r1   rZ   r   rK   r[   r`   rH   r   r\   s     r$   r[   zTFCvtOutput.build!  s    ::
4$'3tzz/ U

  $c$..4>>2Q.R!STU U 4U Us   ?BB)rP   r   rH   r`   r   r`   r   r`   ra   )rV   r?   r   r?   r9   rb   rc   r?   r>   rd   rA   s   @r$   r   r     s    #Ur#   r   c                  t     e Zd ZdZ	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZdddZd	dZ xZS )

TFCvtLayera&  
    Convolutional Transformer Block composed by attention layers, normalization and multi-layer perceptrons (mlps). It
    consists of 3 chunks : an attention layer, an intermediate dense layer and an output layer. This corresponds to the
    `Block` class in the original implementation.
    c                   t        |   di | t        |||||||||	|
|||d      | _        t	        |||d      | _        t        ||||d      | _        |dkD  rt        |d      n t        j                  j                  dd      | _        t        j                  j                  dd	
      | _        t        j                  j                  dd
      | _        || _        y )Nr   r   intermediater   r/   	drop_pathr   rp   layernorm_beforerr   layernorm_afterr"   )r(   r)   r   r   r   r  r   r   r&   r   rM   
Activationr  r{   r  r  rH   )r+   rP   r   rH   rl   r   r   r   r   r   r   r   r   r   drop_path_rater   r,   r-   s                    r$   r)   zTFCvtLayer.__init__1  s    & 	"6"'!
  .fiQ_`'	9iV^_ # .{;(((D 	 !& ? ?Se ? f$||>>tRc>d"r#   c                   | j                  | j                  |      |||      }| j                  ||      }||z   }| j                  |      }| j	                  |      }| j                  ||      }| j                  ||      }|S rS   )r   r  r  r  r  r   )r+   rV   r   r   r9   r   layer_outputs          r$   r<   zTFCvtLayer.callb  s    >>$*?*?*MvW\go>p>>*:X>N (,6 ++L9((6 ((|D~~lX~Fr#   c                2   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   NxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTr   r  r   r  r  r  )rX   rY   r1   rZ   r   rK   r[   r  r   r  r  rH   r  r\   s     r$   r[   zTFCvtLayer.builds  s   ::
4d+7t~~223 +$$T*+4.:t00556 .!!''-.4.:t00556 .!!''-.4d+7t~~223 +$$T*+4+T2>t4499: J%%++T4,HIJ4*D1=t33889 I$$**D$+GHI I >+ +. .. .+ +J JI IsH   I%I?I'I43)J)JII$'I14I>J
Jr   )rP   r   r   r`   rH   r`   rl   r`   r   r`   r   r`   r   r`   r   r`   r   r   r   rb   r   r=   r   r=   r   r=   r	  r=   r   rb   ra   r   r>   rd   rA   s   @r$   r  r  *  s    ,  $!/#/# /# 	/#
 /# /# /# /# /#  #/# /# #/# /# /# /#  !/#b"Ir#   r  c                  6     e Zd ZdZd fdZdddZddZ xZS )	
TFCvtStageaK  
    Cvt stage (encoder block). Each stage has 2 parts :
    - (1) A Convolutional Token Embedding layer
    - (2) A Convolutional Transformer Block (layer).
    The classification token is added only in the last stage.

    Args:
        config ([`CvtConfig`]): Model configuration class.
        stage (`int`): Stage number.
    c                   t        |   di | || _        || _        | j                  j                  | j                     rQ| j                  dd| j                  j                  d   ft        | j                  j                        dd      | _        t        | j                  |j                  | j                     | j                  dk(  r|j                  n|j                  | j                  dz
     |j                  | j                     |j                  | j                     |j                  | j                     |j                  | j                     d      | _        t!        j"                  d	|j$                  | j                     |j&                  |         }|D cg c]   }|j)                         j+                         " }}t-        |j&                  | j                           D cg c]X  }t/        |f|j0                  | j                     |j                  | j                     |j2                  | j                     |j4                  | j                     |j6                  | j                     |j8                  | j                     |j:                  | j                     |j<                  | j                     |j>                  | j                     |j@                  | j                     |j                  | j                     |jB                  | j                     || j                     |j                  | j                     d
| d[ c}| _"        y c c}w c c}w )Nr   r   Tzcvt.encoder.stages.2.cls_token)r2   initializer	trainablerK   r   	embedding)rF   rG   rI   rH   rJ   rQ   rK   r/   zlayers.)r   rH   rl   r   r   r   r   r   r   r   r   r   r	  r   rK   r"   )#r(   r)   rP   stager   
add_weightrH   r
   rz   rC   patch_sizesrG   patch_stridepatch_paddingr   r  r1   linspacer	  depthnumpyitemranger  r   
kernel_qkvr   r   r   r   r   r   r   r   rM   )r+   rP   r  r,   drop_path_ratesr8   jr-   s          r$   r)   zTFCvtStage.__init__  s   "6"
;;  ,!__!T[[22267+DKK,I,IJ5	 - DN )KK))$**504

a,,VEUEUVZV`V`cdVdEe&&tzz2&&tzz2((4))$**5	
 ++c6+@+@+Lfll[`Nab5DE1779>>+EE( 6<<

34'
& %  **4::6 **4::6"--djj94 **4::6 **4::6!,,TZZ8&,&B&B4::&N4$*$>$>tzz$J **4::6 **4::6.tzz:%//

;qc]!
 F
s   %M'EMc                D   d }| j                  ||      }t        |      \  }}}}||z  }t        j                  ||||f      }| j                  j
                  | j                     r;t        j                  | j
                  |d      }t        j                  ||fd      }| j                  D ]  }	 |	||||      }
|
} | j                  j
                  | j                     rt        j                  |d||z  gd      \  }}t        j                  |||||f      }||fS )Nr}   r   )repeatsr   r   r   rT   )r  r   r1   r   rP   r   r  repeatr   rM   r   )r+   rV   r9   r   r   r   r   rG   r   layerlayer_outputss              r$   r<   zTFCvtStage.call  s   	~~lH= 3=\2J/
FE<unzz,z;P\6]^;;  ,		$..*1MI99i%>QGL[[ 	)E!,QM(L	) ;;  ,&(hh|a%=PRS&T#I| zz,z65R^6_`Y&&r#   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       K| j                  D ];  }t        j                  |j
                        5  |j                  d        d d d        = y y # 1 sw Y   bxY w# 1 sw Y   UxY w)NTr  rM   )rX   rY   r1   rZ   r  rK   r[   rM   r+   r]   r#  s      r$   r[   zTFCvtStage.build  s    ::
4d+7t~~223 +$$T*+44(4 &]]5::. &KK%& && 5+ +& &s   C*CCC	)rP   r   r  r`   ra   )rV   r?   r9   rb   r>   rd   rA   s   @r$   r  r    s    	-
^'0
&r#   r  c                  R     e Zd ZdZeZd fdZ	 	 	 d	 	 	 	 	 	 	 	 	 ddZddZ xZ	S )	TFCvtEncoderz
    Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers
    (depth) being 1, 2 and 10.

    Args:
        config ([`CvtConfig`]): Model configuration class.
    c           	         t        |   di | || _        t        t	        |j
                              D cg c]  }t        ||d|        c}| _        y c c}w )Nzstages.r   r"   )r(   r)   rP   r  r3   r  r  stages)r+   rP   r,   	stage_idxr-   s       r$   r)   zTFCvtEncoder.__init__  sX    "6"W\]`agamam]nWo
JSJvy/DE
 
s   Ac           	        |rdnd }|}t        j                  |d      }d }t        | j                        D ]  \  }}	 |	||      \  }}|s||fz   } t        j                  |d      }|r.t	        |D 
cg c]  }
t        j                  |
d       c}
      }|st	        d |||fD              S t        |||      S c c}
w )Nr"   )r   r   r   r   r   rT   )r   r   r   r   c              3  &   K   | ]	  }||  y wr>   r"   ).0vs     r$   	<genexpr>z$TFCvtEncoder.call.<locals>.<genexpr>  s     bqTUTabs   r   r   r   )r1   r   	enumerater*  tupler   )r+   rU   output_hidden_statesreturn_dictr9   all_hidden_statesrV   r   r   stage_modulehss              r$   r<   zTFCvtEncoder.call  s     #7BD# ||L|D	!*4;;!7 	HA&2<(&S#L)#$5$G!	H ||L|D %Uf&grr||B\'J&g hb\9>O$Pbbb,*%+
 	
 'hs   7Cc                    | j                   ry d| _         t        | dd       K| j                  D ];  }t        j                  |j
                        5  |j                  d        d d d        = y y # 1 sw Y   IxY w)NTr*  )rX   rY   r*  r1   rZ   rK   r[   r&  s      r$   r[   zTFCvtEncoder.build  sp    ::
44(4 &]]5::. &KK%& && 5& &s   A..A7	rP   r   )FTF)
rU   r   r4  bool | Noner5  r;  r9   r;  rc   0TFBaseModelOutputWithCLSToken | tuple[tf.Tensor]r>   )
r   r   r   r    r   config_classr)   r<   r[   r@   rA   s   @r$   r(  r(    sZ     L
 -2#' %
&
 *
 !	

 
 
:
B&r#   r(  c                  ^     e Zd ZdZeZd fdZe	 	 	 	 d	 	 	 	 	 	 	 	 	 dd       ZddZ	 xZ
S )	TFCvtMainLayerzConstruct the Cvt model.c                V    t        |   di | || _        t        |d      | _        y )Nencoderr   r"   )r(   r)   rP   r(  rA  )r+   rP   r,   r-   s      r$   r)   zTFCvtMainLayer.__init__.  s(    "6"#F;r#   c                    |t        d      | j                  ||||      }|d   }|s	|f|dd  z   S t        ||j                  |j                        S )N You have to specify pixel_valuesr4  r5  r9   r   r   r1  )
ValueErrorrA  r   r   r   )r+   rU   r4  r5  r9   encoder_outputssequence_outputs          r$   r<   zTFCvtMainLayer.call3  s{     ?@@,,!5#	 ' 
 *!,#%(;;;,-+;;)77
 	
r#   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTrA  )rX   rY   r1   rZ   rA  rK   r[   r\   s     r$   r[   zTFCvtMainLayer.buildP  si    ::
4D)5t||001 )""4() ) 6) )r_   r:  NNNF)
rU   zTFModelInputType | Noner4  r;  r5  r;  r9   r;  rc   r<  r>   )r   r   r   r    r   r=  r)   r   r<   r[   r@   rA   s   @r$   r?  r?  (  sg    "L<
  15,0#' %
-
 *
 !	

 
 
:
 
8)r#   r?  c                      e Zd ZdZeZdZdZy)TFCvtPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    cvtrU   N)r   r   r   r    r   r=  base_model_prefixmain_input_namer"   r#   r$   rK  rK  Y  s    
 L$Or#   rK  a  

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TF 2.0 models accepts two formats as inputs:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional arguments.

    This second option is useful when using [`keras.Model.fit`] method which currently requires having all the
    tensors in the first argument of the model call function: `model(inputs)`.

    </Tip>

    Args:
        config ([`CvtConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
al  
    Args:
        pixel_values (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
            for details.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
z]The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Zd fdZe ee       eee	      	 	 	 	 d	 	 	 	 	 	 	 	 	 dd                     Z
ddZ xZS )	
TFCvtModelc                P    t        |   |g|i | t        |d      | _        y )NrL  r   )r(   r)   r?  rL  r+   rP   r   r,   r-   s       r$   r)   zTFCvtModel.__init__  s(    3&3F3!&u5r#   output_typer=  c                    |t        d      | j                  ||||      }|s|d   f|dd z   S t        |j                  |j                  |j
                        S )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, TFCvtModel
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
        >>> model = TFCvtModel.from_pretrained("microsoft/cvt-13")

        >>> inputs = image_processor(images=image, return_tensors="tf")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NrC  )rU   r4  r5  r9   r   r   r1  )rE  rL  r   r   r   r   )r+   rU   r4  r5  r9   outputss         r$   r<   zTFCvtModel.call  sy    > ?@@((%!5#	  
 AJ=712;..,%77#33!//
 	
r#   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTrL  )rX   rY   r1   rZ   rL  rK   r[   r\   s     r$   r[   zTFCvtModel.build  se    ::
4%1txx}}- %t$% % 2% %r_   r:  rI  )
rU   r   r4  r;  r5  r;  r9   r;  rc   r<  r>   )r   r   r   r)   r   r   TFCVT_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr<   r[   r@   rA   s   @r$   rP  rP    s    
6
 *+AB+HWfg *.,0#' %-
&-
 *-
 !	-

 -
 
:-
 h C -
^%r#   rP  z
    Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                       e Zd Zd fdZe ee       eee	      	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 dd                     Z
ddZ xZS )	TFCvtForImageClassificationc                X   t        |   |g|i | |j                  | _        t        |d      | _        t
        j                  j                  dd      | _        t
        j                  j                  |j                  t        |j                        ddd	      | _        || _        y )
NrL  r   rp   	layernormrr   Tr   
classifierr   )r(   r)   
num_labelsr?  rL  r   rM   r{   r]  r   r
   rz   r^  rP   rR  s       r$   r)   z$TFCvtForImageClassification.__init__  s    3&3F3 ++!&u588K8X  ,,,,##.v/G/GH$ - 
 r#   rS  c                   | j                  ||||      }|d   }|d   }| j                  j                  d   r| j                  |      }nUt	        |      \  }	}
}}t        j                  ||	|
||z  f      }t        j                  |d      }| j                  |      }t        j                  |d      }| j                  |      }|d	n| j                  ||
      }|s|f|dd	 z   }||f|z   S |S t        |||j                        S )a+  
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, TFCvtForImageClassification
        >>> import tensorflow as tf
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
        >>> model = TFCvtForImageClassification.from_pretrained("microsoft/cvt-13")

        >>> inputs = image_processor(images=image, return_tensors="tf")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the 1000 ImageNet classes
        >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
        >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
        ```rD  r   r   r   r}   )r   r   r   r   r   N)labelsr   r   )lossr   r   )rL  rP   r   r]  r   r1   r   r   reduce_meanr^  hf_compute_lossr   r   )r+   rU   ra  r4  r5  r9   rV  rG  r   r   rG   r   r   sequence_output_meanr   rb  r   s                    r$   r<   z TFCvtForImageClassification.call  s(   R ((!5#	  
 "!*AJ	;;  $"nnY7O 7A6Q3Jfe jj\[adi[i@jkO ll?KO"nn_=O!~~oAF!56~t4+?+?vV\+?+]Y,F)-)9TGf$EvE54^e^s^sttr#   c                *   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       gt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  d   g       d d d        t        | dd       t        | j                  d      rht        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  d   g       d d d        y y y # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTrL  r]  r   r^  rK   )rX   rY   r1   rZ   rL  rK   r[   r]  rP   rH   hasattrr^  r\   s     r$   r[   z!TFCvtForImageClassification.build7  sE   ::
4%1txx}}- %t$%4d+7t~~223 N$$dD$++2G2G2K%LMN4t,8t/]]4??#7#78 SOO))4t{{7L7LR7P*QRS S 0 9% %N NS Ss$   E0%6E=/6F	0E:=F	Fr:  )NNNNF)rU   r   ra  r   r4  r;  r5  r;  r9   r;  rc   z9TFImageClassifierOutputWithNoAttention | tuple[tf.Tensor]r>   )r   r   r   r)   r   r   rX  r   r   rY  r<   r[   r@   rA   s   @r$   r[  r[    s    $ *+AB+Q`op *.#',0#' %@u&@u !@u *	@u
 !@u @u 
C@u q C @uDSr#   r[  )r[  rP  rK  ):r    
__future__r   collections.abcrv   dataclassesr   
tensorflowr1   modeling_tf_outputsr   modeling_tf_utilsr   r   r	   r
   r   r   r   tf_utilsr   r   utilsr   r   r   r   r   configuration_cvtr   
get_loggerr   loggerrY  r   rM   Layerr&   rC   rL   r   r   r   r   r   r   r   r   r  r  r(  r?  rK  TFCVT_START_DOCSTRINGrX  rP  r[  __all__r"   r#   r$   <module>rv     sg    "  !  I   3  ) 
		H	%  7K 7 7(/ELL&& /(%8ell(( %8P7G%,,,, 7Gt"Mu||'9'9 "MJ);); 85<<#5#5 8DMJ++ MJ`?ell(( ?27.U\\'' 7.t?** ?4U%,,$$ U:^I## ^IB]&## ]&@:&5<<%% :&z -)U\\'' -) -)`%, % 8 & c>%% >%	>%B  eS"68T eSeSP Pr#   