
    rho                      d Z ddlmZ ddlZddlmZ ddlZddlZ	ddl
mZ ddlmZmZmZmZmZ ddlmZmZmZmZmZmZmZmZmZ dd	lmZmZmZ dd
l m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&  e$jN                  e(      Z)dZ*dZ+ G d dejX                  jZ                        Z. G d dejX                  jZ                        Z/ G d dejX                  jZ                        Z0 G d dejX                  jZ                        Z1 G d dejX                  jZ                        Z2 G d dejX                  jZ                        Z3 G d dejX                  jZ                        Z4 G d dejX                  jZ                        Z5 G d d ejX                  jZ                        Z6 G d! d"ejX                  jZ                        Z7d# Z8d$ Z9d% Z:d& Z;d' Z< G d( d)ejX                  jZ                        Z= G d* d+ejX                  jZ                        Z> G d, d-ejX                  jZ                        Z? G d. d/ejX                  jZ                        Z@ G d0 d1ejX                  jZ                        ZA G d2 d3ejX                  jZ                        ZB G d4 d5e      ZCd6ZDd7ZE e"d8eD       G d9 d:eC             ZF e"d;eD       G d< d=eCe             ZG e"d>eD       G d? d@eCe             ZH e"dAeD       G dB dCeCe             ZI e"dDeD       G dE dFeCe             ZJg dGZKy)HzTF 2.0 DeBERTa model.    )annotationsN)Sequence   )get_tf_activation)TFBaseModelOutputTFMaskedLMOutputTFQuestionAnsweringModelOutputTFSequenceClassifierOutputTFTokenClassifierOutput)	TFMaskedLanguageModelingLossTFModelInputTypeTFPreTrainedModelTFQuestionAnsweringLossTFSequenceClassificationLossTFTokenClassificationLossget_initializerkerasunpack_inputs)check_embeddings_within_bounds
shape_liststable_softmax)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )DebertaConfigr   zkamalkraj/deberta-basec                  D     e Zd Zd fdZdddZedd       Zd	dZ xZS )
TFDebertaContextPoolerc                    t        |   di | t        j                  j	                  |j
                  d      | _        t        |j                  d      | _	        || _
        y )Ndensenamedropout )super__init__r   layersDensepooler_hidden_sizer!   TFDebertaStableDropoutpooler_dropoutr$   configselfr-   kwargs	__class__s      /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/deberta/modeling_tf_deberta.pyr'   zTFDebertaContextPooler.__init__9   sO    "6"\\''(A(A'P
-f.C.C)T    c                    |d d df   }| j                  ||      }| j                  |      } t        | j                  j                        |      }|S )Nr   training)r$   r!   r   r-   pooler_hidden_act)r/   hidden_statesr6   context_tokenpooled_outputs        r2   callzTFDebertaContextPooler.call?   sT     &ad+]XF

=1H)$++*G*GHWr3   c                .    | j                   j                  S N)r-   hidden_sizer/   s    r2   
output_dimz!TFDebertaContextPooler.output_dimH   s    {{&&&r3   c                   | j                   ry d| _         t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   exY w# 1 sw Y   y xY w)NTr!   r$   )
builtgetattrtf
name_scoper!   r#   buildr-   r*   r$   r/   input_shapes     r2   rF   zTFDebertaContextPooler.buildL   s    ::
4$'3tzz/ O

  $dkk.L.L!MNO4D)5t||001 )""4() ) 6O O) )s   3C"<C."C+.C7r-   r   Fr6   bool)returnintr=   )	__name__
__module____qualname__r'   r;   propertyr@   rF   __classcell__r1   s   @r2   r   r   8   s&     ' '	)r3   r   c                  ,     e Zd ZdZd fd	ZddZ xZS )TFDebertaXSoftmaxa>  
    Masked Softmax which is optimized for saving memory

    Args:
        input (`tf.Tensor`): The input tensor that will apply softmax.
        mask (`tf.Tensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
        dim (int): The dimension that will apply softmax
    c                2    t        |   di | || _        y Nr%   )r&   r'   axis)r/   rY   r0   r1   s      r2   r'   zTFDebertaXSoftmax.__init__b   s    "6"	r3   c                   t        j                  t        j                  |t         j                              }t        j                  |t        j                  t        d      | j                        |      }t        t        j                  |t         j                        | j                        }t        j                  |d|      }|S )Nz-infdtype        )
rD   logical_notcastrL   wherefloatcompute_dtyper   float32rY   )r/   inputsmaskrmaskoutputs        r2   r;   zTFDebertaXSoftmax.callf   s}    rwwtRWW56%vd>P>P!QSYZbjj A499M%f-r3   ))rd   	tf.Tensorre   ri   )rO   rP   rQ   __doc__r'   r;   rS   rT   s   @r2   rV   rV   X   s    r3   rV   c                  P     e Zd ZdZ fdZej                  d        ZdddZ xZ	S )r+   z
    Optimized dropout module for stabilizing the training

    Args:
        drop_prob (float): the dropout probabilities
    c                2    t        |   di | || _        y rX   )r&   r'   	drop_prob)r/   rm   r0   r1   s      r2   r'   zTFDebertaStableDropout.__init__v   s    "6""r3   c                    t        j                  dt         j                  j                  j                  j                  d j                  z
        j                  t        |            z
  t         j                        t        j                  dd j                  z
  z   j                         j                  dkD  r9t        j                  t        j                  d j                        |      z  } fd}||fS )	z~
        Applies dropout to the inputs, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob.
        r   g      ?)probs)sample_shaper[   r   r]   c                    j                   dkD  r9t        j                  t        j                  dj                        |       z  S | S )Nr   r]   r[   )rm   rD   r`   r_   rb   )upstreamre   scaler/   s    r2   gradz-TFDebertaStableDropout.xdropout.<locals>.grad   s>    ~~!xxbggc9K9K&LhWZ___r3   )rD   r_   compatv1distributions	Bernoullirm   sampler   rL   convert_to_tensorrb   r`   )r/   rd   rt   re   rs   s   `  @@r2   xdropoutzTFDebertaStableDropout.xdropoutz   s    
 wwiill((22t~~9M2NUUcmntcuUvwGG

 $$SA,>%?tGYGYZ>>AXXdBGGCt7I7I$JFSV[[F	  t|r3   c                ,    |r| j                  |      S |S r=   )r{   )r/   rd   r6   s      r2   r;   zTFDebertaStableDropout.call   s    ==((r3   rJ   )rd   ri   r6   ri   )
rO   rP   rQ   rj   r'   rD   custom_gradientr{   r;   rS   rT   s   @r2   r+   r+   n   s1    #  * r3   r+   c                  6     e Zd ZdZd fd	Z fdZddZ xZS )TFDebertaLayerNormzBLayerNorm module in the TF style (epsilon inside the square root).c                @    t        |   di | || _        || _        y rX   )r&   r'   sizeeps)r/   r   r   r0   r1   s       r2   r'   zTFDebertaLayerNorm.__init__   s!    "6"	r3   c                    | j                  | j                  gt        j                         d      | _        | j                  | j                  gt        j
                         d      | _        t        | !  |      S )Nweight)shapeinitializerr#   bias)	
add_weightr   rD   ones_initializergammazeros_initializerbetar&   rF   )r/   rH   r1   s     r2   rF   zTFDebertaLayerNorm.build   s^    __DII;BDWDWDY`h_i
OO499+2CWCWCY`fOg	w}[))r3   c                .   t        j                  |dgd      }t        j                  t        j                  ||z
        dgd      }t         j                  j	                  || j
                  z         }| j                  ||z
  z  |z  | j                  z   S )Nrh   T)rY   keepdims)rD   reduce_meansquaremathsqrtr   r   r   )r/   xmeanvariancestds        r2   r;   zTFDebertaLayerNorm.call   ss    ~~ardT:>>"))AH"5RD4Pggll8dhh./zzQX&,tyy88r3   )g-q=)r   ri   rM   ri   rO   rP   rQ   rj   r'   rF   r;   rS   rT   s   @r2   r   r      s    L
*
9r3   r   c                  2     e Zd Zd fdZdddZddZ xZS )TFDebertaSelfOutputc                *   t        |   di | t        j                  j	                  |j
                  d      | _        t        j                  j                  |j                  d      | _	        t        |j                  d      | _        || _        y )Nr!   r"   	LayerNormepsilonr#   r$   r%   )r&   r'   r   r(   r)   r>   r!   LayerNormalizationlayer_norm_epsr   r+   hidden_dropout_probr$   r-   r.   s      r2   r'   zTFDebertaSelfOutput.__init__   sq    "6"\\''(:(:'I
88AVAV]h8i-f.H.HyYr3   c                v    | j                  |      }| j                  ||      }| j                  ||z         }|S )Nr5   r!   r$   r   r/   r8   input_tensorr6   s       r2   r;   zTFDebertaSelfOutput.call   s;    

=1]XF}|'CDr3   c                   | j                   ry d| _         t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   xY w# 1 sw Y   qxY w# 1 sw Y   y xY wNTr!   r   r$   )rB   rC   rD   rE   r!   r#   rF   r-   r>   r   r$   rG   s     r2   rF   zTFDebertaSelfOutput.build   s)   ::
4$'3tzz/ H

  $dkk.E.E!FGH4d+7t~~223 L$$dD$++2I2I%JKL4D)5t||001 )""4() ) 6H HL L) )$   3E<3E-E+EE(+E4rI   rJ   rK   r=   rO   rP   rQ   r'   r;   rF   rS   rT   s   @r2   r   r      s    )r3   r   c                  Z     e Zd Zd fdZ	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZ xZS )TFDebertaAttentionc                z    t        |   di | t        |d      | _        t	        |d      | _        || _        y )Nr/   r"   rg   r%   )r&   r'   "TFDebertaDisentangledSelfAttentionr/   r   dense_outputr-   r.   s      r2   r'   zTFDebertaAttention.__init__   s7    "6"6vFK	/XFr3   c           	     ~    | j                  |||||||      }||}| j                  |d   ||      }	|	f|dd  z   }
|
S )Nr8   attention_maskquery_statesrelative_posrel_embeddingsoutput_attentionsr6   r   r8   r   r6   r   )r/   r   )r/   r   r   r   r   r   r   r6   self_outputsattention_outputrg   s              r2   r;   zTFDebertaAttention.call   su     yy&)%%)/ ! 
 'L,,&q/x - 
 #$|AB'77r3   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   exY w# 1 sw Y   y xY w)NTr/   r   )rB   rC   rD   rE   r/   r#   rF   r   rG   s     r2   rF   zTFDebertaAttention.build   s    ::
4&2tyy~~. &		%&4.:t00556 .!!''-. . ;& &. .   C%CCC rI   NNNFF)r   ri   r   ri   r   tf.Tensor | Noner   r   r   r   r   rL   r6   rL   rM   tuple[tf.Tensor]r=   r   rT   s   @r2   r   r      sq     *.)-+/"' " '	
 ' )    
:	.r3   r   c                  0     e Zd Zd fdZddZddZ xZS )TFDebertaIntermediatec                T   t        |   di | t        j                  j	                  |j
                  t        |j                        d      | _        t        |j                  t              r"t        |j                        | _        || _        y |j                  | _        || _        y )Nr!   unitskernel_initializerr#   r%   )r&   r'   r   r(   r)   intermediate_sizer   initializer_ranger!   
isinstance
hidden_actstrr   intermediate_act_fnr-   r.   s      r2   r'   zTFDebertaIntermediate.__init__   s    "6"\\''**vOgOg?hov ( 

 f''-'89J9J'KD$  (.'8'8D$r3   c                L    | j                  |      }| j                  |      }|S Nrd   )r!   r   r/   r8   s     r2   r;   zTFDebertaIntermediate.call  s(    

-
800?r3   c                (   | j                   ry d| _         t        | dd       et        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        y y # 1 sw Y   y xY w)NTr!   )	rB   rC   rD   rE   r!   r#   rF   r-   r>   rG   s     r2   rF   zTFDebertaIntermediate.build  s}    ::
4$'3tzz/ H

  $dkk.E.E!FGH H 4H Hs   3BBrI   r8   ri   rM   ri   r=   r   rT   s   @r2   r   r      s    Hr3   r   c                  2     e Zd Zd fdZdddZddZ xZS )TFDebertaOutputc                R   t        |   di | t        j                  j	                  |j
                  t        |j                        d      | _        t        j                  j                  |j                  d      | _        t        |j                  d      | _        || _        y )Nr!   r   r   r   r$   r"   r%   )r&   r'   r   r(   r)   r>   r   r   r!   r   r   r   r+   r   r$   r-   r.   s      r2   r'   zTFDebertaOutput.__init__  s    "6"\\''$$IaIa9bip ( 

 88AVAV]h8i-f.H.HyYr3   c                x    | j                  |      }| j                  ||      }| j                  ||z         }|S )Nr   r5   r   r   s       r2   r;   zTFDebertaOutput.call  s=    

-
8]XF}|'CDr3   c                   | j                   ry d| _         t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   xY w# 1 sw Y   qxY w# 1 sw Y   y xY wr   )rB   rC   rD   rE   r!   r#   rF   r-   r   r   r>   r$   rG   s     r2   rF   zTFDebertaOutput.build&  s)   ::
4$'3tzz/ N

  $dkk.K.K!LMN4d+7t~~223 L$$dD$++2I2I%JKL4D)5t||001 )""4() ) 6N NL L) )r   rI   rJ   )r8   ri   r   ri   r6   rL   rM   ri   r=   r   rT   s   @r2   r   r     s    )r3   r   c                  Z     e Zd Zd fdZ	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZ xZS )TFDebertaLayerc                    t        |   di | t        |d      | _        t	        |d      | _        t        |d      | _        y )N	attentionr"   intermediaterg   r%   )r&   r'   r   r   r   r   r   bert_outputr.   s      r2   r'   zTFDebertaLayer.__init__6  s?    "6"+FE1&~N*6Ar3   c           	         | j                  |||||||      }|d   }	| j                  |	      }
| j                  |
|	|      }|f|dd  z   }|S )N)r   r   r   r   r   r   r6   r   r8   r   r   )r   r   r   )r/   r8   r   r   r   r   r   r6   attention_outputsr   intermediate_outputlayer_outputoutputss                r2   r;   zTFDebertaLayer.call=  s     !NN&)%%)/ + 
 -Q/"//>N/O''-<LW_ ( 
  /$5ab$99r3   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   xY w# 1 sw Y   qxY w# 1 sw Y   y xY w)NTr   r   r   )	rB   rC   rD   rE   r   r#   rF   r   r   rG   s     r2   rF   zTFDebertaLayer.buildY  s	   ::
4d+7t~~223 +$$T*+4.:t00556 .!!''-.4-9t//445 -  &&t,- - :+ +. .- -s$   D%%D1?D=%D.1D:=ErI   r   r8   ri   r   ri   r   r   r   r   r   r   r   rL   r6   rL   rM   r   r=   r   rT   s   @r2   r   r   5  sr    B *.)-+/"'  " '	
 ' )    
8-r3   r   c                  t     e Zd Zd fdZddZd Zd Zd	dZ	 	 	 	 	 	 d
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ xZ	S )TFDebertaEncoderc                L   t        |   di | t        |j                        D cg c]  }t	        |d|        c}| _        t        |dd      | _        || _        | j                  r4t        |dd      | _	        | j                  dk  r|j                  | _	        y y y c c}w )	Nzlayer_._r"   relative_attentionFmax_relative_positionsrh   r   r%   )r&   r'   rangenum_hidden_layersr   layerrC   r   r-   r   max_position_embeddings)r/   r-   r0   ir1   s       r2   r'   zTFDebertaEncoder.__init__i  s    "6"KPQWQiQiKjkanVHQC.Ak
")&2F"N""*1&:RTV*WD'**Q..4.L.L+ / # ls   B!c                   | j                   ry d| _         | j                  rY| j                  d| j                  dz  | j                  j
                  gt        | j                  j                              | _        t        | dd       K| j                  D ];  }t        j                  |j                        5  |j                  d        d d d        = y y # 1 sw Y   IxY w)NTzrel_embeddings.weight   r#   r   r   r   )rB   r   r   r   r-   r>   r   r   r   rC   r   rD   rE   r#   rF   )r/   rH   r   s      r2   rF   zTFDebertaEncoder.buildt  s    ::
"""&//,22Q68O8OP+DKK,I,IJ #2 #D
 4$'3 &]]5::. &KK%& && 4& &s   5CC	c                >    | j                   r| j                  }|S d }|S r=   )r   r   )r/   r   s     r2   get_rel_embeddingz"TFDebertaEncoder.get_rel_embedding  s*    040G0G,, NRr3   c                   t        t        |            dk  r}t        j                  t        j                  |d      d      }|t        j                  t        j                  |d      d      z  }t        j
                  |t        j                        }|S t        t        |            dk(  rt        j                  |d      }|S )Nr   r   rh   r   )lenr   rD   expand_dimssqueezer_   uint8)r/   r   extended_attention_masks      r2   get_attention_maskz#TFDebertaEncoder.get_attention_mask  s    z.)*a/&(nnR^^NTU5VXY&Z#4r~~bjjQhjlFmoq7rrNWW^RXX>N  N+,1^^NA>Nr3   c                    | j                   r8|6|t        |      d   nt        |      d   }t        |t        |      d         }|S )Nr   )r   r   build_relative_position)r/   r8   r   r   qs        r2   get_rel_poszTFDebertaEncoder.get_rel_pos  sO    ""|';0<0H
<(,jYfNghjNkA21j6OPR6STLr3   c	           
        |rdnd }	|rdnd }
| j                  |      }| j                  |||      }t        |t              r|d   }n|}| j	                         }t        | j                        D ]i  \  }}|r|	|fz   }	 ||||||||      }|d   }|8|}t        |t              r(|dz   t        | j                        k  r||dz      nd }n|}|sa|
|d   fz   }
k |r|	|fz   }	|st        d ||	|
fD              S t        ||	|
      S )Nr%   r   r   r   c              3  &   K   | ]	  }||  y wr=   r%   ).0vs     r2   	<genexpr>z(TFDebertaEncoder.call.<locals>.<genexpr>  s     hqZ[Zghs   last_hidden_stater8   
attentions)
r   r   r   r   r   	enumerater   r   tupler   )r/   r8   r   r   r   r   output_hidden_statesreturn_dictr6   all_hidden_statesall_attentionsnext_kvr   r   layer_modulelayer_outputss                   r2   r;   zTFDebertaEncoder.call  sT    #7BD0d00@''|\RmX.#A&G#G//1(4 	FOA|#$58H$H!(%-))-"3!M *!,M',mX667!ec$**o6MmAE2SWG' !/=3C2E!E/	F4   1]4D Dh]4E~$Vhhh +;LYg
 	
r3   rI   r=   )NN)NNFFTF)r8   ri   r   ri   r   r   r   r   r   rL   r  rL   r  rL   r6   rL   rM   $TFBaseModelOutput | tuple[tf.Tensor])
rO   rP   rQ   r'   rF   r   r   r   r;   rS   rT   s   @r2   r   r   h  s    	M& *.)-"'%* :
 :
 ":
 '	:

 ':
  :
 #:
 :
 :
 
.:
r3   r   c                   t        j                  | t         j                        }t        j                  |t         j                        }|dddf   t        j                  t        j                  |ddg      | dg      z
  }|d| ddf   }t        j
                  |d      }t        j                  |t         j                        S )a  
    Build relative position according to the query and key

    We assume the absolute position of query \(P_q\) is range from (0, query_size) and the absolute position of key
    \(P_k\) is range from (0, key_size), The relative positions from query to key is \(R_{q \rightarrow k} = P_q -
    P_k\)

    Args:
        query_size (int): the length of query
        key_size (int): the length of key

    Return:
        `tf.Tensor`: A tensor with shape [1, query_size, key_size]

    r[   Nr   rh   r   rY   )rD   r   int32tilereshaper   r_   int64)
query_sizekey_sizeq_idsk_idsrel_pos_idss        r2   r   r     s      HHZrxx0EHHXRXX.E4.2772::eaW+E
TU#WWKkzk1n-K..15K77;))r3   c                    t        |      d   t        |      d   t        |      d   t        |      d   g}t        j                  | |      S )Nr   r   r   rh   r   rD   broadcast_to)c2p_posquery_layerr   shapess       r2   c2p_dynamic_expandr     sP    ;";";"< $	F ??7F++r3   c                    t        |      d   t        |      d   t        |      d   t        |      d   g}t        j                  | |      S )Nr   r   r   r  )r  r  	key_layerr  s       r2   p2c_dynamic_expandr#    sP    ;";"9b!9b!	F ??7F++r3   c                    t        |      d d t        |       d   t        |      d   gz   }t        j                  | |      S )Nr   r   r  )	pos_indexp2c_attr"  r  s       r2   pos_dynamic_expandr'     sC     !$
9(=b(A:iCXY[C\']]F??9f--r3   c                z   |dk  rt        j                  |       |z   }|t        j                  |       dz
  k7  rt        j                  |       dz
  |z
  }t        j                  t        j                  t        j                  |             |d      }t        j                  | |      } t        j                  ||      }nd}t        j
                  | dt        j                  |       d   f      }t        j
                  |dt        j                  |      d   f      }t        j                  ||d      }t        j
                  |t        j                  |            }|dk7  rVt        j                  t        j                  t        j                  |             | d      }t        j                  ||      }|S )Nr   r   r  permrh   )
batch_dims)rD   rankrollr   	transposer  r   gather)r   indicesgather_axispre_rollpermutationflat_xflat_indicesgathereds           r2   torch_gatherr7    s=   Qggaj;.bggaj1n$771:>K/ggbhhrwwqz2H1ELL-,,w[9ZZBB01F::gBHHW,=b,A'BCLyy!<Hzz(BHHW$56H1}ggbhhrwwqz2XIAF<<{;Or3   c                  l     e Zd ZdZd fdZddZd	dZ	 	 	 	 	 d
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZd Z xZ	S )r   a  
    Disentangled self-attention module

    Parameters:
        config (`str`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaConfig`]

    c                   t        |   di | |j                  |j                  z  dk7  r&t	        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  j                  | j                  dz  t        |j                        dd      | _        |j                  |j                  ng | _        t        |d	d      | _        t        |d
d      | _        | j"                  rt        j                  j                  | j                  t        |j                        dd      | _        t        j                  j                  | j                  t        |j                        dd      | _        t)        d      | _        | j                   rt        |dd      | _        | j,                  dk  r|j.                  | _        t1        |j2                  d      | _        d| j                  v rEt        j                  j                  | j                  t        |j                        dd      | _        d| j                  v rDt        j                  j                  | j                  t        |j                        d      | _        t1        |j:                  d      | _        || _        y )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   in_projFr   r#   use_biasr   talking_headhead_logits_projhead_weights_projrh   r  r   r   pos_dropoutr"   c2ppos_projp2c
pos_q_proj)r   r#   r$   r%   ) r&   r'   r>   num_attention_heads
ValueErrorrN   attention_head_sizeall_head_sizer   r(   r)   r   r   r;  pos_att_typerC   r   r>  r?  r@  rV   softmaxr   r   r+   r   rA  rC  rE  attention_probs_dropout_probr$   r-   r.   s      r2   r'   z+TFDebertaDisentangledSelfAttention.__init__(  s   "6" : ::a?#F$6$6#7 8 445Q8  $*#=#= #&v'9'9F<V<V'V#W !558P8PP||))".v/G/GH	 * 
 4:3F3F3RF//XZ")&2F"N#FNEB$)LL$6$6((#263K3K#L'	 %7 %D! &+\\%7%7((#263K3K#L(	 &8 &D" )b1""*1&:RTV*WD'**Q..4.L.L+5f6P6PWdeD))) % 2 2&&'6v7O7O'P#"	 !3 ! )))"',,"4"4&&?6KcKc;dkw #5 # .f.Q.QXabr3   c                J   | j                   ry d| _         | j                  d| j                  t        j                  j                               | _        | j                  d| j                  t        j                  j                               | _        t        | dd       dt        j                  | j                  j                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       Mt        j                  | j                   j                        5  | j                   j                  d        d d d        t        | dd       Mt        j                  | j"                  j                        5  | j"                  j                  d        d d d        t        | dd       Mt        j                  | j$                  j                        5  | j$                  j                  d        d d d        t        | d	d       Mt        j                  | j&                  j                        5  | j&                  j                  d        d d d        t        | d
d       bt        j                  | j(                  j                        5  | j(                  j                  | j                  j                  g       d d d        t        | dd       ct        j                  | j*                  j                        5  | j*                  j                  | j                  j                  g       d d d        y y # 1 sw Y   RxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   kxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTq_biasr   v_biasr;  r$   r?  r@  rA  rC  rE  )rB   r   rI  r   initializersZerosrN  rO  rC   rD   rE   r;  r#   rF   r-   r>   r$   r?  r@  rA  rC  rE  rG   s     r2   rF   z(TFDebertaDisentangledSelfAttention.builda  s   ::
oo$"4"45CUCUC[C[C] & 
 oo$"4"45CUCUC[C[C] & 
 4D)5t||001 J""D$0G0G#HIJ4D)5t||001 )""4()4+T2>t4499: 2%%++D124,d3?t55::; 3&&,,T234-9t//445 -  &&t,-4T*6t}}112 ?##T[[%<%<$=>?4t,8t334 A%%t{{'>'>&?@A A 9#J J) )2 23 3- -? ?A AsT   	3M:MM&.M3N "1N1NMM#&M03M= N
NN"c                    t        |      d d | j                  dgz   }t        j                  ||      }t        j                  |g d      S )Nrh   tensorr   r   r   r   r   r)  )r   rF  rD   r  r.  )r/   rT  r   s      r2   transpose_for_scoresz7TFDebertaDisentangledSelfAttention.transpose_for_scores  sF    6"3B'4+C+CR*HH67 ||F66r3   c           	        |>| j                  |      }t        j                  | j                  |      dd      \  }	}
}nd }t        j                  t        j                  | j                   j
                  d         | j                  dz  d      }t        j                  | j                  d      }t        j                  d      D ]  }t        j                  | j                  | j                        }t        j                  | j                        D ]  }|j                  |||dz  |z            } |j                  ||j                               } dgdz  } ||d   |d   |      } ||d   |d   |      } ||d	   |d	   |      }| j                  |      }	| j                  |      }
| j                  |      }|	| j                  | j                  ddddf         z   }	|| j                  | j                  ddddf         z   }d}dt        | j                        z   }t!        j"                  t%        |	      d   |z        }|	|z  }	t        j&                  |	t        j                  |
g d
            }| j(                  r(| j+                  ||      }| j-                  |	|
|||      }|||z   }| j.                  r=t        j                  | j1                  t        j                  |g d            g d      }| j3                  ||      }| j5                  ||      }| j.                  r=t        j                  | j7                  t        j                  |g d            g d      }t        j&                  ||      }t        j                  |g d      }t%        |      }|dd |d   |d   z  gz   }t        j8                  ||      }|r||f}|S |f}|S )a  
        Call the module

        Args:
            hidden_states (`tf.Tensor`):
                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                *Attention(Q,K,V)*

            attention_mask (`tf.Tensor`):
                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

            return_att (`bool`, *optional*):
                Whether return the attention matrix.

            query_states (`tf.Tensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`tf.Tensor`):
                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
                values ranging in [*-max_relative_positions*, *max_relative_positions*].

            rel_embeddings (`tf.Tensor`):
                The embedding of relative distances. It's a tensor of shape [\(2 \times
                \text{max_relative_positions}\), *hidden_size*].


        Nr   rh   )num_or_size_splitsrY   c                j    t        j                  || d      }||t        j                  |      z  }|S )NT)transpose_b)rD   matmulr.  )wbr   outs       r2   linearz7TFDebertaDisentangledSelfAttention.call.<locals>.linear  s0    ii1$7=2<<?*C
r3   r   )r\   r   r   r   r   r   r   r   r5   )r   r   r   r   )r   r   r   r   rU  r   )r;  rD   splitrV  r.  r   rF  TensorArrayr\   r   writeconcatrN  rO  r   rJ  r   r   r   r[  r   rA  disentangled_att_biasr>  r?  rK  r$   r@  r  )r/   r8   r   r   r   r   r   r6   qpr  r"  value_layerr_  wsqkvwkqkvw_insider   qkvbr   r   rel_attscale_factorrs   attention_scoresattention_probscontext_layercontext_layer_shapenew_context_layer_shaper   s                                 r2   r;   z'TFDebertaDisentangledSelfAttention.call  s   N m,B24(())"-!"3/KK
 T\\0034IaIadeIelmB >>

;DXXa[ ; nn4::DD\D\]$":":; FA"-"3"3Ar!a%!)}"EKFzz![%7%7%9:	;
 6A:DtAwQ6AtAwQ7AtAwQ7A33A6K11!4I33A6K!D$=$=dkk$PTVW->X$YY!D$=$=dkk$PTVW->X$YY3t0011		*[1"5DE!E)99[",,y,2WX""!--nx-PN00iWegstG/'9!||%%bll3C\&RSUa  ,,'7H,,,J ll&&r||O\'RSUaO 		/;?]LA(7
 #6cr":>QRT>UXklnXo>o=p"p

=2IJ6G=/2 O\M]r3   c           
     0   |&t        |      d   }t        |t        |      d         }t        |      }t        |      dk(  r+t        j                  t        j                  |d      d      }nJt        |      dk(  rt        j                  |d      }n%t        |      dk7  rt        dt        |             t        j                  t        j                  t        j                  t        |      d   t        |      d         | j                        t        j                        }t        j                  || j                  |z
  | j                  |z   d d f   d      }d}	d| j                  v r| j                  |      }
| j                  |
      }
t        j                  |t        j                  |
g d	            }t        j                   ||z   d|dz  dz
        }t#        |t%        |||      d
      }|	|z  }	d| j                  v r| j'                  |      }| j                  |      }|t        j(                  j+                  t        j                  t        |      d
   |z  | j,                              z  }t        |      d   t        |      d   k7  r%t        t        |      d   t        |      d         }n|}t        j                   | |z   d|dz  dz
        }t        j                  |t        j                  |g d	            }t        j                  t#        |t/        |||      d
      g d	      }t        |      d   t        |      d   k7  r;t        j                  |d d d d d d df   d
      }t#        |t1        |||      d      }|	|z  }	|	S )Nr   r   r   r   r      z2Relative position ids must be of dim 2 or 3 or 4. rB  r`  rh   rD  r[   )r   r   r   rD   r   rG  r_   minimummaximumr   r  rJ  rC  rV  r[  r.  clip_by_valuer7  r   rE  r   r   rb   r#  r'  )r/   r  r"  r   r   rn  r   shape_list_posatt_spanscorepos_key_layerc2p_attr  pos_query_layerr_posp2c_posr&  r%  s                     r2   re  z8TFDebertaDisentangledSelfAttention.disentangled_att_bias  s@   ;'+A21j6KB6OPL#L1~!#>>"..q*I1ML A%>>,:L A%QRUVdReQfghh77JJ

:k226
98Mb8QRTXToTo HH	
 466ADD_D_bjDjjlmmnpq
  D%%% MM.9M 55mDMiiR\\--VWG&&|h'>8a<RSCSTG"7,>wUa,bdfgGWE D%%%"oon=O"77HOrww||
?3B7,FdN`N`a  O +&r*j.CB.GG/
90Eb0I:V_K`acKde$&&v'8!X\A=MNGii	2<<+VWGllW&8+y&Y[]^`lG +&r*j.CB.GGNN<1a
+CRH	&w0B9gW`0acefWEr3   rI   r=   )rT  ri   rM   ri   r   r   )
rO   rP   rQ   rj   r'   rF   rV  r;   re  rS   rT   s   @r2   r   r     s    7rA@7 *.)-+/"'m m "m '	m
 'm )m  m m 
m^7r3   r   c                  Z     e Zd ZdZ fdZddZ	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ xZS )TFDebertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                H   t        |   di | || _        t        |d|j                        | _        |j                  | _        |j                  | _        t        |dd      | _        |j                  | _        | j
                  |j                  k7  rEt        j                  j                  |j                  t        |j                        dd      | _        t        j                  j                  |j                  d      | _        t#        |j$                  d	
      | _        y )Nembedding_sizeposition_biased_inputT
embed_projFr<  r   r   r$   r"   r%   )r&   r'   r-   rC   r>   r  r   r  r   r   r(   r)   r   r  r   r   r   r+   r   r$   r.   s      r2   r'   zTFDebertaEmbeddings.__init__5  s    "6"%f.>@R@RS!--'-'E'E$%,V5Ld%S"!'!9!9&"4"44#ll00""#263K3K#L!	 1 DO 88AVAV]h8i-f.H.HyYr3   c                   t        j                  d      5  | j                  d| j                  j                  | j
                  gt        | j                              | _        d d d        t        j                  d      5  | j                  j                  dkD  rM| j                  d| j                  j                  | j
                  gt        | j                              | _
        nd | _
        d d d        t        j                  d      5  | j                  rC| j                  d| j                  | j                  gt        | j                              | _        nd | _        d d d        | j                  ry d| _        t!        | d	d       dt        j                  | j"                  j$                        5  | j"                  j'                  d d | j                  j                  g       d d d        t!        | d
d       Mt        j                  | j(                  j$                        5  | j(                  j'                  d        d d d        t!        | dd       [t        j                  | j*                  j$                        5  | j*                  j'                  d d | j
                  g       d d d        y y # 1 sw Y   QxY w# 1 sw Y   xY w# 1 sw Y   lxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)Nword_embeddingsr   r   token_type_embeddingsr   
embeddingsposition_embeddingsTr   r$   r  )rD   rE   r   r-   
vocab_sizer  r   r   r   type_vocab_sizer  r  r   r>   r  rB   rC   r   r#   rF   r$   r  rG   s     r2   rF   zTFDebertaEmbeddings.buildH  s|   ]],- 	//{{--t/B/BC+D,B,BC * DK	 ]]23 	2{{**Q.-1__%;;668K8KL /0F0F G .= .* .2*	2 ]]01 	0))+/??%779I9IJ /0F0F G ,; ,( ,0(	0 ::
4d+7t~~223 L$$dD$++2I2I%JKL4D)5t||001 )""4()4t,8t334 I%%tT43F3F&GHI I 9I	 		2 	2	0 	0L L) )I IsJ   AJ2 A.J?AK43K%K%?)K12J<?K	KK"%K.1K:c                .   ||t        d      |At        || j                  j                         t	        j
                  | j                  |      }t        |      dd }|t	        j                  |d      }|/t	        j                  t	        j                  d|d         d      }|}| j                  r&t	        j
                  | j                  |      }	||	z  }| j                  j                  dkD  r&t	        j
                  | j                  |      }
||
z  }| j                  | j                   k7  r| j#                  |      }| j%                  |      }|t'        t        |            t'        t        |            k7  ryt'        t        |            d	k(  r,t	        j(                  t	        j(                  |d
      d
      }t	        j*                  t	        j                  |d      | j,                        }||z  }| j/                  ||      }|S )z
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        Nz5Need to provide either `input_ids` or `input_embeds`.)paramsr0  rh   r   dimsvalue)startlimitr  ru  r   r   r[   r5   )rG  r   r-   r  rD   r/  r   r   fillr   r   r  r  r  r  r  r>   r  r   r   r   r_   rb   r$   )r/   	input_idsposition_idstoken_type_idsinputs_embedsre   r6   rH   final_embeddingsposition_embedstoken_type_embedss              r2   r;   zTFDebertaEmbeddings.callq  s    !6TUU *9dkk6L6LMIIT[[)LM /4!WW+Q?N>>"((+b/*RYZ[L(%% iit/G/GQ]^O/;;&&* "		1K1KUc d 11$"2"22#/?@>>*:;:d#$J7G,H(IIz$'(A-::bjjA&>QGDwwr~~d;4CUCUV/$6<<(88<Lr3   r=   )NNNNNF)r  r   r  r   r  r   r  r   re   r   r6   rL   rM   ri   r   rT   s   @r2   r  r  2  sp    QZ&'IV '+)-+/*.!%5 #5  '5  )	5 
 (5  5  5  
5 r3   r  c                  0     e Zd Zd fdZddZddZ xZS ) TFDebertaPredictionHeadTransformc                   t        |   di | t        |d|j                        | _        t
        j                  j                  | j                  t        |j                        d      | _
        t        |j                  t              rt        |j                        | _        n|j                  | _        t
        j                  j!                  |j"                  d      | _        || _        y )Nr  r!   r   r   r   r%   )r&   r'   rC   r>   r  r   r(   r)   r   r   r!   r   r   r   r   transform_act_fnr   r   r   r-   r.   s      r2   r'   z)TFDebertaPredictionHeadTransform.__init__  s    "6"%f.>@R@RS\\''%%.v/G/GH ( 

 f''-$5f6G6G$HD!$*$5$5D!88AVAV]h8ir3   c                n    | j                  |      }| j                  |      }| j                  |      }|S r   )r!   r  r   r   s     r2   r;   z%TFDebertaPredictionHeadTransform.call  s6    

-
8--m<}5r3   c                   | j                   ry d| _         t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   rxY w# 1 sw Y   y xY w)NTr!   r   )rB   rC   rD   rE   r!   r#   rF   r-   r>   r   r  rG   s     r2   rF   z&TFDebertaPredictionHeadTransform.build  s    ::
4$'3tzz/ H

  $dkk.E.E!FGH4d+7t~~223 H$$dD$2E2E%FGH H 8H HH Hs   3C/<)C;/C8;DrI   r   r=   r   rT   s   @r2   r  r    s    $	Hr3   r  c                  P     e Zd Zd fdZd	dZd
dZddZddZddZddZ	 xZ
S )TFDebertaLMPredictionHeadc                    t        |   di | || _        t        |d|j                        | _        t        |d      | _        || _        y )Nr  	transformr"   r%   )	r&   r'   r-   rC   r>   r  r  r  input_embeddingsr/   r-   r  r0   r1   s       r2   r'   z"TFDebertaLMPredictionHead.__init__  sJ    "6"%f.>@R@RS9&{S !1r3   c                X   | j                  | j                  j                  fddd      | _        | j                  ry d| _        t        | dd       Nt        j                  | j                  j                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NzerosTr   )r   r   	trainabler#   r  )r   r-   r  r   rB   rC   rD   rE   r  r#   rF   rG   s     r2   rF   zTFDebertaLMPredictionHead.build  s    OO4;;+A+A*CQXdhouOv	::
4d+7t~~223 +$$T*+ + 8+ +s   :B  B)c                    | j                   S r=   )r  r?   s    r2   get_output_embeddingsz/TFDebertaLMPredictionHead.get_output_embeddings  s    $$$r3   c                `    || j                   _        t        |      d   | j                   _        y Nr   )r  r   r   r  r/   r  s     r2   set_output_embeddingsz/TFDebertaLMPredictionHead.set_output_embeddings  s(    ',$+5e+<Q+?(r3   c                    d| j                   iS )Nr   )r   r?   s    r2   get_biasz"TFDebertaLMPredictionHead.get_bias  s    		""r3   c                X    |d   | _         t        |d         d   | j                  _        y )Nr   r   )r   r   r-   r  r  s     r2   set_biasz"TFDebertaLMPredictionHead.set_bias  s'    &M	!+E&M!:1!=r3   c                   | j                  |      }t        |      d   }t        j                  |d| j                  g      }t        j
                  || j                  j                  d      }t        j                  |d|| j                  j                  g      }t        j                  j                  || j                        }|S )Nr   r   rh   rS  T)ar]  rZ  )r  r   )r  r   rD   r  r  r[  r  r   r-   r  nnbias_addr   )r/   r8   
seq_lengths      r2   r;   zTFDebertaLMPredictionHead.call  s    ]C.q1


-DDWDW?XY		MT5J5J5Q5Q_cd

-JPTP[P[PfPf?gh]Kr3   r-   r   r  keras.layers.Layerr=   rM   r  r  ztf.Variable)rM   zdict[str, tf.Variable]r   )rO   rP   rQ   r'   rF   r  r  r  r  r;   rS   rT   s   @r2   r  r    s'    
1+%@#>r3   r  c                  0     e Zd Zd fdZddZddZ xZS )TFDebertaOnlyMLMHeadc                J    t        |   di | t        ||d      | _        y )Npredictionsr"   r%   )r&   r'   r  r  r  s       r2   r'   zTFDebertaOnlyMLMHead.__init__   s&    "6"4V=MTabr3   c                *    | j                  |      }|S )Nr   )r  )r/   sequence_outputprediction_scoress      r2   r;   zTFDebertaOnlyMLMHead.call  s     ,,?,K  r3   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTr  )rB   rC   rD   rE   r  r#   rF   rG   s     r2   rF   zTFDebertaOnlyMLMHead.build	  sm    ::
4-9t//445 -  &&t,- - :- -   A11A:r  )r  ri   rM   ri   r=   r   rT   s   @r2   r  r    s    c!
-r3   r  c                       e Zd ZeZd fdZddZd	dZd Ze		 	 	 	 	 	 	 	 	 d
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd       Z
ddZ xZS )TFDebertaMainLayerc                z    t        |   di | || _        t        |d      | _        t        |d      | _        y )Nr  r"   encoderr%   )r&   r'   r-   r  r  r   r  r.   s      r2   r'   zTFDebertaMainLayer.__init__  s6    "6"-f<H'Y?r3   c                    | j                   S r=   )r  r?   s    r2   get_input_embeddingsz'TFDebertaMainLayer.get_input_embeddings  s    r3   c                `    || j                   _        t        |      d   | j                   _        y r  )r  r   r   r  r  s     r2   set_input_embeddingsz'TFDebertaMainLayer.set_input_embeddings!  s$    !&%/%6q%9"r3   c                    t         )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        )NotImplementedError)r/   heads_to_prunes     r2   _prune_headszTFDebertaMainLayer._prune_heads%  s
    
 "!r3   c
                   ||t        d      |t        |      }
n|t        |      d d }
nt        d      |t        j                  |
d      }|t        j                  |
d      }| j	                  ||||||	      }| j                  ||||||	      }|d   }|s	|f|dd  z   S t        ||j                  |j                  	      S )
NzDYou cannot specify both input_ids and inputs_embeds at the same timerh   z5You have to specify either input_ids or inputs_embedsr   r  r   )r  r  r  r  re   r6   )r8   r   r   r  r  r6   r  )	rG  r   rD   r  r  r  r   r8   r  )r/   r  r   r  r  r  r   r  r  r6   rH   embedding_outputencoder_outputsr  s                 r2   r;   zTFDebertaMainLayer.call,  s     ]%>cdd"$Y/K&$]3CR8KTUU!WW+Q?N!WW+Q?N??%)' + 
 ,,*)/!5# ' 
 *!,#%(;;; -)77&11
 	
r3   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   exY w# 1 sw Y   y xY w)NTr  r  )rB   rC   rD   rE   r  r#   rF   r  rG   s     r2   rF   zTFDebertaMainLayer.builde  s    ::
4t,8t334 ,%%d+,4D)5t||001 )""4() ) 6, ,) )r   rI   r  r  	NNNNNNNNF)r  TFModelInputType | Noner   np.ndarray | tf.Tensor | Noner  r  r  r  r  r  r   bool | Noner  r  r  r  r6   rL   rM   r  r=   )rO   rP   rQ   r   config_classr'   r  r  r  r   r;   rF   rS   rT   s   @r2   r  r    s     L@:"  .28<8<6:7;)-,0#'6
*6
 66
 6	6

 46
 56
 '6
 *6
 !6
 6
 
.6
 6
p	)r3   r  c                      e Zd ZdZeZdZy)TFDebertaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    debertaN)rO   rP   rQ   rj   r   r  base_model_prefixr%   r3   r2   r  r  q  s    
 !L!r3   r  a9
  
    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
    Attention](https://huggingface.co/papers/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Parameters:
        config ([`DebertaConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a	  
    Args:
        input_ids (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput``] instead of a plain tuple.
zaThe bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Zd fdZe eej                  d             ee	e
e      	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd                     Zd	dZ xZS )
TFDebertaModelc                P    t        |   |g|i | t        |d      | _        y )Nr  r"   )r&   r'   r  r  r/   r-   rd   r0   r1   s       r2   r'   zTFDebertaModel.__init__  s(    3&3F3)&yAr3   batch_size, sequence_length
checkpointoutput_typer  c
                :    | j                  |||||||||		      }
|
S )N	r  r   r  r  r  r   r  r  r6   )r  )r/   r  r   r  r  r  r   r  r  r6   r   s              r2   r;   zTFDebertaModel.call  s9    & ,,))%'/!5#  

 r3   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTr  )rB   rC   rD   rE   r  r#   rF   rG   s     r2   rF   zTFDebertaModel.build  si    ::
4D)5t||001 )""4() ) 6) )r  rI   r  )r  r  r   r  r  r  r  r  r  r  r   r  r  r  r  r  r6   r  rM   r  r=   )rO   rP   rQ   r'   r   r   DEBERTA_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr;   rF   rS   rT   s   @r2   r  r    s    
B
 *+C+J+JKh+ij&%$ .28<8<6:7;)-,0#' %* 6 6	
 4 5 ' * !  
. k 4)r3   r  z5DeBERTa Model with a `language modeling` head on top.c                       e Zd Zd fdZddZe eej                  d             e	e
ee      	 	 	 	 	 	 	 	 	 	 d		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d
d                     ZddZ xZS )TFDebertaForMaskedLMc                    t        |   |g|i | |j                  rt        j	                  d       t        |d      | _        t        || j                  j                  d      | _	        y )NzpIf you want to use `TFDebertaForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.r  r"   cls)r  r#   )
r&   r'   
is_decoderloggerwarningr  r  r  r  mlmr  s       r2   r'   zTFDebertaForMaskedLM.__init__  s_    3&3F3NN1
 *&yA'AXAX_der3   c                .    | j                   j                  S r=   )r  r  r?   s    r2   get_lm_headz TFDebertaForMaskedLM.get_lm_head  s    xx###r3   r  r  c                   | j                  |||||||||
	      }|d   }| j                  ||
      }|	dn| j                  |	|      }|s|f|dd z   }||f|z   S |S t        |||j                  |j
                        S )a  
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        r  r   )r  r6   Nlabelslogitsr   lossr  r8   r  )r  r  hf_compute_lossr   r8   r  )r/   r  r   r  r  r  r   r  r  r  r6   r   r  r  r  rg   s                   r2   r;   zTFDebertaForMaskedLM.call  s    4 ,,))%'/!5#  

 "!* HH_xHX~t4+?+?vVg+?+h')GABK7F)-)9TGf$EvE$!//))	
 	
r3   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   exY w# 1 sw Y   y xY w)NTr  r  )rB   rC   rD   rE   r  r#   rF   r  rG   s     r2   rF   zTFDebertaForMaskedLM.buildJ  s    ::
4D)5t||001 )""4()4%1txx}}- %t$% % 2) )% %r   rI   r  
NNNNNNNNNF)r  r  r   r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r6   r  rM   z#TFMaskedLMOutput | tuple[tf.Tensor]r=   )rO   rP   rQ   r'   r  r   r   r  r  r   r  r   r  r;   rF   rS   rT   s   @r2   r  r    s    
f$ *+C+J+JKh+ij&$$ .28<8<6:7;)-,0#'04 %+
*+
 6+
 6	+

 4+
 5+
 '+
 *+
 !+
 .+
 +
 
-+
 k +
Z	%r3   r  z
    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       e Zd Zd fdZe eej                  d             ee	e
e      	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd                     Zd	dZ xZS )
"TFDebertaForSequenceClassificationc                   t        |   |g|i | |j                  | _        t        |d      | _        t        |d      | _        t        |dd       }|| j                  j                  n|}t        |d      | _        t        j                  j                  |j                  t        |j                         d      | _        | j                  j$                  | _        y )Nr  r"   poolercls_dropout
classifierr   )r&   r'   
num_labelsr  r  r   r  rC   r-   r   r+   r$   r   r(   r)   r   r   r  r@   )r/   r-   rd   r0   drop_outr1   s        r2   r'   z+TFDebertaForSequenceClassification.__init__^  s    3&3F3 ++)&yA,V(C6=$76>6F4;;22H-h]K,,,,##.v/G/GH - 

 ++00r3   r  r  c                L   | j                  |||||||||
	      }|d   }| j                  ||
      }| j                  ||
      }| j                  |      }|	dn| j	                  |	|      }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        r  r   r5   Nr  r   r  )r  r  r$   r  r   r
   r8   r  )r/   r  r   r  r  r  r   r  r  r  r6   r   r  r:   r  r  rg   s                    r2   r;   z'TFDebertaForSequenceClassification.callp  s    4 ,,))%'/!5#  

 "!*OhG]XF/~t4+?+?vV\+?+]Y,F)-)9TGf$EvE)!//))	
 	
r3   c                z   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   'xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTr  r  r$   r  )rB   rC   rD   rE   r  r#   rF   r  r$   r  r@   rG   s     r2   rF   z(TFDebertaForSequenceClassification.build  s]   ::
4D)5t||001 )""4()44(4t{{//0 (!!$'(4D)5t||001 )""4()4t,8t334 E%%tT4??&CDE E 9) )( () )E Es0   F%F?F%)F1FF"%F.1F:rI   r  )r  r  r   r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r6   r  rM   z-TFSequenceClassifierOutput | tuple[tf.Tensor]r=   )rO   rP   rQ   r'   r   r   r  r  r   r  r
   r  r;   rF   rS   rT   s   @r2   r  r  V  s    1$ *+C+J+JKh+ij&.$ .28<8<6:7;)-,0#'04 %.
*.
 6.
 6	.

 4.
 5.
 '.
 *.
 !.
 ..
 .
 
7.
 k .
`Er3   r  z
    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd Zd fdZe eej                  d             ee	e
e      	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd                     Zd	dZ xZS )
TFDebertaForTokenClassificationc                f   t        |   |g|i | |j                  | _        t        |d      | _        t
        j                  j                  |j                        | _	        t
        j                  j                  |j                  t        |j                        d      | _        || _        y )Nr  r"   )rater  r   )r&   r'   r	  r  r  r   r(   Dropoutr   r$   r)   r   r   r  r-   r  s       r2   r'   z(TFDebertaForTokenClassification.__init__  s    3&3F3 ++)&yA||++1K1K+L,,,,##H`H`8aht - 
 r3   r  r  c                (   | j                  |||||||||
	      }|d   }| j                  ||
      }| j                  |      }|	dn| j                  |	|      }|s|f|dd z   }||f|z   S |S t	        |||j
                  |j                        S )	z
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        r  r   r5   r   Nr  r   r  )r  r$   r  r   r   r8   r  )r/   r  r   r  r  r  r   r  r  r  r6   r   r  r  r  rg   s                   r2   r;   z$TFDebertaForTokenClassification.call  s    0 ,,))%'/!5#  

 "!*,,,J8~t4+?+?vV\+?+]Y,F)-)9TGf$EvE&!//))	
 	
r3   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       et        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        y y # 1 sw Y   |xY w# 1 sw Y   y xY w)NTr  r  )
rB   rC   rD   rE   r  r#   rF   r  r-   r>   rG   s     r2   rF   z%TFDebertaForTokenClassification.build       ::
4D)5t||001 )""4()4t,8t334 M%%tT4;;3J3J&KLM M 9) )M M   C"%3C."C+.C7rI   r  )r  r  r   r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r6   r  rM   z*TFTokenClassifierOutput | tuple[tf.Tensor]r=   )rO   rP   rQ   r'   r   r   r  r  r   r  r   r  r;   rF   rS   rT   s   @r2   r  r    s    
 *+C+J+JKh+ij&+$ .28<8<6:7;)-,0#'04 %*
**
 6*
 6	*

 4*
 5*
 '*
 **
 !*
 .*
 *
 
4*
 k *
X	Mr3   r  z
    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd Zd fdZe eej                  d             ee	e
e      	 	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd                     Zd	dZ xZS )
TFDebertaForQuestionAnsweringc                   t        |   |g|i | |j                  | _        t        |d      | _        t
        j                  j                  |j                  t        |j                        d      | _
        || _        y )Nr  r"   
qa_outputsr   )r&   r'   r	  r  r  r   r(   r)   r   r   r  r-   r  s       r2   r'   z&TFDebertaForQuestionAnswering.__init__  sr    3&3F3 ++)&yA,,,,##H`H`8aht - 
 r3   r  r  c                   | j                  |||||||||	      }|d   }| j                  |      }t        j                  |dd      \  }}t        j                  |d      }t        j                  |d      }d}|	 |
d	|	i}|
|d
<   | j                  |||f      }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                        S )a  
        start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        r  r   r   r   rh   )r  rX  rY   )inputrY   Nstart_positionend_positionr  )r  start_logits
end_logitsr8   r  )	r  r  rD   ra  r   r   r	   r8   r  )r/   r  r   r  r  r  r   r  r  start_positionsend_positionsr6   r   r  r  r  r  r  r  rg   s                       r2   r;   z"TFDebertaForQuestionAnswering.call  s   > ,,))%'/!5#  

 "!*8#%88&QUW#X jzz2>ZZjr:
&=+D&8F%2F>"''v|Z>X'YD"J/'!"+=F)-)9TGf$EvE-%!!//))
 	
r3   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       et        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        y y # 1 sw Y   |xY w# 1 sw Y   y xY w)NTr  r  )
rB   rC   rD   rE   r  r#   rF   r  r-   r>   rG   s     r2   rF   z#TFDebertaForQuestionAnswering.builda  r  r  rI   )NNNNNNNNNNF)r  r  r   r  r  r  r  r  r  r  r   r  r  r  r  r  r   r  r!  r  r6   r  rM   z1TFQuestionAnsweringModelOutput | tuple[tf.Tensor]r=   )rO   rP   rQ   r'   r   r   r  r  r   r  r	   r  r;   rF   rS   rT   s   @r2   r  r    s    	 *+C+J+JKh+ij&2$ .28<8<6:7;)-,0#'9=7; %9
*9
 69
 6	9

 49
 59
 '9
 *9
 !9
 79
 59
 9
 
;9
 k 9
v	Mr3   r  )r  r  r  r  r  r  )Lrj   
__future__r   r   collections.abcr   numpynp
tensorflowrD   activations_tfr   modeling_tf_outputsr   r   r	   r
   r   modeling_tf_utilsr   r   r   r   r   r   r   r   r   tf_utilsr   r   r   utilsr   r   r   r   configuration_debertar   
get_loggerrO   r  r  r  r(   Layerr   rV   r+   r   r   r   r   r   r   r   r   r   r#  r'  r7  r   r  r  r  r  r  r  DEBERTA_START_DOCSTRINGr  r  r  r  r  r  __all__r%   r3   r2   <module>r2     s    "  $   / 
 
 
 S R u u 0 
		H	% ". )U\\// )@** ,%U\\// %P9++ 9()%,,,, ):-.++ -.`HELL.. H:)ell(( )B0-U\\'' 0-fi
u||)) i
X*0,,.
0R);); Rjt %,,,, t n#Hu||'9'9 #HL- 2 2 -`-5<<-- -([)++ [)|"0 "( T) X g-)- -)	-)` QSjkM%35Q M% lM%`  YE)AC_ YEYEx  IM&>@Y IMIMX  WM$<>U WMWMtr3   