
    rh                     x   d Z ddlmZmZ ddlZddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZmZ ddlmZ  ej8                  e      Z G d dej>                        Z  G d dej>                        Z!ejD                  jF                  d        Z$ejD                  jF                  d        Z%ejD                  jF                  d        Z&ejD                  jF                  d        Z'ejD                  jF                  dejP                  de)fd       Z*ejD                  jF                  dejP                  dejP                  fd       Z+ejD                  jF                  dejP                  dejP                  de)fd       Z,ejD                  jF                  dejP                  dejP                  fd       Z- G d dej>                        Z. G d  d!ej>                        Z/ G d" d#ej>                        Z0 G d$ d%ej>                        Z1 G d& d'ej>                        Z2 G d( d)e      Z3 G d* d+ej>                        Z4e G d, d-e             Z5e G d. d/e5             Z6 G d0 d1ej>                        Z7 G d2 d3ej>                        Z8 G d4 d5ej>                        Z9 G d6 d7ej>                        Z: G d8 d9ej>                        Z;e G d: d;e5             Z< G d< d=ej>                        Z= ed>?       G d@ dAe5             Z>e G dB dCe5             Z?e G dD dEe5             Z@g dFZAy)GzPyTorch DeBERTa model.    )OptionalUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )DebertaConfigc                   *     e Zd ZdZd fd	Zd Z xZS )DebertaLayerNormzBLayerNorm module in the TF style (epsilon inside the square root).c                     t         |           t        j                  t	        j
                  |            | _        t        j                  t	        j                  |            | _        || _	        y N)
super__init__r   	Parametertorchonesweightzerosbiasvariance_epsilon)selfsizeeps	__class__s      /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/deberta/modeling_deberta.pyr   zDebertaLayerNorm.__init__,   sH    ll5::d#34LLT!23	 #    c                 X   |j                   }|j                         }|j                  dd      }||z
  j                  d      j                  dd      }||z
  t	        j
                  || j                  z         z  }|j                  |      }| j                  |z  | j                  z   }|S )NT)keepdim   )
dtypefloatmeanpowr   sqrtr"   tor   r!   )r#   hidden_states
input_typer/   varianceys         r'   forwardzDebertaLayerNorm.forward2   s    "((
%++-!!"d!3!D(--a055b$5G&-HtG\G\<\1]]%((4KK-'$))3r(   )g-q=__name__
__module____qualname____doc__r   r7   __classcell__r&   s   @r'   r   r   )   s    L$r(   r   c                   $     e Zd Z fdZd Z xZS )DebertaSelfOutputc                    t         |           t        j                  |j                  |j                        | _        t        |j                  |j                        | _        t        j                  |j                        | _        y r   )r   r   r   Linearhidden_sizedenser   layer_norm_eps	LayerNormDropouthidden_dropout_probdropoutr#   configr&   s     r'   r   zDebertaSelfOutput.__init__>   s\    YYv1163E3EF
)&*<*<f>S>STzz&"<"<=r(   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rD   rI   rF   r#   r3   input_tensors      r'   r7   zDebertaSelfOutput.forwardD   7    

=1]3}|'CDr(   r9   r:   r;   r   r7   r=   r>   s   @r'   r@   r@   =   s    >r(   r@   c                    | j                  d      }|j                  d      }t        j                  |t        j                  | j                        }t        j                  |t        j                  |j                        }|dddf   |j                  dd      j                  |d      z
  }|d|ddf   }|j                  d      }|S )a  
    Build relative position according to the query and key

    We assume the absolute position of query \(P_q\) is range from (0, query_size) and the absolute position of key
    \(P_k\) is range from (0, key_size), The relative positions from query to key is \(R_{q \rightarrow k} = P_q -
    P_k\)

    Args:
        query_size (int): the length of query
        key_size (int): the length of key

    Return:
        `torch.LongTensor`: A tensor with shape [1, query_size, key_size]

    r-   deviceNr   r*   r   )r$   r   arangelongrU   viewrepeat	unsqueeze)query_layer	key_layer
query_sizekey_sizeq_idsk_idsrel_pos_idss          r'   build_relative_positionrb   K   s    $ !!"%J~~b!HLL5::k>P>PQELLI<L<LME4.5::a#4#;#;J#JJKkzk1n-K''*Kr(   c                     | j                  |j                  d      |j                  d      |j                  d      |j                  d      g      S )Nr   r   r,   r*   expandr$   )c2p_posr[   relative_poss      r'   c2p_dynamic_expandrh   h   sI    >>;++A.0@0@0C[EUEUVWEXZfZkZklnZopqqr(   c                     | j                  |j                  d      |j                  d      |j                  d      |j                  d      g      S )Nr   r   rS   rd   )rf   r[   r\   s      r'   p2c_dynamic_expandrj   m   sG    >>;++A.0@0@0CY^^TVEWYbYgYghjYklmmr(   c                     | j                  |j                         d d | j                  d      |j                  d      fz         S )Nr,   rS   rd   )	pos_indexp2c_attr\   s      r'   pos_dynamic_expandrn   r   s=    GLLN2A.)..2DinnUWFX1YYZZr(   r[   scale_factorc                     t        j                  t        j                  | j                  d      t         j                        |z        S )Nr*   r-   )r   r1   tensorr$   r.   )r[   ro   s     r'   scaled_size_sqrtrs   z   s0    ::ell;#3#3B#7u{{KlZ[[r(   r\   c                 d    | j                  d      |j                  d      k7  rt        | |      S |S NrS   )r$   rb   )r[   r\   rg   s      r'   
build_rposrv      s1    y~~b11&{I>>r(   max_relative_positionsc           
          t        j                  t        t        | j	                  d      |j	                  d            |            S ru   )r   rr   minmaxr$   )r[   r\   rw   s      r'   compute_attention_spanr{      s4    <<C 0 0 4innR6HIKabccr(   c           	          |j                  d      |j                  d      k7  rA|d d d d d d df   j                  d      }t        j                  | dt	        || |            S | S )NrS   r   r*   r,   dimindex)r$   rZ   r   gatherrn   )rm   r[   r\   rg   rl   s        r'   uneven_size_correctedr      s_    y~~b11 Aq!,66r:	||G2DYPWYb2cddr(   c                   p    e Zd ZdZ fdZd Z	 	 	 	 ddej                  dej                  dede	ej                     de	ej                     d	e	ej                     d
e
ej                  e	ej                     f   fdZdej                  dej                  dej                  d	ej                  def
dZ xZS )DisentangledSelfAttentiona  
    Disentangled self-attention module

    Parameters:
        config (`str`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaConfig`]

    c                 Z   t         |           |j                  |j                  z  dk7  r&t	        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                  dz  d      | _
        t        j                  t        j                  | j                  t        j                              | _        t        j                  t        j                  | j                  t        j                              | _        |j"                  |j"                  ng | _        t%        |d	d      | _        t%        |d
d      | _        | j(                  rct        j                  |j                  |j                  d      | _        t        j                  |j                  |j                  d      | _        nd | _        d | _        | j&                  rt%        |dd      | _        | j.                  dk  r|j0                  | _        t        j2                  |j4                        | _        d| j"                  v r1t        j                  |j                  | j                  d      | _        d| j"                  v r/t        j                  |j                  | j                        | _        t        j2                  |j<                        | _        y )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r	   Fr!   rq   relative_attentiontalking_headrw   r*   r   c2pp2c) r   r   rC   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   rB   in_projr   r   r    r.   q_biasv_biaspos_att_typegetattrr   r   head_logits_projhead_weights_projrw   max_position_embeddingsrG   rH   pos_dropoutpos_proj
pos_q_projattention_probs_dropout_probrI   rJ   s     r'   r   z"DisentangledSelfAttention.__init__   sm    : ::a?#F$6$6#7 8 445Q8  $*#=#= #&v'9'9F<V<V'V#W !558P8PPyy!3!3T5G5G!5KRWXll5;;0B0B5;;#WXll5;;0B0B5;;#WX393F3F3RF//XZ")&2F"N#FNEB$&IIf.H.H&JdJdkp$qD!%'YYv/I/I6KeKelq%rD"$(D!%)D"""*1&:RTV*WD'**Q..4.L.L+!zz&*D*DED))) "		&*<*<d>P>PW\ ])))"$))F,>,>@R@R"Szz&"E"EFr(   c                     |j                         d d | j                  dfz   }|j                  |      }|j                  dddd      S )Nr*   r   r,   r   r	   )r$   r   rX   permute)r#   xnew_x_shapes      r'   transpose_for_scoresz.DisentangledSelfAttention.transpose_for_scores   sF    ffhsmt'?'?&DDFF;yyAq!$$r(   r3   attention_maskoutput_attentionsquery_statesrg   rel_embeddingsreturnc                 L   |9| j                  |      }| j                  |      j                  dd      \  }}	}
n| j                   j                  j                  | j                  dz  d      }t        d      D cg c]C  }t        j                  t        | j                        D cg c]  }||dz  |z       c}d      E }}}t        j                  |d   |j                         j                  |d   j                              }t        j                  |d   |j                         j                  |d   j                              }t        j                  |d   |j                         j                  |d   j                              }|||fD cg c]  }| j                  |       c}\  }}	}
|| j                  | j                  ddddf         z   }|
| j                  | j                  ddddf         z   }
d}dt        | j                        z   }t!        ||      }||j                  |j                        z  }t        j                  ||	j#                  dd	            }| j$                  r*|(|&| j'                  |      }| j)                  ||	|||      }|||z   }| j*                  5| j+                  |j-                  dddd            j-                  dddd      }|j/                         }|j1                  | t        j2                  |j                        j4                        }t6        j8                  j;                  |d      }| j=                  |      }| j>                  5| j?                  |j-                  dddd            j-                  dddd      }t        j                  ||
      }|j-                  dddd      jA                         }|jC                         dd	 d
z   }|jE                  |      }|s|dfS ||fS c c}w c c}}w c c}w )a  
        Call the module

        Args:
            hidden_states (`torch.FloatTensor`):
                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                *Attention(Q,K,V)*

            attention_mask (`torch.BoolTensor`):
                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

            output_attentions (`bool`, *optional*):
                Whether return the attention matrix.

            query_states (`torch.FloatTensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`torch.LongTensor`):
                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
                values ranging in [*-max_relative_positions*, *max_relative_positions*].

            rel_embeddings (`torch.FloatTensor`):
                The embedding of relative distances. It's a tensor of shape [\(2 \times
                \text{max_relative_positions}\), *hidden_size*].


        Nr	   r*   r~   r   rq   r   r,   rS   )r*   )#r   r   chunkr   r   ranger   catmatmultr2   r-   r   r   lenr   rs   	transposer   r   disentangled_att_biasr   r   boolmasked_fillfinfory   r   
functionalsoftmaxrI   r   
contiguousr$   rX   )r#   r3   r   r   r   rg   r   qpr[   r\   value_layerwskiqkvwqvr   rel_attro   scaleattention_scoresattention_probscontext_layernew_context_layer_shapes                            r'   r7   z!DisentangledSelfAttention.forward   s   L m,B262K2KB2O2U2UVW]_2U2`/KK$$**4+C+Ca+GQ*OBhmnohpqcdEIIeD<T<T6UVr!a%!)}V\]^qDqT!Wlnn&6&9&9Q&9&NOAT!Wmoo&7&:&:a&:&OPAT!Wmoo&7&:&:a&:&OPAZ[]^`aYb2cTU43L3LQ3O2c/KK!D$=$=dkk$PTVW->X$YY!D$=$=dkk$PTVW->X$YY3t0011 l;!EHH;3D3DH$EE <<Y5H5HR5PQ""~'AlF^!--n=N00iWegstG/'9   ,#445E5M5MaQRTUWX5YZbbcdfgijlmn',,.+77.8I5;;WbWhWhKiKmKmn--//0@b/I,,7!!-"44_5L5LQPQSTVW5XYaabcefhiklmO_kB%--aAq9DDF"/"4"4"6s";e"C%**+BC !4((//U Wq 3ds   >+P)P;PP!Pr[   r\   ro   c           	      $   |t        |||j                        }|j                         dk(  r!|j                  d      j                  d      }nT|j                         dk(  r|j                  d      }n/|j                         dk7  rt	        d|j                                t        ||| j                        }|j                         }|| j                  |z
  | j                  |z   d d f   j                  d      }d}d| j                  v r| j                  |      }| j                  |      }t        j                  ||j                  dd	            }	t        j                  ||z   d|dz  dz
        }
t        j                  |	dt!        |
||      
      }	||	z  }d| j                  v r| j#                  |      }| j                  |      }|t%        ||      z  }t'        |||      }t        j                  | |z   d|dz  dz
        }t        j                  ||j                  dd	      j)                  |j*                              }t        j                  |dt-        |||      
      j                  dd	      }t/        ||||      }||z  }|S )Nr,   r   r	   r      z2Relative position ids must be of dim 2 or 3 or 4. r   r*   rS   r}   r   rq   )rb   rU   r~   rZ   r   r{   rw   rW   r   r   r   r   r   r   clampr   rh   r   rs   rv   r2   r-   rj   r   )r#   r[   r\   rg   r   ro   att_spanscorepos_key_layerc2p_attrf   pos_query_layerr_posp2c_posrm   s                  r'   r   z/DisentangledSelfAttention.disentangled_att_bias%  s    2;	;K]K]^L"'11!4>>qAL1$'11!4L1$QR^RbRbRdQefgg)+y$B]B]^#((*'''(2T5P5PS[5[[]^^

)A, 	  D%%% MM.9M 55mDMll;0G0GB0OPGkk,"91hlQ>NOGll7:LWVaco:pqGWE D%%%"oon=O"77HO/NNOE
 kk5&8"3Q1q8HIGll9o.G.GB.O.R.RYbYhYh.R.ijGllR'9';PY'ZiB  ,G[)\ZGWEr(   FNNN)r9   r:   r;   r<   r   r   r   Tensorr   r   tupler7   r   r   r=   r>   s   @r'   r   r      s    $GL% #(/3/315U0||U0 U0  	U0
 u||,U0 u||,U0 !.U0 
u||Xell33	4U0n6\\6 <<6 ll	6
 6 6r(   r   c                   *     e Zd ZdZ fdZddZ xZS )DebertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        |dd      }t        |d|j                        | _        t        j                  |j                  | j                  |      | _        t        |dd      | _	        | j                  sd | _
        n/t        j                  |j                  | j                        | _
        |j                  dkD  r0t        j                  |j                  | j                        | _        nd | _        | j                  |j                  k7  r2t        j                  | j                  |j                  d      | _        nd | _        t!        |j                  |j"                        | _        t        j&                  |j(                        | _        || _        | j/                  d	t1        j2                  |j                        j5                  d
      d       y )Npad_token_idr   embedding_size)padding_idxposition_biased_inputTFr   position_ids)r   r*   )
persistent)r   r   r   rC   r   r   	Embedding
vocab_sizeword_embeddingsr   position_embeddingsr   type_vocab_sizetoken_type_embeddingsrB   
embed_projr   rE   rF   rG   rH   rI   rK   register_bufferr   rV   re   )r#   rK   r   r&   s      r'   r   zDebertaEmbeddings.__init__a  sy   v~q9%f.>@R@RS!||F,=,=t?R?R`lm%,V5Ld%S"))'+D$')||F4R4RTXTgTg'hD$!!A%)+f6L6LdNaNa)bD&)-D&&"4"44 ii(;(;V=O=OV[\DO"DO)&*<*<f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
r(   c                    ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|:t        j                  |t        j                  | j                  j
                        }|| j                  |      }| j                   | j                  |j	                               }nt        j                  |      }|}	| j                  r|	|z   }	| j                  | j                  |      }
|	|
z   }	| j                  | j                  |	      }	| j                  |	      }	||j                         |	j                         k7  rD|j                         dk(  r |j                  d      j                  d      }|j                  d      }|j!                  |	j"                        }|	|z  }	| j%                  |	      }	|	S )Nr*   r   rT   r   r,   )r$   r   r   r    rW   rU   r   r   
zeros_liker   r   r   rF   r~   squeezerZ   r2   r-   rI   )r#   	input_idstoken_type_idsr   maskinputs_embedsinput_shape
seq_lengthr   
embeddingsr   s              r'   r7   zDebertaEmbeddings.forward  s    #..*K',,.s3K ^
,,Q^<L!"[[EJJtO`O`OgOghN  00;M##/"&":":<;L;L;N"O"'"2"2="A"
%%#&99J%%1$($>$>~$N!#&;;J??&4J^^J/
xxzZ^^--88:?<<?2215D~~a(77:++,D#d*J\\*-
r(   )NNNNNr8   r>   s   @r'   r   r   ^  s    Q
>,r(   r   c                   p     e Zd Z fdZ	 	 	 	 ddedeej                  eej                     f   fdZ	 xZ
S )DebertaAttentionc                 p    t         |           t        |      | _        t	        |      | _        || _        y r   )r   r   r   r#   r@   outputrK   rJ   s     r'   r   zDebertaAttention.__init__  s-    -f5	'/r(   r   r   c                 v    | j                  ||||||      \  }}||}| j                  ||      }	|r|	|fS |	d fS )N)r   rg   r   )r#   r   )
r#   r3   r   r   r   rg   r   self_output
att_matrixattention_outputs
             r'   r7   zDebertaAttention.forward  se     #'))%%) #, #
Z (L;;{LA$j11$d++r(   r   r9   r:   r;   r   r   r   r   r   r   r7   r=   r>   s   @r'   r   r     sF     #(,  	, 
u||Xell33	4,r(   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )DebertaIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r   r   r   rB   rC   intermediate_sizerD   
isinstance
hidden_actstrr
   intermediate_act_fnrJ   s     r'   r   zDebertaIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r(   r3   r   c                 J    | j                  |      }| j                  |      }|S r   )rD   r   r#   r3   s     r'   r7   zDebertaIntermediate.forward  s&    

=100?r(   r9   r:   r;   r   r   r   r7   r=   r>   s   @r'   r   r     s#    9U\\ ell r(   r   c                   $     e Zd Z fdZd Z xZS )DebertaOutputc                     t         |           t        j                  |j                  |j
                        | _        t        |j
                  |j                        | _	        t        j                  |j                        | _        || _        y r   )r   r   r   rB   r   rC   rD   r   rE   rF   rG   rH   rI   rK   rJ   s     r'   r   zDebertaOutput.__init__  sc    YYv779K9KL
)&*<*<f>S>STzz&"<"<=r(   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rM   rN   s      r'   r7   zDebertaOutput.forward  rP   r(   rQ   r>   s   @r'   r  r    s    r(   r  c                   p     e Zd Z fdZ	 	 	 	 ddedeej                  eej                     f   fdZ	 xZ
S )DebertaLayerc                     t         |           t        |      | _        t	        |      | _        t        |      | _        y r   )r   r   r   	attentionr   intermediater  r   rJ   s     r'   r   zDebertaLayer.__init__  s3    )&1/7#F+r(   r   r   c                     | j                  ||||||      \  }}| j                  |      }	| j                  |	|      }
|r|
|fS |
d fS )Nr   r   rg   r   )r  r  r   )r#   r3   r   r   rg   r   r   r   r   intermediate_outputlayer_outputs              r'   r7   zDebertaLayer.forward  sn     (,~~/%%) (6 (
$* #//0@A{{#68HI *-- $''r(   )NNNFr   r>   s   @r'   r  r    sF    , "'(  ( 
u||Xell33	4(r(   r  c                        e Zd ZdZ fdZd Zd ZddZ	 	 	 	 	 ddej                  dej                  de
d	e
d
e
f
dZ xZS )DebertaEncoderz8Modified BertEncoder with relative position bias supportc                    t         |           t        j                  t	        |j
                        D cg c]  }t        |       c}      | _        t        |dd      | _	        | j                  rdt        |dd      | _
        | j                  dk  r|j                  | _
        t        j                  | j                  dz  |j                        | _        d| _        y c c}w )Nr   Frw   r*   r   r,   )r   r   r   
ModuleListr   num_hidden_layersr  layerr   r   rw   r   r   rC   r   gradient_checkpointing)r#   rK   _r&   s      r'   r   zDebertaEncoder.__init__  s    ]]%H`H`Ba#bQL$8#bc
")&2F"N""*1&:RTV*WD'**Q..4.L.L+"$,,t/J/JQ/NPVPbPb"cD&+# $cs   Cc                 R    | j                   r| j                  j                  }|S d }|S r   )r   r   r   )r#   r   s     r'   get_rel_embeddingz DebertaEncoder.get_rel_embedding  s0    7;7N7N,,33 UYr(   c                     |j                         dk  rE|j                  d      j                  d      }||j                  d      j                  d      z  }|S |j                         dk(  r|j                  d      }|S )Nr,   r   rS   r*   r	   )r~   rZ   r   )r#   r   extended_attention_masks      r'   get_attention_maskz!DebertaEncoder.get_attention_mask#  s    1$&4&>&>q&A&K&KA&N#47N7V7VWY7Z7d7deg7hhN  !Q&+55a8Nr(   c                 Z    | j                   r||t        ||      }|S t        ||      }|S r   )r   rb   )r#   r3   r   rg   s       r'   get_rel_poszDebertaEncoder.get_rel_pos,  s>    ""|';'6|]S   7}mTr(   r3   r   output_hidden_statesr   return_dictc           	      ^   | j                  |      }| j                  |||      }|r|fnd }|rdnd }	|}
| j                         }t        | j                        D ].  \  }} ||
|||||      \  }}|r||fz   }||}n|}
|s)|	|fz   }	0 |st        d |||	fD              S t        |||	      S )N )r   rg   r   r   c              3   &   K   | ]	  }||  y wr   r  ).0r   s     r'   	<genexpr>z)DebertaEncoder.forward.<locals>.<genexpr>]  s     hqZ[Zghs   last_hidden_stater3   
attentions)r  r  r  	enumerater  r   r   )r#   r3   r   r  r   r   rg   r  all_hidden_statesall_attentionsnext_kvr   r   layer_moduleatt_ms                  r'   r7   zDebertaEncoder.forward4  s     00@''|\ROcM;Kim0d//1(4 	;OA|#/))-"3$ M5 $$58H$H!',' !/5(!:'	;* h]4E~$Vhhh+;LYg
 	
r(   )NN)TFNNT)r9   r:   r;   r<   r   r  r  r  r   r   r   r7   r=   r>   s   @r'   r  r    sh    B	, &*"' ,
||,
 ,
 #	,

  ,
 ,
r(   r  c                   ,    e Zd ZU eed<   dZdgZdZd Zy)DebertaPreTrainedModelrK   debertar   Tc                 6   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                  t        f      rJ|j                  j                  j                  d       |j                  j                  j                          yt        |t              rI|j                   j                  j                          |j"                  j                  j                          yt        |t$        t&        f      r%|j                  j                  j                          yy)zInitialize the weights.g        )r/   stdNg      ?)r   r   rB   r   datanormal_rK   initializer_ranger!   zero_r   r   rF   r   fill_r   r   r   LegacyDebertaLMPredictionHeadDebertaLMPredictionHead)r#   modules     r'   _init_weightsz$DebertaPreTrainedModel._init_weightsj  sk   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> ./? @AMM$$S)KK""$ 9:MM$$&MM$$&!>@W XYKK""$ Zr(   N)	r9   r:   r;   r   __annotations__base_model_prefix"_keys_to_ignore_on_load_unexpectedsupports_gradient_checkpointingr9  r  r(   r'   r-  r-  c  s"    !*?)@&&*#%r(   r-  c                       e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee   dee   dee   deeef   fd       Z xZS )DebertaModelc                     t         |   |       t        |      | _        t	        |      | _        d| _        || _        | j                          y Nr   )	r   r   r   r   r  encoderz_stepsrK   	post_initrJ   s     r'   r   zDebertaModel.__init__  s@     +F3%f-r(   c                 .    | j                   j                  S r   r   r   r#   s    r'   get_input_embeddingsz!DebertaModel.get_input_embeddings  s    ...r(   c                 &    || j                   _        y r   rF  r#   new_embeddingss     r'   set_input_embeddingsz!DebertaModel.set_input_embeddings  s    *8'r(   c                     t        d      )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        z7The prune function is not implemented in DeBERTa model.)NotImplementedError)r#   heads_to_prunes     r'   _prune_headszDebertaModel._prune_heads  s    
 ""[\\r(   r   r   r   r   r   r   r  r  r   c	           	      |   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }	n!||j                         d d }	nt	        d      ||j                  n|j                  }
|t        j                  |	|
      }|&t        j                  |	t        j                  |
      }| j                  |||||      }| j                  ||d||      }|d	   }| j                  d	kD  r|d
   }t        | j                        D cg c]  }| j                  j                   d    }}|d   }| j                  j#                         }| j                  j%                  |      }| j                  j'                  |      }|d	d  D ]!  } |||d|||      }|j)                  |       # |d   }|s|f||rd	d  z   S dd  z   S t+        ||r|j,                  nd |j.                        S c c}w )NzDYou cannot specify both input_ids and inputs_embeds at the same timer*   z5You have to specify either input_ids or inputs_embeds)rU   rT   )r   r   r   r   r   T)r  r   r  r   rS   Fr
  r,   r#  )rK   r   r  use_return_dictr   %warn_if_padding_and_no_attention_maskr$   rU   r   r   r    rW   r   rB  rC  r   r  r  r  r  appendr   r3   r%  )r#   r   r   r   r   r   r   r  r  r   rU   embedding_outputencoder_outputsencoded_layersr3   r  layersr   r   rel_posr  sequence_outputs                         r'   r7   zDebertaModel.forward  sz    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU%.%:!!@T@T!"ZZFCN!"[[EJJvVN??)%' + 
 ,,!%/# ' 
 )+<<!*2.M6;DLL6IJdll((,JFJ)"-L!\\;;=N!\\<<^LNll../?@G 	4$!"&+!-!(#1  %%l3	4 ),#%>R8\(]]]XY8\(]]]-;O/77UY&11
 	
+ Ks    H9)NNNNNNNN)r9   r:   r;   r   rH  rL  rP  r   r   r   r   r   r   r   r   r7   r=   r>   s   @r'   r?  r?    s    /9]  -11515/304,0/3&*N
ELL)N
 !.N
 !.	N

 u||,N
  -N
 $D>N
 'tnN
 d^N
 
uo%	&N
 N
r(   r?  c                   $     e Zd Z fdZd Z xZS )$LegacyDebertaPredictionHeadTransformc                    t         |           t        |d|j                        | _        t        j                  |j                  | j                        | _        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  | j                  |j                        | _        y )Nr   )r%   )r   r   r   rC   r   r   rB   rD   r   r   r   r
   transform_act_fnrF   rE   rJ   s     r'   r   z-LegacyDebertaPredictionHeadTransform.__init__  s    %f.>@R@RSYYv1143F3FG
f''-$*6+<+<$=D!$*$5$5D!d&9&9v?T?TUr(   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rD   r^  rF   r   s     r'   r7   z,LegacyDebertaPredictionHeadTransform.forward  s4    

=1--m<}5r(   rQ   r>   s   @r'   r\  r\    s    	Vr(   r\  c                   *     e Zd Z fdZd Zd Z xZS )r6  c                    t         |           t        |      | _        t	        |d|j
                        | _        t        j                  | j                  |j                  d      | _
        t        j                  t        j                  |j                              | _        | j                  | j                  _        y )Nr   Fr   )r   r   r\  	transformr   rC   r   r   rB   r   decoderr   r   r    r!   rJ   s     r'   r   z&LegacyDebertaLMPredictionHead.__init__  s    =fE%f.>@R@RS yy!4!4f6G6GeTLLV->->!?@	 !IIr(   c                 :    | j                   | j                  _         y r   )r!   rc  rG  s    r'   _tie_weightsz*LegacyDebertaLMPredictionHead._tie_weights  s     IIr(   c                 J    | j                  |      }| j                  |      }|S r   )rb  rc  r   s     r'   r7   z%LegacyDebertaLMPredictionHead.forward  s$    }5]3r(   )r9   r:   r;   r   re  r7   r=   r>   s   @r'   r6  r6    s    &&r(   r6  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )LegacyDebertaOnlyMLMHeadc                 B    t         |           t        |      | _        y r   )r   r   r6  predictionsrJ   s     r'   r   z!LegacyDebertaOnlyMLMHead.__init__  s    8@r(   rZ  r   c                 (    | j                  |      }|S r   )rj  )r#   rZ  prediction_scoress      r'   r7   z LegacyDebertaOnlyMLMHead.forward  s     ,,_=  r(   r   r>   s   @r'   rh  rh    s$    A!u|| ! !r(   rh  c                   (     e Zd ZdZ fdZd Z xZS )r7  zMhttps://github.com/microsoft/DeBERTa/blob/master/DeBERTa/deberta/bert.py#L270c                    t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                  d      | _        t        j                  t        j                  |j                               | _        y )NT)r%   elementwise_affine)r   r   r   rB   rC   rD   r   r   r   r
   r^  rF   rE   r   r   r    r   r!   rJ   s     r'   r   z DebertaLMPredictionHead.__init__$  s    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>ShlmLLV->->!?@	r(   c                     | j                  |      }| j                  |      }| j                  |      }t        j                  ||j
                  j                               | j                  z   }|S r   )rD   r^  rF   r   r   r   r   r!   )r#   r3   r   s      r'   r7   zDebertaLMPredictionHead.forward2  sd    

=1--m<
 ]O4J4J4L4L4NORVR[R[[r(   r8   r>   s   @r'   r7  r7  !  s    WAr(   r7  c                   $     e Zd Z fdZd Z xZS )DebertaOnlyMLMHeadc                 B    t         |           t        |      | _        y r   )r   r   r7  lm_headrJ   s     r'   r   zDebertaOnlyMLMHead.__init__=  s    .v6r(   c                 *    | j                  ||      }|S r   )rt  )r#   rZ  r   rl  s       r'   r7   zDebertaOnlyMLMHead.forwardB  s     LL/J  r(   rQ   r>   s   @r'   rr  rr  <  s    7
!r(   rr  c                   8    e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee   dee   dee   deeef   fd       Z xZS )DebertaForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                     t         |   |       |j                  | _        t        |      | _        | j                  rt        |      | _        nddg| _        t        |      | _	        | j                          y )Nzlm_predictions.lm_head.weightz)deberta.embeddings.word_embeddings.weight)r   r   legacyr?  r.  rh  cls_tied_weights_keysrr  lm_predictionsrD  rJ   s     r'   r   zDebertaForMaskedLM.__init__K  sa     mm#F+;;/7DH'FHs&tD#"4V"<D 	r(   c                     | j                   r | j                  j                  j                  S | j                  j
                  j                  S r   )ry  rz  rj  rc  r|  rt  rD   rG  s    r'   get_output_embeddingsz(DebertaForMaskedLM.get_output_embeddingsX  s7    ;;88''///&&..444r(   c                    | j                   rA|| j                  j                  _        |j                  | j                  j                  _        y || j
                  j                  _        |j                  | j
                  j                  _        y r   )ry  rz  rj  rc  r!   r|  rt  rD   rJ  s     r'   set_output_embeddingsz(DebertaForMaskedLM.set_output_embeddings^  sa    ;;+9DHH  ((6(;(;DHH  %0>D''-/=/B/BD'',r(   r   r   r   r   r   labelsr   r  r  r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  r| j	                  |      }n0| j                  || j                  j                  j                        }d}|Ft               } ||j                  d| j                   j                        |j                  d            }|	s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r   r   r   r   r  r  r   r*   r   losslogitsr3   r%  )rK   rR  r.  ry  rz  r|  r   r   r   rX   r   r   r3   r%  )r#   r   r   r   r   r   r  r   r  r  outputsrZ  rl  masked_lm_lossloss_fctr   s                   r'   r7   zDebertaForMaskedLM.forwardf  s!   ( &1%<k$++B]B],,))%'/!5#  	
 "!*;; $ 9 $ 3 3OT\\E\E\ElEl m')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r(   	NNNNNNNNN)r9   r:   r;   r{  r   r~  r  r   r   r   r   r   r   r   r   r7   r=   r>   s   @r'   rw  rw  G  s    :<Z[5C  -11515/304)-,0/3&*4
ELL)4
 !.4
 !.	4

 u||,4
  -4
 &4
 $D>4
 'tn4
 d^4
 
un$	%4
 4
r(   rw  c                   4     e Zd Z fdZd Zed        Z xZS )ContextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        || _	        y r   )
r   r   r   rB   pooler_hidden_sizerD   rG   pooler_dropoutrI   rK   rJ   s     r'   r   zContextPooler.__init__  sI    YYv88&:S:ST
zz&"7"78r(   c                     |d d df   }| j                  |      }| j                  |      }t        | j                  j                     |      }|S rA  )rI   rD   r
   rK   pooler_hidden_act)r#   r3   context_tokenpooled_outputs       r'   r7   zContextPooler.forward  sM     &ad+]3

=1t{{<<=mLr(   c                 .    | j                   j                  S r   )rK   rC   rG  s    r'   
output_dimzContextPooler.output_dim  s    {{&&&r(   )r9   r:   r;   r   r7   propertyr  r=   r>   s   @r'   r  r    s!     ' 'r(   r  z
    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                   0    e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
ee
   dee
   dee
   deeef   fd       Z xZS ) DebertaForSequenceClassificationc                    t         |   |       t        |dd      }|| _        t	        |      | _        t        |      | _        | j                  j                  }t        j                  ||      | _        t        |dd       }|| j                  j                  n|}t        j                  |      | _        | j!                          y )N
num_labelsr,   cls_dropout)r   r   r   r  r?  r.  r  poolerr  r   rB   
classifierrK   rH   rG   rI   rD  )r#   rK   r  r  drop_outr&   s        r'   r   z)DebertaForSequenceClassification.__init__  s     V\15
$#F+#F+[[++
))J
;6=$76>6F4;;22Hzz(+ 	r(   c                 6    | j                   j                         S r   )r.  rH  rG  s    r'   rH  z5DebertaForSequenceClassification.get_input_embeddings  s    ||0022r(   c                 :    | j                   j                  |       y r   )r.  rL  rJ  s     r'   rL  z5DebertaForSequenceClassification.set_input_embeddings  s    )).9r(   r   r   r   r   r   r  r   r  r  r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }| j	                  |      }| j                  |      }d}|| j                   j                  | j                  dk(  rXt        j                         }|j                  d      j                  |j                        } |||j                  d            }n_|j                         dk(  s|j                  d      dk(  r|dk\  j                         }|j!                         }|j                  d      dkD  rt#        j$                  |d|j'                  |j                  d      |j                  d                  }t#        j$                  |d|j                  d            }t)               } ||j                  d| j                        j+                         |j                  d            }nIt#        j,                  d      j                  |      }n#t        j.                  d      } ||      |z  j1                  d      j3                          }n| j                   j                  dk(  rIt               }| j                  dk(  r& ||j5                         |j5                               }n |||      }n| j                   j                  dk(  r=t)               } ||j                  d| j                        |j                  d            }n,| j                   j                  dk(  rt7               } |||      }|	s|f|
dd z   }||f|z   S |S t9        |||
j:                  |
j<                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   r   r   r   r  r  r   r   r*   
regressionsingle_label_classificationmulti_label_classificationr  )rK   rR  r.  r  rI   r  problem_typer  r   r   rX   r2   r-   r~   r$   nonzerorW   r   r   re   r   r.   rr   
LogSoftmaxsumr/   r   r   r   r3   r%  )r#   r   r   r   r   r   r  r   r  r  r  encoder_layerr  r  r  loss_fnlabel_indexlabeled_logitsr  log_softmaxr   s                        r'   r7   z(DebertaForSequenceClassification.forward  s   & &1%<k$++B]B],,))%'/!5#  	
  
M2]3/{{''/??a' jjlG#[[_//=F"66;;r?;DZZ\Q&&++b/Q*>#)Q;"7"7"9K#[[]F"''*Q.)."A{'9'9+:J:J1:Mv{{[\~'^* "'fa9I9I"9M!N#3#5'(;(;B(P(V(V(XZ`ZeZefhZij$||A11&9"$--"3K)&1F:??CIIKKD))\9"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'fG4I4IV]VhVh
 	
r(   r  )r9   r:   r;   r   rH  rL  r   r   r   r   r   r   r   r   r7   r=   r>   s   @r'   r  r    s    $3:  -11515/304)-,0/3&*M
ELL)M
 !.M
 !.	M

 u||,M
  -M
 &M
 $D>M
 'tnM
 d^M
 
u..	/M
 M
r(   r  c                   $    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     dee   d	ee   d
ee   de	e
ef   fd       Z xZS )DebertaForTokenClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r   )r   r   r  r?  r.  r   rG   rH   rI   rB   rC   r  rD  rJ   s     r'   r   z&DebertaForTokenClassification.__init__&  si      ++#F+zz&"<"<=))F$6$68I8IJ 	r(   r   r   r   r   r   r  r   r  r  r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|	s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r*   r   r  )rK   rR  r.  rI   r  r   rX   r  r   r3   r%  )r#   r   r   r   r   r   r  r   r  r  r  rZ  r  r  r  r   s                   r'   r7   z%DebertaForTokenClassification.forward1  s    " &1%<k$++B]B],,))%'/!5#  	
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$fG4I4IV]VhVh
 	
r(   r  )r9   r:   r;   r   r   r   r   r   r   r   r   r   r7   r=   r>   s   @r'   r  r  $  s    	  -11515/304)-,0/3&*-
ELL)-
 !.-
 !.	-

 u||,-
  --
 &-
 $D>-
 'tn-
 d^-
 
u++	,-
 -
r(   r  c                   D    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   dee   de	e
ef   fd       Z xZS )DebertaForQuestionAnsweringc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r   )
r   r   r  r?  r.  r   rB   rC   
qa_outputsrD  rJ   s     r'   r   z$DebertaForQuestionAnswering.__init__d  sS      ++#F+))F$6$68I8IJ 	r(   r   r   r   r   r   start_positionsend_positionsr   r  r  r   c           
      &   |
|
n| j                   j                  }
| j                  |||||||	|
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|
s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nr  r   r   r*   r   )ignore_indexr,   )r  start_logits
end_logitsr3   r%  )rK   rR  r.  r  splitr   r   r   r$   r   r   r   r3   r%  )r#   r   r   r   r   r   r  r  r   r  r  r  rZ  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                         r'   r7   z#DebertaForQuestionAnswering.forwardn  s    &1%<k$++B]B],,))%'/!5#  	
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r(   )
NNNNNNNNNN)r9   r:   r;   r   r   r   r   r   r   r   r   r   r7   r=   r>   s   @r'   r  r  b  s      -11515/3042604,0/3&*<
ELL)<
 !.<
 !.	<

 u||,<
  -<
 "%,,/<
  -<
 $D><
 'tn<
 d^<
 
u22	3<
 <
r(   r  )rw  r  r  r  r?  r-  )Br<   typingr   r   r   torch.utils.checkpointr   torch.nnr   r   r   activationsr
   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   configuration_debertar   
get_loggerr9   loggerModuler   r@   jitscriptrb   rh   rj   rn   r   r   rs   rv   r{   r   r   r   r   r   r  r  r  r-  r?  r\  r6  rh  r7  rr  rw  r  r  r  r  __all__r  r(   r'   <module>r     sg    "    A A ! 9  . , 0 
		H	%ryy (		   8 r r n n [ [ \%,, \c \ \ ELL U\\   d d dgj d d    C		 CLN		 Nb,ryy ,F")) BII (- (BO
RYY O
d %_ % %8 g
) g
 g
T299 &BII 2!ryy !bii 6! ! S
/ S
 S
l'BII ', g
'= g
g
T :
$: :
 :
z H
"8 H
 H
Vr(   