
    rh7                        d Z ddlZddlmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)  e'jT                  e+      Z,dZ- G d dej\                        Z/ G d dej\                        Z0 G d dej\                        Z1de0iZ2 G d dej\                        Z3 G d dej\                        Z4 G d dej\                        Z5 G d d e      Z6 G d! d"ej\                        Z7 G d# d$ej\                        Z8e& G d% d&e              Z9e& G d' d(e9             Z: e&d)*       G d+ d,e9e             Z;e& G d- d.e9             Z< G d/ d0ej\                        Z= e&d1*       G d2 d3e9             Z>e& G d4 d5e9             Z?e& G d6 d7e9             Z@ G d8 d9ej\                        ZAe& G d: d;e9             ZBd>d<ZCg d=ZDy)?zPyTorch Data2VecText model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)CacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )Data2VecTextConfig   c                   2     e Zd ZdZ fdZ	 ddZd Z xZS )Data2VecTextForTextEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       |j                  | _        t        j                  |j                  |j
                  | j6                        | _	        y )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr'   register_buffertorcharangeexpandzerosr)   sizelongr$   selfconfig	__class__s     /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/data2vec/modeling_data2vec_text.pyr0   z&Data2VecTextForTextEmbeddings.__init__:   si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	

 "..#%<<**F,>,>DL\L\$
     c                    |+|t        || j                  |      }n| j                  |      }||j                         }n|j                         d d }|d   }|st	        | d      r-| j
                  d d d |f   }|j                  |d   |      }	|	}n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  dk(  r| j                  |      }||z  }| j!                  |      }| j#                  |      }|S )Nr*   r   r,   r   r.   devicer(   )"create_position_ids_from_input_idsr$   &create_position_ids_from_inputs_embedsrE   hasattrr,   rC   rA   rD   rF   r)   rO   r5   r9   r'   r7   r:   r>   )rH   	input_idsr,   r)   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr9   
embeddingsr7   s                rK   forwardz%Data2VecTextForTextEmbeddings.forwardS   sR    $A)TM]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
rL   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr*   r   rN   r   )rE   rA   rB   r$   rF   rO   	unsqueezerC   )rH   rT   rV   sequence_lengthr)   s        rK   rQ   zDData2VecTextForTextEmbeddings.create_position_ids_from_inputs_embeds{   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<rL   )NNNNr   )__name__
__module____qualname____doc__r0   r[   rQ   __classcell__rJ   s   @rK   r"   r"   4   s    

4 rs&P=rL   r"   c                        e Zd Zd fd	Z	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     dee   dee	   deej
                     d	e
ej
                     fd
Z xZS )Data2VecTextSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |xs t#        |dd      | _        | j$                  dk(  s| j$                  d	k(  rF|j&                  | _        t        j(                  d
|j&                  z  dz
  | j                        | _        |j,                  | _        || _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r'   r(   relative_keyrelative_key_queryr    r   )r/   r0   r3   num_attention_headsrR   
ValueErrorintattention_head_sizeall_head_sizer   Linearquerykeyvaluer<   attention_probs_dropout_probr>   r?   r'   r6   r1   distance_embedding
is_decoder	layer_idxrH   rI   r'   rx   rJ   s       rK   r0   z"Data2VecTextSelfAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++"rL   hidden_statesattention_mask	head_maskencoder_hidden_statespast_key_valueoutput_attentionscache_positionreturnc                 	   |j                   \  }}	}
| j                  |      }|j                  |d| j                  | j                        j                  dd      }|d u}|St        |t              rA|j                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|rK|IrGj                  | j                     j                  }|j                  | j                     j                  }n| j!                  |      }|j                  |d| j                  | j                        j                  dd      }| j#                  |      }|j                  |d| j                  | j                        j                  dd      }|D|s|nd }j%                  ||| j                  d|i      \  }}|rd|j                  | j                  <   t'        j(                  ||j                  dd            }| j*                  dk(  s| j*                  dk(  r|j                   d   |j                   d   }}|Dt'        j,                  |dz
  t&        j.                  |j0                  	      j                  dd      }n@t'        j2                  |t&        j.                  |j0                  	      j                  dd      }t'        j2                  |t&        j.                  |j0                  	      j                  dd      }||z
  }| j5                  || j6                  z   dz
        }|j9                  |j:                  
      }| j*                  dk(  rt'        j<                  d||      }||z   }nE| j*                  dk(  r6t'        j<                  d||      }t'        j<                  d||      }||z   |z   }|t?        j@                  | j                        z  }|||z   }tB        jD                  jG                  |d      }| jI                  |      }|||z  }t'        j(                  ||      }|jK                  dddd      jM                         }|jO                         d d | jP                  fz   }|j                  |      }||fS )Nr*   r   r    r   Trj   rk   rN   r-   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r	   ))shaperr   viewrl   ro   	transpose
isinstancer   
is_updatedgetrx   cross_attention_cacheself_attention_cachelayerskeysvaluesrs   rt   updaterA   matmulr'   tensorrF   rO   rB   rv   r6   tor.   einsummathsqrtr   
functionalsoftmaxr>   permute
contiguousrE   rp   )rH   rz   r{   r|   r}   r~   r   r   
batch_sizerW   _query_layeris_cross_attentionr   curr_past_key_valuecurrent_states	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapes                                  rK   r[   z!Data2VecTextSelfAttention.forward   sN    %2$7$7!
Jjj/!&&z2t7O7OQUQiQijttq
 3$>%.*=>+66::4>>J
%*8*N*N'*8*M*M'&4#2D.-."<+224>>BGGI-44T^^DKKK0I!z2t7O7OQUQiQijtt1I **^4K%**B 8 8$:R:Ri1o  )7It)<)C)C{DNN=M~<^*&	; &@DN--dnn= !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L)!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BCo--rL   NNNNNNFN)r_   r`   ra   r0   rA   Tensorr   FloatTensorr   booltupler[   rc   rd   s   @rK   rf   rf      s    #< 7;15=A*.,115d.||d. !!2!23d. E--.	d.
  ((9(9:d. !d. $D>d. !.d. 
u||	d.rL   rf   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )Data2VecTextSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr%   )r/   r0   r   rq   r3   denser:   r;   r<   r=   r>   rG   s     rK   r0   zData2VecTextSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rL   rz   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S Nr   r>   r:   rH   rz   r   s      rK   r[   zData2VecTextSelfOutput.forward  7    

=1]3}|'CDrL   r_   r`   ra   r0   rA   r   r[   rc   rd   s   @rK   r   r     1    >U\\  RWR^R^ rL   r   eagerc                        e Zd Zd fd	Zd Z	 	 	 	 	 	 ddej                  deej                     deej                     deej                     dee	   dee
   d	eej                     d
eej                     fdZ xZS )Data2VecTextAttentionc                     t         |           t        |j                     |||      | _        t        |      | _        t               | _        y )Nr'   rx   )	r/   r0   $DATA2VEC_TEXT_SELF_ATTENTION_CLASSES_attn_implementationrH   r   outputsetpruned_headsry   s       rK   r0   zData2VecTextAttention.__init__'  sF    89T9TU$;
	
 -V4ErL   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )lenr   rH   rl   ro   r   r   rr   rs   rt   r   r   rp   union)rH   headsindexs      rK   prune_headsz!Data2VecTextAttention.prune_heads1  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:rL   rz   r{   r|   r}   r~   r   r   r   c           	      r    | j                  |||||||      }| j                  |d   |      }	|	f|dd  z   }
|
S )Nr{   r|   r}   r~   r   r   r   r   )rH   r   )rH   rz   r{   r|   r}   r~   r   r   self_outputsattention_outputoutputss              rK   r[   zData2VecTextAttention.forwardC  s\     yy)"7)/) ! 
  ;;|AF#%QR(88rL   r   r   )r_   r`   ra   r0   r   rA   r   r   r   r   r   r   r[   rc   rd   s   @rK   r   r   &  s    ";* 7;15=A*.,115|| !!2!23 E--.	
  ((9(9: ! $D> !. 
u||	rL   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Data2VecTextIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r/   r0   r   rq   r3   intermediate_sizer   r   
hidden_actstrr
   intermediate_act_fnrG   s     rK   r0   z!Data2VecTextIntermediate.__init__]  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$rL   rz   r   c                 J    | j                  |      }| j                  |      }|S r   )r   r   )rH   rz   s     rK   r[   z Data2VecTextIntermediate.forwarde  s&    

=100?rL   r   rd   s   @rK   r   r   \  s#    9U\\ ell rL   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )Data2VecTextOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r/   r0   r   rq   r   r3   r   r:   r;   r<   r=   r>   rG   s     rK   r0   zData2VecTextOutput.__init__m  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rL   rz   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      rK   r[   zData2VecTextOutput.forwards  r   rL   r   rd   s   @rK   r   r   l  r   rL   r   c                       e Zd Zd fd	Z	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     dee   dee	   d	eej
                     d
e
ej
                     fdZd Z xZS )Data2VecTextLayerc                 l   t         |           |j                  | _        d| _        t	        ||      | _        |j                  | _        |j                  | _        | j                  r-| j                  st        |  d      t	        |d|      | _	        t        |      | _        t        |      | _        y )Nr   rx   z> should be used as a decoder model if cross attention is addedr(   r   )r/   r0   chunk_size_feed_forwardseq_len_dimr   	attentionrw   add_cross_attentionrm   crossattentionr   intermediater   r   )rH   rI   rx   rJ   s      rK   r0   zData2VecTextLayer.__init__|  s    '-'E'E$.vK ++#)#=#= ##?? D6)g!hii"7
i#D 5V<(0rL   rz   r{   r|   r}   encoder_attention_maskr~   r   r   r   c	           	      H   | j                  ||||||      }	|	d   }
|	dd  }| j                  rB|@t        | d      st        d|  d      | j	                  |
||||||      }|d   }
||dd  z   }t        | j                  | j                  | j                  |
      }|f|z   }|S )N)r{   r|   r   r~   r   r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r   rw   rR   rm   r   r   feed_forward_chunkr   r   )rH   rz   r{   r|   r}   r   r~   r   r   self_attention_outputsr   r   cross_attention_outputslayer_outputs                 rK   r[   zData2VecTextLayer.forward  s    "&)/)) "0 "
 2!4(,??4@4!12 =dV DD D 
 '+&9&9 5#&;-"3- ': '#  7q9 7 ;;G0##T%A%A4CSCSUe
  /G+rL   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )rH   r   intermediate_outputr   s       rK   r   z$Data2VecTextLayer.feed_forward_chunk  s,    "//0@A{{#68HIrL   r   )NNNNNFN)r_   r`   ra   r0   rA   r   r   r   r   r   r   r[   r   rc   rd   s   @rK   r   r   {  s    1& 7;15=A>B*.,115.||. !!2!23. E--.	.
  ((9(9:. !)):): ;. !. $D>. !.. 
u||	.`rL   r   c                   f    e Zd Zd fd	Z	 	 	 	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	ee	   d
ee	   dee	   deej
                     de
eej
                     ef   fdZ xZS )Data2VecTextEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _	        y c c}w )Nr   F)
r/   r0   rI   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)rH   rI   rx   irJ   s       rK   r0   zData2VecTextEncoder.__init__  sS    ]]TYZ`ZrZrTs#tq$5f$J#tu
&+# $us   A%rz   r{   r|   r}   r   past_key_values	use_cacher   output_hidden_statesreturn_dictr   r   c                    |	rdnd }|rdnd }|r| j                   j                  rdnd }| j                  r%| j                  r|rt        j                  d       d}d}|rR| j                   j                  r<t        |t              s,t        j                  d       d}t        j                  |      }t        | j                        D ]W  \  }}|	r||fz   }|||   nd } |||||||||      }|d   }|s/||d   fz   }| j                   j                  sO||d	   fz   }Y |	r||fz   }|r|j                         }|
st        d
 |||||fD              S t        |||||      S )N zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.T)r   r~   r   r   r   r   r    c              3   $   K   | ]  }|| 
 y wr   r  ).0vs     rK   	<genexpr>z.Data2VecTextEncoder.forward.<locals>.<genexpr>  s      
 = 
s   )last_hidden_stater   rz   
attentionscross_attentions)rI   r   r   trainingloggerwarning_oncerw   r   r   r   from_legacy_cache	enumerater   to_legacy_cacher   r   )rH   rz   r{   r|   r}   r   r   r  r   r  r  r   all_hidden_statesall_self_attentionsall_cross_attentionsreturn_legacy_cacher   layer_modulelayer_head_masklayer_outputss                       rK   r[   zData2VecTextEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#//
?TY8Z\
 #'1CCOTO(4 	VOA|#$58H$H!.7.CilO(%'=."3-	M *!,M &9]1=M<O&O#;;22+?=QRCSBU+U(+	V.   1]4D D-==?O 
 "#%'(
 
 
 9+++*1
 	
rL   r   )
NNNNNNFFTN)r_   r`   ra   r0   rA   r   r   r   r   r   r   r   r[   rc   rd   s   @rK   r   r     s"   , 7;15=A>BEI$(,1/4&*15R
||R
 !!2!23R
 E--.	R

  ((9(9:R
 !)):): ;R
 "%e.?.?(@"ABR
 D>R
 $D>R
 'tnR
 d^R
 !.R
 
uU\\"$MM	NR
rL   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Data2VecTextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r/   r0   r   rq   r3   r   Tanh
activationrG   s     rK   r0   zData2VecTextPooler.__init__!  s9    YYv1163E3EF
'')rL   rz   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S Nr   )r   r  )rH   rz   first_token_tensorpooled_outputs       rK   r[   zData2VecTextPooler.forward&  s6     +1a40

#566rL   r   rd   s   @rK   r  r     s#    $
U\\ ell rL   r  c                   .    e Zd ZU eed<   dZdZddgZd Zy)Data2VecTextPreTrainedModelrI   data2vec_textTr"   r   c                 f   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        r|t        |d      r0|j                  $|j                  j                  j                          t        |d      r3|j                  &|j                  j                  j                  d       yyyy)zInitialize the weightsg        )meanstdNbiasweightg      ?)r   r   rq   r*  datanormal_rI   initializer_ranger)  zero_r1   r$   r:   rR   fill_)rH   modules     rK   _init_weightsz)Data2VecTextPreTrainedModel._init_weights6  s5   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-vv&6;;+B  &&(vx(V]]-F""((- .G( .rL   N)	r_   r`   ra   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr1  r  rL   rK   r$  r$  /  s%    '&*#8:MN.rL   r$  c            "           e Zd ZdZd fd	Zd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e
j                     de	e
j                     de	ee
j                        de	e   de	e   de	e   de	e   de	e
j                     deee
j                     ef   fd       Z xZS )Data2VecTextModela2  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762

    c                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
r/   r0   rI   r"   rZ   r   encoderr  pooler	post_init)rH   rI   add_pooling_layerrJ   s      rK   r0   zData2VecTextModel.__init__Z  sN    
 	 7?*624E(04 	rL   c                 .    | j                   j                  S r   rZ   r5   rH   s    rK   get_input_embeddingsz&Data2VecTextModel.get_input_embeddingsj  s    ...rL   c                 &    || j                   _        y r   r>  )rH   rt   s     rK   set_input_embeddingsz&Data2VecTextModel.set_input_embeddingsm  s    */'rL   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr9  r   r   r   )rH   heads_to_pruner   r   s       rK   _prune_headszData2VecTextModel._prune_headsp  sE    
 +002 	CLE5LLu%//;;EB	CrL   rS   r{   r,   r)   r|   rT   r}   r   r   r  r   r  r  r   r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                   j                  r|
|
n| j                   j
                  }
nd}
||t        d      |#| j                  ||       |j                         }n!||j                         d d }nt        d      |\  }}||j                  n|j                  }d}|	5t        |	t              s|	d   d   j                  d   n|	j                         }|t        j                  |||z   f|      }|pt!        | j"                  d      r4| j"                  j$                  d d d |f   }|j'                  ||      }|}n&t        j(                  |t        j*                  |	      }| j-                  ||      }| j                   j                  rE|C|j                         \  }}}||f}|t        j                  ||      }| j/                  |      }nd }| j1                  || j                   j2                        }| j#                  |||||
      }| j5                  ||||||	|
||||      }|d   }| j6                  | j7                  |      nd }|s
||f|dd  z   S t9        |||j:                  |j<                  |j>                  |j@                        S )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer*   z5You have to specify either input_ids or inputs_embedsr   r   )rO   r,   rN   )rS   r)   r,   rT   rU   )
r{   r|   r}   r   r   r  r   r  r  r   r   )r
  pooler_outputr   rz   r  r  )!rI   r   r  use_return_dictrw   r  rm   %warn_if_padding_and_no_attention_maskrE   rO   r   r   r   get_seq_lengthrA   onesrR   rZ   r,   rC   rD   rF   get_extended_attention_maskinvert_attention_maskget_head_maskr   r9  r:  r   r   rz   r  r  ) rH   rS   r{   r,   r)   r|   rT   r}   r   r   r  r   r  r  r   rV   r   rW   rO   rU   rX   rY   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputr"  s                                    rK   r[   zData2VecTextModel.forwardx  s4   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"& "/59  "1%++B/$335 # !"ZZ*jCY6Y)ZdjkN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y$++2O2OP	??%)'#9 + 
 ,,2"7#B+/!5#) ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
rL   )T)NNNNNNNNNNNNNN)r_   r`   ra   rb   r0   r@  rB  rF  r   r   rA   r   listr   r   r   r   r   r[   rc   rd   s   @rK   r7  r7  I  s    /0C  -11515/3,0048<9==A$(,0/3&*15s
ELL)s
 !.s
 !.	s

 u||,s
 ELL)s
  -s
  (5s
 !) 6s
 "$u'8'8"9:s
 D>s
 $D>s
 'tns
 d^s
 !.s
  
uU\\"$PP	Q!s
 s
rL   r7  zX
    Data2VecText Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc            $           e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     dee	j                     deeee	j                           dee   dee   dee   dee   dee	j                     deeef   f d       Z xZS )Data2VecTextForCausalLMlm_head.decoder.weightlm_head.decoder.biasc                     t         |   |       |j                  st        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzTIf you want to use `Data2VecTextLMHeadModel` as a standalone, add `is_decoder=True.`Fr<  
r/   r0   rw   r  warningr7  r%  Data2VecTextLMHeadlm_headr;  rG   s     rK   r0   z Data2VecTextForCausalLM.__init__  sM       NNqr.vO)&1 	rL   c                 .    | j                   j                  S r   rc  decoderr?  s    rK   get_output_embeddingsz-Data2VecTextForCausalLM.get_output_embeddings      ||###rL   c                 &    || j                   _        y r   re  rH   new_embeddingss     rK   set_output_embeddingsz-Data2VecTextForCausalLM.set_output_embeddings      -rL   rS   r{   r,   r)   r|   rT   r}   r   labelsr   r  r   r  r  r   r   c                    ||n| j                   j                  }|	d}| j                  |||||||||
|||||      }|d   }| j                  |      }d}|	* | j                  ||	fd| j                   j
                  i|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )aA  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Data2VecTextForCausalLM, Data2VecTextConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/data2vec-text-base")
        >>> config = Data2VecTextConfig.from_pretrained("facebook/data2vec-text-base")
        >>> config.is_decoder = True
        >>> model = Data2VecTextForCausalLM.from_pretrained("facebook/data2vec-text-base", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NF)r{   r,   r)   r|   rT   r}   r   r   r  r   r  r  r   r   r2   r    )losslogitsr   rz   r  r  )rI   rI  r%  rc  loss_functionr2   r   r   rz   r  r  )rH   rS   r{   r,   r)   r|   rT   r}   r   rn  r   r  r   r  r  r   kwargsr   rW  prediction_scoreslm_lossr   s                         rK   r[   zData2VecTextForCausalLM.forward	  s&   T &1%<k$++B]B]I$$))%'"7#9+/!5#) % 
" "!* LL9(d((!  ;;11 	G ')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
rL   )NNNNNNNNNNNNNNN)r_   r`   ra   _tied_weights_keysr0   rg  rl  r   r   rA   
LongTensorr   r   r   r   r   r   r[   rc   rd   s   @rK   r[  r[    s    34JK
$.  156:59371559=A>B-1EI$(,0/3&*15!U
E,,-U
 !!2!23U
 !!1!12	U

 u//0U
 E--.U
   1 12U
  ((9(9:U
 !)):): ;U
 ))*U
 "%e.?.?(@"ABU
 D>U
 $D>U
 'tnU
 d^U
  !.!U
$ 
u77	8%U
 U
rL   r[  c                       e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee   dee   dee   deeef   fd       Z xZS )Data2VecTextForMaskedLMr\  r]  c                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzsIf you want to use `Data2VecTextForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr_  r`  rG   s     rK   r0   z Data2VecTextForMaskedLM.__init__f  sS     NN1
 /vO)&1 	rL   c                 .    | j                   j                  S r   re  r?  s    rK   rg  z-Data2VecTextForMaskedLM.get_output_embeddingsu  rh  rL   c                 &    || j                   _        y r   re  rj  s     rK   rl  z-Data2VecTextForMaskedLM.set_output_embeddingsx  rm  rL   rS   r{   r,   r)   r|   rT   r}   r   rn  r   r  r  r   c                    ||n| j                   j                  }| j                  |||||||||
||      }|d   }| j                  |      }d}|	at	               }|	j                  |j                        }	 ||j                  d| j                   j                        |	j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        N)
r{   r,   r)   r|   rT   r}   r   r   r  r  r   r*   r    rp  rq  rz   r  )rI   rI  r%  rc  r   r   rO   r   r2   r   rz   r  )rH   rS   r{   r,   r)   r|   rT   r}   r   rn  r   r  r  r   rW  rt  masked_lm_lossloss_fctr   s                      rK   r[   zData2VecTextForMaskedLM.forward{  s   , &1%<k$++B]B]$$))%'"7#9/!5# % 
 "!* LL9')HYY0778F%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
rL   )NNNNNNNNNNNN)r_   r`   ra   rv  r0   rg  rl  r   r   rA   rw  r   r   r   r   r   r[   rc   rd   s   @rK   ry  ry  b  sQ   24JK$.  156:59371559=A>B-1,0/3&*7
E,,-7
 !!2!237
 !!1!12	7

 u//07
 E--.7
   1 127
  ((9(9:7
 !)):): ;7
 ))*7
 $D>7
 'tn7
 d^7
 
un$	%7
 7
rL   ry  c                   .     e Zd ZdZ fdZd Zd Z xZS )rb  z/Data2VecText Head for masked language modeling.c                    t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _
        t        j                  t        j                  |j                              | _        | j                  | j                  _        y r   )r/   r0   r   rq   r3   r   r:   r;   
layer_normr2   rf  	ParameterrA   rD   r)  rG   s     rK   r0   zData2VecTextLMHead.__init__  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	 IIrL   c                     | j                  |      }t        |      }| j                  |      }| j                  |      }|S r   )r   r   r  rf  rH   featuresrs  xs       rK   r[   zData2VecTextLMHead.forward  s;    JJx GOOA LLOrL   c                     | j                   j                  j                  j                  dk(  r| j                  | j                   _        y | j                   j                  | _        y )Nmeta)rf  r)  rO   typer?  s    rK   _tie_weightszData2VecTextLMHead._tie_weights  sC     <<##((F2 $		DLL))DIrL   )r_   r`   ra   rb   r0   r[   r  rc   rd   s   @rK   rb  rb    s    9&*rL   rb  z
    Data2VecText Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   D    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eef   fd       Z xZS )%Data2VecTextForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |d      | _        t        |      | _        | j                          y NFr_  )	r/   r0   
num_labelsrI   r7  r%  Data2VecTextClassificationHead
classifierr;  rG   s     rK   r0   z.Data2VecTextForSequenceClassification.__init__  sK      ++.vO8@ 	rL   rS   r{   r,   r)   r|   rT   rn  r   r  r  r   c                 T   |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }d}||j	                  |j
                        }| j                   j                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j                        |j                  d            }n,| j                   j                  dk(  rt!               } |||      }|
s|f|d	d z   }||f|z   S |S t#        |||j$                  |j&                  
      S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr{   r,   r)   r|   rT   r   r  r  r   r   
regressionsingle_label_classificationmulti_label_classificationr*   r    r~  )rI   rI  r%  r  r   rO   problem_typer  r.   rA   rF   rn   r   squeezer   r   r   r   rz   r  rH   rS   r{   r,   r)   r|   rT   rn  r   r  r  r   rW  rq  rp  r  r   s                    rK   r[   z-Data2VecTextForSequenceClassification.forward  s   ( &1%<k$++B]B]$$))%'/!5# % 

 "!*1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
rL   
NNNNNNNNNN)r_   r`   ra   r0   r   r   rA   rw  r   r   r   r   r   r[   rc   rd   s   @rK   r  r    s   	  156:59371559-1,0/3&*E
E,,-E
 !!2!23E
 !!1!12	E

 u//0E
 E--.E
   1 12E
 ))*E
 $D>E
 'tnE
 d^E
 
u..	/E
 E
rL   r  c                   D    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eef   fd       Z xZS )Data2VecTextForMultipleChoicec                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y )Nr   )r/   r0   r7  r%  r   r<   r=   r>   rq   r3   r  r;  rG   s     rK   r0   z&Data2VecTextForMultipleChoice.__init__3  sW     .v6zz&"<"<=))F$6$6: 	rL   rS   r,   r{   rn  r)   r|   rT   r   r  r  r   c                    |
|
n| j                   j                  }
||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	|
	      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|.t               }|j                  |j                        } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr   r*   r   )r)   r,   r{   r|   rT   r   r  r  r    r~  )rI   rI  r   r   rE   r%  r>   r  r   r   rO   r   rz   r  )rH   rS   r,   r{   rn  r)   r|   rT   r   r  r  num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   r"  rq  reshaped_logitsrp  r  r   s                           rK   r[   z%Data2VecTextForMultipleChoice.forward=  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]CLCXINN2,>?^bLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 $$*..,/!5# % 

  
]3/ ++b+6')HYY556FOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
rL   r  )r_   r`   ra   r0   r   r   rA   rw  r   r   r   r   r   r[   rc   rd   s   @rK   r  r  1  s     15596:-1371559,0/3&*Y
E,,-Y
 !!1!12Y
 !!2!23	Y

 ))*Y
 u//0Y
 E--.Y
   1 12Y
 $D>Y
 'tnY
 d^Y
 
u//	0Y
 Y
rL   r  c                   D    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eef   fd       Z xZS )"Data2VecTextForTokenClassificationc                 d   t         |   |       |j                  | _        t        |d      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y r  )r/   r0   r  r7  r%  classifier_dropoutr=   r   r<   r>   rq   r3   r  r;  rH   rI   r  rJ   s      rK   r0   z+Data2VecTextForTokenClassification.__init__  s      ++.vO)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rL   rS   r{   r,   r)   r|   rT   rn  r   r  r  r   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|Wt               }|j                  |j                        } ||j                  d| j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r*   r    r~  )rI   rI  r%  r>   r  r   r   rO   r   r  r   rz   r  r  s                    rK   r[   z*Data2VecTextForTokenClassification.forward  s
   $ &1%<k$++B]B]$$))%'/!5# % 

 "!*,,71')HYYv}}-FFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rL   r  )r_   r`   ra   r0   r   r   rA   rw  r   r   r   r   r   r[   rc   rd   s   @rK   r  r    s     156:59371559-1,0/3&*4
E,,-4
 !!2!234
 !!1!12	4

 u//04
 E--.4
   1 124
 ))*4
 $D>4
 'tn4
 d^4
 
u++	,4
 4
rL   r  c                   (     e Zd ZdZ fdZd Z xZS )r  z-Head for sentence-level classification tasks.c                 Z   t         |           t        j                  |j                  |j                        | _        |j                  |j                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        y r   )r/   r0   r   rq   r3   r   r  r=   r<   r>   r  out_projr  s      rK   r0   z'Data2VecTextClassificationHead.__init__  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHrL   c                     |d d dd d f   }| j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S r   )r>   r   rA   tanhr  r  s       rK   r[   z&Data2VecTextClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!rL   )r_   r`   ra   rb   r0   r[   rc   rd   s   @rK   r  r    s    7IrL   r  c                   d    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee	   dee	   dee	   de
eef   fd       Z xZS ) Data2VecTextForQuestionAnsweringc                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y r  )
r/   r0   r  r7  r%  r   rq   r3   
qa_outputsr;  rG   s     rK   r0   z)Data2VecTextForQuestionAnswering.__init__  sV      ++.vO))F$6$68I8IJ 	rL   rS   r{   r,   r)   r|   rT   start_positionsend_positionsr   r  r  r   c                 (   ||n| j                   j                  }| j                  |||||||	|
|	      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nr  r   r   r*   r   )ignore_indexr    )rp  start_logits
end_logitsrz   r  )rI   rI  r%  r  splitr  r   r   rE   clampr   r   rz   r  )rH   rS   r{   r,   r)   r|   rT   r  r  r   r  r  r   rW  rq  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                          rK   r[   z(Data2VecTextForQuestionAnswering.forward  s    &1%<k$++B]B]$$))%'/!5# % 

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rL   )NNNNNNNNNNN)r_   r`   ra   r0   r   r   rA   rw  r   r   r   r   r   r[   rc   rd   s   @rK   r  r    s$     156:593715596:48,0/3&*>
E,,->
 !!2!23>
 !!1!12	>

 u//0>
 E--.>
   1 12>
 "%"2"23>
   0 01>
 $D>>
 'tn>
 d^>
 
u22	3>
 >
rL   r  c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r   )nern   rA   cumsumtype_asrF   )rS   r$   rU   maskincremental_indicess        rK   rP   rP   G  sW     <<$((*D <<!4<<TBE[[_cc##%33rL   )r[  ry  r  r  r  r  r7  r$  )r   )Erb   r   typingr   r   rA   torch.utils.checkpointr   torch.nnr   r   r   activationsr
   r   cache_utilsr   r   
generationr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_data2vec_textr   
get_loggerr_   r  _HIDDEN_STATES_START_POSITIONModuler"   rf   r   r   r   r   r   r   r   r  r$  r7  r[  ry  rb  r  r  r  r  r  rP   __all__r  rL   rK   <module>r     sN   "  "    A A ' 5 ) 9	 	 	 . l l , ; 
		H	% !" V=BII V=t@.		 @.HRYY  &( $2BII 2lryy   D2 DPY
")) Y
z  ./ . .2 b
3 b
 b
J 
k
9? k

k
\ P
9 P
 P
h* *> R
,G R
R
j e
$? e
 e
P D
)D D
 D
PRYY , J
'B J
 J
Z4 	rL   